File: | include/llvm/Analysis/TargetTransformInfoImpl.h |
Warning: | line 694, column 52 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// This file implements a TargetTransformInfo analysis pass specific to the | |||
10 | /// X86 target machine. It uses the target's detailed information to provide | |||
11 | /// more precise answers to certain TTI queries, while letting the target | |||
12 | /// independent and default TTI implementations handle the rest. | |||
13 | /// | |||
14 | //===----------------------------------------------------------------------===// | |||
15 | /// About Cost Model numbers used below it's necessary to say the following: | |||
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of | |||
17 | /// concrete CPU model. Usually the numbers correspond to CPU where the feature | |||
18 | /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in | |||
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | |||
20 | /// to support that feature level and thus has most likely the worst case cost. | |||
21 | /// Some examples of other technologies/CPUs: | |||
22 | /// SSE 3 - Pentium4 / Athlon64 | |||
23 | /// SSE 4.1 - Penryn | |||
24 | /// SSE 4.2 - Nehalem | |||
25 | /// AVX - Sandy Bridge | |||
26 | /// AVX2 - Haswell | |||
27 | /// AVX-512 - Xeon Phi / Skylake | |||
28 | /// And some examples of instruction target dependent costs (latency) | |||
29 | /// divss sqrtss rsqrtss | |||
30 | /// AMD K7 11-16 19 3 | |||
31 | /// Piledriver 9-24 13-15 5 | |||
32 | /// Jaguar 14 16 2 | |||
33 | /// Pentium II,III 18 30 2 | |||
34 | /// Nehalem 7-14 7-18 3 | |||
35 | /// Haswell 10-13 11 5 | |||
36 | /// TODO: Develop and implement the target dependent cost model and | |||
37 | /// specialize cost numbers for different Cost Model Targets such as throughput, | |||
38 | /// code size, latency and uop count. | |||
39 | //===----------------------------------------------------------------------===// | |||
40 | ||||
41 | #include "X86TargetTransformInfo.h" | |||
42 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
43 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
44 | #include "llvm/CodeGen/CostTable.h" | |||
45 | #include "llvm/CodeGen/TargetLowering.h" | |||
46 | #include "llvm/IR/IntrinsicInst.h" | |||
47 | #include "llvm/Support/Debug.h" | |||
48 | ||||
49 | using namespace llvm; | |||
50 | ||||
51 | #define DEBUG_TYPE"x86tti" "x86tti" | |||
52 | ||||
53 | //===----------------------------------------------------------------------===// | |||
54 | // | |||
55 | // X86 cost model. | |||
56 | // | |||
57 | //===----------------------------------------------------------------------===// | |||
58 | ||||
59 | TargetTransformInfo::PopcntSupportKind | |||
60 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
61 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")((isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 61, __PRETTY_FUNCTION__)); | |||
62 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | |||
63 | // instructions is inefficient. Once the problem is fixed, we should | |||
64 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | |||
65 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | |||
66 | } | |||
67 | ||||
68 | llvm::Optional<unsigned> X86TTIImpl::getCacheSize( | |||
69 | TargetTransformInfo::CacheLevel Level) const { | |||
70 | switch (Level) { | |||
71 | case TargetTransformInfo::CacheLevel::L1D: | |||
72 | // - Penryn | |||
73 | // - Nehalem | |||
74 | // - Westmere | |||
75 | // - Sandy Bridge | |||
76 | // - Ivy Bridge | |||
77 | // - Haswell | |||
78 | // - Broadwell | |||
79 | // - Skylake | |||
80 | // - Kabylake | |||
81 | return 32 * 1024; // 32 KByte | |||
82 | case TargetTransformInfo::CacheLevel::L2D: | |||
83 | // - Penryn | |||
84 | // - Nehalem | |||
85 | // - Westmere | |||
86 | // - Sandy Bridge | |||
87 | // - Ivy Bridge | |||
88 | // - Haswell | |||
89 | // - Broadwell | |||
90 | // - Skylake | |||
91 | // - Kabylake | |||
92 | return 256 * 1024; // 256 KByte | |||
93 | } | |||
94 | ||||
95 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 95); | |||
96 | } | |||
97 | ||||
98 | llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( | |||
99 | TargetTransformInfo::CacheLevel Level) const { | |||
100 | // - Penryn | |||
101 | // - Nehalem | |||
102 | // - Westmere | |||
103 | // - Sandy Bridge | |||
104 | // - Ivy Bridge | |||
105 | // - Haswell | |||
106 | // - Broadwell | |||
107 | // - Skylake | |||
108 | // - Kabylake | |||
109 | switch (Level) { | |||
110 | case TargetTransformInfo::CacheLevel::L1D: | |||
111 | LLVM_FALLTHROUGH[[clang::fallthrough]]; | |||
112 | case TargetTransformInfo::CacheLevel::L2D: | |||
113 | return 8; | |||
114 | } | |||
115 | ||||
116 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 116); | |||
117 | } | |||
118 | ||||
119 | unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { | |||
120 | if (Vector && !ST->hasSSE1()) | |||
121 | return 0; | |||
122 | ||||
123 | if (ST->is64Bit()) { | |||
124 | if (Vector && ST->hasAVX512()) | |||
125 | return 32; | |||
126 | return 16; | |||
127 | } | |||
128 | return 8; | |||
129 | } | |||
130 | ||||
131 | unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { | |||
132 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | |||
133 | if (Vector) { | |||
134 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | |||
135 | return 512; | |||
136 | if (ST->hasAVX() && PreferVectorWidth >= 256) | |||
137 | return 256; | |||
138 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | |||
139 | return 128; | |||
140 | return 0; | |||
141 | } | |||
142 | ||||
143 | if (ST->is64Bit()) | |||
144 | return 64; | |||
145 | ||||
146 | return 32; | |||
147 | } | |||
148 | ||||
149 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | |||
150 | return getRegisterBitWidth(true); | |||
151 | } | |||
152 | ||||
153 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | |||
154 | // If the loop will not be vectorized, don't interleave the loop. | |||
155 | // Let regular unroll to unroll the loop, which saves the overflow | |||
156 | // check and memory check cost. | |||
157 | if (VF == 1) | |||
158 | return 1; | |||
159 | ||||
160 | if (ST->isAtom()) | |||
161 | return 1; | |||
162 | ||||
163 | // Sandybridge and Haswell have multiple execution ports and pipelined | |||
164 | // vector units. | |||
165 | if (ST->hasAVX()) | |||
166 | return 4; | |||
167 | ||||
168 | return 2; | |||
169 | } | |||
170 | ||||
171 | int X86TTIImpl::getArithmeticInstrCost( | |||
172 | unsigned Opcode, Type *Ty, | |||
173 | TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, | |||
174 | TTI::OperandValueProperties Opd1PropInfo, | |||
175 | TTI::OperandValueProperties Opd2PropInfo, | |||
176 | ArrayRef<const Value *> Args) { | |||
177 | // Legalize the type. | |||
178 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
179 | ||||
180 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
181 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 181, __PRETTY_FUNCTION__)); | |||
182 | ||||
183 | static const CostTblEntry GLMCostTable[] = { | |||
184 | { ISD::FDIV, MVT::f32, 18 }, // divss | |||
185 | { ISD::FDIV, MVT::v4f32, 35 }, // divps | |||
186 | { ISD::FDIV, MVT::f64, 33 }, // divsd | |||
187 | { ISD::FDIV, MVT::v2f64, 65 }, // divpd | |||
188 | }; | |||
189 | ||||
190 | if (ST->isGLM()) | |||
191 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, | |||
192 | LT.second)) | |||
193 | return LT.first * Entry->Cost; | |||
194 | ||||
195 | static const CostTblEntry SLMCostTable[] = { | |||
196 | { ISD::MUL, MVT::v4i32, 11 }, // pmulld | |||
197 | { ISD::MUL, MVT::v8i16, 2 }, // pmullw | |||
198 | { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. | |||
199 | { ISD::FMUL, MVT::f64, 2 }, // mulsd | |||
200 | { ISD::FMUL, MVT::v2f64, 4 }, // mulpd | |||
201 | { ISD::FMUL, MVT::v4f32, 2 }, // mulps | |||
202 | { ISD::FDIV, MVT::f32, 17 }, // divss | |||
203 | { ISD::FDIV, MVT::v4f32, 39 }, // divps | |||
204 | { ISD::FDIV, MVT::f64, 32 }, // divsd | |||
205 | { ISD::FDIV, MVT::v2f64, 69 }, // divpd | |||
206 | { ISD::FADD, MVT::v2f64, 2 }, // addpd | |||
207 | { ISD::FSUB, MVT::v2f64, 2 }, // subpd | |||
208 | // v2i64/v4i64 mul is custom lowered as a series of long: | |||
209 | // multiplies(3), shifts(3) and adds(2) | |||
210 | // slm muldq version throughput is 2 and addq throughput 4 | |||
211 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | |||
212 | // 3X4 (addq throughput) = 17 | |||
213 | { ISD::MUL, MVT::v2i64, 17 }, | |||
214 | // slm addq\subq throughput is 4 | |||
215 | { ISD::ADD, MVT::v2i64, 4 }, | |||
216 | { ISD::SUB, MVT::v2i64, 4 }, | |||
217 | }; | |||
218 | ||||
219 | if (ST->isSLM()) { | |||
220 | if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { | |||
221 | // Check if the operands can be shrinked into a smaller datatype. | |||
222 | bool Op1Signed = false; | |||
223 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | |||
224 | bool Op2Signed = false; | |||
225 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | |||
226 | ||||
227 | bool signedMode = Op1Signed | Op2Signed; | |||
228 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | |||
229 | ||||
230 | if (OpMinSize <= 7) | |||
231 | return LT.first * 3; // pmullw/sext | |||
232 | if (!signedMode && OpMinSize <= 8) | |||
233 | return LT.first * 3; // pmullw/zext | |||
234 | if (OpMinSize <= 15) | |||
235 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
236 | if (!signedMode && OpMinSize <= 16) | |||
237 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
238 | } | |||
239 | ||||
240 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, | |||
241 | LT.second)) { | |||
242 | return LT.first * Entry->Cost; | |||
243 | } | |||
244 | } | |||
245 | ||||
246 | if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || | |||
247 | ISD == ISD::UREM) && | |||
248 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
249 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
250 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | |||
251 | if (ISD == ISD::SDIV || ISD == ISD::SREM) { | |||
252 | // On X86, vector signed division by constants power-of-two are | |||
253 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | |||
254 | // The OperandValue properties may not be the same as that of the previous | |||
255 | // operation; conservatively assume OP_None. | |||
256 | int Cost = | |||
257 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, | |||
258 | TargetTransformInfo::OP_None, | |||
259 | TargetTransformInfo::OP_None); | |||
260 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, | |||
261 | TargetTransformInfo::OP_None, | |||
262 | TargetTransformInfo::OP_None); | |||
263 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, | |||
264 | TargetTransformInfo::OP_None, | |||
265 | TargetTransformInfo::OP_None); | |||
266 | ||||
267 | if (ISD == ISD::SREM) { | |||
268 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | |||
269 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info); | |||
270 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info); | |||
271 | } | |||
272 | ||||
273 | return Cost; | |||
274 | } | |||
275 | ||||
276 | // Vector unsigned division/remainder will be simplified to shifts/masks. | |||
277 | if (ISD == ISD::UDIV) | |||
278 | return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, | |||
279 | TargetTransformInfo::OP_None, | |||
280 | TargetTransformInfo::OP_None); | |||
281 | ||||
282 | if (ISD == ISD::UREM) | |||
283 | return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info, | |||
284 | TargetTransformInfo::OP_None, | |||
285 | TargetTransformInfo::OP_None); | |||
286 | } | |||
287 | ||||
288 | static const CostTblEntry AVX512BWUniformConstCostTable[] = { | |||
289 | { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. | |||
290 | { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. | |||
291 | { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. | |||
292 | }; | |||
293 | ||||
294 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
295 | ST->hasBWI()) { | |||
296 | if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, | |||
297 | LT.second)) | |||
298 | return LT.first * Entry->Cost; | |||
299 | } | |||
300 | ||||
301 | static const CostTblEntry AVX512UniformConstCostTable[] = { | |||
302 | { ISD::SRA, MVT::v2i64, 1 }, | |||
303 | { ISD::SRA, MVT::v4i64, 1 }, | |||
304 | { ISD::SRA, MVT::v8i64, 1 }, | |||
305 | }; | |||
306 | ||||
307 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
308 | ST->hasAVX512()) { | |||
309 | if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, | |||
310 | LT.second)) | |||
311 | return LT.first * Entry->Cost; | |||
312 | } | |||
313 | ||||
314 | static const CostTblEntry AVX2UniformConstCostTable[] = { | |||
315 | { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. | |||
316 | { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. | |||
317 | { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. | |||
318 | ||||
319 | { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. | |||
320 | }; | |||
321 | ||||
322 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
323 | ST->hasAVX2()) { | |||
324 | if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, | |||
325 | LT.second)) | |||
326 | return LT.first * Entry->Cost; | |||
327 | } | |||
328 | ||||
329 | static const CostTblEntry SSE2UniformConstCostTable[] = { | |||
330 | { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. | |||
331 | { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. | |||
332 | { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. | |||
333 | ||||
334 | { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. | |||
335 | { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. | |||
336 | { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. | |||
337 | }; | |||
338 | ||||
339 | // XOP has faster vXi8 shifts. | |||
340 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
341 | ST->hasSSE2() && !ST->hasXOP()) { | |||
342 | if (const auto *Entry = | |||
343 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | |||
344 | return LT.first * Entry->Cost; | |||
345 | } | |||
346 | ||||
347 | static const CostTblEntry AVX512BWConstCostTable[] = { | |||
348 | { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | |||
349 | { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
350 | { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | |||
351 | { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
352 | { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence | |||
353 | { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence | |||
354 | { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence | |||
355 | { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence | |||
356 | }; | |||
357 | ||||
358 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
359 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
360 | ST->hasBWI()) { | |||
361 | if (const auto *Entry = | |||
362 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | |||
363 | return LT.first * Entry->Cost; | |||
364 | } | |||
365 | ||||
366 | static const CostTblEntry AVX512ConstCostTable[] = { | |||
367 | { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence | |||
368 | { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence | |||
369 | { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence | |||
370 | { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence | |||
371 | }; | |||
372 | ||||
373 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
374 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
375 | ST->hasAVX512()) { | |||
376 | if (const auto *Entry = | |||
377 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | |||
378 | return LT.first * Entry->Cost; | |||
379 | } | |||
380 | ||||
381 | static const CostTblEntry AVX2ConstCostTable[] = { | |||
382 | { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | |||
383 | { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
384 | { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | |||
385 | { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
386 | { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence | |||
387 | { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence | |||
388 | { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence | |||
389 | { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence | |||
390 | { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence | |||
391 | { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence | |||
392 | { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence | |||
393 | { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence | |||
394 | }; | |||
395 | ||||
396 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
397 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
398 | ST->hasAVX2()) { | |||
399 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | |||
400 | return LT.first * Entry->Cost; | |||
401 | } | |||
402 | ||||
403 | static const CostTblEntry SSE2ConstCostTable[] = { | |||
404 | { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | |||
405 | { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
406 | { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | |||
407 | { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
408 | { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | |||
409 | { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
410 | { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | |||
411 | { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
412 | { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. | |||
413 | { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. | |||
414 | { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence | |||
415 | { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence | |||
416 | { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. | |||
417 | { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. | |||
418 | { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence | |||
419 | { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence | |||
420 | { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. | |||
421 | { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
422 | { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence | |||
423 | { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence | |||
424 | { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. | |||
425 | { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
426 | { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence | |||
427 | { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence | |||
428 | }; | |||
429 | ||||
430 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
431 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
432 | ST->hasSSE2()) { | |||
433 | // pmuldq sequence. | |||
434 | if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) | |||
435 | return LT.first * 32; | |||
436 | if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) | |||
437 | return LT.first * 38; | |||
438 | if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) | |||
439 | return LT.first * 15; | |||
440 | if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) | |||
441 | return LT.first * 20; | |||
442 | ||||
443 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | |||
444 | return LT.first * Entry->Cost; | |||
445 | } | |||
446 | ||||
447 | static const CostTblEntry AVX2UniformCostTable[] = { | |||
448 | // Uniform splats are cheaper for the following instructions. | |||
449 | { ISD::SHL, MVT::v16i16, 1 }, // psllw. | |||
450 | { ISD::SRL, MVT::v16i16, 1 }, // psrlw. | |||
451 | { ISD::SRA, MVT::v16i16, 1 }, // psraw. | |||
452 | }; | |||
453 | ||||
454 | if (ST->hasAVX2() && | |||
455 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
456 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
457 | if (const auto *Entry = | |||
458 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | |||
459 | return LT.first * Entry->Cost; | |||
460 | } | |||
461 | ||||
462 | static const CostTblEntry SSE2UniformCostTable[] = { | |||
463 | // Uniform splats are cheaper for the following instructions. | |||
464 | { ISD::SHL, MVT::v8i16, 1 }, // psllw. | |||
465 | { ISD::SHL, MVT::v4i32, 1 }, // pslld | |||
466 | { ISD::SHL, MVT::v2i64, 1 }, // psllq. | |||
467 | ||||
468 | { ISD::SRL, MVT::v8i16, 1 }, // psrlw. | |||
469 | { ISD::SRL, MVT::v4i32, 1 }, // psrld. | |||
470 | { ISD::SRL, MVT::v2i64, 1 }, // psrlq. | |||
471 | ||||
472 | { ISD::SRA, MVT::v8i16, 1 }, // psraw. | |||
473 | { ISD::SRA, MVT::v4i32, 1 }, // psrad. | |||
474 | }; | |||
475 | ||||
476 | if (ST->hasSSE2() && | |||
477 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
478 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
479 | if (const auto *Entry = | |||
480 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | |||
481 | return LT.first * Entry->Cost; | |||
482 | } | |||
483 | ||||
484 | static const CostTblEntry AVX512DQCostTable[] = { | |||
485 | { ISD::MUL, MVT::v2i64, 1 }, | |||
486 | { ISD::MUL, MVT::v4i64, 1 }, | |||
487 | { ISD::MUL, MVT::v8i64, 1 } | |||
488 | }; | |||
489 | ||||
490 | // Look for AVX512DQ lowering tricks for custom cases. | |||
491 | if (ST->hasDQI()) | |||
492 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | |||
493 | return LT.first * Entry->Cost; | |||
494 | ||||
495 | static const CostTblEntry AVX512BWCostTable[] = { | |||
496 | { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw | |||
497 | { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw | |||
498 | { ISD::SRA, MVT::v8i16, 1 }, // vpsravw | |||
499 | ||||
500 | { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw | |||
501 | { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw | |||
502 | { ISD::SRA, MVT::v16i16, 1 }, // vpsravw | |||
503 | ||||
504 | { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw | |||
505 | { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw | |||
506 | { ISD::SRA, MVT::v32i16, 1 }, // vpsravw | |||
507 | ||||
508 | { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. | |||
509 | { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. | |||
510 | { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. | |||
511 | ||||
512 | { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. | |||
513 | { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. | |||
514 | { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. | |||
515 | }; | |||
516 | ||||
517 | // Look for AVX512BW lowering tricks for custom cases. | |||
518 | if (ST->hasBWI()) | |||
519 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | |||
520 | return LT.first * Entry->Cost; | |||
521 | ||||
522 | static const CostTblEntry AVX512CostTable[] = { | |||
523 | { ISD::SHL, MVT::v16i32, 1 }, | |||
524 | { ISD::SRL, MVT::v16i32, 1 }, | |||
525 | { ISD::SRA, MVT::v16i32, 1 }, | |||
526 | ||||
527 | { ISD::SHL, MVT::v8i64, 1 }, | |||
528 | { ISD::SRL, MVT::v8i64, 1 }, | |||
529 | ||||
530 | { ISD::SRA, MVT::v2i64, 1 }, | |||
531 | { ISD::SRA, MVT::v4i64, 1 }, | |||
532 | { ISD::SRA, MVT::v8i64, 1 }, | |||
533 | ||||
534 | { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. | |||
535 | { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. | |||
536 | { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) | |||
537 | { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) | |||
538 | { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) | |||
539 | { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add | |||
540 | ||||
541 | { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
542 | { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
543 | { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
544 | ||||
545 | { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
546 | { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
547 | { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
548 | }; | |||
549 | ||||
550 | if (ST->hasAVX512()) | |||
551 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | |||
552 | return LT.first * Entry->Cost; | |||
553 | ||||
554 | static const CostTblEntry AVX2ShiftCostTable[] = { | |||
555 | // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to | |||
556 | // customize them to detect the cases where shift amount is a scalar one. | |||
557 | { ISD::SHL, MVT::v4i32, 1 }, | |||
558 | { ISD::SRL, MVT::v4i32, 1 }, | |||
559 | { ISD::SRA, MVT::v4i32, 1 }, | |||
560 | { ISD::SHL, MVT::v8i32, 1 }, | |||
561 | { ISD::SRL, MVT::v8i32, 1 }, | |||
562 | { ISD::SRA, MVT::v8i32, 1 }, | |||
563 | { ISD::SHL, MVT::v2i64, 1 }, | |||
564 | { ISD::SRL, MVT::v2i64, 1 }, | |||
565 | { ISD::SHL, MVT::v4i64, 1 }, | |||
566 | { ISD::SRL, MVT::v4i64, 1 }, | |||
567 | }; | |||
568 | ||||
569 | // Look for AVX2 lowering tricks. | |||
570 | if (ST->hasAVX2()) { | |||
571 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | |||
572 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
573 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
574 | // On AVX2, a packed v16i16 shift left by a constant build_vector | |||
575 | // is lowered into a vector multiply (vpmullw). | |||
576 | return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info, | |||
577 | TargetTransformInfo::OP_None, | |||
578 | TargetTransformInfo::OP_None); | |||
579 | ||||
580 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | |||
581 | return LT.first * Entry->Cost; | |||
582 | } | |||
583 | ||||
584 | static const CostTblEntry XOPShiftCostTable[] = { | |||
585 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | |||
586 | { ISD::SHL, MVT::v16i8, 1 }, | |||
587 | { ISD::SRL, MVT::v16i8, 2 }, | |||
588 | { ISD::SRA, MVT::v16i8, 2 }, | |||
589 | { ISD::SHL, MVT::v8i16, 1 }, | |||
590 | { ISD::SRL, MVT::v8i16, 2 }, | |||
591 | { ISD::SRA, MVT::v8i16, 2 }, | |||
592 | { ISD::SHL, MVT::v4i32, 1 }, | |||
593 | { ISD::SRL, MVT::v4i32, 2 }, | |||
594 | { ISD::SRA, MVT::v4i32, 2 }, | |||
595 | { ISD::SHL, MVT::v2i64, 1 }, | |||
596 | { ISD::SRL, MVT::v2i64, 2 }, | |||
597 | { ISD::SRA, MVT::v2i64, 2 }, | |||
598 | // 256bit shifts require splitting if AVX2 didn't catch them above. | |||
599 | { ISD::SHL, MVT::v32i8, 2+2 }, | |||
600 | { ISD::SRL, MVT::v32i8, 4+2 }, | |||
601 | { ISD::SRA, MVT::v32i8, 4+2 }, | |||
602 | { ISD::SHL, MVT::v16i16, 2+2 }, | |||
603 | { ISD::SRL, MVT::v16i16, 4+2 }, | |||
604 | { ISD::SRA, MVT::v16i16, 4+2 }, | |||
605 | { ISD::SHL, MVT::v8i32, 2+2 }, | |||
606 | { ISD::SRL, MVT::v8i32, 4+2 }, | |||
607 | { ISD::SRA, MVT::v8i32, 4+2 }, | |||
608 | { ISD::SHL, MVT::v4i64, 2+2 }, | |||
609 | { ISD::SRL, MVT::v4i64, 4+2 }, | |||
610 | { ISD::SRA, MVT::v4i64, 4+2 }, | |||
611 | }; | |||
612 | ||||
613 | // Look for XOP lowering tricks. | |||
614 | if (ST->hasXOP()) { | |||
615 | // If the right shift is constant then we'll fold the negation so | |||
616 | // it's as cheap as a left shift. | |||
617 | int ShiftISD = ISD; | |||
618 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && | |||
619 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
620 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
621 | ShiftISD = ISD::SHL; | |||
622 | if (const auto *Entry = | |||
623 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | |||
624 | return LT.first * Entry->Cost; | |||
625 | } | |||
626 | ||||
627 | static const CostTblEntry SSE2UniformShiftCostTable[] = { | |||
628 | // Uniform splats are cheaper for the following instructions. | |||
629 | { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. | |||
630 | { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. | |||
631 | { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. | |||
632 | ||||
633 | { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. | |||
634 | { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. | |||
635 | { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. | |||
636 | ||||
637 | { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. | |||
638 | { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. | |||
639 | { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. | |||
640 | { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. | |||
641 | }; | |||
642 | ||||
643 | if (ST->hasSSE2() && | |||
644 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
645 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
646 | ||||
647 | // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. | |||
648 | if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) | |||
649 | return LT.first * 4; // 2*psrad + shuffle. | |||
650 | ||||
651 | if (const auto *Entry = | |||
652 | CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) | |||
653 | return LT.first * Entry->Cost; | |||
654 | } | |||
655 | ||||
656 | if (ISD == ISD::SHL && | |||
657 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { | |||
658 | MVT VT = LT.second; | |||
659 | // Vector shift left by non uniform constant can be lowered | |||
660 | // into vector multiply. | |||
661 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | |||
662 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | |||
663 | ISD = ISD::MUL; | |||
664 | } | |||
665 | ||||
666 | static const CostTblEntry AVX2CostTable[] = { | |||
667 | { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. | |||
668 | { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. | |||
669 | ||||
670 | { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. | |||
671 | { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. | |||
672 | ||||
673 | { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. | |||
674 | { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. | |||
675 | { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. | |||
676 | { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. | |||
677 | ||||
678 | { ISD::SUB, MVT::v32i8, 1 }, // psubb | |||
679 | { ISD::ADD, MVT::v32i8, 1 }, // paddb | |||
680 | { ISD::SUB, MVT::v16i16, 1 }, // psubw | |||
681 | { ISD::ADD, MVT::v16i16, 1 }, // paddw | |||
682 | { ISD::SUB, MVT::v8i32, 1 }, // psubd | |||
683 | { ISD::ADD, MVT::v8i32, 1 }, // paddd | |||
684 | { ISD::SUB, MVT::v4i64, 1 }, // psubq | |||
685 | { ISD::ADD, MVT::v4i64, 1 }, // paddq | |||
686 | ||||
687 | { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. | |||
688 | { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. | |||
689 | { ISD::MUL, MVT::v16i16, 1 }, // pmullw | |||
690 | { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) | |||
691 | { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add | |||
692 | ||||
693 | { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
694 | { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
695 | { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
696 | { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
697 | { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
698 | { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
699 | ||||
700 | { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | |||
701 | { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | |||
702 | { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | |||
703 | { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | |||
704 | { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | |||
705 | { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | |||
706 | }; | |||
707 | ||||
708 | // Look for AVX2 lowering tricks for custom cases. | |||
709 | if (ST->hasAVX2()) | |||
710 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | |||
711 | return LT.first * Entry->Cost; | |||
712 | ||||
713 | static const CostTblEntry AVX1CostTable[] = { | |||
714 | // We don't have to scalarize unsupported ops. We can issue two half-sized | |||
715 | // operations and we only need to extract the upper YMM half. | |||
716 | // Two ops + 1 extract + 1 insert = 4. | |||
717 | { ISD::MUL, MVT::v16i16, 4 }, | |||
718 | { ISD::MUL, MVT::v8i32, 4 }, | |||
719 | { ISD::SUB, MVT::v32i8, 4 }, | |||
720 | { ISD::ADD, MVT::v32i8, 4 }, | |||
721 | { ISD::SUB, MVT::v16i16, 4 }, | |||
722 | { ISD::ADD, MVT::v16i16, 4 }, | |||
723 | { ISD::SUB, MVT::v8i32, 4 }, | |||
724 | { ISD::ADD, MVT::v8i32, 4 }, | |||
725 | { ISD::SUB, MVT::v4i64, 4 }, | |||
726 | { ISD::ADD, MVT::v4i64, 4 }, | |||
727 | ||||
728 | // A v4i64 multiply is custom lowered as two split v2i64 vectors that then | |||
729 | // are lowered as a series of long multiplies(3), shifts(3) and adds(2) | |||
730 | // Because we believe v4i64 to be a legal type, we must also include the | |||
731 | // extract+insert in the cost table. Therefore, the cost here is 18 | |||
732 | // instead of 8. | |||
733 | { ISD::MUL, MVT::v4i64, 18 }, | |||
734 | ||||
735 | { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. | |||
736 | ||||
737 | { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ | |||
738 | { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | |||
739 | { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | |||
740 | { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ | |||
741 | { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ | |||
742 | { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ | |||
743 | }; | |||
744 | ||||
745 | if (ST->hasAVX()) | |||
746 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | |||
747 | return LT.first * Entry->Cost; | |||
748 | ||||
749 | static const CostTblEntry SSE42CostTable[] = { | |||
750 | { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
751 | { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | |||
752 | { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
753 | { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
754 | ||||
755 | { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
756 | { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ | |||
757 | { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
758 | { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
759 | ||||
760 | { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
761 | { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | |||
762 | { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
763 | { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
764 | ||||
765 | { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ | |||
766 | { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ | |||
767 | { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ | |||
768 | { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ | |||
769 | }; | |||
770 | ||||
771 | if (ST->hasSSE42()) | |||
772 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | |||
773 | return LT.first * Entry->Cost; | |||
774 | ||||
775 | static const CostTblEntry SSE41CostTable[] = { | |||
776 | { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. | |||
777 | { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. | |||
778 | { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. | |||
779 | { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. | |||
780 | { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld | |||
781 | { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split | |||
782 | ||||
783 | { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. | |||
784 | { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. | |||
785 | { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. | |||
786 | { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. | |||
787 | { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. | |||
788 | { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. | |||
789 | ||||
790 | { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. | |||
791 | { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. | |||
792 | { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. | |||
793 | { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. | |||
794 | { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. | |||
795 | { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. | |||
796 | ||||
797 | { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) | |||
798 | }; | |||
799 | ||||
800 | if (ST->hasSSE41()) | |||
801 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | |||
802 | return LT.first * Entry->Cost; | |||
803 | ||||
804 | static const CostTblEntry SSE2CostTable[] = { | |||
805 | // We don't correctly identify costs of casts because they are marked as | |||
806 | // custom. | |||
807 | { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. | |||
808 | { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. | |||
809 | { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. | |||
810 | { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. | |||
811 | { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. | |||
812 | ||||
813 | { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. | |||
814 | { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. | |||
815 | { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. | |||
816 | { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. | |||
817 | { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. | |||
818 | ||||
819 | { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. | |||
820 | { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. | |||
821 | { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. | |||
822 | { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. | |||
823 | { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. | |||
824 | ||||
825 | { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. | |||
826 | { ISD::MUL, MVT::v8i16, 1 }, // pmullw | |||
827 | { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle | |||
828 | { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add | |||
829 | ||||
830 | { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ | |||
831 | { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ | |||
832 | { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ | |||
833 | { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ | |||
834 | ||||
835 | { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | |||
836 | { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | |||
837 | ||||
838 | { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | |||
839 | { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | |||
840 | }; | |||
841 | ||||
842 | if (ST->hasSSE2()) | |||
843 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | |||
844 | return LT.first * Entry->Cost; | |||
845 | ||||
846 | static const CostTblEntry SSE1CostTable[] = { | |||
847 | { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ | |||
848 | { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ | |||
849 | ||||
850 | { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | |||
851 | { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | |||
852 | ||||
853 | { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | |||
854 | { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | |||
855 | ||||
856 | { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | |||
857 | { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | |||
858 | { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | |||
859 | ||||
860 | { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | |||
861 | { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | |||
862 | { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | |||
863 | }; | |||
864 | ||||
865 | if (ST->hasSSE1()) | |||
866 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | |||
867 | return LT.first * Entry->Cost; | |||
868 | ||||
869 | // It is not a good idea to vectorize division. We have to scalarize it and | |||
870 | // in the process we will often end up having to spilling regular | |||
871 | // registers. The overhead of division is going to dominate most kernels | |||
872 | // anyways so try hard to prevent vectorization of division - it is | |||
873 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | |||
874 | // to hide "20 cycles" for each lane. | |||
875 | if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || | |||
876 | ISD == ISD::UDIV || ISD == ISD::UREM)) { | |||
877 | int ScalarCost = getArithmeticInstrCost( | |||
878 | Opcode, Ty->getScalarType(), Op1Info, Op2Info, | |||
879 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
880 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | |||
881 | } | |||
882 | ||||
883 | // Fallback to the default implementation. | |||
884 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); | |||
885 | } | |||
886 | ||||
887 | int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, | |||
888 | Type *SubTp) { | |||
889 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | |||
890 | // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. | |||
891 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
892 | ||||
893 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | |||
894 | if (Kind == TTI::SK_Transpose) | |||
895 | Kind = TTI::SK_PermuteTwoSrc; | |||
896 | ||||
897 | // For Broadcasts we are splatting the first element from the first input | |||
898 | // register, so only need to reference that input and all the output | |||
899 | // registers are the same. | |||
900 | if (Kind == TTI::SK_Broadcast) | |||
901 | LT.first = 1; | |||
902 | ||||
903 | // Subvector extractions are free if they start at the beginning of a | |||
904 | // vector and cheap if the subvectors are aligned. | |||
905 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | |||
906 | int NumElts = LT.second.getVectorNumElements(); | |||
907 | if ((Index % NumElts) == 0) | |||
908 | return 0; | |||
909 | std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp); | |||
910 | if (SubLT.second.isVector()) { | |||
911 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
912 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
913 | return SubLT.first; | |||
914 | } | |||
915 | } | |||
916 | ||||
917 | // We are going to permute multiple sources and the result will be in multiple | |||
918 | // destinations. Providing an accurate cost only for splits where the element | |||
919 | // type remains the same. | |||
920 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | |||
921 | MVT LegalVT = LT.second; | |||
922 | if (LegalVT.isVector() && | |||
923 | LegalVT.getVectorElementType().getSizeInBits() == | |||
924 | Tp->getVectorElementType()->getPrimitiveSizeInBits() && | |||
925 | LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { | |||
926 | ||||
927 | unsigned VecTySize = DL.getTypeStoreSize(Tp); | |||
928 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
929 | // Number of source vectors after legalization: | |||
930 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
931 | // Number of destination vectors after legalization: | |||
932 | unsigned NumOfDests = LT.first; | |||
933 | ||||
934 | Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), | |||
935 | LegalVT.getVectorNumElements()); | |||
936 | ||||
937 | unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | |||
938 | return NumOfShuffles * | |||
939 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); | |||
940 | } | |||
941 | ||||
942 | return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); | |||
943 | } | |||
944 | ||||
945 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | |||
946 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | |||
947 | // We assume that source and destination have the same vector type. | |||
948 | int NumOfDests = LT.first; | |||
949 | int NumOfShufflesPerDest = LT.first * 2 - 1; | |||
950 | LT.first = NumOfDests * NumOfShufflesPerDest; | |||
951 | } | |||
952 | ||||
953 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | |||
954 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | |||
955 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | |||
956 | ||||
957 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | |||
958 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | |||
959 | ||||
960 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b | |||
961 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b | |||
962 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b | |||
963 | }; | |||
964 | ||||
965 | if (ST->hasVBMI()) | |||
966 | if (const auto *Entry = | |||
967 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | |||
968 | return LT.first * Entry->Cost; | |||
969 | ||||
970 | static const CostTblEntry AVX512BWShuffleTbl[] = { | |||
971 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | |||
972 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | |||
973 | ||||
974 | {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw | |||
975 | {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw | |||
976 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | |||
977 | ||||
978 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw | |||
979 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw | |||
980 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw | |||
981 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | |||
982 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc | |||
983 | ||||
984 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w | |||
985 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w | |||
986 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w | |||
987 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc | |||
988 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | |||
989 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc | |||
990 | }; | |||
991 | ||||
992 | if (ST->hasBWI()) | |||
993 | if (const auto *Entry = | |||
994 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | |||
995 | return LT.first * Entry->Cost; | |||
996 | ||||
997 | static const CostTblEntry AVX512ShuffleTbl[] = { | |||
998 | {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd | |||
999 | {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps | |||
1000 | {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq | |||
1001 | {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd | |||
1002 | ||||
1003 | {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd | |||
1004 | {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps | |||
1005 | {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq | |||
1006 | {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd | |||
1007 | ||||
1008 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd | |||
1009 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
1010 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd | |||
1011 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps | |||
1012 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
1013 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps | |||
1014 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq | |||
1015 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
1016 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq | |||
1017 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd | |||
1018 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
1019 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd | |||
1020 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
1021 | ||||
1022 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd | |||
1023 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps | |||
1024 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q | |||
1025 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d | |||
1026 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd | |||
1027 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps | |||
1028 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q | |||
1029 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d | |||
1030 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd | |||
1031 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps | |||
1032 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q | |||
1033 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d | |||
1034 | }; | |||
1035 | ||||
1036 | if (ST->hasAVX512()) | |||
1037 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | |||
1038 | return LT.first * Entry->Cost; | |||
1039 | ||||
1040 | static const CostTblEntry AVX2ShuffleTbl[] = { | |||
1041 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | |||
1042 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | |||
1043 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | |||
1044 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | |||
1045 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | |||
1046 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | |||
1047 | ||||
1048 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | |||
1049 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | |||
1050 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | |||
1051 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | |||
1052 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | |||
1053 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | |||
1054 | ||||
1055 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | |||
1056 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | |||
1057 | ||||
1058 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
1059 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
1060 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
1061 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
1062 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | |||
1063 | // + vpblendvb | |||
1064 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | |||
1065 | // + vpblendvb | |||
1066 | ||||
1067 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | |||
1068 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | |||
1069 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | |||
1070 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | |||
1071 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1072 | // + vpblendvb | |||
1073 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1074 | // + vpblendvb | |||
1075 | }; | |||
1076 | ||||
1077 | if (ST->hasAVX2()) | |||
1078 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | |||
1079 | return LT.first * Entry->Cost; | |||
1080 | ||||
1081 | static const CostTblEntry XOPShuffleTbl[] = { | |||
1082 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | |||
1083 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | |||
1084 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | |||
1085 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | |||
1086 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | |||
1087 | // + vinsertf128 | |||
1088 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | |||
1089 | // + vinsertf128 | |||
1090 | ||||
1091 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | |||
1092 | // + vinsertf128 | |||
1093 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | |||
1094 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | |||
1095 | // + vinsertf128 | |||
1096 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | |||
1097 | }; | |||
1098 | ||||
1099 | if (ST->hasXOP()) | |||
1100 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | |||
1101 | return LT.first * Entry->Cost; | |||
1102 | ||||
1103 | static const CostTblEntry AVX1ShuffleTbl[] = { | |||
1104 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1105 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1106 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1107 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1108 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
1109 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | |||
1110 | ||||
1111 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1112 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1113 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1114 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1115 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | |||
1116 | // + vinsertf128 | |||
1117 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | |||
1118 | // + vinsertf128 | |||
1119 | ||||
1120 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | |||
1121 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | |||
1122 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | |||
1123 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | |||
1124 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | |||
1125 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | |||
1126 | ||||
1127 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | |||
1128 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | |||
1129 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1130 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1131 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | |||
1132 | // + 2*por + vinsertf128 | |||
1133 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | |||
1134 | // + 2*por + vinsertf128 | |||
1135 | ||||
1136 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | |||
1137 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | |||
1138 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1139 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1140 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | |||
1141 | // + 4*por + vinsertf128 | |||
1142 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | |||
1143 | // + 4*por + vinsertf128 | |||
1144 | }; | |||
1145 | ||||
1146 | if (ST->hasAVX()) | |||
1147 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | |||
1148 | return LT.first * Entry->Cost; | |||
1149 | ||||
1150 | static const CostTblEntry SSE41ShuffleTbl[] = { | |||
1151 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | |||
1152 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1153 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | |||
1154 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | |||
1155 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | |||
1156 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | |||
1157 | }; | |||
1158 | ||||
1159 | if (ST->hasSSE41()) | |||
1160 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | |||
1161 | return LT.first * Entry->Cost; | |||
1162 | ||||
1163 | static const CostTblEntry SSSE3ShuffleTbl[] = { | |||
1164 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | |||
1165 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | |||
1166 | ||||
1167 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | |||
1168 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | |||
1169 | ||||
1170 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | |||
1171 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | |||
1172 | ||||
1173 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | |||
1174 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
1175 | ||||
1176 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | |||
1177 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | |||
1178 | }; | |||
1179 | ||||
1180 | if (ST->hasSSSE3()) | |||
1181 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | |||
1182 | return LT.first * Entry->Cost; | |||
1183 | ||||
1184 | static const CostTblEntry SSE2ShuffleTbl[] = { | |||
1185 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | |||
1186 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | |||
1187 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | |||
1188 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | |||
1189 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | |||
1190 | ||||
1191 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | |||
1192 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | |||
1193 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | |||
1194 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | |||
1195 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | |||
1196 | // + 2*pshufd + 2*unpck + packus | |||
1197 | ||||
1198 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | |||
1199 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1200 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | |||
1201 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | |||
1202 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | |||
1203 | ||||
1204 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | |||
1205 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | |||
1206 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | |||
1207 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | |||
1208 | // + pshufd/unpck | |||
1209 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | |||
1210 | // + 2*pshufd + 2*unpck + 2*packus | |||
1211 | ||||
1212 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | |||
1213 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | |||
1214 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | |||
1215 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | |||
1216 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | |||
1217 | }; | |||
1218 | ||||
1219 | if (ST->hasSSE2()) | |||
1220 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | |||
1221 | return LT.first * Entry->Cost; | |||
1222 | ||||
1223 | static const CostTblEntry SSE1ShuffleTbl[] = { | |||
1224 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | |||
1225 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | |||
1226 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | |||
1227 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | |||
1228 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | |||
1229 | }; | |||
1230 | ||||
1231 | if (ST->hasSSE1()) | |||
1232 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | |||
1233 | return LT.first * Entry->Cost; | |||
1234 | ||||
1235 | return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); | |||
1236 | } | |||
1237 | ||||
1238 | int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, | |||
1239 | const Instruction *I) { | |||
1240 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1241 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 1241, __PRETTY_FUNCTION__)); | |||
1242 | ||||
1243 | // FIXME: Need a better design of the cost table to handle non-simple types of | |||
1244 | // potential massive combinations (elem_num x src_type x dst_type). | |||
1245 | ||||
1246 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | |||
1247 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
1248 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
1249 | ||||
1250 | // Mask sign extend has an instruction. | |||
1251 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
1252 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
1253 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
1254 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
1255 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | |||
1256 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | |||
1257 | ||||
1258 | // Mask zero extend is a load + broadcast. | |||
1259 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
1260 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
1261 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
1262 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
1263 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | |||
1264 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | |||
1265 | }; | |||
1266 | ||||
1267 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | |||
1268 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
1269 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
1270 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
1271 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
1272 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
1273 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
1274 | ||||
1275 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
1276 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
1277 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
1278 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
1279 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
1280 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
1281 | ||||
1282 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, | |||
1283 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
1284 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
1285 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1286 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
1287 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
1288 | ||||
1289 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, | |||
1290 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
1291 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
1292 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1293 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
1294 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
1295 | }; | |||
1296 | ||||
1297 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | |||
1298 | // 256-bit wide vectors. | |||
1299 | ||||
1300 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | |||
1301 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | |||
1302 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | |||
1303 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | |||
1304 | ||||
1305 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, | |||
1306 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, | |||
1307 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, | |||
1308 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, | |||
1309 | ||||
1310 | // v16i1 -> v16i32 - load + broadcast | |||
1311 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, | |||
1312 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, | |||
1313 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
1314 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
1315 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
1316 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
1317 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
1318 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
1319 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
1320 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
1321 | ||||
1322 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
1323 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
1324 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, | |||
1325 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, | |||
1326 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
1327 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, | |||
1328 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
1329 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
1330 | ||||
1331 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
1332 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
1333 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, | |||
1334 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, | |||
1335 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, | |||
1336 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, | |||
1337 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, | |||
1338 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, | |||
1339 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, | |||
1340 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
1341 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
1342 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, | |||
1343 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | |||
1344 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | |||
1345 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
1346 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
1347 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
1348 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
1349 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
1350 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | |||
1351 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | |||
1352 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
1353 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | |||
1354 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | |||
1355 | ||||
1356 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
1357 | ||||
1358 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
1359 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
1360 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
1361 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
1362 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, | |||
1363 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, | |||
1364 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | |||
1365 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 }, | |||
1366 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, | |||
1367 | }; | |||
1368 | ||||
1369 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | |||
1370 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
1371 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
1372 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
1373 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
1374 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, | |||
1375 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, | |||
1376 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
1377 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
1378 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
1379 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
1380 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1381 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1382 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
1383 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
1384 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
1385 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
1386 | ||||
1387 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, | |||
1388 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, | |||
1389 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | |||
1390 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, | |||
1391 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | |||
1392 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, | |||
1393 | ||||
1394 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | |||
1395 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | |||
1396 | ||||
1397 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | |||
1398 | }; | |||
1399 | ||||
1400 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | |||
1401 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | |||
1402 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | |||
1403 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | |||
1404 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | |||
1405 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, | |||
1406 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, | |||
1407 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, | |||
1408 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, | |||
1409 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, | |||
1410 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, | |||
1411 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, | |||
1412 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1413 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, | |||
1414 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, | |||
1415 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, | |||
1416 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, | |||
1417 | ||||
1418 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, | |||
1419 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, | |||
1420 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
1421 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, | |||
1422 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, | |||
1423 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, | |||
1424 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, | |||
1425 | ||||
1426 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | |||
1427 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | |||
1428 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | |||
1429 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | |||
1430 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, | |||
1431 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, | |||
1432 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, | |||
1433 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, | |||
1434 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, | |||
1435 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
1436 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
1437 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
1438 | ||||
1439 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | |||
1440 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | |||
1441 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | |||
1442 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, | |||
1443 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, | |||
1444 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, | |||
1445 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
1446 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, | |||
1447 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, | |||
1448 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, | |||
1449 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, | |||
1450 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | |||
1451 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, | |||
1452 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
1453 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 }, | |||
1454 | // The generic code to compute the scalar overhead is currently broken. | |||
1455 | // Workaround this limitation by estimating the scalarization overhead | |||
1456 | // here. We have roughly 10 instructions per scalar element. | |||
1457 | // Multiply that by the vector width. | |||
1458 | // FIXME: remove that when PR19268 is fixed. | |||
1459 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, | |||
1460 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, | |||
1461 | ||||
1462 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, | |||
1463 | { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, | |||
1464 | // This node is expanded into scalarized operations but BasicTTI is overly | |||
1465 | // optimistic estimating its cost. It computes 3 per element (one | |||
1466 | // vector-extract, one scalar conversion and one vector-insert). The | |||
1467 | // problem is that the inserts form a read-modify-write chain so latency | |||
1468 | // should be factored in too. Inflating the cost per element by 1. | |||
1469 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, | |||
1470 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, | |||
1471 | ||||
1472 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | |||
1473 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | |||
1474 | }; | |||
1475 | ||||
1476 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | |||
1477 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, | |||
1478 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, | |||
1479 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, | |||
1480 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, | |||
1481 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
1482 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
1483 | ||||
1484 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, | |||
1485 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, | |||
1486 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, | |||
1487 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, | |||
1488 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
1489 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
1490 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, | |||
1491 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, | |||
1492 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
1493 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
1494 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, | |||
1495 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, | |||
1496 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
1497 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
1498 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
1499 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
1500 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, | |||
1501 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, | |||
1502 | ||||
1503 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, | |||
1504 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, | |||
1505 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, | |||
1506 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, | |||
1507 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, | |||
1508 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, | |||
1509 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, | |||
1510 | ||||
1511 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | |||
1512 | }; | |||
1513 | ||||
1514 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | |||
1515 | // These are somewhat magic numbers justified by looking at the output of | |||
1516 | // Intel's IACA, running some kernels and making sure when we take | |||
1517 | // legalization into account the throughput will be overestimated. | |||
1518 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, | |||
1519 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, | |||
1520 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, | |||
1521 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, | |||
1522 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
1523 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, | |||
1524 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, | |||
1525 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, | |||
1526 | ||||
1527 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, | |||
1528 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, | |||
1529 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, | |||
1530 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, | |||
1531 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, | |||
1532 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, | |||
1533 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 }, | |||
1534 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, | |||
1535 | ||||
1536 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, | |||
1537 | ||||
1538 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, | |||
1539 | ||||
1540 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, | |||
1541 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, | |||
1542 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | |||
1543 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, | |||
1544 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, | |||
1545 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, | |||
1546 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
1547 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, | |||
1548 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, | |||
1549 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, | |||
1550 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
1551 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, | |||
1552 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, | |||
1553 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, | |||
1554 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
1555 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, | |||
1556 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1557 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, | |||
1558 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
1559 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, | |||
1560 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, | |||
1561 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, | |||
1562 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
1563 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, | |||
1564 | ||||
1565 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, | |||
1566 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, | |||
1567 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | |||
1568 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, | |||
1569 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, | |||
1570 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, | |||
1571 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | |||
1572 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
1573 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, | |||
1574 | }; | |||
1575 | ||||
1576 | std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); | |||
1577 | std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); | |||
1578 | ||||
1579 | if (ST->hasSSE2() && !ST->hasAVX()) { | |||
1580 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
1581 | LTDest.second, LTSrc.second)) | |||
1582 | return LTSrc.first * Entry->Cost; | |||
1583 | } | |||
1584 | ||||
1585 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
1586 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
1587 | ||||
1588 | // The function getSimpleVT only handles simple value types. | |||
1589 | if (!SrcTy.isSimple() || !DstTy.isSimple()) | |||
1590 | return BaseT::getCastInstrCost(Opcode, Dst, Src); | |||
1591 | ||||
1592 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | |||
1593 | MVT SimpleDstTy = DstTy.getSimpleVT(); | |||
1594 | ||||
1595 | // Make sure that neither type is going to be split before using the | |||
1596 | // AVX512 tables. This handles -mprefer-vector-width=256 | |||
1597 | // with -min-legal-vector-width<=256 | |||
1598 | if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector && | |||
1599 | TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) { | |||
1600 | if (ST->hasBWI()) | |||
1601 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD, | |||
1602 | SimpleDstTy, SimpleSrcTy)) | |||
1603 | return Entry->Cost; | |||
1604 | ||||
1605 | if (ST->hasDQI()) | |||
1606 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, | |||
1607 | SimpleDstTy, SimpleSrcTy)) | |||
1608 | return Entry->Cost; | |||
1609 | ||||
1610 | if (ST->hasAVX512()) | |||
1611 | if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, | |||
1612 | SimpleDstTy, SimpleSrcTy)) | |||
1613 | return Entry->Cost; | |||
1614 | } | |||
1615 | ||||
1616 | if (ST->hasAVX2()) { | |||
1617 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
1618 | SimpleDstTy, SimpleSrcTy)) | |||
1619 | return Entry->Cost; | |||
1620 | } | |||
1621 | ||||
1622 | if (ST->hasAVX()) { | |||
1623 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
1624 | SimpleDstTy, SimpleSrcTy)) | |||
1625 | return Entry->Cost; | |||
1626 | } | |||
1627 | ||||
1628 | if (ST->hasSSE41()) { | |||
1629 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
1630 | SimpleDstTy, SimpleSrcTy)) | |||
1631 | return Entry->Cost; | |||
1632 | } | |||
1633 | ||||
1634 | if (ST->hasSSE2()) { | |||
1635 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
1636 | SimpleDstTy, SimpleSrcTy)) | |||
1637 | return Entry->Cost; | |||
1638 | } | |||
1639 | ||||
1640 | return BaseT::getCastInstrCost(Opcode, Dst, Src, I); | |||
1641 | } | |||
1642 | ||||
1643 | int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, | |||
1644 | const Instruction *I) { | |||
1645 | // Legalize the type. | |||
1646 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
1647 | ||||
1648 | MVT MTy = LT.second; | |||
1649 | ||||
1650 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1651 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 1651, __PRETTY_FUNCTION__)); | |||
1652 | ||||
1653 | unsigned ExtraCost = 0; | |||
1654 | if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) { | |||
1655 | // Some vector comparison predicates cost extra instructions. | |||
1656 | if (MTy.isVector() && | |||
1657 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | |||
1658 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | |||
1659 | ST->hasBWI())) { | |||
1660 | switch (cast<CmpInst>(I)->getPredicate()) { | |||
1661 | case CmpInst::Predicate::ICMP_NE: | |||
1662 | // xor(cmpeq(x,y),-1) | |||
1663 | ExtraCost = 1; | |||
1664 | break; | |||
1665 | case CmpInst::Predicate::ICMP_SGE: | |||
1666 | case CmpInst::Predicate::ICMP_SLE: | |||
1667 | // xor(cmpgt(x,y),-1) | |||
1668 | ExtraCost = 1; | |||
1669 | break; | |||
1670 | case CmpInst::Predicate::ICMP_ULT: | |||
1671 | case CmpInst::Predicate::ICMP_UGT: | |||
1672 | // cmpgt(xor(x,signbit),xor(y,signbit)) | |||
1673 | // xor(cmpeq(pmaxu(x,y),x),-1) | |||
1674 | ExtraCost = 2; | |||
1675 | break; | |||
1676 | case CmpInst::Predicate::ICMP_ULE: | |||
1677 | case CmpInst::Predicate::ICMP_UGE: | |||
1678 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | |||
1679 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | |||
1680 | // cmpeq(psubus(x,y),0) | |||
1681 | // cmpeq(pminu(x,y),x) | |||
1682 | ExtraCost = 1; | |||
1683 | } else { | |||
1684 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | |||
1685 | ExtraCost = 3; | |||
1686 | } | |||
1687 | break; | |||
1688 | default: | |||
1689 | break; | |||
1690 | } | |||
1691 | } | |||
1692 | } | |||
1693 | ||||
1694 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
1695 | { ISD::SETCC, MVT::v32i16, 1 }, | |||
1696 | { ISD::SETCC, MVT::v64i8, 1 }, | |||
1697 | ||||
1698 | { ISD::SELECT, MVT::v32i16, 1 }, | |||
1699 | { ISD::SELECT, MVT::v64i8, 1 }, | |||
1700 | }; | |||
1701 | ||||
1702 | static const CostTblEntry AVX512CostTbl[] = { | |||
1703 | { ISD::SETCC, MVT::v8i64, 1 }, | |||
1704 | { ISD::SETCC, MVT::v16i32, 1 }, | |||
1705 | { ISD::SETCC, MVT::v8f64, 1 }, | |||
1706 | { ISD::SETCC, MVT::v16f32, 1 }, | |||
1707 | ||||
1708 | { ISD::SELECT, MVT::v8i64, 1 }, | |||
1709 | { ISD::SELECT, MVT::v16i32, 1 }, | |||
1710 | { ISD::SELECT, MVT::v8f64, 1 }, | |||
1711 | { ISD::SELECT, MVT::v16f32, 1 }, | |||
1712 | }; | |||
1713 | ||||
1714 | static const CostTblEntry AVX2CostTbl[] = { | |||
1715 | { ISD::SETCC, MVT::v4i64, 1 }, | |||
1716 | { ISD::SETCC, MVT::v8i32, 1 }, | |||
1717 | { ISD::SETCC, MVT::v16i16, 1 }, | |||
1718 | { ISD::SETCC, MVT::v32i8, 1 }, | |||
1719 | ||||
1720 | { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb | |||
1721 | { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb | |||
1722 | { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb | |||
1723 | { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb | |||
1724 | }; | |||
1725 | ||||
1726 | static const CostTblEntry AVX1CostTbl[] = { | |||
1727 | { ISD::SETCC, MVT::v4f64, 1 }, | |||
1728 | { ISD::SETCC, MVT::v8f32, 1 }, | |||
1729 | // AVX1 does not support 8-wide integer compare. | |||
1730 | { ISD::SETCC, MVT::v4i64, 4 }, | |||
1731 | { ISD::SETCC, MVT::v8i32, 4 }, | |||
1732 | { ISD::SETCC, MVT::v16i16, 4 }, | |||
1733 | { ISD::SETCC, MVT::v32i8, 4 }, | |||
1734 | ||||
1735 | { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd | |||
1736 | { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps | |||
1737 | { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd | |||
1738 | { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps | |||
1739 | { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps | |||
1740 | { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps | |||
1741 | }; | |||
1742 | ||||
1743 | static const CostTblEntry SSE42CostTbl[] = { | |||
1744 | { ISD::SETCC, MVT::v2f64, 1 }, | |||
1745 | { ISD::SETCC, MVT::v4f32, 1 }, | |||
1746 | { ISD::SETCC, MVT::v2i64, 1 }, | |||
1747 | }; | |||
1748 | ||||
1749 | static const CostTblEntry SSE41CostTbl[] = { | |||
1750 | { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd | |||
1751 | { ISD::SELECT, MVT::v4f32, 1 }, // blendvps | |||
1752 | { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb | |||
1753 | { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb | |||
1754 | { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb | |||
1755 | { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb | |||
1756 | }; | |||
1757 | ||||
1758 | static const CostTblEntry SSE2CostTbl[] = { | |||
1759 | { ISD::SETCC, MVT::v2f64, 2 }, | |||
1760 | { ISD::SETCC, MVT::f64, 1 }, | |||
1761 | { ISD::SETCC, MVT::v2i64, 8 }, | |||
1762 | { ISD::SETCC, MVT::v4i32, 1 }, | |||
1763 | { ISD::SETCC, MVT::v8i16, 1 }, | |||
1764 | { ISD::SETCC, MVT::v16i8, 1 }, | |||
1765 | ||||
1766 | { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd | |||
1767 | { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por | |||
1768 | { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por | |||
1769 | { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por | |||
1770 | { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por | |||
1771 | }; | |||
1772 | ||||
1773 | static const CostTblEntry SSE1CostTbl[] = { | |||
1774 | { ISD::SETCC, MVT::v4f32, 2 }, | |||
1775 | { ISD::SETCC, MVT::f32, 1 }, | |||
1776 | ||||
1777 | { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps | |||
1778 | }; | |||
1779 | ||||
1780 | if (ST->hasBWI()) | |||
1781 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
1782 | return LT.first * (ExtraCost + Entry->Cost); | |||
1783 | ||||
1784 | if (ST->hasAVX512()) | |||
1785 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
1786 | return LT.first * (ExtraCost + Entry->Cost); | |||
1787 | ||||
1788 | if (ST->hasAVX2()) | |||
1789 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
1790 | return LT.first * (ExtraCost + Entry->Cost); | |||
1791 | ||||
1792 | if (ST->hasAVX()) | |||
1793 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
1794 | return LT.first * (ExtraCost + Entry->Cost); | |||
1795 | ||||
1796 | if (ST->hasSSE42()) | |||
1797 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
1798 | return LT.first * (ExtraCost + Entry->Cost); | |||
1799 | ||||
1800 | if (ST->hasSSE41()) | |||
1801 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
1802 | return LT.first * (ExtraCost + Entry->Cost); | |||
1803 | ||||
1804 | if (ST->hasSSE2()) | |||
1805 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
1806 | return LT.first * (ExtraCost + Entry->Cost); | |||
1807 | ||||
1808 | if (ST->hasSSE1()) | |||
1809 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
1810 | return LT.first * (ExtraCost + Entry->Cost); | |||
1811 | ||||
1812 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); | |||
1813 | } | |||
1814 | ||||
1815 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | |||
1816 | ||||
1817 | int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, | |||
1818 | ArrayRef<Type *> Tys, FastMathFlags FMF, | |||
1819 | unsigned ScalarizationCostPassed) { | |||
1820 | // Costs should match the codegen from: | |||
1821 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | |||
1822 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | |||
1823 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | |||
1824 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | |||
1825 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | |||
1826 | static const CostTblEntry AVX512CDCostTbl[] = { | |||
1827 | { ISD::CTLZ, MVT::v8i64, 1 }, | |||
1828 | { ISD::CTLZ, MVT::v16i32, 1 }, | |||
1829 | { ISD::CTLZ, MVT::v32i16, 8 }, | |||
1830 | { ISD::CTLZ, MVT::v64i8, 20 }, | |||
1831 | { ISD::CTLZ, MVT::v4i64, 1 }, | |||
1832 | { ISD::CTLZ, MVT::v8i32, 1 }, | |||
1833 | { ISD::CTLZ, MVT::v16i16, 4 }, | |||
1834 | { ISD::CTLZ, MVT::v32i8, 10 }, | |||
1835 | { ISD::CTLZ, MVT::v2i64, 1 }, | |||
1836 | { ISD::CTLZ, MVT::v4i32, 1 }, | |||
1837 | { ISD::CTLZ, MVT::v8i16, 4 }, | |||
1838 | { ISD::CTLZ, MVT::v16i8, 4 }, | |||
1839 | }; | |||
1840 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
1841 | { ISD::BITREVERSE, MVT::v8i64, 5 }, | |||
1842 | { ISD::BITREVERSE, MVT::v16i32, 5 }, | |||
1843 | { ISD::BITREVERSE, MVT::v32i16, 5 }, | |||
1844 | { ISD::BITREVERSE, MVT::v64i8, 5 }, | |||
1845 | { ISD::CTLZ, MVT::v8i64, 23 }, | |||
1846 | { ISD::CTLZ, MVT::v16i32, 22 }, | |||
1847 | { ISD::CTLZ, MVT::v32i16, 18 }, | |||
1848 | { ISD::CTLZ, MVT::v64i8, 17 }, | |||
1849 | { ISD::CTPOP, MVT::v8i64, 7 }, | |||
1850 | { ISD::CTPOP, MVT::v16i32, 11 }, | |||
1851 | { ISD::CTPOP, MVT::v32i16, 9 }, | |||
1852 | { ISD::CTPOP, MVT::v64i8, 6 }, | |||
1853 | { ISD::CTTZ, MVT::v8i64, 10 }, | |||
1854 | { ISD::CTTZ, MVT::v16i32, 14 }, | |||
1855 | { ISD::CTTZ, MVT::v32i16, 12 }, | |||
1856 | { ISD::CTTZ, MVT::v64i8, 9 }, | |||
1857 | { ISD::SADDSAT, MVT::v32i16, 1 }, | |||
1858 | { ISD::SADDSAT, MVT::v64i8, 1 }, | |||
1859 | { ISD::SSUBSAT, MVT::v32i16, 1 }, | |||
1860 | { ISD::SSUBSAT, MVT::v64i8, 1 }, | |||
1861 | { ISD::UADDSAT, MVT::v32i16, 1 }, | |||
1862 | { ISD::UADDSAT, MVT::v64i8, 1 }, | |||
1863 | { ISD::USUBSAT, MVT::v32i16, 1 }, | |||
1864 | { ISD::USUBSAT, MVT::v64i8, 1 }, | |||
1865 | }; | |||
1866 | static const CostTblEntry AVX512CostTbl[] = { | |||
1867 | { ISD::BITREVERSE, MVT::v8i64, 36 }, | |||
1868 | { ISD::BITREVERSE, MVT::v16i32, 24 }, | |||
1869 | { ISD::CTLZ, MVT::v8i64, 29 }, | |||
1870 | { ISD::CTLZ, MVT::v16i32, 35 }, | |||
1871 | { ISD::CTPOP, MVT::v8i64, 16 }, | |||
1872 | { ISD::CTPOP, MVT::v16i32, 24 }, | |||
1873 | { ISD::CTTZ, MVT::v8i64, 20 }, | |||
1874 | { ISD::CTTZ, MVT::v16i32, 28 }, | |||
1875 | { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd | |||
1876 | { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq | |||
1877 | { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq | |||
1878 | { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq | |||
1879 | { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd | |||
1880 | { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq | |||
1881 | { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq | |||
1882 | { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq | |||
1883 | }; | |||
1884 | static const CostTblEntry XOPCostTbl[] = { | |||
1885 | { ISD::BITREVERSE, MVT::v4i64, 4 }, | |||
1886 | { ISD::BITREVERSE, MVT::v8i32, 4 }, | |||
1887 | { ISD::BITREVERSE, MVT::v16i16, 4 }, | |||
1888 | { ISD::BITREVERSE, MVT::v32i8, 4 }, | |||
1889 | { ISD::BITREVERSE, MVT::v2i64, 1 }, | |||
1890 | { ISD::BITREVERSE, MVT::v4i32, 1 }, | |||
1891 | { ISD::BITREVERSE, MVT::v8i16, 1 }, | |||
1892 | { ISD::BITREVERSE, MVT::v16i8, 1 }, | |||
1893 | { ISD::BITREVERSE, MVT::i64, 3 }, | |||
1894 | { ISD::BITREVERSE, MVT::i32, 3 }, | |||
1895 | { ISD::BITREVERSE, MVT::i16, 3 }, | |||
1896 | { ISD::BITREVERSE, MVT::i8, 3 } | |||
1897 | }; | |||
1898 | static const CostTblEntry AVX2CostTbl[] = { | |||
1899 | { ISD::BITREVERSE, MVT::v4i64, 5 }, | |||
1900 | { ISD::BITREVERSE, MVT::v8i32, 5 }, | |||
1901 | { ISD::BITREVERSE, MVT::v16i16, 5 }, | |||
1902 | { ISD::BITREVERSE, MVT::v32i8, 5 }, | |||
1903 | { ISD::BSWAP, MVT::v4i64, 1 }, | |||
1904 | { ISD::BSWAP, MVT::v8i32, 1 }, | |||
1905 | { ISD::BSWAP, MVT::v16i16, 1 }, | |||
1906 | { ISD::CTLZ, MVT::v4i64, 23 }, | |||
1907 | { ISD::CTLZ, MVT::v8i32, 18 }, | |||
1908 | { ISD::CTLZ, MVT::v16i16, 14 }, | |||
1909 | { ISD::CTLZ, MVT::v32i8, 9 }, | |||
1910 | { ISD::CTPOP, MVT::v4i64, 7 }, | |||
1911 | { ISD::CTPOP, MVT::v8i32, 11 }, | |||
1912 | { ISD::CTPOP, MVT::v16i16, 9 }, | |||
1913 | { ISD::CTPOP, MVT::v32i8, 6 }, | |||
1914 | { ISD::CTTZ, MVT::v4i64, 10 }, | |||
1915 | { ISD::CTTZ, MVT::v8i32, 14 }, | |||
1916 | { ISD::CTTZ, MVT::v16i16, 12 }, | |||
1917 | { ISD::CTTZ, MVT::v32i8, 9 }, | |||
1918 | { ISD::SADDSAT, MVT::v16i16, 1 }, | |||
1919 | { ISD::SADDSAT, MVT::v32i8, 1 }, | |||
1920 | { ISD::SSUBSAT, MVT::v16i16, 1 }, | |||
1921 | { ISD::SSUBSAT, MVT::v32i8, 1 }, | |||
1922 | { ISD::UADDSAT, MVT::v16i16, 1 }, | |||
1923 | { ISD::UADDSAT, MVT::v32i8, 1 }, | |||
1924 | { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd | |||
1925 | { ISD::USUBSAT, MVT::v16i16, 1 }, | |||
1926 | { ISD::USUBSAT, MVT::v32i8, 1 }, | |||
1927 | { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd | |||
1928 | { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | |||
1929 | { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | |||
1930 | { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | |||
1931 | { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | |||
1932 | { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | |||
1933 | { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | |||
1934 | }; | |||
1935 | static const CostTblEntry AVX1CostTbl[] = { | |||
1936 | { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert | |||
1937 | { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert | |||
1938 | { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert | |||
1939 | { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert | |||
1940 | { ISD::BSWAP, MVT::v4i64, 4 }, | |||
1941 | { ISD::BSWAP, MVT::v8i32, 4 }, | |||
1942 | { ISD::BSWAP, MVT::v16i16, 4 }, | |||
1943 | { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert | |||
1944 | { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert | |||
1945 | { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert | |||
1946 | { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | |||
1947 | { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert | |||
1948 | { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert | |||
1949 | { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert | |||
1950 | { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert | |||
1951 | { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert | |||
1952 | { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert | |||
1953 | { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert | |||
1954 | { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | |||
1955 | { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
1956 | { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
1957 | { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
1958 | { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
1959 | { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
1960 | { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
1961 | { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert | |||
1962 | { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
1963 | { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
1964 | { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert | |||
1965 | { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ | |||
1966 | { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | |||
1967 | { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | |||
1968 | { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ | |||
1969 | { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ | |||
1970 | { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ | |||
1971 | }; | |||
1972 | static const CostTblEntry GLMCostTbl[] = { | |||
1973 | { ISD::FSQRT, MVT::f32, 19 }, // sqrtss | |||
1974 | { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps | |||
1975 | { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd | |||
1976 | { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd | |||
1977 | }; | |||
1978 | static const CostTblEntry SLMCostTbl[] = { | |||
1979 | { ISD::FSQRT, MVT::f32, 20 }, // sqrtss | |||
1980 | { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps | |||
1981 | { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd | |||
1982 | { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd | |||
1983 | }; | |||
1984 | static const CostTblEntry SSE42CostTbl[] = { | |||
1985 | { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd | |||
1986 | { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd | |||
1987 | { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ | |||
1988 | { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ | |||
1989 | }; | |||
1990 | static const CostTblEntry SSSE3CostTbl[] = { | |||
1991 | { ISD::BITREVERSE, MVT::v2i64, 5 }, | |||
1992 | { ISD::BITREVERSE, MVT::v4i32, 5 }, | |||
1993 | { ISD::BITREVERSE, MVT::v8i16, 5 }, | |||
1994 | { ISD::BITREVERSE, MVT::v16i8, 5 }, | |||
1995 | { ISD::BSWAP, MVT::v2i64, 1 }, | |||
1996 | { ISD::BSWAP, MVT::v4i32, 1 }, | |||
1997 | { ISD::BSWAP, MVT::v8i16, 1 }, | |||
1998 | { ISD::CTLZ, MVT::v2i64, 23 }, | |||
1999 | { ISD::CTLZ, MVT::v4i32, 18 }, | |||
2000 | { ISD::CTLZ, MVT::v8i16, 14 }, | |||
2001 | { ISD::CTLZ, MVT::v16i8, 9 }, | |||
2002 | { ISD::CTPOP, MVT::v2i64, 7 }, | |||
2003 | { ISD::CTPOP, MVT::v4i32, 11 }, | |||
2004 | { ISD::CTPOP, MVT::v8i16, 9 }, | |||
2005 | { ISD::CTPOP, MVT::v16i8, 6 }, | |||
2006 | { ISD::CTTZ, MVT::v2i64, 10 }, | |||
2007 | { ISD::CTTZ, MVT::v4i32, 14 }, | |||
2008 | { ISD::CTTZ, MVT::v8i16, 12 }, | |||
2009 | { ISD::CTTZ, MVT::v16i8, 9 } | |||
2010 | }; | |||
2011 | static const CostTblEntry SSE2CostTbl[] = { | |||
2012 | { ISD::BITREVERSE, MVT::v2i64, 29 }, | |||
2013 | { ISD::BITREVERSE, MVT::v4i32, 27 }, | |||
2014 | { ISD::BITREVERSE, MVT::v8i16, 27 }, | |||
2015 | { ISD::BITREVERSE, MVT::v16i8, 20 }, | |||
2016 | { ISD::BSWAP, MVT::v2i64, 7 }, | |||
2017 | { ISD::BSWAP, MVT::v4i32, 7 }, | |||
2018 | { ISD::BSWAP, MVT::v8i16, 7 }, | |||
2019 | { ISD::CTLZ, MVT::v2i64, 25 }, | |||
2020 | { ISD::CTLZ, MVT::v4i32, 26 }, | |||
2021 | { ISD::CTLZ, MVT::v8i16, 20 }, | |||
2022 | { ISD::CTLZ, MVT::v16i8, 17 }, | |||
2023 | { ISD::CTPOP, MVT::v2i64, 12 }, | |||
2024 | { ISD::CTPOP, MVT::v4i32, 15 }, | |||
2025 | { ISD::CTPOP, MVT::v8i16, 13 }, | |||
2026 | { ISD::CTPOP, MVT::v16i8, 10 }, | |||
2027 | { ISD::CTTZ, MVT::v2i64, 14 }, | |||
2028 | { ISD::CTTZ, MVT::v4i32, 18 }, | |||
2029 | { ISD::CTTZ, MVT::v8i16, 16 }, | |||
2030 | { ISD::CTTZ, MVT::v16i8, 13 }, | |||
2031 | { ISD::SADDSAT, MVT::v8i16, 1 }, | |||
2032 | { ISD::SADDSAT, MVT::v16i8, 1 }, | |||
2033 | { ISD::SSUBSAT, MVT::v8i16, 1 }, | |||
2034 | { ISD::SSUBSAT, MVT::v16i8, 1 }, | |||
2035 | { ISD::UADDSAT, MVT::v8i16, 1 }, | |||
2036 | { ISD::UADDSAT, MVT::v16i8, 1 }, | |||
2037 | { ISD::USUBSAT, MVT::v8i16, 1 }, | |||
2038 | { ISD::USUBSAT, MVT::v16i8, 1 }, | |||
2039 | { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ | |||
2040 | { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ | |||
2041 | }; | |||
2042 | static const CostTblEntry SSE1CostTbl[] = { | |||
2043 | { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ | |||
2044 | { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ | |||
2045 | }; | |||
2046 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | |||
2047 | { ISD::BITREVERSE, MVT::i64, 14 }, | |||
2048 | { ISD::SADDO, MVT::i64, 1 }, | |||
2049 | { ISD::UADDO, MVT::i64, 1 }, | |||
2050 | }; | |||
2051 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
2052 | { ISD::BITREVERSE, MVT::i32, 14 }, | |||
2053 | { ISD::BITREVERSE, MVT::i16, 14 }, | |||
2054 | { ISD::BITREVERSE, MVT::i8, 11 }, | |||
2055 | { ISD::SADDO, MVT::i32, 1 }, | |||
2056 | { ISD::SADDO, MVT::i16, 1 }, | |||
2057 | { ISD::SADDO, MVT::i8, 1 }, | |||
2058 | { ISD::UADDO, MVT::i32, 1 }, | |||
2059 | { ISD::UADDO, MVT::i16, 1 }, | |||
2060 | { ISD::UADDO, MVT::i8, 1 }, | |||
2061 | }; | |||
2062 | ||||
2063 | Type *OpTy = RetTy; | |||
2064 | unsigned ISD = ISD::DELETED_NODE; | |||
2065 | switch (IID) { | |||
2066 | default: | |||
2067 | break; | |||
2068 | case Intrinsic::bitreverse: | |||
2069 | ISD = ISD::BITREVERSE; | |||
2070 | break; | |||
2071 | case Intrinsic::bswap: | |||
2072 | ISD = ISD::BSWAP; | |||
2073 | break; | |||
2074 | case Intrinsic::ctlz: | |||
2075 | ISD = ISD::CTLZ; | |||
2076 | break; | |||
2077 | case Intrinsic::ctpop: | |||
2078 | ISD = ISD::CTPOP; | |||
2079 | break; | |||
2080 | case Intrinsic::cttz: | |||
2081 | ISD = ISD::CTTZ; | |||
2082 | break; | |||
2083 | case Intrinsic::sadd_sat: | |||
2084 | ISD = ISD::SADDSAT; | |||
2085 | break; | |||
2086 | case Intrinsic::ssub_sat: | |||
2087 | ISD = ISD::SSUBSAT; | |||
2088 | break; | |||
2089 | case Intrinsic::uadd_sat: | |||
2090 | ISD = ISD::UADDSAT; | |||
2091 | break; | |||
2092 | case Intrinsic::usub_sat: | |||
2093 | ISD = ISD::USUBSAT; | |||
2094 | break; | |||
2095 | case Intrinsic::sqrt: | |||
2096 | ISD = ISD::FSQRT; | |||
2097 | break; | |||
2098 | case Intrinsic::sadd_with_overflow: | |||
2099 | case Intrinsic::ssub_with_overflow: | |||
2100 | // SSUBO has same costs so don't duplicate. | |||
2101 | ISD = ISD::SADDO; | |||
2102 | OpTy = RetTy->getContainedType(0); | |||
2103 | break; | |||
2104 | case Intrinsic::uadd_with_overflow: | |||
2105 | case Intrinsic::usub_with_overflow: | |||
2106 | // USUBO has same costs so don't duplicate. | |||
2107 | ISD = ISD::UADDO; | |||
2108 | OpTy = RetTy->getContainedType(0); | |||
2109 | break; | |||
2110 | } | |||
2111 | ||||
2112 | if (ISD != ISD::DELETED_NODE) { | |||
2113 | // Legalize the type. | |||
2114 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); | |||
2115 | MVT MTy = LT.second; | |||
2116 | ||||
2117 | // Attempt to lookup cost. | |||
2118 | if (ST->isGLM()) | |||
2119 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | |||
2120 | return LT.first * Entry->Cost; | |||
2121 | ||||
2122 | if (ST->isSLM()) | |||
2123 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
2124 | return LT.first * Entry->Cost; | |||
2125 | ||||
2126 | if (ST->hasCDI()) | |||
2127 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | |||
2128 | return LT.first * Entry->Cost; | |||
2129 | ||||
2130 | if (ST->hasBWI()) | |||
2131 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
2132 | return LT.first * Entry->Cost; | |||
2133 | ||||
2134 | if (ST->hasAVX512()) | |||
2135 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
2136 | return LT.first * Entry->Cost; | |||
2137 | ||||
2138 | if (ST->hasXOP()) | |||
2139 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
2140 | return LT.first * Entry->Cost; | |||
2141 | ||||
2142 | if (ST->hasAVX2()) | |||
2143 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
2144 | return LT.first * Entry->Cost; | |||
2145 | ||||
2146 | if (ST->hasAVX()) | |||
2147 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
2148 | return LT.first * Entry->Cost; | |||
2149 | ||||
2150 | if (ST->hasSSE42()) | |||
2151 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
2152 | return LT.first * Entry->Cost; | |||
2153 | ||||
2154 | if (ST->hasSSSE3()) | |||
2155 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | |||
2156 | return LT.first * Entry->Cost; | |||
2157 | ||||
2158 | if (ST->hasSSE2()) | |||
2159 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
2160 | return LT.first * Entry->Cost; | |||
2161 | ||||
2162 | if (ST->hasSSE1()) | |||
2163 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
2164 | return LT.first * Entry->Cost; | |||
2165 | ||||
2166 | if (ST->is64Bit()) | |||
2167 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
2168 | return LT.first * Entry->Cost; | |||
2169 | ||||
2170 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
2171 | return LT.first * Entry->Cost; | |||
2172 | } | |||
2173 | ||||
2174 | return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); | |||
2175 | } | |||
2176 | ||||
2177 | int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, | |||
2178 | ArrayRef<Value *> Args, FastMathFlags FMF, | |||
2179 | unsigned VF) { | |||
2180 | static const CostTblEntry AVX512CostTbl[] = { | |||
2181 | { ISD::ROTL, MVT::v8i64, 1 }, | |||
2182 | { ISD::ROTL, MVT::v4i64, 1 }, | |||
2183 | { ISD::ROTL, MVT::v2i64, 1 }, | |||
2184 | { ISD::ROTL, MVT::v16i32, 1 }, | |||
2185 | { ISD::ROTL, MVT::v8i32, 1 }, | |||
2186 | { ISD::ROTL, MVT::v4i32, 1 }, | |||
2187 | { ISD::ROTR, MVT::v8i64, 1 }, | |||
2188 | { ISD::ROTR, MVT::v4i64, 1 }, | |||
2189 | { ISD::ROTR, MVT::v2i64, 1 }, | |||
2190 | { ISD::ROTR, MVT::v16i32, 1 }, | |||
2191 | { ISD::ROTR, MVT::v8i32, 1 }, | |||
2192 | { ISD::ROTR, MVT::v4i32, 1 } | |||
2193 | }; | |||
2194 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | |||
2195 | static const CostTblEntry XOPCostTbl[] = { | |||
2196 | { ISD::ROTL, MVT::v4i64, 4 }, | |||
2197 | { ISD::ROTL, MVT::v8i32, 4 }, | |||
2198 | { ISD::ROTL, MVT::v16i16, 4 }, | |||
2199 | { ISD::ROTL, MVT::v32i8, 4 }, | |||
2200 | { ISD::ROTL, MVT::v2i64, 1 }, | |||
2201 | { ISD::ROTL, MVT::v4i32, 1 }, | |||
2202 | { ISD::ROTL, MVT::v8i16, 1 }, | |||
2203 | { ISD::ROTL, MVT::v16i8, 1 }, | |||
2204 | { ISD::ROTR, MVT::v4i64, 6 }, | |||
2205 | { ISD::ROTR, MVT::v8i32, 6 }, | |||
2206 | { ISD::ROTR, MVT::v16i16, 6 }, | |||
2207 | { ISD::ROTR, MVT::v32i8, 6 }, | |||
2208 | { ISD::ROTR, MVT::v2i64, 2 }, | |||
2209 | { ISD::ROTR, MVT::v4i32, 2 }, | |||
2210 | { ISD::ROTR, MVT::v8i16, 2 }, | |||
2211 | { ISD::ROTR, MVT::v16i8, 2 } | |||
2212 | }; | |||
2213 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | |||
2214 | { ISD::ROTL, MVT::i64, 1 }, | |||
2215 | { ISD::ROTR, MVT::i64, 1 }, | |||
2216 | { ISD::FSHL, MVT::i64, 4 } | |||
2217 | }; | |||
2218 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
2219 | { ISD::ROTL, MVT::i32, 1 }, | |||
2220 | { ISD::ROTL, MVT::i16, 1 }, | |||
2221 | { ISD::ROTL, MVT::i8, 1 }, | |||
2222 | { ISD::ROTR, MVT::i32, 1 }, | |||
2223 | { ISD::ROTR, MVT::i16, 1 }, | |||
2224 | { ISD::ROTR, MVT::i8, 1 }, | |||
2225 | { ISD::FSHL, MVT::i32, 4 }, | |||
2226 | { ISD::FSHL, MVT::i16, 4 }, | |||
2227 | { ISD::FSHL, MVT::i8, 4 } | |||
2228 | }; | |||
2229 | ||||
2230 | unsigned ISD = ISD::DELETED_NODE; | |||
2231 | switch (IID) { | |||
2232 | default: | |||
2233 | break; | |||
2234 | case Intrinsic::fshl: | |||
2235 | ISD = ISD::FSHL; | |||
2236 | if (Args[0] == Args[1]) | |||
2237 | ISD = ISD::ROTL; | |||
2238 | break; | |||
2239 | case Intrinsic::fshr: | |||
2240 | // FSHR has same costs so don't duplicate. | |||
2241 | ISD = ISD::FSHL; | |||
2242 | if (Args[0] == Args[1]) | |||
2243 | ISD = ISD::ROTR; | |||
2244 | break; | |||
2245 | } | |||
2246 | ||||
2247 | if (ISD != ISD::DELETED_NODE) { | |||
2248 | // Legalize the type. | |||
2249 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); | |||
2250 | MVT MTy = LT.second; | |||
2251 | ||||
2252 | // Attempt to lookup cost. | |||
2253 | if (ST->hasAVX512()) | |||
2254 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
2255 | return LT.first * Entry->Cost; | |||
2256 | ||||
2257 | if (ST->hasXOP()) | |||
2258 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
2259 | return LT.first * Entry->Cost; | |||
2260 | ||||
2261 | if (ST->is64Bit()) | |||
2262 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
2263 | return LT.first * Entry->Cost; | |||
2264 | ||||
2265 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
2266 | return LT.first * Entry->Cost; | |||
2267 | } | |||
2268 | ||||
2269 | return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); | |||
2270 | } | |||
2271 | ||||
2272 | int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { | |||
2273 | assert(Val->isVectorTy() && "This must be a vector type")((Val->isVectorTy() && "This must be a vector type" ) ? static_cast<void> (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2273, __PRETTY_FUNCTION__)); | |||
2274 | ||||
2275 | Type *ScalarType = Val->getScalarType(); | |||
2276 | ||||
2277 | if (Index != -1U) { | |||
2278 | // Legalize the type. | |||
2279 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); | |||
2280 | ||||
2281 | // This type is legalized to a scalar type. | |||
2282 | if (!LT.second.isVector()) | |||
2283 | return 0; | |||
2284 | ||||
2285 | // The type may be split. Normalize the index to the new type. | |||
2286 | unsigned Width = LT.second.getVectorNumElements(); | |||
2287 | Index = Index % Width; | |||
2288 | ||||
2289 | // Floating point scalars are already located in index #0. | |||
2290 | if (ScalarType->isFloatingPointTy() && Index == 0) | |||
2291 | return 0; | |||
2292 | } | |||
2293 | ||||
2294 | // Add to the base cost if we know that the extracted element of a vector is | |||
2295 | // destined to be moved to and used in the integer register file. | |||
2296 | int RegisterFileMoveCost = 0; | |||
2297 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | |||
2298 | RegisterFileMoveCost = 1; | |||
2299 | ||||
2300 | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; | |||
2301 | } | |||
2302 | ||||
2303 | int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, | |||
2304 | unsigned AddressSpace, const Instruction *I) { | |||
2305 | // Handle non-power-of-two vectors such as <3 x float> | |||
2306 | if (VectorType *VTy = dyn_cast<VectorType>(Src)) { | |||
2307 | unsigned NumElem = VTy->getVectorNumElements(); | |||
2308 | ||||
2309 | // Handle a few common cases: | |||
2310 | // <3 x float> | |||
2311 | if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) | |||
2312 | // Cost = 64 bit store + extract + 32 bit store. | |||
2313 | return 3; | |||
2314 | ||||
2315 | // <3 x double> | |||
2316 | if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) | |||
2317 | // Cost = 128 bit store + unpack + 64 bit store. | |||
2318 | return 3; | |||
2319 | ||||
2320 | // Assume that all other non-power-of-two numbers are scalarized. | |||
2321 | if (!isPowerOf2_32(NumElem)) { | |||
2322 | int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, | |||
2323 | AddressSpace); | |||
2324 | int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, | |||
2325 | Opcode == Instruction::Store); | |||
2326 | return NumElem * Cost + SplitCost; | |||
2327 | } | |||
2328 | } | |||
2329 | ||||
2330 | // Legalize the type. | |||
2331 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | |||
2332 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(((Opcode == Instruction::Load || Opcode == Instruction::Store ) && "Invalid Opcode") ? static_cast<void> (0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2333, __PRETTY_FUNCTION__)) | |||
2333 | "Invalid Opcode")(((Opcode == Instruction::Load || Opcode == Instruction::Store ) && "Invalid Opcode") ? static_cast<void> (0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2333, __PRETTY_FUNCTION__)); | |||
2334 | ||||
2335 | // Each load/store unit costs 1. | |||
2336 | int Cost = LT.first * 1; | |||
2337 | ||||
2338 | // This isn't exactly right. We're using slow unaligned 32-byte accesses as a | |||
2339 | // proxy for a double-pumped AVX memory interface such as on Sandybridge. | |||
2340 | if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) | |||
2341 | Cost *= 2; | |||
2342 | ||||
2343 | return Cost; | |||
2344 | } | |||
2345 | ||||
2346 | int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, | |||
2347 | unsigned Alignment, | |||
2348 | unsigned AddressSpace) { | |||
2349 | bool IsLoad = (Instruction::Load == Opcode); | |||
2350 | bool IsStore = (Instruction::Store == Opcode); | |||
2351 | ||||
2352 | VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); | |||
2353 | if (!SrcVTy) | |||
2354 | // To calculate scalar take the regular cost, without mask | |||
2355 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); | |||
2356 | ||||
2357 | unsigned NumElem = SrcVTy->getVectorNumElements(); | |||
2358 | VectorType *MaskTy = | |||
2359 | VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | |||
2360 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) || | |||
2361 | (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { | |||
2362 | // Scalarization | |||
2363 | int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); | |||
2364 | int ScalarCompareCost = getCmpSelInstrCost( | |||
2365 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr); | |||
2366 | int BranchCost = getCFInstrCost(Instruction::Br); | |||
2367 | int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | |||
2368 | ||||
2369 | int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore); | |||
2370 | int MemopCost = | |||
2371 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
2372 | Alignment, AddressSpace); | |||
2373 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | |||
2374 | } | |||
2375 | ||||
2376 | // Legalize the type. | |||
2377 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); | |||
2378 | auto VT = TLI->getValueType(DL, SrcVTy); | |||
2379 | int Cost = 0; | |||
2380 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | |||
2381 | LT.second.getVectorNumElements() == NumElem) | |||
2382 | // Promotion requires expand/truncate for data and a shuffle for mask. | |||
2383 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) + | |||
2384 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); | |||
2385 | ||||
2386 | else if (LT.second.getVectorNumElements() > NumElem) { | |||
2387 | VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), | |||
2388 | LT.second.getVectorNumElements()); | |||
2389 | // Expanding requires fill mask with zeroes | |||
2390 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); | |||
2391 | } | |||
2392 | ||||
2393 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | |||
2394 | if (!ST->hasAVX512()) | |||
2395 | return Cost + LT.first * (IsLoad ? 2 : 8); | |||
2396 | ||||
2397 | // AVX-512 masked load/store is cheapper | |||
2398 | return Cost + LT.first; | |||
2399 | } | |||
2400 | ||||
2401 | int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, | |||
2402 | const SCEV *Ptr) { | |||
2403 | // Address computations in vectorized code with non-consecutive addresses will | |||
2404 | // likely result in more instructions compared to scalar code where the | |||
2405 | // computation can more often be merged into the index mode. The resulting | |||
2406 | // extra micro-ops can significantly decrease throughput. | |||
2407 | const unsigned NumVectorInstToHideOverhead = 10; | |||
2408 | ||||
2409 | // Cost modeling of Strided Access Computation is hidden by the indexing | |||
2410 | // modes of X86 regardless of the stride value. We dont believe that there | |||
2411 | // is a difference between constant strided access in gerenal and constant | |||
2412 | // strided value which is less than or equal to 64. | |||
2413 | // Even in the case of (loop invariant) stride whose value is not known at | |||
2414 | // compile time, the address computation will not incur more than one extra | |||
2415 | // ADD instruction. | |||
2416 | if (Ty->isVectorTy() && SE) { | |||
2417 | if (!BaseT::isStridedAccess(Ptr)) | |||
2418 | return NumVectorInstToHideOverhead; | |||
2419 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | |||
2420 | return 1; | |||
2421 | } | |||
2422 | ||||
2423 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | |||
2424 | } | |||
2425 | ||||
2426 | int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, | |||
2427 | bool IsPairwise) { | |||
2428 | ||||
2429 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
2430 | ||||
2431 | MVT MTy = LT.second; | |||
2432 | ||||
2433 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
2434 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2434, __PRETTY_FUNCTION__)); | |||
2435 | ||||
2436 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
2437 | // and make it as the cost. | |||
2438 | ||||
2439 | static const CostTblEntry SSE42CostTblPairWise[] = { | |||
2440 | { ISD::FADD, MVT::v2f64, 2 }, | |||
2441 | { ISD::FADD, MVT::v4f32, 4 }, | |||
2442 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | |||
2443 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". | |||
2444 | { ISD::ADD, MVT::v8i16, 5 }, | |||
2445 | }; | |||
2446 | ||||
2447 | static const CostTblEntry AVX1CostTblPairWise[] = { | |||
2448 | { ISD::FADD, MVT::v4f32, 4 }, | |||
2449 | { ISD::FADD, MVT::v4f64, 5 }, | |||
2450 | { ISD::FADD, MVT::v8f32, 7 }, | |||
2451 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | |||
2452 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". | |||
2453 | { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". | |||
2454 | { ISD::ADD, MVT::v8i16, 5 }, | |||
2455 | { ISD::ADD, MVT::v8i32, 5 }, | |||
2456 | }; | |||
2457 | ||||
2458 | static const CostTblEntry SSE42CostTblNoPairWise[] = { | |||
2459 | { ISD::FADD, MVT::v2f64, 2 }, | |||
2460 | { ISD::FADD, MVT::v4f32, 4 }, | |||
2461 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | |||
2462 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | |||
2463 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | |||
2464 | }; | |||
2465 | ||||
2466 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
2467 | { ISD::FADD, MVT::v4f32, 3 }, | |||
2468 | { ISD::FADD, MVT::v4f64, 3 }, | |||
2469 | { ISD::FADD, MVT::v8f32, 4 }, | |||
2470 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | |||
2471 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". | |||
2472 | { ISD::ADD, MVT::v4i64, 3 }, | |||
2473 | { ISD::ADD, MVT::v8i16, 4 }, | |||
2474 | { ISD::ADD, MVT::v8i32, 5 }, | |||
2475 | }; | |||
2476 | ||||
2477 | if (IsPairwise) { | |||
2478 | if (ST->hasAVX()) | |||
2479 | if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) | |||
2480 | return LT.first * Entry->Cost; | |||
2481 | ||||
2482 | if (ST->hasSSE42()) | |||
2483 | if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) | |||
2484 | return LT.first * Entry->Cost; | |||
2485 | } else { | |||
2486 | if (ST->hasAVX()) | |||
2487 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
2488 | return LT.first * Entry->Cost; | |||
2489 | ||||
2490 | if (ST->hasSSE42()) | |||
2491 | if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) | |||
2492 | return LT.first * Entry->Cost; | |||
2493 | } | |||
2494 | ||||
2495 | static const CostTblEntry AVX2BoolReduction[] = { | |||
2496 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
2497 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
2498 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
2499 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
2500 | }; | |||
2501 | ||||
2502 | static const CostTblEntry AVX1BoolReduction[] = { | |||
2503 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
2504 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
2505 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
2506 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
2507 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
2508 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
2509 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
2510 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
2511 | }; | |||
2512 | ||||
2513 | static const CostTblEntry SSE2BoolReduction[] = { | |||
2514 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | |||
2515 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | |||
2516 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
2517 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
2518 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | |||
2519 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | |||
2520 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
2521 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
2522 | }; | |||
2523 | ||||
2524 | // Handle bool allof/anyof patterns. | |||
2525 | if (ValTy->getVectorElementType()->isIntegerTy(1)) { | |||
2526 | if (ST->hasAVX2()) | |||
2527 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | |||
2528 | return LT.first * Entry->Cost; | |||
2529 | if (ST->hasAVX()) | |||
2530 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | |||
2531 | return LT.first * Entry->Cost; | |||
2532 | if (ST->hasSSE2()) | |||
2533 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | |||
2534 | return LT.first * Entry->Cost; | |||
2535 | } | |||
2536 | ||||
2537 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); | |||
2538 | } | |||
2539 | ||||
2540 | int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, | |||
2541 | bool IsPairwise, bool IsUnsigned) { | |||
2542 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
2543 | ||||
2544 | MVT MTy = LT.second; | |||
2545 | ||||
2546 | int ISD; | |||
2547 | if (ValTy->isIntOrIntVectorTy()) { | |||
2548 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
2549 | } else { | |||
2550 | assert(ValTy->isFPOrFPVectorTy() &&((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type." ) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2551, __PRETTY_FUNCTION__)) | |||
2551 | "Expected float point or integer vector type.")((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type." ) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2551, __PRETTY_FUNCTION__)); | |||
2552 | ISD = ISD::FMINNUM; | |||
2553 | } | |||
2554 | ||||
2555 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
2556 | // and make it as the cost. | |||
2557 | ||||
2558 | static const CostTblEntry SSE1CostTblPairWise[] = { | |||
2559 | {ISD::FMINNUM, MVT::v4f32, 4}, | |||
2560 | }; | |||
2561 | ||||
2562 | static const CostTblEntry SSE2CostTblPairWise[] = { | |||
2563 | {ISD::FMINNUM, MVT::v2f64, 3}, | |||
2564 | {ISD::SMIN, MVT::v2i64, 6}, | |||
2565 | {ISD::UMIN, MVT::v2i64, 8}, | |||
2566 | {ISD::SMIN, MVT::v4i32, 6}, | |||
2567 | {ISD::UMIN, MVT::v4i32, 8}, | |||
2568 | {ISD::SMIN, MVT::v8i16, 4}, | |||
2569 | {ISD::UMIN, MVT::v8i16, 6}, | |||
2570 | {ISD::SMIN, MVT::v16i8, 8}, | |||
2571 | {ISD::UMIN, MVT::v16i8, 6}, | |||
2572 | }; | |||
2573 | ||||
2574 | static const CostTblEntry SSE41CostTblPairWise[] = { | |||
2575 | {ISD::FMINNUM, MVT::v4f32, 2}, | |||
2576 | {ISD::SMIN, MVT::v2i64, 9}, | |||
2577 | {ISD::UMIN, MVT::v2i64,10}, | |||
2578 | {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" | |||
2579 | {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" | |||
2580 | {ISD::SMIN, MVT::v8i16, 2}, | |||
2581 | {ISD::UMIN, MVT::v8i16, 2}, | |||
2582 | {ISD::SMIN, MVT::v16i8, 3}, | |||
2583 | {ISD::UMIN, MVT::v16i8, 3}, | |||
2584 | }; | |||
2585 | ||||
2586 | static const CostTblEntry SSE42CostTblPairWise[] = { | |||
2587 | {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" | |||
2588 | {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" | |||
2589 | }; | |||
2590 | ||||
2591 | static const CostTblEntry AVX1CostTblPairWise[] = { | |||
2592 | {ISD::FMINNUM, MVT::v4f32, 1}, | |||
2593 | {ISD::FMINNUM, MVT::v4f64, 1}, | |||
2594 | {ISD::FMINNUM, MVT::v8f32, 2}, | |||
2595 | {ISD::SMIN, MVT::v2i64, 3}, | |||
2596 | {ISD::UMIN, MVT::v2i64, 3}, | |||
2597 | {ISD::SMIN, MVT::v4i32, 1}, | |||
2598 | {ISD::UMIN, MVT::v4i32, 1}, | |||
2599 | {ISD::SMIN, MVT::v8i16, 1}, | |||
2600 | {ISD::UMIN, MVT::v8i16, 1}, | |||
2601 | {ISD::SMIN, MVT::v16i8, 2}, | |||
2602 | {ISD::UMIN, MVT::v16i8, 2}, | |||
2603 | {ISD::SMIN, MVT::v4i64, 7}, | |||
2604 | {ISD::UMIN, MVT::v4i64, 7}, | |||
2605 | {ISD::SMIN, MVT::v8i32, 3}, | |||
2606 | {ISD::UMIN, MVT::v8i32, 3}, | |||
2607 | {ISD::SMIN, MVT::v16i16, 3}, | |||
2608 | {ISD::UMIN, MVT::v16i16, 3}, | |||
2609 | {ISD::SMIN, MVT::v32i8, 3}, | |||
2610 | {ISD::UMIN, MVT::v32i8, 3}, | |||
2611 | }; | |||
2612 | ||||
2613 | static const CostTblEntry AVX2CostTblPairWise[] = { | |||
2614 | {ISD::SMIN, MVT::v4i64, 2}, | |||
2615 | {ISD::UMIN, MVT::v4i64, 2}, | |||
2616 | {ISD::SMIN, MVT::v8i32, 1}, | |||
2617 | {ISD::UMIN, MVT::v8i32, 1}, | |||
2618 | {ISD::SMIN, MVT::v16i16, 1}, | |||
2619 | {ISD::UMIN, MVT::v16i16, 1}, | |||
2620 | {ISD::SMIN, MVT::v32i8, 2}, | |||
2621 | {ISD::UMIN, MVT::v32i8, 2}, | |||
2622 | }; | |||
2623 | ||||
2624 | static const CostTblEntry AVX512CostTblPairWise[] = { | |||
2625 | {ISD::FMINNUM, MVT::v8f64, 1}, | |||
2626 | {ISD::FMINNUM, MVT::v16f32, 2}, | |||
2627 | {ISD::SMIN, MVT::v8i64, 2}, | |||
2628 | {ISD::UMIN, MVT::v8i64, 2}, | |||
2629 | {ISD::SMIN, MVT::v16i32, 1}, | |||
2630 | {ISD::UMIN, MVT::v16i32, 1}, | |||
2631 | }; | |||
2632 | ||||
2633 | static const CostTblEntry SSE1CostTblNoPairWise[] = { | |||
2634 | {ISD::FMINNUM, MVT::v4f32, 4}, | |||
2635 | }; | |||
2636 | ||||
2637 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
2638 | {ISD::FMINNUM, MVT::v2f64, 3}, | |||
2639 | {ISD::SMIN, MVT::v2i64, 6}, | |||
2640 | {ISD::UMIN, MVT::v2i64, 8}, | |||
2641 | {ISD::SMIN, MVT::v4i32, 6}, | |||
2642 | {ISD::UMIN, MVT::v4i32, 8}, | |||
2643 | {ISD::SMIN, MVT::v8i16, 4}, | |||
2644 | {ISD::UMIN, MVT::v8i16, 6}, | |||
2645 | {ISD::SMIN, MVT::v16i8, 8}, | |||
2646 | {ISD::UMIN, MVT::v16i8, 6}, | |||
2647 | }; | |||
2648 | ||||
2649 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | |||
2650 | {ISD::FMINNUM, MVT::v4f32, 3}, | |||
2651 | {ISD::SMIN, MVT::v2i64, 9}, | |||
2652 | {ISD::UMIN, MVT::v2i64,11}, | |||
2653 | {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" | |||
2654 | {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" | |||
2655 | {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" | |||
2656 | {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" | |||
2657 | {ISD::SMIN, MVT::v16i8, 3}, | |||
2658 | {ISD::UMIN, MVT::v16i8, 3}, | |||
2659 | }; | |||
2660 | ||||
2661 | static const CostTblEntry SSE42CostTblNoPairWise[] = { | |||
2662 | {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" | |||
2663 | {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" | |||
2664 | }; | |||
2665 | ||||
2666 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
2667 | {ISD::FMINNUM, MVT::v4f32, 1}, | |||
2668 | {ISD::FMINNUM, MVT::v4f64, 1}, | |||
2669 | {ISD::FMINNUM, MVT::v8f32, 1}, | |||
2670 | {ISD::SMIN, MVT::v2i64, 3}, | |||
2671 | {ISD::UMIN, MVT::v2i64, 3}, | |||
2672 | {ISD::SMIN, MVT::v4i32, 1}, | |||
2673 | {ISD::UMIN, MVT::v4i32, 1}, | |||
2674 | {ISD::SMIN, MVT::v8i16, 1}, | |||
2675 | {ISD::UMIN, MVT::v8i16, 1}, | |||
2676 | {ISD::SMIN, MVT::v16i8, 2}, | |||
2677 | {ISD::UMIN, MVT::v16i8, 2}, | |||
2678 | {ISD::SMIN, MVT::v4i64, 7}, | |||
2679 | {ISD::UMIN, MVT::v4i64, 7}, | |||
2680 | {ISD::SMIN, MVT::v8i32, 2}, | |||
2681 | {ISD::UMIN, MVT::v8i32, 2}, | |||
2682 | {ISD::SMIN, MVT::v16i16, 2}, | |||
2683 | {ISD::UMIN, MVT::v16i16, 2}, | |||
2684 | {ISD::SMIN, MVT::v32i8, 2}, | |||
2685 | {ISD::UMIN, MVT::v32i8, 2}, | |||
2686 | }; | |||
2687 | ||||
2688 | static const CostTblEntry AVX2CostTblNoPairWise[] = { | |||
2689 | {ISD::SMIN, MVT::v4i64, 1}, | |||
2690 | {ISD::UMIN, MVT::v4i64, 1}, | |||
2691 | {ISD::SMIN, MVT::v8i32, 1}, | |||
2692 | {ISD::UMIN, MVT::v8i32, 1}, | |||
2693 | {ISD::SMIN, MVT::v16i16, 1}, | |||
2694 | {ISD::UMIN, MVT::v16i16, 1}, | |||
2695 | {ISD::SMIN, MVT::v32i8, 1}, | |||
2696 | {ISD::UMIN, MVT::v32i8, 1}, | |||
2697 | }; | |||
2698 | ||||
2699 | static const CostTblEntry AVX512CostTblNoPairWise[] = { | |||
2700 | {ISD::FMINNUM, MVT::v8f64, 1}, | |||
2701 | {ISD::FMINNUM, MVT::v16f32, 2}, | |||
2702 | {ISD::SMIN, MVT::v8i64, 1}, | |||
2703 | {ISD::UMIN, MVT::v8i64, 1}, | |||
2704 | {ISD::SMIN, MVT::v16i32, 1}, | |||
2705 | {ISD::UMIN, MVT::v16i32, 1}, | |||
2706 | }; | |||
2707 | ||||
2708 | if (IsPairwise) { | |||
2709 | if (ST->hasAVX512()) | |||
2710 | if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) | |||
2711 | return LT.first * Entry->Cost; | |||
2712 | ||||
2713 | if (ST->hasAVX2()) | |||
2714 | if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) | |||
2715 | return LT.first * Entry->Cost; | |||
2716 | ||||
2717 | if (ST->hasAVX()) | |||
2718 | if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) | |||
2719 | return LT.first * Entry->Cost; | |||
2720 | ||||
2721 | if (ST->hasSSE42()) | |||
2722 | if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) | |||
2723 | return LT.first * Entry->Cost; | |||
2724 | ||||
2725 | if (ST->hasSSE41()) | |||
2726 | if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy)) | |||
2727 | return LT.first * Entry->Cost; | |||
2728 | ||||
2729 | if (ST->hasSSE2()) | |||
2730 | if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) | |||
2731 | return LT.first * Entry->Cost; | |||
2732 | ||||
2733 | if (ST->hasSSE1()) | |||
2734 | if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy)) | |||
2735 | return LT.first * Entry->Cost; | |||
2736 | } else { | |||
2737 | if (ST->hasAVX512()) | |||
2738 | if (const auto *Entry = | |||
2739 | CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) | |||
2740 | return LT.first * Entry->Cost; | |||
2741 | ||||
2742 | if (ST->hasAVX2()) | |||
2743 | if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) | |||
2744 | return LT.first * Entry->Cost; | |||
2745 | ||||
2746 | if (ST->hasAVX()) | |||
2747 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
2748 | return LT.first * Entry->Cost; | |||
2749 | ||||
2750 | if (ST->hasSSE42()) | |||
2751 | if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) | |||
2752 | return LT.first * Entry->Cost; | |||
2753 | ||||
2754 | if (ST->hasSSE41()) | |||
2755 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
2756 | return LT.first * Entry->Cost; | |||
2757 | ||||
2758 | if (ST->hasSSE2()) | |||
2759 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
2760 | return LT.first * Entry->Cost; | |||
2761 | ||||
2762 | if (ST->hasSSE1()) | |||
2763 | if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy)) | |||
2764 | return LT.first * Entry->Cost; | |||
2765 | } | |||
2766 | ||||
2767 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); | |||
2768 | } | |||
2769 | ||||
2770 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
2771 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
2772 | /// is valid to return a cost of ZERO. | |||
2773 | int X86TTIImpl::getIntImmCost(int64_t Val) { | |||
2774 | if (Val == 0) | |||
2775 | return TTI::TCC_Free; | |||
2776 | ||||
2777 | if (isInt<32>(Val)) | |||
2778 | return TTI::TCC_Basic; | |||
2779 | ||||
2780 | return 2 * TTI::TCC_Basic; | |||
2781 | } | |||
2782 | ||||
2783 | int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { | |||
2784 | assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2784, __PRETTY_FUNCTION__)); | |||
2785 | ||||
2786 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
2787 | if (BitSize == 0) | |||
2788 | return ~0U; | |||
2789 | ||||
2790 | // Never hoist constants larger than 128bit, because this might lead to | |||
2791 | // incorrect code generation or assertions in codegen. | |||
2792 | // Fixme: Create a cost model for types larger than i128 once the codegen | |||
2793 | // issues have been fixed. | |||
2794 | if (BitSize > 128) | |||
2795 | return TTI::TCC_Free; | |||
2796 | ||||
2797 | if (Imm == 0) | |||
2798 | return TTI::TCC_Free; | |||
2799 | ||||
2800 | // Sign-extend all constants to a multiple of 64-bit. | |||
2801 | APInt ImmVal = Imm; | |||
2802 | if (BitSize % 64 != 0) | |||
2803 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | |||
2804 | ||||
2805 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
2806 | // chunk. | |||
2807 | int Cost = 0; | |||
2808 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
2809 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
2810 | int64_t Val = Tmp.getSExtValue(); | |||
2811 | Cost += getIntImmCost(Val); | |||
2812 | } | |||
2813 | // We need at least one instruction to materialize the constant. | |||
2814 | return std::max(1, Cost); | |||
2815 | } | |||
2816 | ||||
2817 | int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, | |||
2818 | Type *Ty) { | |||
2819 | assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2819, __PRETTY_FUNCTION__)); | |||
2820 | ||||
2821 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
2822 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
2823 | // here, so that constant hoisting will ignore this constant. | |||
2824 | if (BitSize == 0) | |||
2825 | return TTI::TCC_Free; | |||
2826 | ||||
2827 | unsigned ImmIdx = ~0U; | |||
2828 | switch (Opcode) { | |||
2829 | default: | |||
2830 | return TTI::TCC_Free; | |||
2831 | case Instruction::GetElementPtr: | |||
2832 | // Always hoist the base address of a GetElementPtr. This prevents the | |||
2833 | // creation of new constants for every base constant that gets constant | |||
2834 | // folded with the offset. | |||
2835 | if (Idx == 0) | |||
2836 | return 2 * TTI::TCC_Basic; | |||
2837 | return TTI::TCC_Free; | |||
2838 | case Instruction::Store: | |||
2839 | ImmIdx = 0; | |||
2840 | break; | |||
2841 | case Instruction::ICmp: | |||
2842 | // This is an imperfect hack to prevent constant hoisting of | |||
2843 | // compares that might be trying to check if a 64-bit value fits in | |||
2844 | // 32-bits. The backend can optimize these cases using a right shift by 32. | |||
2845 | // Ideally we would check the compare predicate here. There also other | |||
2846 | // similar immediates the backend can use shifts for. | |||
2847 | if (Idx == 1 && Imm.getBitWidth() == 64) { | |||
2848 | uint64_t ImmVal = Imm.getZExtValue(); | |||
2849 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | |||
2850 | return TTI::TCC_Free; | |||
2851 | } | |||
2852 | ImmIdx = 1; | |||
2853 | break; | |||
2854 | case Instruction::And: | |||
2855 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | |||
2856 | // by using a 32-bit operation with implicit zero extension. Detect such | |||
2857 | // immediates here as the normal path expects bit 31 to be sign extended. | |||
2858 | if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) | |||
2859 | return TTI::TCC_Free; | |||
2860 | ImmIdx = 1; | |||
2861 | break; | |||
2862 | case Instruction::Add: | |||
2863 | case Instruction::Sub: | |||
2864 | // For add/sub, we can use the opposite instruction for INT32_MIN. | |||
2865 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | |||
2866 | return TTI::TCC_Free; | |||
2867 | ImmIdx = 1; | |||
2868 | break; | |||
2869 | case Instruction::UDiv: | |||
2870 | case Instruction::SDiv: | |||
2871 | case Instruction::URem: | |||
2872 | case Instruction::SRem: | |||
2873 | // Division by constant is typically expanded later into a different | |||
2874 | // instruction sequence. This completely changes the constants. | |||
2875 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | |||
2876 | return TTI::TCC_Free; | |||
2877 | case Instruction::Mul: | |||
2878 | case Instruction::Or: | |||
2879 | case Instruction::Xor: | |||
2880 | ImmIdx = 1; | |||
2881 | break; | |||
2882 | // Always return TCC_Free for the shift value of a shift instruction. | |||
2883 | case Instruction::Shl: | |||
2884 | case Instruction::LShr: | |||
2885 | case Instruction::AShr: | |||
2886 | if (Idx == 1) | |||
2887 | return TTI::TCC_Free; | |||
2888 | break; | |||
2889 | case Instruction::Trunc: | |||
2890 | case Instruction::ZExt: | |||
2891 | case Instruction::SExt: | |||
2892 | case Instruction::IntToPtr: | |||
2893 | case Instruction::PtrToInt: | |||
2894 | case Instruction::BitCast: | |||
2895 | case Instruction::PHI: | |||
2896 | case Instruction::Call: | |||
2897 | case Instruction::Select: | |||
2898 | case Instruction::Ret: | |||
2899 | case Instruction::Load: | |||
2900 | break; | |||
2901 | } | |||
2902 | ||||
2903 | if (Idx == ImmIdx) { | |||
2904 | int NumConstants = divideCeil(BitSize, 64); | |||
2905 | int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); | |||
2906 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
2907 | ? static_cast<int>(TTI::TCC_Free) | |||
2908 | : Cost; | |||
2909 | } | |||
2910 | ||||
2911 | return X86TTIImpl::getIntImmCost(Imm, Ty); | |||
2912 | } | |||
2913 | ||||
2914 | int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, | |||
2915 | Type *Ty) { | |||
2916 | assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2916, __PRETTY_FUNCTION__)); | |||
2917 | ||||
2918 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
2919 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
2920 | // here, so that constant hoisting will ignore this constant. | |||
2921 | if (BitSize == 0) | |||
2922 | return TTI::TCC_Free; | |||
2923 | ||||
2924 | switch (IID) { | |||
2925 | default: | |||
2926 | return TTI::TCC_Free; | |||
2927 | case Intrinsic::sadd_with_overflow: | |||
2928 | case Intrinsic::uadd_with_overflow: | |||
2929 | case Intrinsic::ssub_with_overflow: | |||
2930 | case Intrinsic::usub_with_overflow: | |||
2931 | case Intrinsic::smul_with_overflow: | |||
2932 | case Intrinsic::umul_with_overflow: | |||
2933 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) | |||
2934 | return TTI::TCC_Free; | |||
2935 | break; | |||
2936 | case Intrinsic::experimental_stackmap: | |||
2937 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
2938 | return TTI::TCC_Free; | |||
2939 | break; | |||
2940 | case Intrinsic::experimental_patchpoint_void: | |||
2941 | case Intrinsic::experimental_patchpoint_i64: | |||
2942 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
2943 | return TTI::TCC_Free; | |||
2944 | break; | |||
2945 | } | |||
2946 | return X86TTIImpl::getIntImmCost(Imm, Ty); | |||
2947 | } | |||
2948 | ||||
2949 | unsigned X86TTIImpl::getUserCost(const User *U, | |||
2950 | ArrayRef<const Value *> Operands) { | |||
2951 | if (isa<StoreInst>(U)) { | |||
| ||||
2952 | Value *Ptr = U->getOperand(1); | |||
2953 | // Store instruction with index and scale costs 2 Uops. | |||
2954 | // Check the preceding GEP to identify non-const indices. | |||
2955 | if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) { | |||
2956 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | |||
2957 | return TTI::TCC_Basic * 2; | |||
2958 | } | |||
2959 | return TTI::TCC_Basic; | |||
2960 | } | |||
2961 | return BaseT::getUserCost(U, Operands); | |||
2962 | } | |||
2963 | ||||
2964 | // Return an average cost of Gather / Scatter instruction, maybe improved later | |||
2965 | int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, | |||
2966 | unsigned Alignment, unsigned AddressSpace) { | |||
2967 | ||||
2968 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")((isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost" ) ? static_cast<void> (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 2968, __PRETTY_FUNCTION__)); | |||
2969 | unsigned VF = SrcVTy->getVectorNumElements(); | |||
2970 | ||||
2971 | // Try to reduce index size from 64 bit (default for GEP) | |||
2972 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the | |||
2973 | // operation will use 16 x 64 indices which do not fit in a zmm and needs | |||
2974 | // to split. Also check that the base pointer is the same for all lanes, | |||
2975 | // and that there's at most one variable index. | |||
2976 | auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { | |||
2977 | unsigned IndexSize = DL.getPointerSizeInBits(); | |||
2978 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); | |||
2979 | if (IndexSize < 64 || !GEP) | |||
2980 | return IndexSize; | |||
2981 | ||||
2982 | unsigned NumOfVarIndices = 0; | |||
2983 | Value *Ptrs = GEP->getPointerOperand(); | |||
2984 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) | |||
2985 | return IndexSize; | |||
2986 | for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { | |||
2987 | if (isa<Constant>(GEP->getOperand(i))) | |||
2988 | continue; | |||
2989 | Type *IndxTy = GEP->getOperand(i)->getType(); | |||
2990 | if (IndxTy->isVectorTy()) | |||
2991 | IndxTy = IndxTy->getVectorElementType(); | |||
2992 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && | |||
2993 | !isa<SExtInst>(GEP->getOperand(i))) || | |||
2994 | ++NumOfVarIndices > 1) | |||
2995 | return IndexSize; // 64 | |||
2996 | } | |||
2997 | return (unsigned)32; | |||
2998 | }; | |||
2999 | ||||
3000 | ||||
3001 | // Trying to reduce IndexSize to 32 bits for vector 16. | |||
3002 | // By default the IndexSize is equal to pointer size. | |||
3003 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) | |||
3004 | ? getIndexSizeInBits(Ptr, DL) | |||
3005 | : DL.getPointerSizeInBits(); | |||
3006 | ||||
3007 | Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), | |||
3008 | IndexSize), VF); | |||
3009 | std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); | |||
3010 | std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); | |||
3011 | int SplitFactor = std::max(IdxsLT.first, SrcLT.first); | |||
3012 | if (SplitFactor > 1) { | |||
3013 | // Handle splitting of vector of pointers | |||
3014 | Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); | |||
3015 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, | |||
3016 | AddressSpace); | |||
3017 | } | |||
3018 | ||||
3019 | // The gather / scatter cost is given by Intel architects. It is a rough | |||
3020 | // number since we are looking at one instruction in a time. | |||
3021 | const int GSOverhead = (Opcode == Instruction::Load) | |||
3022 | ? ST->getGatherOverhead() | |||
3023 | : ST->getScatterOverhead(); | |||
3024 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
3025 | Alignment, AddressSpace); | |||
3026 | } | |||
3027 | ||||
3028 | /// Return the cost of full scalarization of gather / scatter operation. | |||
3029 | /// | |||
3030 | /// Opcode - Load or Store instruction. | |||
3031 | /// SrcVTy - The type of the data vector that should be gathered or scattered. | |||
3032 | /// VariableMask - The mask is non-constant at compile time. | |||
3033 | /// Alignment - Alignment for one element. | |||
3034 | /// AddressSpace - pointer[s] address space. | |||
3035 | /// | |||
3036 | int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, | |||
3037 | bool VariableMask, unsigned Alignment, | |||
3038 | unsigned AddressSpace) { | |||
3039 | unsigned VF = SrcVTy->getVectorNumElements(); | |||
3040 | ||||
3041 | int MaskUnpackCost = 0; | |||
3042 | if (VariableMask) { | |||
3043 | VectorType *MaskTy = | |||
3044 | VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); | |||
3045 | MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); | |||
3046 | int ScalarCompareCost = | |||
3047 | getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), | |||
3048 | nullptr); | |||
3049 | int BranchCost = getCFInstrCost(Instruction::Br); | |||
3050 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); | |||
3051 | } | |||
3052 | ||||
3053 | // The cost of the scalar loads/stores. | |||
3054 | int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
3055 | Alignment, AddressSpace); | |||
3056 | ||||
3057 | int InsertExtractCost = 0; | |||
3058 | if (Opcode == Instruction::Load) | |||
3059 | for (unsigned i = 0; i < VF; ++i) | |||
3060 | // Add the cost of inserting each scalar load into the vector | |||
3061 | InsertExtractCost += | |||
3062 | getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); | |||
3063 | else | |||
3064 | for (unsigned i = 0; i < VF; ++i) | |||
3065 | // Add the cost of extracting each element out of the data vector | |||
3066 | InsertExtractCost += | |||
3067 | getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); | |||
3068 | ||||
3069 | return MemoryOpCost + MaskUnpackCost + InsertExtractCost; | |||
3070 | } | |||
3071 | ||||
3072 | /// Calculate the cost of Gather / Scatter operation | |||
3073 | int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, | |||
3074 | Value *Ptr, bool VariableMask, | |||
3075 | unsigned Alignment) { | |||
3076 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")((SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter" ) ? static_cast<void> (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 3076, __PRETTY_FUNCTION__)); | |||
3077 | unsigned VF = SrcVTy->getVectorNumElements(); | |||
3078 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); | |||
3079 | if (!PtrTy && Ptr->getType()->isVectorTy()) | |||
3080 | PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); | |||
3081 | assert(PtrTy && "Unexpected type for Ptr argument")((PtrTy && "Unexpected type for Ptr argument") ? static_cast <void> (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 3081, __PRETTY_FUNCTION__)); | |||
3082 | unsigned AddressSpace = PtrTy->getAddressSpace(); | |||
3083 | ||||
3084 | bool Scalarize = false; | |||
3085 | if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || | |||
3086 | (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) | |||
3087 | Scalarize = true; | |||
3088 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX | |||
3089 | // Vector-4 of gather/scatter instruction does not exist on KNL. | |||
3090 | // We can extend it to 8 elements, but zeroing upper bits of | |||
3091 | // the mask vector will add more instructions. Right now we give the scalar | |||
3092 | // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction | |||
3093 | // is better in the VariableMask case. | |||
3094 | if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) | |||
3095 | Scalarize = true; | |||
3096 | ||||
3097 | if (Scalarize) | |||
3098 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, | |||
3099 | AddressSpace); | |||
3100 | ||||
3101 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); | |||
3102 | } | |||
3103 | ||||
3104 | bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, | |||
3105 | TargetTransformInfo::LSRCost &C2) { | |||
3106 | // X86 specific here are "instruction number 1st priority". | |||
3107 | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, | |||
3108 | C1.NumIVMuls, C1.NumBaseAdds, | |||
3109 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | |||
3110 | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, | |||
3111 | C2.NumIVMuls, C2.NumBaseAdds, | |||
3112 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | |||
3113 | } | |||
3114 | ||||
3115 | bool X86TTIImpl::canMacroFuseCmp() { | |||
3116 | return ST->hasMacroFusion() || ST->hasBranchFusion(); | |||
3117 | } | |||
3118 | ||||
3119 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { | |||
3120 | if (!ST->hasAVX()) | |||
3121 | return false; | |||
3122 | ||||
3123 | // The backend can't handle a single element vector. | |||
3124 | if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) | |||
3125 | return false; | |||
3126 | Type *ScalarTy = DataTy->getScalarType(); | |||
3127 | ||||
3128 | if (ScalarTy->isPointerTy()) | |||
3129 | return true; | |||
3130 | ||||
3131 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
3132 | return true; | |||
3133 | ||||
3134 | if (!ScalarTy->isIntegerTy()) | |||
3135 | return false; | |||
3136 | ||||
3137 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
3138 | return IntWidth == 32 || IntWidth == 64 || | |||
3139 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); | |||
3140 | } | |||
3141 | ||||
3142 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { | |||
3143 | return isLegalMaskedLoad(DataType); | |||
3144 | } | |||
3145 | ||||
3146 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { | |||
3147 | if (!isa<VectorType>(DataTy)) | |||
3148 | return false; | |||
3149 | ||||
3150 | if (!ST->hasAVX512()) | |||
3151 | return false; | |||
3152 | ||||
3153 | // The backend can't handle a single element vector. | |||
3154 | if (DataTy->getVectorNumElements() == 1) | |||
3155 | return false; | |||
3156 | ||||
3157 | Type *ScalarTy = DataTy->getVectorElementType(); | |||
3158 | ||||
3159 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
3160 | return true; | |||
3161 | ||||
3162 | if (!ScalarTy->isIntegerTy()) | |||
3163 | return false; | |||
3164 | ||||
3165 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
3166 | return IntWidth == 32 || IntWidth == 64 || | |||
3167 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); | |||
3168 | } | |||
3169 | ||||
3170 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { | |||
3171 | return isLegalMaskedExpandLoad(DataTy); | |||
3172 | } | |||
3173 | ||||
3174 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { | |||
3175 | // Some CPUs have better gather performance than others. | |||
3176 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only | |||
3177 | // enable gather with a -march. | |||
3178 | if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()))) | |||
3179 | return false; | |||
3180 | ||||
3181 | // This function is called now in two cases: from the Loop Vectorizer | |||
3182 | // and from the Scalarizer. | |||
3183 | // When the Loop Vectorizer asks about legality of the feature, | |||
3184 | // the vectorization factor is not calculated yet. The Loop Vectorizer | |||
3185 | // sends a scalar type and the decision is based on the width of the | |||
3186 | // scalar element. | |||
3187 | // Later on, the cost model will estimate usage this intrinsic based on | |||
3188 | // the vector type. | |||
3189 | // The Scalarizer asks again about legality. It sends a vector type. | |||
3190 | // In this case we can reject non-power-of-2 vectors. | |||
3191 | // We also reject single element vectors as the type legalizer can't | |||
3192 | // scalarize it. | |||
3193 | if (isa<VectorType>(DataTy)) { | |||
3194 | unsigned NumElts = DataTy->getVectorNumElements(); | |||
3195 | if (NumElts == 1 || !isPowerOf2_32(NumElts)) | |||
3196 | return false; | |||
3197 | } | |||
3198 | Type *ScalarTy = DataTy->getScalarType(); | |||
3199 | if (ScalarTy->isPointerTy()) | |||
3200 | return true; | |||
3201 | ||||
3202 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
3203 | return true; | |||
3204 | ||||
3205 | if (!ScalarTy->isIntegerTy()) | |||
3206 | return false; | |||
3207 | ||||
3208 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
3209 | return IntWidth == 32 || IntWidth == 64; | |||
3210 | } | |||
3211 | ||||
3212 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { | |||
3213 | // AVX2 doesn't support scatter | |||
3214 | if (!ST->hasAVX512()) | |||
3215 | return false; | |||
3216 | return isLegalMaskedGather(DataType); | |||
3217 | } | |||
3218 | ||||
3219 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { | |||
3220 | EVT VT = TLI->getValueType(DL, DataType); | |||
3221 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); | |||
3222 | } | |||
3223 | ||||
3224 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | |||
3225 | return false; | |||
3226 | } | |||
3227 | ||||
3228 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, | |||
3229 | const Function *Callee) const { | |||
3230 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
3231 | ||||
3232 | // Work this as a subsetting of subtarget features. | |||
3233 | const FeatureBitset &CallerBits = | |||
3234 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
3235 | const FeatureBitset &CalleeBits = | |||
3236 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
3237 | ||||
3238 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | |||
3239 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | |||
3240 | return (RealCallerBits & RealCalleeBits) == RealCalleeBits; | |||
3241 | } | |||
3242 | ||||
3243 | bool X86TTIImpl::areFunctionArgsABICompatible( | |||
3244 | const Function *Caller, const Function *Callee, | |||
3245 | SmallPtrSetImpl<Argument *> &Args) const { | |||
3246 | if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) | |||
3247 | return false; | |||
3248 | ||||
3249 | // If we get here, we know the target features match. If one function | |||
3250 | // considers 512-bit vectors legal and the other does not, consider them | |||
3251 | // incompatible. | |||
3252 | // FIXME Look at the arguments and only consider 512 bit or larger vectors? | |||
3253 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
3254 | ||||
3255 | return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == | |||
3256 | TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs(); | |||
3257 | } | |||
3258 | ||||
3259 | const X86TTIImpl::TTI::MemCmpExpansionOptions * | |||
3260 | X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { | |||
3261 | // Only enable vector loads for equality comparison. | |||
3262 | // Right now the vector version is not as fast, see #33329. | |||
3263 | static const auto ThreeWayOptions = [this]() { | |||
3264 | TTI::MemCmpExpansionOptions Options; | |||
3265 | if (ST->is64Bit()) { | |||
3266 | Options.LoadSizes.push_back(8); | |||
3267 | } | |||
3268 | Options.LoadSizes.push_back(4); | |||
3269 | Options.LoadSizes.push_back(2); | |||
3270 | Options.LoadSizes.push_back(1); | |||
3271 | return Options; | |||
3272 | }(); | |||
3273 | static const auto EqZeroOptions = [this]() { | |||
3274 | TTI::MemCmpExpansionOptions Options; | |||
3275 | // TODO: enable AVX512 when the DAG is ready. | |||
3276 | // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); | |||
3277 | if (ST->hasAVX2()) Options.LoadSizes.push_back(32); | |||
3278 | if (ST->hasSSE2()) Options.LoadSizes.push_back(16); | |||
3279 | if (ST->is64Bit()) { | |||
3280 | Options.LoadSizes.push_back(8); | |||
3281 | } | |||
3282 | Options.LoadSizes.push_back(4); | |||
3283 | Options.LoadSizes.push_back(2); | |||
3284 | Options.LoadSizes.push_back(1); | |||
3285 | // All GPR and vector loads can be unaligned. SIMD compare requires integer | |||
3286 | // vectors (SSE2/AVX2). | |||
3287 | Options.AllowOverlappingLoads = true; | |||
3288 | return Options; | |||
3289 | }(); | |||
3290 | return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; | |||
3291 | } | |||
3292 | ||||
3293 | bool X86TTIImpl::enableInterleavedAccessVectorization() { | |||
3294 | // TODO: We expect this to be beneficial regardless of arch, | |||
3295 | // but there are currently some unexplained performance artifacts on Atom. | |||
3296 | // As a temporary solution, disable on Atom. | |||
3297 | return !(ST->isAtom()); | |||
3298 | } | |||
3299 | ||||
3300 | // Get estimation for interleaved load/store operations for AVX2. | |||
3301 | // \p Factor is the interleaved-access factor (stride) - number of | |||
3302 | // (interleaved) elements in the group. | |||
3303 | // \p Indices contains the indices for a strided load: when the | |||
3304 | // interleaved load has gaps they indicate which elements are used. | |||
3305 | // If Indices is empty (or if the number of indices is equal to the size | |||
3306 | // of the interleaved-access as given in \p Factor) the access has no gaps. | |||
3307 | // | |||
3308 | // As opposed to AVX-512, AVX2 does not have generic shuffles that allow | |||
3309 | // computing the cost using a generic formula as a function of generic | |||
3310 | // shuffles. We therefore use a lookup table instead, filled according to | |||
3311 | // the instruction sequences that codegen currently generates. | |||
3312 | int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, | |||
3313 | unsigned Factor, | |||
3314 | ArrayRef<unsigned> Indices, | |||
3315 | unsigned Alignment, | |||
3316 | unsigned AddressSpace, | |||
3317 | bool UseMaskForCond, | |||
3318 | bool UseMaskForGaps) { | |||
3319 | ||||
3320 | if (UseMaskForCond || UseMaskForGaps) | |||
3321 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
3322 | Alignment, AddressSpace, | |||
3323 | UseMaskForCond, UseMaskForGaps); | |||
3324 | ||||
3325 | // We currently Support only fully-interleaved groups, with no gaps. | |||
3326 | // TODO: Support also strided loads (interleaved-groups with gaps). | |||
3327 | if (Indices.size() && Indices.size() != Factor) | |||
3328 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
3329 | Alignment, AddressSpace); | |||
3330 | ||||
3331 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
3332 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
3333 | // VecTy = <12 x i32>. | |||
3334 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | |||
3335 | ||||
3336 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case | |||
3337 | // the VF=2, while v2i128 is an unsupported MVT vector type | |||
3338 | // (see MachineValueType.h::getVectorVT()). | |||
3339 | if (!LegalVT.isVector()) | |||
3340 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
3341 | Alignment, AddressSpace); | |||
3342 | ||||
3343 | unsigned VF = VecTy->getVectorNumElements() / Factor; | |||
3344 | Type *ScalarTy = VecTy->getVectorElementType(); | |||
3345 | ||||
3346 | // Calculate the number of memory operations (NumOfMemOps), required | |||
3347 | // for load/store the VecTy. | |||
3348 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | |||
3349 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
3350 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
3351 | ||||
3352 | // Get the cost of one memory operation. | |||
3353 | Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), | |||
3354 | LegalVT.getVectorNumElements()); | |||
3355 | unsigned MemOpCost = | |||
3356 | getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); | |||
3357 | ||||
3358 | VectorType *VT = VectorType::get(ScalarTy, VF); | |||
3359 | EVT ETy = TLI->getValueType(DL, VT); | |||
3360 | if (!ETy.isSimple()) | |||
3361 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
3362 | Alignment, AddressSpace); | |||
3363 | ||||
3364 | // TODO: Complete for other data-types and strides. | |||
3365 | // Each combination of Stride, ElementTy and VF results in a different | |||
3366 | // sequence; The cost tables are therefore accessed with: | |||
3367 | // Factor (stride) and VectorType=VFxElemType. | |||
3368 | // The Cost accounts only for the shuffle sequence; | |||
3369 | // The cost of the loads/stores is accounted for separately. | |||
3370 | // | |||
3371 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { | |||
3372 | { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64 | |||
3373 | { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64 | |||
3374 | ||||
3375 | { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 | |||
3376 | { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 | |||
3377 | { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 | |||
3378 | { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 | |||
3379 | { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
3380 | { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 | |||
3381 | ||||
3382 | { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 | |||
3383 | { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 | |||
3384 | { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 | |||
3385 | { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 | |||
3386 | { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 | |||
3387 | ||||
3388 | { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 | |||
3389 | }; | |||
3390 | ||||
3391 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { | |||
3392 | { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store) | |||
3393 | { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store) | |||
3394 | ||||
3395 | { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) | |||
3396 | { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) | |||
3397 | { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) | |||
3398 | { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) | |||
3399 | { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) | |||
3400 | ||||
3401 | { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) | |||
3402 | { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) | |||
3403 | { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) | |||
3404 | { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) | |||
3405 | { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) | |||
3406 | }; | |||
3407 | ||||
3408 | if (Opcode == Instruction::Load) { | |||
3409 | if (const auto *Entry = | |||
3410 | CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) | |||
3411 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
3412 | } else { | |||
3413 | assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 3414, __PRETTY_FUNCTION__)) | |||
3414 | "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 3414, __PRETTY_FUNCTION__)); | |||
3415 | if (const auto *Entry = | |||
3416 | CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) | |||
3417 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
3418 | } | |||
3419 | ||||
3420 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
3421 | Alignment, AddressSpace); | |||
3422 | } | |||
3423 | ||||
3424 | // Get estimation for interleaved load/store operations and strided load. | |||
3425 | // \p Indices contains indices for strided load. | |||
3426 | // \p Factor - the factor of interleaving. | |||
3427 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. | |||
3428 | int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, | |||
3429 | unsigned Factor, | |||
3430 | ArrayRef<unsigned> Indices, | |||
3431 | unsigned Alignment, | |||
3432 | unsigned AddressSpace, | |||
3433 | bool UseMaskForCond, | |||
3434 | bool UseMaskForGaps) { | |||
3435 | ||||
3436 | if (UseMaskForCond || UseMaskForGaps) | |||
3437 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
3438 | Alignment, AddressSpace, | |||
3439 | UseMaskForCond, UseMaskForGaps); | |||
3440 | ||||
3441 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
3442 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
3443 | // VecTy = <12 x i32>. | |||
3444 | ||||
3445 | // Calculate the number of memory operations (NumOfMemOps), required | |||
3446 | // for load/store the VecTy. | |||
3447 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | |||
3448 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | |||
3449 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
3450 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
3451 | ||||
3452 | // Get the cost of one memory operation. | |||
3453 | Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), | |||
3454 | LegalVT.getVectorNumElements()); | |||
3455 | unsigned MemOpCost = | |||
3456 | getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); | |||
3457 | ||||
3458 | unsigned VF = VecTy->getVectorNumElements() / Factor; | |||
3459 | MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); | |||
3460 | ||||
3461 | if (Opcode == Instruction::Load) { | |||
3462 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) | |||
3463 | // contain the cost of the optimized shuffle sequence that the | |||
3464 | // X86InterleavedAccess pass will generate. | |||
3465 | // The cost of loads and stores are computed separately from the table. | |||
3466 | ||||
3467 | // X86InterleavedAccess support only the following interleaved-access group. | |||
3468 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { | |||
3469 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 | |||
3470 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
3471 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
3472 | }; | |||
3473 | ||||
3474 | if (const auto *Entry = | |||
3475 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) | |||
3476 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
3477 | //If an entry does not exist, fallback to the default implementation. | |||
3478 | ||||
3479 | // Kind of shuffle depends on number of loaded values. | |||
3480 | // If we load the entire data in one register, we can use a 1-src shuffle. | |||
3481 | // Otherwise, we'll merge 2 sources in each operation. | |||
3482 | TTI::ShuffleKind ShuffleKind = | |||
3483 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; | |||
3484 | ||||
3485 | unsigned ShuffleCost = | |||
3486 | getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); | |||
3487 | ||||
3488 | unsigned NumOfLoadsInInterleaveGrp = | |||
3489 | Indices.size() ? Indices.size() : Factor; | |||
3490 | Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), | |||
3491 | VecTy->getVectorNumElements() / Factor); | |||
3492 | unsigned NumOfResults = | |||
3493 | getTLI()->getTypeLegalizationCost(DL, ResultTy).first * | |||
3494 | NumOfLoadsInInterleaveGrp; | |||
3495 | ||||
3496 | // About a half of the loads may be folded in shuffles when we have only | |||
3497 | // one result. If we have more than one result, we do not fold loads at all. | |||
3498 | unsigned NumOfUnfoldedLoads = | |||
3499 | NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; | |||
3500 | ||||
3501 | // Get a number of shuffle operations per result. | |||
3502 | unsigned NumOfShufflesPerResult = | |||
3503 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); | |||
3504 | ||||
3505 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
3506 | // When we have more than one destination, we need additional instructions | |||
3507 | // to keep sources. | |||
3508 | unsigned NumOfMoves = 0; | |||
3509 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) | |||
3510 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; | |||
3511 | ||||
3512 | int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + | |||
3513 | NumOfUnfoldedLoads * MemOpCost + NumOfMoves; | |||
3514 | ||||
3515 | return Cost; | |||
3516 | } | |||
3517 | ||||
3518 | // Store. | |||
3519 | assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 3520, __PRETTY_FUNCTION__)) | |||
3520 | "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/X86/X86TargetTransformInfo.cpp" , 3520, __PRETTY_FUNCTION__)); | |||
3521 | // X86InterleavedAccess support only the following interleaved-access group. | |||
3522 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { | |||
3523 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) | |||
3524 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) | |||
3525 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) | |||
3526 | ||||
3527 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | |||
3528 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) | |||
3529 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) | |||
3530 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) | |||
3531 | }; | |||
3532 | ||||
3533 | if (const auto *Entry = | |||
3534 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) | |||
3535 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
3536 | //If an entry does not exist, fallback to the default implementation. | |||
3537 | ||||
3538 | // There is no strided stores meanwhile. And store can't be folded in | |||
3539 | // shuffle. | |||
3540 | unsigned NumOfSources = Factor; // The number of values to be merged. | |||
3541 | unsigned ShuffleCost = | |||
3542 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); | |||
3543 | unsigned NumOfShufflesPerStore = NumOfSources - 1; | |||
3544 | ||||
3545 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
3546 | // We need additional instructions to keep sources. | |||
3547 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; | |||
3548 | int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + | |||
3549 | NumOfMoves; | |||
3550 | return Cost; | |||
3551 | } | |||
3552 | ||||
3553 | int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, | |||
3554 | unsigned Factor, | |||
3555 | ArrayRef<unsigned> Indices, | |||
3556 | unsigned Alignment, | |||
3557 | unsigned AddressSpace, | |||
3558 | bool UseMaskForCond, | |||
3559 | bool UseMaskForGaps) { | |||
3560 | auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { | |||
3561 | Type *EltTy = VecTy->getVectorElementType(); | |||
3562 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || | |||
3563 | EltTy->isIntegerTy(32) || EltTy->isPointerTy()) | |||
3564 | return true; | |||
3565 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) | |||
3566 | return HasBW; | |||
3567 | return false; | |||
3568 | }; | |||
3569 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) | |||
3570 | return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, | |||
3571 | Alignment, AddressSpace, | |||
3572 | UseMaskForCond, UseMaskForGaps); | |||
3573 | if (ST->hasAVX2()) | |||
3574 | return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, | |||
3575 | Alignment, AddressSpace, | |||
3576 | UseMaskForCond, UseMaskForGaps); | |||
3577 | ||||
3578 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
3579 | Alignment, AddressSpace, | |||
3580 | UseMaskForCond, UseMaskForGaps); | |||
3581 | } |
1 | //===- TargetTransformInfoImpl.h --------------------------------*- C++ -*-===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// This file provides helpers for the implementation of | |||
10 | /// a TargetTransformInfo-conforming class. | |||
11 | /// | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H | |||
15 | #define LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H | |||
16 | ||||
17 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" | |||
18 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
19 | #include "llvm/Analysis/VectorUtils.h" | |||
20 | #include "llvm/IR/CallSite.h" | |||
21 | #include "llvm/IR/DataLayout.h" | |||
22 | #include "llvm/IR/Function.h" | |||
23 | #include "llvm/IR/GetElementPtrTypeIterator.h" | |||
24 | #include "llvm/IR/Operator.h" | |||
25 | #include "llvm/IR/Type.h" | |||
26 | ||||
27 | namespace llvm { | |||
28 | ||||
29 | /// Base class for use as a mix-in that aids implementing | |||
30 | /// a TargetTransformInfo-compatible class. | |||
31 | class TargetTransformInfoImplBase { | |||
32 | protected: | |||
33 | typedef TargetTransformInfo TTI; | |||
34 | ||||
35 | const DataLayout &DL; | |||
36 | ||||
37 | explicit TargetTransformInfoImplBase(const DataLayout &DL) : DL(DL) {} | |||
38 | ||||
39 | public: | |||
40 | // Provide value semantics. MSVC requires that we spell all of these out. | |||
41 | TargetTransformInfoImplBase(const TargetTransformInfoImplBase &Arg) | |||
42 | : DL(Arg.DL) {} | |||
43 | TargetTransformInfoImplBase(TargetTransformInfoImplBase &&Arg) : DL(Arg.DL) {} | |||
44 | ||||
45 | const DataLayout &getDataLayout() const { return DL; } | |||
46 | ||||
47 | unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { | |||
48 | switch (Opcode) { | |||
49 | default: | |||
50 | // By default, just classify everything as 'basic'. | |||
51 | return TTI::TCC_Basic; | |||
52 | ||||
53 | case Instruction::GetElementPtr: | |||
54 | llvm_unreachable("Use getGEPCost for GEP operations!")::llvm::llvm_unreachable_internal("Use getGEPCost for GEP operations!" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 54); | |||
55 | ||||
56 | case Instruction::BitCast: | |||
57 | assert(OpTy && "Cast instructions must provide the operand type")((OpTy && "Cast instructions must provide the operand type" ) ? static_cast<void> (0) : __assert_fail ("OpTy && \"Cast instructions must provide the operand type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 57, __PRETTY_FUNCTION__)); | |||
58 | if (Ty == OpTy || (Ty->isPointerTy() && OpTy->isPointerTy())) | |||
59 | // Identity and pointer-to-pointer casts are free. | |||
60 | return TTI::TCC_Free; | |||
61 | ||||
62 | // Otherwise, the default basic cost is used. | |||
63 | return TTI::TCC_Basic; | |||
64 | ||||
65 | case Instruction::FDiv: | |||
66 | case Instruction::FRem: | |||
67 | case Instruction::SDiv: | |||
68 | case Instruction::SRem: | |||
69 | case Instruction::UDiv: | |||
70 | case Instruction::URem: | |||
71 | return TTI::TCC_Expensive; | |||
72 | ||||
73 | case Instruction::IntToPtr: { | |||
74 | // An inttoptr cast is free so long as the input is a legal integer type | |||
75 | // which doesn't contain values outside the range of a pointer. | |||
76 | unsigned OpSize = OpTy->getScalarSizeInBits(); | |||
77 | if (DL.isLegalInteger(OpSize) && | |||
78 | OpSize <= DL.getPointerTypeSizeInBits(Ty)) | |||
79 | return TTI::TCC_Free; | |||
80 | ||||
81 | // Otherwise it's not a no-op. | |||
82 | return TTI::TCC_Basic; | |||
83 | } | |||
84 | case Instruction::PtrToInt: { | |||
85 | // A ptrtoint cast is free so long as the result is large enough to store | |||
86 | // the pointer, and a legal integer type. | |||
87 | unsigned DestSize = Ty->getScalarSizeInBits(); | |||
88 | if (DL.isLegalInteger(DestSize) && | |||
89 | DestSize >= DL.getPointerTypeSizeInBits(OpTy)) | |||
90 | return TTI::TCC_Free; | |||
91 | ||||
92 | // Otherwise it's not a no-op. | |||
93 | return TTI::TCC_Basic; | |||
94 | } | |||
95 | case Instruction::Trunc: | |||
96 | // trunc to a native type is free (assuming the target has compare and | |||
97 | // shift-right of the same width). | |||
98 | if (DL.isLegalInteger(DL.getTypeSizeInBits(Ty))) | |||
99 | return TTI::TCC_Free; | |||
100 | ||||
101 | return TTI::TCC_Basic; | |||
102 | } | |||
103 | } | |||
104 | ||||
105 | int getGEPCost(Type *PointeeType, const Value *Ptr, | |||
106 | ArrayRef<const Value *> Operands) { | |||
107 | // In the basic model, we just assume that all-constant GEPs will be folded | |||
108 | // into their uses via addressing modes. | |||
109 | for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx) | |||
110 | if (!isa<Constant>(Operands[Idx])) | |||
111 | return TTI::TCC_Basic; | |||
112 | ||||
113 | return TTI::TCC_Free; | |||
114 | } | |||
115 | ||||
116 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, | |||
117 | unsigned &JTSize) { | |||
118 | JTSize = 0; | |||
119 | return SI.getNumCases(); | |||
120 | } | |||
121 | ||||
122 | int getExtCost(const Instruction *I, const Value *Src) { | |||
123 | return TTI::TCC_Basic; | |||
124 | } | |||
125 | ||||
126 | unsigned getCallCost(FunctionType *FTy, int NumArgs, const User *U) { | |||
127 | assert(FTy && "FunctionType must be provided to this routine.")((FTy && "FunctionType must be provided to this routine." ) ? static_cast<void> (0) : __assert_fail ("FTy && \"FunctionType must be provided to this routine.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 127, __PRETTY_FUNCTION__)); | |||
128 | ||||
129 | // The target-independent implementation just measures the size of the | |||
130 | // function by approximating that each argument will take on average one | |||
131 | // instruction to prepare. | |||
132 | ||||
133 | if (NumArgs < 0) | |||
134 | // Set the argument number to the number of explicit arguments in the | |||
135 | // function. | |||
136 | NumArgs = FTy->getNumParams(); | |||
137 | ||||
138 | return TTI::TCC_Basic * (NumArgs + 1); | |||
139 | } | |||
140 | ||||
141 | unsigned getInliningThresholdMultiplier() { return 1; } | |||
142 | ||||
143 | unsigned getMemcpyCost(const Instruction *I) { | |||
144 | return TTI::TCC_Expensive; | |||
145 | } | |||
146 | ||||
147 | bool hasBranchDivergence() { return false; } | |||
148 | ||||
149 | bool isSourceOfDivergence(const Value *V) { return false; } | |||
150 | ||||
151 | bool isAlwaysUniform(const Value *V) { return false; } | |||
152 | ||||
153 | unsigned getFlatAddressSpace () { | |||
154 | return -1; | |||
155 | } | |||
156 | ||||
157 | bool isLoweredToCall(const Function *F) { | |||
158 | assert(F && "A concrete function must be provided to this routine.")((F && "A concrete function must be provided to this routine." ) ? static_cast<void> (0) : __assert_fail ("F && \"A concrete function must be provided to this routine.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 158, __PRETTY_FUNCTION__)); | |||
159 | ||||
160 | // FIXME: These should almost certainly not be handled here, and instead | |||
161 | // handled with the help of TLI or the target itself. This was largely | |||
162 | // ported from existing analysis heuristics here so that such refactorings | |||
163 | // can take place in the future. | |||
164 | ||||
165 | if (F->isIntrinsic()) | |||
166 | return false; | |||
167 | ||||
168 | if (F->hasLocalLinkage() || !F->hasName()) | |||
169 | return true; | |||
170 | ||||
171 | StringRef Name = F->getName(); | |||
172 | ||||
173 | // These will all likely lower to a single selection DAG node. | |||
174 | if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" || | |||
175 | Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" || | |||
176 | Name == "fmin" || Name == "fminf" || Name == "fminl" || | |||
177 | Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" || | |||
178 | Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" || | |||
179 | Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl") | |||
180 | return false; | |||
181 | ||||
182 | // These are all likely to be optimized into something smaller. | |||
183 | if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" || | |||
184 | Name == "exp2l" || Name == "exp2f" || Name == "floor" || | |||
185 | Name == "floorf" || Name == "ceil" || Name == "round" || | |||
186 | Name == "ffs" || Name == "ffsl" || Name == "abs" || Name == "labs" || | |||
187 | Name == "llabs") | |||
188 | return false; | |||
189 | ||||
190 | return true; | |||
191 | } | |||
192 | ||||
193 | void getUnrollingPreferences(Loop *, ScalarEvolution &, | |||
194 | TTI::UnrollingPreferences &) {} | |||
195 | ||||
196 | bool isLegalAddImmediate(int64_t Imm) { return false; } | |||
197 | ||||
198 | bool isLegalICmpImmediate(int64_t Imm) { return false; } | |||
199 | ||||
200 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, | |||
201 | bool HasBaseReg, int64_t Scale, | |||
202 | unsigned AddrSpace, Instruction *I = nullptr) { | |||
203 | // Guess that only reg and reg+reg addressing is allowed. This heuristic is | |||
204 | // taken from the implementation of LSR. | |||
205 | return !BaseGV && BaseOffset == 0 && (Scale == 0 || Scale == 1); | |||
206 | } | |||
207 | ||||
208 | bool isLSRCostLess(TTI::LSRCost &C1, TTI::LSRCost &C2) { | |||
209 | return std::tie(C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds, | |||
210 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | |||
211 | std::tie(C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds, | |||
212 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | |||
213 | } | |||
214 | ||||
215 | bool canMacroFuseCmp() { return false; } | |||
216 | ||||
217 | bool shouldFavorPostInc() const { return false; } | |||
218 | ||||
219 | bool shouldFavorBackedgeIndex(const Loop *L) const { return false; } | |||
220 | ||||
221 | bool isLegalMaskedStore(Type *DataType) { return false; } | |||
222 | ||||
223 | bool isLegalMaskedLoad(Type *DataType) { return false; } | |||
224 | ||||
225 | bool isLegalMaskedScatter(Type *DataType) { return false; } | |||
226 | ||||
227 | bool isLegalMaskedGather(Type *DataType) { return false; } | |||
228 | ||||
229 | bool isLegalMaskedCompressStore(Type *DataType) { return false; } | |||
230 | ||||
231 | bool isLegalMaskedExpandLoad(Type *DataType) { return false; } | |||
232 | ||||
233 | bool hasDivRemOp(Type *DataType, bool IsSigned) { return false; } | |||
234 | ||||
235 | bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { return false; } | |||
236 | ||||
237 | bool prefersVectorizedAddressing() { return true; } | |||
238 | ||||
239 | int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, | |||
240 | bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { | |||
241 | // Guess that all legal addressing mode are free. | |||
242 | if (isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, | |||
243 | Scale, AddrSpace)) | |||
244 | return 0; | |||
245 | return -1; | |||
246 | } | |||
247 | ||||
248 | bool LSRWithInstrQueries() { return false; } | |||
249 | ||||
250 | bool isTruncateFree(Type *Ty1, Type *Ty2) { return false; } | |||
251 | ||||
252 | bool isProfitableToHoist(Instruction *I) { return true; } | |||
253 | ||||
254 | bool useAA() { return false; } | |||
255 | ||||
256 | bool isTypeLegal(Type *Ty) { return false; } | |||
257 | ||||
258 | unsigned getJumpBufAlignment() { return 0; } | |||
259 | ||||
260 | unsigned getJumpBufSize() { return 0; } | |||
261 | ||||
262 | bool shouldBuildLookupTables() { return true; } | |||
263 | bool shouldBuildLookupTablesForConstant(Constant *C) { return true; } | |||
264 | ||||
265 | bool useColdCCForColdCall(Function &F) { return false; } | |||
266 | ||||
267 | unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { | |||
268 | return 0; | |||
269 | } | |||
270 | ||||
271 | unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, | |||
272 | unsigned VF) { return 0; } | |||
273 | ||||
274 | bool supportsEfficientVectorElementLoadStore() { return false; } | |||
275 | ||||
276 | bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } | |||
277 | ||||
278 | const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( | |||
279 | bool IsZeroCmp) const { | |||
280 | return nullptr; | |||
281 | } | |||
282 | ||||
283 | bool enableInterleavedAccessVectorization() { return false; } | |||
284 | ||||
285 | bool enableMaskedInterleavedAccessVectorization() { return false; } | |||
286 | ||||
287 | bool isFPVectorizationPotentiallyUnsafe() { return false; } | |||
288 | ||||
289 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, | |||
290 | unsigned BitWidth, | |||
291 | unsigned AddressSpace, | |||
292 | unsigned Alignment, | |||
293 | bool *Fast) { return false; } | |||
294 | ||||
295 | TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) { | |||
296 | return TTI::PSK_Software; | |||
297 | } | |||
298 | ||||
299 | bool haveFastSqrt(Type *Ty) { return false; } | |||
300 | ||||
301 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return true; } | |||
302 | ||||
303 | unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; } | |||
304 | ||||
305 | int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, | |||
306 | Type *Ty) { | |||
307 | return 0; | |||
308 | } | |||
309 | ||||
310 | unsigned getIntImmCost(const APInt &Imm, Type *Ty) { return TTI::TCC_Basic; } | |||
311 | ||||
312 | unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, | |||
313 | Type *Ty) { | |||
314 | return TTI::TCC_Free; | |||
315 | } | |||
316 | ||||
317 | unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, | |||
318 | Type *Ty) { | |||
319 | return TTI::TCC_Free; | |||
320 | } | |||
321 | ||||
322 | unsigned getNumberOfRegisters(bool Vector) { return 8; } | |||
323 | ||||
324 | unsigned getRegisterBitWidth(bool Vector) const { return 32; } | |||
325 | ||||
326 | unsigned getMinVectorRegisterBitWidth() { return 128; } | |||
327 | ||||
328 | bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } | |||
329 | ||||
330 | unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } | |||
331 | ||||
332 | bool | |||
333 | shouldConsiderAddressTypePromotion(const Instruction &I, | |||
334 | bool &AllowPromotionWithoutCommonHeader) { | |||
335 | AllowPromotionWithoutCommonHeader = false; | |||
336 | return false; | |||
337 | } | |||
338 | ||||
339 | unsigned getCacheLineSize() { return 0; } | |||
340 | ||||
341 | llvm::Optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level) { | |||
342 | switch (Level) { | |||
343 | case TargetTransformInfo::CacheLevel::L1D: | |||
344 | LLVM_FALLTHROUGH[[clang::fallthrough]]; | |||
345 | case TargetTransformInfo::CacheLevel::L2D: | |||
346 | return llvm::Optional<unsigned>(); | |||
347 | } | |||
348 | ||||
349 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 349); | |||
350 | } | |||
351 | ||||
352 | llvm::Optional<unsigned> getCacheAssociativity( | |||
353 | TargetTransformInfo::CacheLevel Level) { | |||
354 | switch (Level) { | |||
355 | case TargetTransformInfo::CacheLevel::L1D: | |||
356 | LLVM_FALLTHROUGH[[clang::fallthrough]]; | |||
357 | case TargetTransformInfo::CacheLevel::L2D: | |||
358 | return llvm::Optional<unsigned>(); | |||
359 | } | |||
360 | ||||
361 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 361); | |||
362 | } | |||
363 | ||||
364 | unsigned getPrefetchDistance() { return 0; } | |||
365 | ||||
366 | unsigned getMinPrefetchStride() { return 1; } | |||
367 | ||||
368 | unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX(2147483647 *2U +1U); } | |||
369 | ||||
370 | unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } | |||
371 | ||||
372 | unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, | |||
373 | TTI::OperandValueKind Opd1Info, | |||
374 | TTI::OperandValueKind Opd2Info, | |||
375 | TTI::OperandValueProperties Opd1PropInfo, | |||
376 | TTI::OperandValueProperties Opd2PropInfo, | |||
377 | ArrayRef<const Value *> Args) { | |||
378 | return 1; | |||
379 | } | |||
380 | ||||
381 | unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Ty, int Index, | |||
382 | Type *SubTp) { | |||
383 | return 1; | |||
384 | } | |||
385 | ||||
386 | unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, | |||
387 | const Instruction *I) { return 1; } | |||
388 | ||||
389 | unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, | |||
390 | VectorType *VecTy, unsigned Index) { | |||
391 | return 1; | |||
392 | } | |||
393 | ||||
394 | unsigned getCFInstrCost(unsigned Opcode) { return 1; } | |||
395 | ||||
396 | unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, | |||
397 | const Instruction *I) { | |||
398 | return 1; | |||
399 | } | |||
400 | ||||
401 | unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { | |||
402 | return 1; | |||
403 | } | |||
404 | ||||
405 | unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, | |||
406 | unsigned AddressSpace, const Instruction *I) { | |||
407 | return 1; | |||
408 | } | |||
409 | ||||
410 | unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, | |||
411 | unsigned AddressSpace) { | |||
412 | return 1; | |||
413 | } | |||
414 | ||||
415 | unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, | |||
416 | bool VariableMask, | |||
417 | unsigned Alignment) { | |||
418 | return 1; | |||
419 | } | |||
420 | ||||
421 | unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, | |||
422 | unsigned Factor, | |||
423 | ArrayRef<unsigned> Indices, | |||
424 | unsigned Alignment, unsigned AddressSpace, | |||
425 | bool UseMaskForCond = false, | |||
426 | bool UseMaskForGaps = false) { | |||
427 | return 1; | |||
428 | } | |||
429 | ||||
430 | unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, | |||
431 | ArrayRef<Type *> Tys, FastMathFlags FMF, | |||
432 | unsigned ScalarizationCostPassed) { | |||
433 | return 1; | |||
434 | } | |||
435 | unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, | |||
436 | ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) { | |||
437 | return 1; | |||
438 | } | |||
439 | ||||
440 | unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) { | |||
441 | return 1; | |||
442 | } | |||
443 | ||||
444 | unsigned getNumberOfParts(Type *Tp) { return 0; } | |||
445 | ||||
446 | unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *, | |||
447 | const SCEV *) { | |||
448 | return 0; | |||
449 | } | |||
450 | ||||
451 | unsigned getArithmeticReductionCost(unsigned, Type *, bool) { return 1; } | |||
452 | ||||
453 | unsigned getMinMaxReductionCost(Type *, Type *, bool, bool) { return 1; } | |||
454 | ||||
455 | unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; } | |||
456 | ||||
457 | bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) { | |||
458 | return false; | |||
459 | } | |||
460 | ||||
461 | unsigned getAtomicMemIntrinsicMaxElementSize() const { | |||
462 | // Note for overrides: You must ensure for all element unordered-atomic | |||
463 | // memory intrinsics that all power-of-2 element sizes up to, and | |||
464 | // including, the return value of this method have a corresponding | |||
465 | // runtime lib call. These runtime lib call definitions can be found | |||
466 | // in RuntimeLibcalls.h | |||
467 | return 0; | |||
468 | } | |||
469 | ||||
470 | Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, | |||
471 | Type *ExpectedType) { | |||
472 | return nullptr; | |||
473 | } | |||
474 | ||||
475 | Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, | |||
476 | unsigned SrcAlign, unsigned DestAlign) const { | |||
477 | return Type::getInt8Ty(Context); | |||
478 | } | |||
479 | ||||
480 | void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut, | |||
481 | LLVMContext &Context, | |||
482 | unsigned RemainingBytes, | |||
483 | unsigned SrcAlign, | |||
484 | unsigned DestAlign) const { | |||
485 | for (unsigned i = 0; i != RemainingBytes; ++i) | |||
486 | OpsOut.push_back(Type::getInt8Ty(Context)); | |||
487 | } | |||
488 | ||||
489 | bool areInlineCompatible(const Function *Caller, | |||
490 | const Function *Callee) const { | |||
491 | return (Caller->getFnAttribute("target-cpu") == | |||
492 | Callee->getFnAttribute("target-cpu")) && | |||
493 | (Caller->getFnAttribute("target-features") == | |||
494 | Callee->getFnAttribute("target-features")); | |||
495 | } | |||
496 | ||||
497 | bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, | |||
498 | SmallPtrSetImpl<Argument *> &Args) const { | |||
499 | return (Caller->getFnAttribute("target-cpu") == | |||
500 | Callee->getFnAttribute("target-cpu")) && | |||
501 | (Caller->getFnAttribute("target-features") == | |||
502 | Callee->getFnAttribute("target-features")); | |||
503 | } | |||
504 | ||||
505 | bool isIndexedLoadLegal(TTI::MemIndexedMode Mode, Type *Ty, | |||
506 | const DataLayout &DL) const { | |||
507 | return false; | |||
508 | } | |||
509 | ||||
510 | bool isIndexedStoreLegal(TTI::MemIndexedMode Mode, Type *Ty, | |||
511 | const DataLayout &DL) const { | |||
512 | return false; | |||
513 | } | |||
514 | ||||
515 | unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { return 128; } | |||
516 | ||||
517 | bool isLegalToVectorizeLoad(LoadInst *LI) const { return true; } | |||
518 | ||||
519 | bool isLegalToVectorizeStore(StoreInst *SI) const { return true; } | |||
520 | ||||
521 | bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, | |||
522 | unsigned Alignment, | |||
523 | unsigned AddrSpace) const { | |||
524 | return true; | |||
525 | } | |||
526 | ||||
527 | bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, | |||
528 | unsigned Alignment, | |||
529 | unsigned AddrSpace) const { | |||
530 | return true; | |||
531 | } | |||
532 | ||||
533 | unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, | |||
534 | unsigned ChainSizeInBytes, | |||
535 | VectorType *VecTy) const { | |||
536 | return VF; | |||
537 | } | |||
538 | ||||
539 | unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, | |||
540 | unsigned ChainSizeInBytes, | |||
541 | VectorType *VecTy) const { | |||
542 | return VF; | |||
543 | } | |||
544 | ||||
545 | bool useReductionIntrinsic(unsigned Opcode, Type *Ty, | |||
546 | TTI::ReductionFlags Flags) const { | |||
547 | return false; | |||
548 | } | |||
549 | ||||
550 | bool shouldExpandReduction(const IntrinsicInst *II) const { | |||
551 | return true; | |||
552 | } | |||
553 | ||||
554 | protected: | |||
555 | // Obtain the minimum required size to hold the value (without the sign) | |||
556 | // In case of a vector it returns the min required size for one element. | |||
557 | unsigned minRequiredElementSize(const Value* Val, bool &isSigned) { | |||
558 | if (isa<ConstantDataVector>(Val) || isa<ConstantVector>(Val)) { | |||
559 | const auto* VectorValue = cast<Constant>(Val); | |||
560 | ||||
561 | // In case of a vector need to pick the max between the min | |||
562 | // required size for each element | |||
563 | auto *VT = cast<VectorType>(Val->getType()); | |||
564 | ||||
565 | // Assume unsigned elements | |||
566 | isSigned = false; | |||
567 | ||||
568 | // The max required size is the total vector width divided by num | |||
569 | // of elements in the vector | |||
570 | unsigned MaxRequiredSize = VT->getBitWidth() / VT->getNumElements(); | |||
571 | ||||
572 | unsigned MinRequiredSize = 0; | |||
573 | for(unsigned i = 0, e = VT->getNumElements(); i < e; ++i) { | |||
574 | if (auto* IntElement = | |||
575 | dyn_cast<ConstantInt>(VectorValue->getAggregateElement(i))) { | |||
576 | bool signedElement = IntElement->getValue().isNegative(); | |||
577 | // Get the element min required size. | |||
578 | unsigned ElementMinRequiredSize = | |||
579 | IntElement->getValue().getMinSignedBits() - 1; | |||
580 | // In case one element is signed then all the vector is signed. | |||
581 | isSigned |= signedElement; | |||
582 | // Save the max required bit size between all the elements. | |||
583 | MinRequiredSize = std::max(MinRequiredSize, ElementMinRequiredSize); | |||
584 | } | |||
585 | else { | |||
586 | // not an int constant element | |||
587 | return MaxRequiredSize; | |||
588 | } | |||
589 | } | |||
590 | return MinRequiredSize; | |||
591 | } | |||
592 | ||||
593 | if (const auto* CI = dyn_cast<ConstantInt>(Val)) { | |||
594 | isSigned = CI->getValue().isNegative(); | |||
595 | return CI->getValue().getMinSignedBits() - 1; | |||
596 | } | |||
597 | ||||
598 | if (const auto* Cast = dyn_cast<SExtInst>(Val)) { | |||
599 | isSigned = true; | |||
600 | return Cast->getSrcTy()->getScalarSizeInBits() - 1; | |||
601 | } | |||
602 | ||||
603 | if (const auto* Cast = dyn_cast<ZExtInst>(Val)) { | |||
604 | isSigned = false; | |||
605 | return Cast->getSrcTy()->getScalarSizeInBits(); | |||
606 | } | |||
607 | ||||
608 | isSigned = false; | |||
609 | return Val->getType()->getScalarSizeInBits(); | |||
610 | } | |||
611 | ||||
612 | bool isStridedAccess(const SCEV *Ptr) { | |||
613 | return Ptr && isa<SCEVAddRecExpr>(Ptr); | |||
614 | } | |||
615 | ||||
616 | const SCEVConstant *getConstantStrideStep(ScalarEvolution *SE, | |||
617 | const SCEV *Ptr) { | |||
618 | if (!isStridedAccess(Ptr)) | |||
619 | return nullptr; | |||
620 | const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ptr); | |||
621 | return dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(*SE)); | |||
622 | } | |||
623 | ||||
624 | bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, | |||
625 | int64_t MergeDistance) { | |||
626 | const SCEVConstant *Step = getConstantStrideStep(SE, Ptr); | |||
627 | if (!Step) | |||
628 | return false; | |||
629 | APInt StrideVal = Step->getAPInt(); | |||
630 | if (StrideVal.getBitWidth() > 64) | |||
631 | return false; | |||
632 | // FIXME: Need to take absolute value for negative stride case. | |||
633 | return StrideVal.getSExtValue() < MergeDistance; | |||
634 | } | |||
635 | }; | |||
636 | ||||
637 | /// CRTP base class for use as a mix-in that aids implementing | |||
638 | /// a TargetTransformInfo-compatible class. | |||
639 | template <typename T> | |||
640 | class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { | |||
641 | private: | |||
642 | typedef TargetTransformInfoImplBase BaseT; | |||
643 | ||||
644 | protected: | |||
645 | explicit TargetTransformInfoImplCRTPBase(const DataLayout &DL) : BaseT(DL) {} | |||
646 | ||||
647 | public: | |||
648 | using BaseT::getCallCost; | |||
649 | ||||
650 | unsigned getCallCost(const Function *F, int NumArgs, const User *U) { | |||
651 | assert(F && "A concrete function must be provided to this routine.")((F && "A concrete function must be provided to this routine." ) ? static_cast<void> (0) : __assert_fail ("F && \"A concrete function must be provided to this routine.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 651, __PRETTY_FUNCTION__)); | |||
652 | ||||
653 | if (NumArgs < 0) | |||
654 | // Set the argument number to the number of explicit arguments in the | |||
655 | // function. | |||
656 | NumArgs = F->arg_size(); | |||
657 | ||||
658 | if (Intrinsic::ID IID = F->getIntrinsicID()) { | |||
659 | FunctionType *FTy = F->getFunctionType(); | |||
660 | SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end()); | |||
661 | return static_cast<T *>(this) | |||
662 | ->getIntrinsicCost(IID, FTy->getReturnType(), ParamTys, U); | |||
663 | } | |||
664 | ||||
665 | if (!static_cast<T *>(this)->isLoweredToCall(F)) | |||
666 | return TTI::TCC_Basic; // Give a basic cost if it will be lowered | |||
667 | // directly. | |||
668 | ||||
669 | return static_cast<T *>(this)->getCallCost(F->getFunctionType(), NumArgs, U); | |||
670 | } | |||
671 | ||||
672 | unsigned getCallCost(const Function *F, ArrayRef<const Value *> Arguments, | |||
673 | const User *U) { | |||
674 | // Simply delegate to generic handling of the call. | |||
675 | // FIXME: We should use instsimplify or something else to catch calls which | |||
676 | // will constant fold with these arguments. | |||
677 | return static_cast<T *>(this)->getCallCost(F, Arguments.size(), U); | |||
678 | } | |||
679 | ||||
680 | using BaseT::getGEPCost; | |||
681 | ||||
682 | int getGEPCost(Type *PointeeType, const Value *Ptr, | |||
683 | ArrayRef<const Value *> Operands) { | |||
684 | const GlobalValue *BaseGV = nullptr; | |||
685 | if (Ptr != nullptr) { | |||
686 | // TODO: will remove this when pointers have an opaque type. | |||
687 | assert(Ptr->getType()->getScalarType()->getPointerElementType() ==((Ptr->getType()->getScalarType()->getPointerElementType () == PointeeType && "explicit pointee type doesn't match operand's pointee type" ) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->getScalarType()->getPointerElementType() == PointeeType && \"explicit pointee type doesn't match operand's pointee type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 689, __PRETTY_FUNCTION__)) | |||
688 | PointeeType &&((Ptr->getType()->getScalarType()->getPointerElementType () == PointeeType && "explicit pointee type doesn't match operand's pointee type" ) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->getScalarType()->getPointerElementType() == PointeeType && \"explicit pointee type doesn't match operand's pointee type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 689, __PRETTY_FUNCTION__)) | |||
689 | "explicit pointee type doesn't match operand's pointee type")((Ptr->getType()->getScalarType()->getPointerElementType () == PointeeType && "explicit pointee type doesn't match operand's pointee type" ) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->getScalarType()->getPointerElementType() == PointeeType && \"explicit pointee type doesn't match operand's pointee type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 689, __PRETTY_FUNCTION__)); | |||
690 | BaseGV = dyn_cast<GlobalValue>(Ptr->stripPointerCasts()); | |||
691 | } | |||
692 | bool HasBaseReg = (BaseGV == nullptr); | |||
693 | ||||
694 | auto PtrSizeBits = DL.getPointerTypeSizeInBits(Ptr->getType()); | |||
| ||||
695 | APInt BaseOffset(PtrSizeBits, 0); | |||
696 | int64_t Scale = 0; | |||
697 | ||||
698 | auto GTI = gep_type_begin(PointeeType, Operands); | |||
699 | Type *TargetType = nullptr; | |||
700 | ||||
701 | // Handle the case where the GEP instruction has a single operand, | |||
702 | // the basis, therefore TargetType is a nullptr. | |||
703 | if (Operands.empty()) | |||
704 | return !BaseGV ? TTI::TCC_Free : TTI::TCC_Basic; | |||
705 | ||||
706 | for (auto I = Operands.begin(); I != Operands.end(); ++I, ++GTI) { | |||
707 | TargetType = GTI.getIndexedType(); | |||
708 | // We assume that the cost of Scalar GEP with constant index and the | |||
709 | // cost of Vector GEP with splat constant index are the same. | |||
710 | const ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I); | |||
711 | if (!ConstIdx) | |||
712 | if (auto Splat = getSplatValue(*I)) | |||
713 | ConstIdx = dyn_cast<ConstantInt>(Splat); | |||
714 | if (StructType *STy = GTI.getStructTypeOrNull()) { | |||
715 | // For structures the index is always splat or scalar constant | |||
716 | assert(ConstIdx && "Unexpected GEP index")((ConstIdx && "Unexpected GEP index") ? static_cast< void> (0) : __assert_fail ("ConstIdx && \"Unexpected GEP index\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/Analysis/TargetTransformInfoImpl.h" , 716, __PRETTY_FUNCTION__)); | |||
717 | uint64_t Field = ConstIdx->getZExtValue(); | |||
718 | BaseOffset += DL.getStructLayout(STy)->getElementOffset(Field); | |||
719 | } else { | |||
720 | int64_t ElementSize = DL.getTypeAllocSize(GTI.getIndexedType()); | |||
721 | if (ConstIdx) { | |||
722 | BaseOffset += | |||
723 | ConstIdx->getValue().sextOrTrunc(PtrSizeBits) * ElementSize; | |||
724 | } else { | |||
725 | // Needs scale register. | |||
726 | if (Scale != 0) | |||
727 | // No addressing mode takes two scale registers. | |||
728 | return TTI::TCC_Basic; | |||
729 | Scale = ElementSize; | |||
730 | } | |||
731 | } | |||
732 | } | |||
733 | ||||
734 | // Assumes the address space is 0 when Ptr is nullptr. | |||
735 | unsigned AS = | |||
736 | (Ptr == nullptr ? 0 : Ptr->getType()->getPointerAddressSpace()); | |||
737 | ||||
738 | if (static_cast<T *>(this)->isLegalAddressingMode( | |||
739 | TargetType, const_cast<GlobalValue *>(BaseGV), | |||
740 | BaseOffset.sextOrTrunc(64).getSExtValue(), HasBaseReg, Scale, AS)) | |||
741 | return TTI::TCC_Free; | |||
742 | return TTI::TCC_Basic; | |||
743 | } | |||
744 | ||||
745 | unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, | |||
746 | ArrayRef<Type *> ParamTys, const User *U) { | |||
747 | switch (IID) { | |||
748 | default: | |||
749 | // Intrinsics rarely (if ever) have normal argument setup constraints. | |||
750 | // Model them as having a basic instruction cost. | |||
751 | return TTI::TCC_Basic; | |||
752 | ||||
753 | // TODO: other libc intrinsics. | |||
754 | case Intrinsic::memcpy: | |||
755 | return static_cast<T *>(this)->getMemcpyCost(dyn_cast<Instruction>(U)); | |||
756 | ||||
757 | case Intrinsic::annotation: | |||
758 | case Intrinsic::assume: | |||
759 | case Intrinsic::sideeffect: | |||
760 | case Intrinsic::dbg_declare: | |||
761 | case Intrinsic::dbg_value: | |||
762 | case Intrinsic::dbg_label: | |||
763 | case Intrinsic::invariant_start: | |||
764 | case Intrinsic::invariant_end: | |||
765 | case Intrinsic::launder_invariant_group: | |||
766 | case Intrinsic::strip_invariant_group: | |||
767 | case Intrinsic::is_constant: | |||
768 | case Intrinsic::lifetime_start: | |||
769 | case Intrinsic::lifetime_end: | |||
770 | case Intrinsic::objectsize: | |||
771 | case Intrinsic::ptr_annotation: | |||
772 | case Intrinsic::var_annotation: | |||
773 | case Intrinsic::experimental_gc_result: | |||
774 | case Intrinsic::experimental_gc_relocate: | |||
775 | case Intrinsic::coro_alloc: | |||
776 | case Intrinsic::coro_begin: | |||
777 | case Intrinsic::coro_free: | |||
778 | case Intrinsic::coro_end: | |||
779 | case Intrinsic::coro_frame: | |||
780 | case Intrinsic::coro_size: | |||
781 | case Intrinsic::coro_suspend: | |||
782 | case Intrinsic::coro_param: | |||
783 | case Intrinsic::coro_subfn_addr: | |||
784 | // These intrinsics don't actually represent code after lowering. | |||
785 | return TTI::TCC_Free; | |||
786 | } | |||
787 | } | |||
788 | ||||
789 | unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, | |||
790 | ArrayRef<const Value *> Arguments, const User *U) { | |||
791 | // Delegate to the generic intrinsic handling code. This mostly provides an | |||
792 | // opportunity for targets to (for example) special case the cost of | |||
793 | // certain intrinsics based on constants used as arguments. | |||
794 | SmallVector<Type *, 8> ParamTys; | |||
795 | ParamTys.reserve(Arguments.size()); | |||
796 | for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx) | |||
797 | ParamTys.push_back(Arguments[Idx]->getType()); | |||
798 | return static_cast<T *>(this)->getIntrinsicCost(IID, RetTy, ParamTys, U); | |||
799 | } | |||
800 | ||||
801 | unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands) { | |||
802 | if (isa<PHINode>(U)) | |||
803 | return TTI::TCC_Free; // Model all PHI nodes as free. | |||
804 | ||||
805 | // Static alloca doesn't generate target instructions. | |||
806 | if (auto *A = dyn_cast<AllocaInst>(U)) | |||
807 | if (A->isStaticAlloca()) | |||
808 | return TTI::TCC_Free; | |||
809 | ||||
810 | if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U)) { | |||
811 | return static_cast<T *>(this)->getGEPCost(GEP->getSourceElementType(), | |||
812 | GEP->getPointerOperand(), | |||
813 | Operands.drop_front()); | |||
814 | } | |||
815 | ||||
816 | if (auto CS = ImmutableCallSite(U)) { | |||
817 | const Function *F = CS.getCalledFunction(); | |||
818 | if (!F) { | |||
819 | // Just use the called value type. | |||
820 | Type *FTy = CS.getCalledValue()->getType()->getPointerElementType(); | |||
821 | return static_cast<T *>(this) | |||
822 | ->getCallCost(cast<FunctionType>(FTy), CS.arg_size(), U); | |||
823 | } | |||
824 | ||||
825 | SmallVector<const Value *, 8> Arguments(CS.arg_begin(), CS.arg_end()); | |||
826 | return static_cast<T *>(this)->getCallCost(F, Arguments, U); | |||
827 | } | |||
828 | ||||
829 | if (isa<SExtInst>(U) || isa<ZExtInst>(U) || isa<FPExtInst>(U)) | |||
830 | // The old behaviour of generally treating extensions of icmp to be free | |||
831 | // has been removed. A target that needs it should override getUserCost(). | |||
832 | return static_cast<T *>(this)->getExtCost(cast<Instruction>(U), | |||
833 | Operands.back()); | |||
834 | ||||
835 | return static_cast<T *>(this)->getOperationCost( | |||
836 | Operator::getOpcode(U), U->getType(), | |||
837 | U->getNumOperands() == 1 ? U->getOperand(0)->getType() : nullptr); | |||
838 | } | |||
839 | ||||
840 | int getInstructionLatency(const Instruction *I) { | |||
841 | SmallVector<const Value *, 4> Operands(I->value_op_begin(), | |||
842 | I->value_op_end()); | |||
843 | if (getUserCost(I, Operands) == TTI::TCC_Free) | |||
844 | return 0; | |||
845 | ||||
846 | if (isa<LoadInst>(I)) | |||
847 | return 4; | |||
848 | ||||
849 | Type *DstTy = I->getType(); | |||
850 | ||||
851 | // Usually an intrinsic is a simple instruction. | |||
852 | // A real function call is much slower. | |||
853 | if (auto *CI = dyn_cast<CallInst>(I)) { | |||
854 | const Function *F = CI->getCalledFunction(); | |||
855 | if (!F || static_cast<T *>(this)->isLoweredToCall(F)) | |||
856 | return 40; | |||
857 | // Some intrinsics return a value and a flag, we use the value type | |||
858 | // to decide its latency. | |||
859 | if (StructType* StructTy = dyn_cast<StructType>(DstTy)) | |||
860 | DstTy = StructTy->getElementType(0); | |||
861 | // Fall through to simple instructions. | |||
862 | } | |||
863 | ||||
864 | if (VectorType *VectorTy = dyn_cast<VectorType>(DstTy)) | |||
865 | DstTy = VectorTy->getElementType(); | |||
866 | if (DstTy->isFloatingPointTy()) | |||
867 | return 3; | |||
868 | ||||
869 | return 1; | |||
870 | } | |||
871 | }; | |||
872 | } | |||
873 | ||||
874 | #endif |
1 | //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file provides a helper that implements much of the TTI interface in |
11 | /// terms of the target-independent code generator and TargetLowering |
12 | /// interfaces. |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #ifndef LLVM_CODEGEN_BASICTTIIMPL_H |
17 | #define LLVM_CODEGEN_BASICTTIIMPL_H |
18 | |
19 | #include "llvm/ADT/APInt.h" |
20 | #include "llvm/ADT/ArrayRef.h" |
21 | #include "llvm/ADT/BitVector.h" |
22 | #include "llvm/ADT/SmallPtrSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/Analysis/LoopInfo.h" |
25 | #include "llvm/Analysis/TargetTransformInfo.h" |
26 | #include "llvm/Analysis/TargetTransformInfoImpl.h" |
27 | #include "llvm/CodeGen/ISDOpcodes.h" |
28 | #include "llvm/CodeGen/TargetLowering.h" |
29 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
30 | #include "llvm/CodeGen/ValueTypes.h" |
31 | #include "llvm/IR/BasicBlock.h" |
32 | #include "llvm/IR/CallSite.h" |
33 | #include "llvm/IR/Constant.h" |
34 | #include "llvm/IR/Constants.h" |
35 | #include "llvm/IR/DataLayout.h" |
36 | #include "llvm/IR/DerivedTypes.h" |
37 | #include "llvm/IR/InstrTypes.h" |
38 | #include "llvm/IR/Instruction.h" |
39 | #include "llvm/IR/Instructions.h" |
40 | #include "llvm/IR/Intrinsics.h" |
41 | #include "llvm/IR/Operator.h" |
42 | #include "llvm/IR/Type.h" |
43 | #include "llvm/IR/Value.h" |
44 | #include "llvm/MC/MCSchedule.h" |
45 | #include "llvm/Support/Casting.h" |
46 | #include "llvm/Support/CommandLine.h" |
47 | #include "llvm/Support/ErrorHandling.h" |
48 | #include "llvm/Support/MachineValueType.h" |
49 | #include "llvm/Support/MathExtras.h" |
50 | #include <algorithm> |
51 | #include <cassert> |
52 | #include <cstdint> |
53 | #include <limits> |
54 | #include <utility> |
55 | |
56 | namespace llvm { |
57 | |
58 | class Function; |
59 | class GlobalValue; |
60 | class LLVMContext; |
61 | class ScalarEvolution; |
62 | class SCEV; |
63 | class TargetMachine; |
64 | |
65 | extern cl::opt<unsigned> PartialUnrollingThreshold; |
66 | |
67 | /// Base class which can be used to help build a TTI implementation. |
68 | /// |
69 | /// This class provides as much implementation of the TTI interface as is |
70 | /// possible using the target independent parts of the code generator. |
71 | /// |
72 | /// In order to subclass it, your class must implement a getST() method to |
73 | /// return the subtarget, and a getTLI() method to return the target lowering. |
74 | /// We need these methods implemented in the derived class so that this class |
75 | /// doesn't have to duplicate storage for them. |
76 | template <typename T> |
77 | class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { |
78 | private: |
79 | using BaseT = TargetTransformInfoImplCRTPBase<T>; |
80 | using TTI = TargetTransformInfo; |
81 | |
82 | /// Estimate a cost of Broadcast as an extract and sequence of insert |
83 | /// operations. |
84 | unsigned getBroadcastShuffleOverhead(Type *Ty) { |
85 | assert(Ty->isVectorTy() && "Can only shuffle vectors")((Ty->isVectorTy() && "Can only shuffle vectors") ? static_cast<void> (0) : __assert_fail ("Ty->isVectorTy() && \"Can only shuffle vectors\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 85, __PRETTY_FUNCTION__)); |
86 | unsigned Cost = 0; |
87 | // Broadcast cost is equal to the cost of extracting the zero'th element |
88 | // plus the cost of inserting it into every element of the result vector. |
89 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
90 | Instruction::ExtractElement, Ty, 0); |
91 | |
92 | for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { |
93 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
94 | Instruction::InsertElement, Ty, i); |
95 | } |
96 | return Cost; |
97 | } |
98 | |
99 | /// Estimate a cost of shuffle as a sequence of extract and insert |
100 | /// operations. |
101 | unsigned getPermuteShuffleOverhead(Type *Ty) { |
102 | assert(Ty->isVectorTy() && "Can only shuffle vectors")((Ty->isVectorTy() && "Can only shuffle vectors") ? static_cast<void> (0) : __assert_fail ("Ty->isVectorTy() && \"Can only shuffle vectors\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 102, __PRETTY_FUNCTION__)); |
103 | unsigned Cost = 0; |
104 | // Shuffle cost is equal to the cost of extracting element from its argument |
105 | // plus the cost of inserting them onto the result vector. |
106 | |
107 | // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from |
108 | // index 0 of first vector, index 1 of second vector,index 2 of first |
109 | // vector and finally index 3 of second vector and insert them at index |
110 | // <0,1,2,3> of result vector. |
111 | for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { |
112 | Cost += static_cast<T *>(this) |
113 | ->getVectorInstrCost(Instruction::InsertElement, Ty, i); |
114 | Cost += static_cast<T *>(this) |
115 | ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); |
116 | } |
117 | return Cost; |
118 | } |
119 | |
120 | /// Estimate a cost of subvector extraction as a sequence of extract and |
121 | /// insert operations. |
122 | unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) { |
123 | assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&((Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && "Can only extract subvectors from vectors" ) ? static_cast<void> (0) : __assert_fail ("Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && \"Can only extract subvectors from vectors\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 124, __PRETTY_FUNCTION__)) |
124 | "Can only extract subvectors from vectors")((Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && "Can only extract subvectors from vectors" ) ? static_cast<void> (0) : __assert_fail ("Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && \"Can only extract subvectors from vectors\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 124, __PRETTY_FUNCTION__)); |
125 | int NumSubElts = SubTy->getVectorNumElements(); |
126 | assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&(((Index + NumSubElts) <= (int)Ty->getVectorNumElements () && "SK_ExtractSubvector index out of range") ? static_cast <void> (0) : __assert_fail ("(Index + NumSubElts) <= (int)Ty->getVectorNumElements() && \"SK_ExtractSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 127, __PRETTY_FUNCTION__)) |
127 | "SK_ExtractSubvector index out of range")(((Index + NumSubElts) <= (int)Ty->getVectorNumElements () && "SK_ExtractSubvector index out of range") ? static_cast <void> (0) : __assert_fail ("(Index + NumSubElts) <= (int)Ty->getVectorNumElements() && \"SK_ExtractSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 127, __PRETTY_FUNCTION__)); |
128 | |
129 | unsigned Cost = 0; |
130 | // Subvector extraction cost is equal to the cost of extracting element from |
131 | // the source type plus the cost of inserting them into the result vector |
132 | // type. |
133 | for (int i = 0; i != NumSubElts; ++i) { |
134 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
135 | Instruction::ExtractElement, Ty, i + Index); |
136 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
137 | Instruction::InsertElement, SubTy, i); |
138 | } |
139 | return Cost; |
140 | } |
141 | |
142 | /// Estimate a cost of subvector insertion as a sequence of extract and |
143 | /// insert operations. |
144 | unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) { |
145 | assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&((Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && "Can only insert subvectors into vectors" ) ? static_cast<void> (0) : __assert_fail ("Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && \"Can only insert subvectors into vectors\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 146, __PRETTY_FUNCTION__)) |
146 | "Can only insert subvectors into vectors")((Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && "Can only insert subvectors into vectors" ) ? static_cast<void> (0) : __assert_fail ("Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && \"Can only insert subvectors into vectors\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 146, __PRETTY_FUNCTION__)); |
147 | int NumSubElts = SubTy->getVectorNumElements(); |
148 | assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&(((Index + NumSubElts) <= (int)Ty->getVectorNumElements () && "SK_InsertSubvector index out of range") ? static_cast <void> (0) : __assert_fail ("(Index + NumSubElts) <= (int)Ty->getVectorNumElements() && \"SK_InsertSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 149, __PRETTY_FUNCTION__)) |
149 | "SK_InsertSubvector index out of range")(((Index + NumSubElts) <= (int)Ty->getVectorNumElements () && "SK_InsertSubvector index out of range") ? static_cast <void> (0) : __assert_fail ("(Index + NumSubElts) <= (int)Ty->getVectorNumElements() && \"SK_InsertSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 149, __PRETTY_FUNCTION__)); |
150 | |
151 | unsigned Cost = 0; |
152 | // Subvector insertion cost is equal to the cost of extracting element from |
153 | // the source type plus the cost of inserting them into the result vector |
154 | // type. |
155 | for (int i = 0; i != NumSubElts; ++i) { |
156 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
157 | Instruction::ExtractElement, SubTy, i); |
158 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
159 | Instruction::InsertElement, Ty, i + Index); |
160 | } |
161 | return Cost; |
162 | } |
163 | |
164 | /// Local query method delegates up to T which *must* implement this! |
165 | const TargetSubtargetInfo *getST() const { |
166 | return static_cast<const T *>(this)->getST(); |
167 | } |
168 | |
169 | /// Local query method delegates up to T which *must* implement this! |
170 | const TargetLoweringBase *getTLI() const { |
171 | return static_cast<const T *>(this)->getTLI(); |
172 | } |
173 | |
174 | static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { |
175 | switch (M) { |
176 | case TTI::MIM_Unindexed: |
177 | return ISD::UNINDEXED; |
178 | case TTI::MIM_PreInc: |
179 | return ISD::PRE_INC; |
180 | case TTI::MIM_PreDec: |
181 | return ISD::PRE_DEC; |
182 | case TTI::MIM_PostInc: |
183 | return ISD::POST_INC; |
184 | case TTI::MIM_PostDec: |
185 | return ISD::POST_DEC; |
186 | } |
187 | llvm_unreachable("Unexpected MemIndexedMode")::llvm::llvm_unreachable_internal("Unexpected MemIndexedMode" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 187); |
188 | } |
189 | |
190 | protected: |
191 | explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) |
192 | : BaseT(DL) {} |
193 | |
194 | using TargetTransformInfoImplBase::DL; |
195 | |
196 | public: |
197 | /// \name Scalar TTI Implementations |
198 | /// @{ |
199 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, |
200 | unsigned BitWidth, unsigned AddressSpace, |
201 | unsigned Alignment, bool *Fast) const { |
202 | EVT E = EVT::getIntegerVT(Context, BitWidth); |
203 | return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast); |
204 | } |
205 | |
206 | bool hasBranchDivergence() { return false; } |
207 | |
208 | bool isSourceOfDivergence(const Value *V) { return false; } |
209 | |
210 | bool isAlwaysUniform(const Value *V) { return false; } |
211 | |
212 | unsigned getFlatAddressSpace() { |
213 | // Return an invalid address space. |
214 | return -1; |
215 | } |
216 | |
217 | bool isLegalAddImmediate(int64_t imm) { |
218 | return getTLI()->isLegalAddImmediate(imm); |
219 | } |
220 | |
221 | bool isLegalICmpImmediate(int64_t imm) { |
222 | return getTLI()->isLegalICmpImmediate(imm); |
223 | } |
224 | |
225 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
226 | bool HasBaseReg, int64_t Scale, |
227 | unsigned AddrSpace, Instruction *I = nullptr) { |
228 | TargetLoweringBase::AddrMode AM; |
229 | AM.BaseGV = BaseGV; |
230 | AM.BaseOffs = BaseOffset; |
231 | AM.HasBaseReg = HasBaseReg; |
232 | AM.Scale = Scale; |
233 | return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); |
234 | } |
235 | |
236 | bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, |
237 | const DataLayout &DL) const { |
238 | EVT VT = getTLI()->getValueType(DL, Ty); |
239 | return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); |
240 | } |
241 | |
242 | bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, |
243 | const DataLayout &DL) const { |
244 | EVT VT = getTLI()->getValueType(DL, Ty); |
245 | return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); |
246 | } |
247 | |
248 | bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { |
249 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
250 | } |
251 | |
252 | int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
253 | bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { |
254 | TargetLoweringBase::AddrMode AM; |
255 | AM.BaseGV = BaseGV; |
256 | AM.BaseOffs = BaseOffset; |
257 | AM.HasBaseReg = HasBaseReg; |
258 | AM.Scale = Scale; |
259 | return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); |
260 | } |
261 | |
262 | bool isTruncateFree(Type *Ty1, Type *Ty2) { |
263 | return getTLI()->isTruncateFree(Ty1, Ty2); |
264 | } |
265 | |
266 | bool isProfitableToHoist(Instruction *I) { |
267 | return getTLI()->isProfitableToHoist(I); |
268 | } |
269 | |
270 | bool useAA() const { return getST()->useAA(); } |
271 | |
272 | bool isTypeLegal(Type *Ty) { |
273 | EVT VT = getTLI()->getValueType(DL, Ty); |
274 | return getTLI()->isTypeLegal(VT); |
275 | } |
276 | |
277 | int getGEPCost(Type *PointeeType, const Value *Ptr, |
278 | ArrayRef<const Value *> Operands) { |
279 | return BaseT::getGEPCost(PointeeType, Ptr, Operands); |
280 | } |
281 | |
282 | int getExtCost(const Instruction *I, const Value *Src) { |
283 | if (getTLI()->isExtFree(I)) |
284 | return TargetTransformInfo::TCC_Free; |
285 | |
286 | if (isa<ZExtInst>(I) || isa<SExtInst>(I)) |
287 | if (const LoadInst *LI = dyn_cast<LoadInst>(Src)) |
288 | if (getTLI()->isExtLoad(LI, I, DL)) |
289 | return TargetTransformInfo::TCC_Free; |
290 | |
291 | return TargetTransformInfo::TCC_Basic; |
292 | } |
293 | |
294 | unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, |
295 | ArrayRef<const Value *> Arguments, const User *U) { |
296 | return BaseT::getIntrinsicCost(IID, RetTy, Arguments, U); |
297 | } |
298 | |
299 | unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, |
300 | ArrayRef<Type *> ParamTys, const User *U) { |
301 | if (IID == Intrinsic::cttz) { |
302 | if (getTLI()->isCheapToSpeculateCttz()) |
303 | return TargetTransformInfo::TCC_Basic; |
304 | return TargetTransformInfo::TCC_Expensive; |
305 | } |
306 | |
307 | if (IID == Intrinsic::ctlz) { |
308 | if (getTLI()->isCheapToSpeculateCtlz()) |
309 | return TargetTransformInfo::TCC_Basic; |
310 | return TargetTransformInfo::TCC_Expensive; |
311 | } |
312 | |
313 | return BaseT::getIntrinsicCost(IID, RetTy, ParamTys, U); |
314 | } |
315 | |
316 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, |
317 | unsigned &JumpTableSize) { |
318 | /// Try to find the estimated number of clusters. Note that the number of |
319 | /// clusters identified in this function could be different from the actural |
320 | /// numbers found in lowering. This function ignore switches that are |
321 | /// lowered with a mix of jump table / bit test / BTree. This function was |
322 | /// initially intended to be used when estimating the cost of switch in |
323 | /// inline cost heuristic, but it's a generic cost model to be used in other |
324 | /// places (e.g., in loop unrolling). |
325 | unsigned N = SI.getNumCases(); |
326 | const TargetLoweringBase *TLI = getTLI(); |
327 | const DataLayout &DL = this->getDataLayout(); |
328 | |
329 | JumpTableSize = 0; |
330 | bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); |
331 | |
332 | // Early exit if both a jump table and bit test are not allowed. |
333 | if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) |
334 | return N; |
335 | |
336 | APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); |
337 | APInt MinCaseVal = MaxCaseVal; |
338 | for (auto CI : SI.cases()) { |
339 | const APInt &CaseVal = CI.getCaseValue()->getValue(); |
340 | if (CaseVal.sgt(MaxCaseVal)) |
341 | MaxCaseVal = CaseVal; |
342 | if (CaseVal.slt(MinCaseVal)) |
343 | MinCaseVal = CaseVal; |
344 | } |
345 | |
346 | // Check if suitable for a bit test |
347 | if (N <= DL.getIndexSizeInBits(0u)) { |
348 | SmallPtrSet<const BasicBlock *, 4> Dests; |
349 | for (auto I : SI.cases()) |
350 | Dests.insert(I.getCaseSuccessor()); |
351 | |
352 | if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, |
353 | DL)) |
354 | return 1; |
355 | } |
356 | |
357 | // Check if suitable for a jump table. |
358 | if (IsJTAllowed) { |
359 | if (N < 2 || N < TLI->getMinimumJumpTableEntries()) |
360 | return N; |
361 | uint64_t Range = |
362 | (MaxCaseVal - MinCaseVal) |
363 | .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; |
364 | // Check whether a range of clusters is dense enough for a jump table |
365 | if (TLI->isSuitableForJumpTable(&SI, N, Range)) { |
366 | JumpTableSize = Range; |
367 | return 1; |
368 | } |
369 | } |
370 | return N; |
371 | } |
372 | |
373 | unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); } |
374 | |
375 | unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); } |
376 | |
377 | bool shouldBuildLookupTables() { |
378 | const TargetLoweringBase *TLI = getTLI(); |
379 | return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || |
380 | TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); |
381 | } |
382 | |
383 | bool haveFastSqrt(Type *Ty) { |
384 | const TargetLoweringBase *TLI = getTLI(); |
385 | EVT VT = TLI->getValueType(DL, Ty); |
386 | return TLI->isTypeLegal(VT) && |
387 | TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); |
388 | } |
389 | |
390 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { |
391 | return true; |
392 | } |
393 | |
394 | unsigned getFPOpCost(Type *Ty) { |
395 | // Check whether FADD is available, as a proxy for floating-point in |
396 | // general. |
397 | const TargetLoweringBase *TLI = getTLI(); |
398 | EVT VT = TLI->getValueType(DL, Ty); |
399 | if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) |
400 | return TargetTransformInfo::TCC_Basic; |
401 | return TargetTransformInfo::TCC_Expensive; |
402 | } |
403 | |
404 | unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { |
405 | const TargetLoweringBase *TLI = getTLI(); |
406 | switch (Opcode) { |
407 | default: break; |
408 | case Instruction::Trunc: |
409 | if (TLI->isTruncateFree(OpTy, Ty)) |
410 | return TargetTransformInfo::TCC_Free; |
411 | return TargetTransformInfo::TCC_Basic; |
412 | case Instruction::ZExt: |
413 | if (TLI->isZExtFree(OpTy, Ty)) |
414 | return TargetTransformInfo::TCC_Free; |
415 | return TargetTransformInfo::TCC_Basic; |
416 | |
417 | case Instruction::AddrSpaceCast: |
418 | if (TLI->isFreeAddrSpaceCast(OpTy->getPointerAddressSpace(), |
419 | Ty->getPointerAddressSpace())) |
420 | return TargetTransformInfo::TCC_Free; |
421 | return TargetTransformInfo::TCC_Basic; |
422 | } |
423 | |
424 | return BaseT::getOperationCost(Opcode, Ty, OpTy); |
425 | } |
426 | |
427 | unsigned getInliningThresholdMultiplier() { return 1; } |
428 | |
429 | void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
430 | TTI::UnrollingPreferences &UP) { |
431 | // This unrolling functionality is target independent, but to provide some |
432 | // motivation for its intended use, for x86: |
433 | |
434 | // According to the Intel 64 and IA-32 Architectures Optimization Reference |
435 | // Manual, Intel Core models and later have a loop stream detector (and |
436 | // associated uop queue) that can benefit from partial unrolling. |
437 | // The relevant requirements are: |
438 | // - The loop must have no more than 4 (8 for Nehalem and later) branches |
439 | // taken, and none of them may be calls. |
440 | // - The loop can have no more than 18 (28 for Nehalem and later) uops. |
441 | |
442 | // According to the Software Optimization Guide for AMD Family 15h |
443 | // Processors, models 30h-4fh (Steamroller and later) have a loop predictor |
444 | // and loop buffer which can benefit from partial unrolling. |
445 | // The relevant requirements are: |
446 | // - The loop must have fewer than 16 branches |
447 | // - The loop must have less than 40 uops in all executed loop branches |
448 | |
449 | // The number of taken branches in a loop is hard to estimate here, and |
450 | // benchmarking has revealed that it is better not to be conservative when |
451 | // estimating the branch count. As a result, we'll ignore the branch limits |
452 | // until someone finds a case where it matters in practice. |
453 | |
454 | unsigned MaxOps; |
455 | const TargetSubtargetInfo *ST = getST(); |
456 | if (PartialUnrollingThreshold.getNumOccurrences() > 0) |
457 | MaxOps = PartialUnrollingThreshold; |
458 | else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) |
459 | MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; |
460 | else |
461 | return; |
462 | |
463 | // Scan the loop: don't unroll loops with calls. |
464 | for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; |
465 | ++I) { |
466 | BasicBlock *BB = *I; |
467 | |
468 | for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) |
469 | if (isa<CallInst>(J) || isa<InvokeInst>(J)) { |
470 | ImmutableCallSite CS(&*J); |
471 | if (const Function *F = CS.getCalledFunction()) { |
472 | if (!static_cast<T *>(this)->isLoweredToCall(F)) |
473 | continue; |
474 | } |
475 | |
476 | return; |
477 | } |
478 | } |
479 | |
480 | // Enable runtime and partial unrolling up to the specified size. |
481 | // Enable using trip count upper bound to unroll loops. |
482 | UP.Partial = UP.Runtime = UP.UpperBound = true; |
483 | UP.PartialThreshold = MaxOps; |
484 | |
485 | // Avoid unrolling when optimizing for size. |
486 | UP.OptSizeThreshold = 0; |
487 | UP.PartialOptSizeThreshold = 0; |
488 | |
489 | // Set number of instructions optimized when "back edge" |
490 | // becomes "fall through" to default value of 2. |
491 | UP.BEInsns = 2; |
492 | } |
493 | |
494 | int getInstructionLatency(const Instruction *I) { |
495 | if (isa<LoadInst>(I)) |
496 | return getST()->getSchedModel().DefaultLoadLatency; |
497 | |
498 | return BaseT::getInstructionLatency(I); |
499 | } |
500 | |
501 | /// @} |
502 | |
503 | /// \name Vector TTI Implementations |
504 | /// @{ |
505 | |
506 | unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } |
507 | |
508 | unsigned getRegisterBitWidth(bool Vector) const { return 32; } |
509 | |
510 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract |
511 | /// are set if the result needs to be inserted and/or extracted from vectors. |
512 | unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { |
513 | assert(Ty->isVectorTy() && "Can only scalarize vectors")((Ty->isVectorTy() && "Can only scalarize vectors" ) ? static_cast<void> (0) : __assert_fail ("Ty->isVectorTy() && \"Can only scalarize vectors\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 513, __PRETTY_FUNCTION__)); |
514 | unsigned Cost = 0; |
515 | |
516 | for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { |
517 | if (Insert) |
518 | Cost += static_cast<T *>(this) |
519 | ->getVectorInstrCost(Instruction::InsertElement, Ty, i); |
520 | if (Extract) |
521 | Cost += static_cast<T *>(this) |
522 | ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); |
523 | } |
524 | |
525 | return Cost; |
526 | } |
527 | |
528 | /// Estimate the overhead of scalarizing an instructions unique |
529 | /// non-constant operands. The types of the arguments are ordinarily |
530 | /// scalar, in which case the costs are multiplied with VF. |
531 | unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
532 | unsigned VF) { |
533 | unsigned Cost = 0; |
534 | SmallPtrSet<const Value*, 4> UniqueOperands; |
535 | for (const Value *A : Args) { |
536 | if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { |
537 | Type *VecTy = nullptr; |
538 | if (A->getType()->isVectorTy()) { |
539 | VecTy = A->getType(); |
540 | // If A is a vector operand, VF should be 1 or correspond to A. |
541 | assert((VF == 1 || VF == VecTy->getVectorNumElements()) &&(((VF == 1 || VF == VecTy->getVectorNumElements()) && "Vector argument does not match VF") ? static_cast<void> (0) : __assert_fail ("(VF == 1 || VF == VecTy->getVectorNumElements()) && \"Vector argument does not match VF\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 542, __PRETTY_FUNCTION__)) |
542 | "Vector argument does not match VF")(((VF == 1 || VF == VecTy->getVectorNumElements()) && "Vector argument does not match VF") ? static_cast<void> (0) : __assert_fail ("(VF == 1 || VF == VecTy->getVectorNumElements()) && \"Vector argument does not match VF\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 542, __PRETTY_FUNCTION__)); |
543 | } |
544 | else |
545 | VecTy = VectorType::get(A->getType(), VF); |
546 | |
547 | Cost += getScalarizationOverhead(VecTy, false, true); |
548 | } |
549 | } |
550 | |
551 | return Cost; |
552 | } |
553 | |
554 | unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) { |
555 | assert(VecTy->isVectorTy())((VecTy->isVectorTy()) ? static_cast<void> (0) : __assert_fail ("VecTy->isVectorTy()", "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 555, __PRETTY_FUNCTION__)); |
556 | |
557 | unsigned Cost = 0; |
558 | |
559 | Cost += getScalarizationOverhead(VecTy, true, false); |
560 | if (!Args.empty()) |
561 | Cost += getOperandsScalarizationOverhead(Args, |
562 | VecTy->getVectorNumElements()); |
563 | else |
564 | // When no information on arguments is provided, we add the cost |
565 | // associated with one argument as a heuristic. |
566 | Cost += getScalarizationOverhead(VecTy, false, true); |
567 | |
568 | return Cost; |
569 | } |
570 | |
571 | unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } |
572 | |
573 | unsigned getArithmeticInstrCost( |
574 | unsigned Opcode, Type *Ty, |
575 | TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, |
576 | TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, |
577 | TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, |
578 | TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, |
579 | ArrayRef<const Value *> Args = ArrayRef<const Value *>()) { |
580 | // Check if any of the operands are vector operands. |
581 | const TargetLoweringBase *TLI = getTLI(); |
582 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
583 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 583, __PRETTY_FUNCTION__)); |
584 | |
585 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
586 | |
587 | bool IsFloat = Ty->isFPOrFPVectorTy(); |
588 | // Assume that floating point arithmetic operations cost twice as much as |
589 | // integer operations. |
590 | unsigned OpCost = (IsFloat ? 2 : 1); |
591 | |
592 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { |
593 | // The operation is legal. Assume it costs 1. |
594 | // TODO: Once we have extract/insert subvector cost we need to use them. |
595 | return LT.first * OpCost; |
596 | } |
597 | |
598 | if (!TLI->isOperationExpand(ISD, LT.second)) { |
599 | // If the operation is custom lowered, then assume that the code is twice |
600 | // as expensive. |
601 | return LT.first * 2 * OpCost; |
602 | } |
603 | |
604 | // Else, assume that we need to scalarize this op. |
605 | // TODO: If one of the types get legalized by splitting, handle this |
606 | // similarly to what getCastInstrCost() does. |
607 | if (Ty->isVectorTy()) { |
608 | unsigned Num = Ty->getVectorNumElements(); |
609 | unsigned Cost = static_cast<T *>(this) |
610 | ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); |
611 | // Return the cost of multiple scalar invocation plus the cost of |
612 | // inserting and extracting the values. |
613 | return getScalarizationOverhead(Ty, Args) + Num * Cost; |
614 | } |
615 | |
616 | // We don't know anything about this scalar instruction. |
617 | return OpCost; |
618 | } |
619 | |
620 | unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, |
621 | Type *SubTp) { |
622 | switch (Kind) { |
623 | case TTI::SK_Broadcast: |
624 | return getBroadcastShuffleOverhead(Tp); |
625 | case TTI::SK_Select: |
626 | case TTI::SK_Reverse: |
627 | case TTI::SK_Transpose: |
628 | case TTI::SK_PermuteSingleSrc: |
629 | case TTI::SK_PermuteTwoSrc: |
630 | return getPermuteShuffleOverhead(Tp); |
631 | case TTI::SK_ExtractSubvector: |
632 | return getExtractSubvectorOverhead(Tp, Index, SubTp); |
633 | case TTI::SK_InsertSubvector: |
634 | return getInsertSubvectorOverhead(Tp, Index, SubTp); |
635 | } |
636 | llvm_unreachable("Unknown TTI::ShuffleKind")::llvm::llvm_unreachable_internal("Unknown TTI::ShuffleKind", "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 636); |
637 | } |
638 | |
639 | unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
640 | const Instruction *I = nullptr) { |
641 | const TargetLoweringBase *TLI = getTLI(); |
642 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
643 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 643, __PRETTY_FUNCTION__)); |
644 | std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); |
645 | std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); |
646 | |
647 | // Check for NOOP conversions. |
648 | if (SrcLT.first == DstLT.first && |
649 | SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { |
650 | |
651 | // Bitcast between types that are legalized to the same type are free. |
652 | if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) |
653 | return 0; |
654 | } |
655 | |
656 | if (Opcode == Instruction::Trunc && |
657 | TLI->isTruncateFree(SrcLT.second, DstLT.second)) |
658 | return 0; |
659 | |
660 | if (Opcode == Instruction::ZExt && |
661 | TLI->isZExtFree(SrcLT.second, DstLT.second)) |
662 | return 0; |
663 | |
664 | if (Opcode == Instruction::AddrSpaceCast && |
665 | TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), |
666 | Dst->getPointerAddressSpace())) |
667 | return 0; |
668 | |
669 | // If this is a zext/sext of a load, return 0 if the corresponding |
670 | // extending load exists on target. |
671 | if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && |
672 | I && isa<LoadInst>(I->getOperand(0))) { |
673 | EVT ExtVT = EVT::getEVT(Dst); |
674 | EVT LoadVT = EVT::getEVT(Src); |
675 | unsigned LType = |
676 | ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); |
677 | if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) |
678 | return 0; |
679 | } |
680 | |
681 | // If the cast is marked as legal (or promote) then assume low cost. |
682 | if (SrcLT.first == DstLT.first && |
683 | TLI->isOperationLegalOrPromote(ISD, DstLT.second)) |
684 | return 1; |
685 | |
686 | // Handle scalar conversions. |
687 | if (!Src->isVectorTy() && !Dst->isVectorTy()) { |
688 | // Scalar bitcasts are usually free. |
689 | if (Opcode == Instruction::BitCast) |
690 | return 0; |
691 | |
692 | // Just check the op cost. If the operation is legal then assume it costs |
693 | // 1. |
694 | if (!TLI->isOperationExpand(ISD, DstLT.second)) |
695 | return 1; |
696 | |
697 | // Assume that illegal scalar instruction are expensive. |
698 | return 4; |
699 | } |
700 | |
701 | // Check vector-to-vector casts. |
702 | if (Dst->isVectorTy() && Src->isVectorTy()) { |
703 | // If the cast is between same-sized registers, then the check is simple. |
704 | if (SrcLT.first == DstLT.first && |
705 | SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { |
706 | |
707 | // Assume that Zext is done using AND. |
708 | if (Opcode == Instruction::ZExt) |
709 | return 1; |
710 | |
711 | // Assume that sext is done using SHL and SRA. |
712 | if (Opcode == Instruction::SExt) |
713 | return 2; |
714 | |
715 | // Just check the op cost. If the operation is legal then assume it |
716 | // costs |
717 | // 1 and multiply by the type-legalization overhead. |
718 | if (!TLI->isOperationExpand(ISD, DstLT.second)) |
719 | return SrcLT.first * 1; |
720 | } |
721 | |
722 | // If we are legalizing by splitting, query the concrete TTI for the cost |
723 | // of casting the original vector twice. We also need to factor in the |
724 | // cost of the split itself. Count that as 1, to be consistent with |
725 | // TLI->getTypeLegalizationCost(). |
726 | if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == |
727 | TargetLowering::TypeSplitVector) || |
728 | (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == |
729 | TargetLowering::TypeSplitVector)) { |
730 | Type *SplitDst = VectorType::get(Dst->getVectorElementType(), |
731 | Dst->getVectorNumElements() / 2); |
732 | Type *SplitSrc = VectorType::get(Src->getVectorElementType(), |
733 | Src->getVectorNumElements() / 2); |
734 | T *TTI = static_cast<T *>(this); |
735 | return TTI->getVectorSplitCost() + |
736 | (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I)); |
737 | } |
738 | |
739 | // In other cases where the source or destination are illegal, assume |
740 | // the operation will get scalarized. |
741 | unsigned Num = Dst->getVectorNumElements(); |
742 | unsigned Cost = static_cast<T *>(this)->getCastInstrCost( |
743 | Opcode, Dst->getScalarType(), Src->getScalarType(), I); |
744 | |
745 | // Return the cost of multiple scalar invocation plus the cost of |
746 | // inserting and extracting the values. |
747 | return getScalarizationOverhead(Dst, true, true) + Num * Cost; |
748 | } |
749 | |
750 | // We already handled vector-to-vector and scalar-to-scalar conversions. |
751 | // This |
752 | // is where we handle bitcast between vectors and scalars. We need to assume |
753 | // that the conversion is scalarized in one way or another. |
754 | if (Opcode == Instruction::BitCast) |
755 | // Illegal bitcasts are done by storing and loading from a stack slot. |
756 | return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) |
757 | : 0) + |
758 | (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) |
759 | : 0); |
760 | |
761 | llvm_unreachable("Unhandled cast")::llvm::llvm_unreachable_internal("Unhandled cast", "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 761); |
762 | } |
763 | |
764 | unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, |
765 | VectorType *VecTy, unsigned Index) { |
766 | return static_cast<T *>(this)->getVectorInstrCost( |
767 | Instruction::ExtractElement, VecTy, Index) + |
768 | static_cast<T *>(this)->getCastInstrCost(Opcode, Dst, |
769 | VecTy->getElementType()); |
770 | } |
771 | |
772 | unsigned getCFInstrCost(unsigned Opcode) { |
773 | // Branches are assumed to be predicted. |
774 | return 0; |
775 | } |
776 | |
777 | unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
778 | const Instruction *I) { |
779 | const TargetLoweringBase *TLI = getTLI(); |
780 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
781 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 781, __PRETTY_FUNCTION__)); |
782 | |
783 | // Selects on vectors are actually vector selects. |
784 | if (ISD == ISD::SELECT) { |
785 | assert(CondTy && "CondTy must exist")((CondTy && "CondTy must exist") ? static_cast<void > (0) : __assert_fail ("CondTy && \"CondTy must exist\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 785, __PRETTY_FUNCTION__)); |
786 | if (CondTy->isVectorTy()) |
787 | ISD = ISD::VSELECT; |
788 | } |
789 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); |
790 | |
791 | if (!(ValTy->isVectorTy() && !LT.second.isVector()) && |
792 | !TLI->isOperationExpand(ISD, LT.second)) { |
793 | // The operation is legal. Assume it costs 1. Multiply |
794 | // by the type-legalization overhead. |
795 | return LT.first * 1; |
796 | } |
797 | |
798 | // Otherwise, assume that the cast is scalarized. |
799 | // TODO: If one of the types get legalized by splitting, handle this |
800 | // similarly to what getCastInstrCost() does. |
801 | if (ValTy->isVectorTy()) { |
802 | unsigned Num = ValTy->getVectorNumElements(); |
803 | if (CondTy) |
804 | CondTy = CondTy->getScalarType(); |
805 | unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost( |
806 | Opcode, ValTy->getScalarType(), CondTy, I); |
807 | |
808 | // Return the cost of multiple scalar invocation plus the cost of |
809 | // inserting and extracting the values. |
810 | return getScalarizationOverhead(ValTy, true, false) + Num * Cost; |
811 | } |
812 | |
813 | // Unknown scalar opcode. |
814 | return 1; |
815 | } |
816 | |
817 | unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { |
818 | std::pair<unsigned, MVT> LT = |
819 | getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); |
820 | |
821 | return LT.first; |
822 | } |
823 | |
824 | unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, |
825 | unsigned AddressSpace, const Instruction *I = nullptr) { |
826 | assert(!Src->isVoidTy() && "Invalid type")((!Src->isVoidTy() && "Invalid type") ? static_cast <void> (0) : __assert_fail ("!Src->isVoidTy() && \"Invalid type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 826, __PRETTY_FUNCTION__)); |
827 | std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); |
828 | |
829 | // Assuming that all loads of legal types cost 1. |
830 | unsigned Cost = LT.first; |
831 | |
832 | if (Src->isVectorTy() && |
833 | Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) { |
834 | // This is a vector load that legalizes to a larger type than the vector |
835 | // itself. Unless the corresponding extending load or truncating store is |
836 | // legal, then this will scalarize. |
837 | TargetLowering::LegalizeAction LA = TargetLowering::Expand; |
838 | EVT MemVT = getTLI()->getValueType(DL, Src); |
839 | if (Opcode == Instruction::Store) |
840 | LA = getTLI()->getTruncStoreAction(LT.second, MemVT); |
841 | else |
842 | LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); |
843 | |
844 | if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { |
845 | // This is a vector load/store for some illegal type that is scalarized. |
846 | // We must account for the cost of building or decomposing the vector. |
847 | Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, |
848 | Opcode == Instruction::Store); |
849 | } |
850 | } |
851 | |
852 | return Cost; |
853 | } |
854 | |
855 | unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, |
856 | unsigned Factor, |
857 | ArrayRef<unsigned> Indices, |
858 | unsigned Alignment, unsigned AddressSpace, |
859 | bool UseMaskForCond = false, |
860 | bool UseMaskForGaps = false) { |
861 | VectorType *VT = dyn_cast<VectorType>(VecTy); |
862 | assert(VT && "Expect a vector type for interleaved memory op")((VT && "Expect a vector type for interleaved memory op" ) ? static_cast<void> (0) : __assert_fail ("VT && \"Expect a vector type for interleaved memory op\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 862, __PRETTY_FUNCTION__)); |
863 | |
864 | unsigned NumElts = VT->getNumElements(); |
865 | assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")((Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor" ) ? static_cast<void> (0) : __assert_fail ("Factor > 1 && NumElts % Factor == 0 && \"Invalid interleave factor\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 865, __PRETTY_FUNCTION__)); |
866 | |
867 | unsigned NumSubElts = NumElts / Factor; |
868 | VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); |
869 | |
870 | // Firstly, the cost of load/store operation. |
871 | unsigned Cost; |
872 | if (UseMaskForCond || UseMaskForGaps) |
873 | Cost = static_cast<T *>(this)->getMaskedMemoryOpCost( |
874 | Opcode, VecTy, Alignment, AddressSpace); |
875 | else |
876 | Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment, |
877 | AddressSpace); |
878 | |
879 | // Legalize the vector type, and get the legalized and unlegalized type |
880 | // sizes. |
881 | MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; |
882 | unsigned VecTySize = |
883 | static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy); |
884 | unsigned VecTyLTSize = VecTyLT.getStoreSize(); |
885 | |
886 | // Return the ceiling of dividing A by B. |
887 | auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; |
888 | |
889 | // Scale the cost of the memory operation by the fraction of legalized |
890 | // instructions that will actually be used. We shouldn't account for the |
891 | // cost of dead instructions since they will be removed. |
892 | // |
893 | // E.g., An interleaved load of factor 8: |
894 | // %vec = load <16 x i64>, <16 x i64>* %ptr |
895 | // %v0 = shufflevector %vec, undef, <0, 8> |
896 | // |
897 | // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be |
898 | // used (those corresponding to elements [0:1] and [8:9] of the unlegalized |
899 | // type). The other loads are unused. |
900 | // |
901 | // We only scale the cost of loads since interleaved store groups aren't |
902 | // allowed to have gaps. |
903 | if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { |
904 | // The number of loads of a legal type it will take to represent a load |
905 | // of the unlegalized vector type. |
906 | unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); |
907 | |
908 | // The number of elements of the unlegalized type that correspond to a |
909 | // single legal instruction. |
910 | unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); |
911 | |
912 | // Determine which legal instructions will be used. |
913 | BitVector UsedInsts(NumLegalInsts, false); |
914 | for (unsigned Index : Indices) |
915 | for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) |
916 | UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); |
917 | |
918 | // Scale the cost of the load by the fraction of legal instructions that |
919 | // will be used. |
920 | Cost *= UsedInsts.count() / NumLegalInsts; |
921 | } |
922 | |
923 | // Then plus the cost of interleave operation. |
924 | if (Opcode == Instruction::Load) { |
925 | // The interleave cost is similar to extract sub vectors' elements |
926 | // from the wide vector, and insert them into sub vectors. |
927 | // |
928 | // E.g. An interleaved load of factor 2 (with one member of index 0): |
929 | // %vec = load <8 x i32>, <8 x i32>* %ptr |
930 | // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 |
931 | // The cost is estimated as extract elements at 0, 2, 4, 6 from the |
932 | // <8 x i32> vector and insert them into a <4 x i32> vector. |
933 | |
934 | assert(Indices.size() <= Factor &&((Indices.size() <= Factor && "Interleaved memory op has too many members" ) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 935, __PRETTY_FUNCTION__)) |
935 | "Interleaved memory op has too many members")((Indices.size() <= Factor && "Interleaved memory op has too many members" ) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 935, __PRETTY_FUNCTION__)); |
936 | |
937 | for (unsigned Index : Indices) { |
938 | assert(Index < Factor && "Invalid index for interleaved memory op")((Index < Factor && "Invalid index for interleaved memory op" ) ? static_cast<void> (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 938, __PRETTY_FUNCTION__)); |
939 | |
940 | // Extract elements from loaded vector for each sub vector. |
941 | for (unsigned i = 0; i < NumSubElts; i++) |
942 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
943 | Instruction::ExtractElement, VT, Index + i * Factor); |
944 | } |
945 | |
946 | unsigned InsSubCost = 0; |
947 | for (unsigned i = 0; i < NumSubElts; i++) |
948 | InsSubCost += static_cast<T *>(this)->getVectorInstrCost( |
949 | Instruction::InsertElement, SubVT, i); |
950 | |
951 | Cost += Indices.size() * InsSubCost; |
952 | } else { |
953 | // The interleave cost is extract all elements from sub vectors, and |
954 | // insert them into the wide vector. |
955 | // |
956 | // E.g. An interleaved store of factor 2: |
957 | // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> |
958 | // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr |
959 | // The cost is estimated as extract all elements from both <4 x i32> |
960 | // vectors and insert into the <8 x i32> vector. |
961 | |
962 | unsigned ExtSubCost = 0; |
963 | for (unsigned i = 0; i < NumSubElts; i++) |
964 | ExtSubCost += static_cast<T *>(this)->getVectorInstrCost( |
965 | Instruction::ExtractElement, SubVT, i); |
966 | Cost += ExtSubCost * Factor; |
967 | |
968 | for (unsigned i = 0; i < NumElts; i++) |
969 | Cost += static_cast<T *>(this) |
970 | ->getVectorInstrCost(Instruction::InsertElement, VT, i); |
971 | } |
972 | |
973 | if (!UseMaskForCond) |
974 | return Cost; |
975 | |
976 | Type *I8Type = Type::getInt8Ty(VT->getContext()); |
977 | VectorType *MaskVT = VectorType::get(I8Type, NumElts); |
978 | SubVT = VectorType::get(I8Type, NumSubElts); |
979 | |
980 | // The Mask shuffling cost is extract all the elements of the Mask |
981 | // and insert each of them Factor times into the wide vector: |
982 | // |
983 | // E.g. an interleaved group with factor 3: |
984 | // %mask = icmp ult <8 x i32> %vec1, %vec2 |
985 | // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, |
986 | // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> |
987 | // The cost is estimated as extract all mask elements from the <8xi1> mask |
988 | // vector and insert them factor times into the <24xi1> shuffled mask |
989 | // vector. |
990 | for (unsigned i = 0; i < NumSubElts; i++) |
991 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
992 | Instruction::ExtractElement, SubVT, i); |
993 | |
994 | for (unsigned i = 0; i < NumElts; i++) |
995 | Cost += static_cast<T *>(this)->getVectorInstrCost( |
996 | Instruction::InsertElement, MaskVT, i); |
997 | |
998 | // The Gaps mask is invariant and created outside the loop, therefore the |
999 | // cost of creating it is not accounted for here. However if we have both |
1000 | // a MaskForGaps and some other mask that guards the execution of the |
1001 | // memory access, we need to account for the cost of And-ing the two masks |
1002 | // inside the loop. |
1003 | if (UseMaskForGaps) |
1004 | Cost += static_cast<T *>(this)->getArithmeticInstrCost( |
1005 | BinaryOperator::And, MaskVT); |
1006 | |
1007 | return Cost; |
1008 | } |
1009 | |
1010 | /// Get intrinsic cost based on arguments. |
1011 | unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, |
1012 | ArrayRef<Value *> Args, FastMathFlags FMF, |
1013 | unsigned VF = 1) { |
1014 | unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); |
1015 | assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type")(((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type" ) ? static_cast<void> (0) : __assert_fail ("(RetVF == 1 || VF == 1) && \"VF > 1 and RetVF is a vector type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1015, __PRETTY_FUNCTION__)); |
1016 | auto *ConcreteTTI = static_cast<T *>(this); |
1017 | |
1018 | switch (IID) { |
1019 | default: { |
1020 | // Assume that we need to scalarize this intrinsic. |
1021 | SmallVector<Type *, 4> Types; |
1022 | for (Value *Op : Args) { |
1023 | Type *OpTy = Op->getType(); |
1024 | assert(VF == 1 || !OpTy->isVectorTy())((VF == 1 || !OpTy->isVectorTy()) ? static_cast<void> (0) : __assert_fail ("VF == 1 || !OpTy->isVectorTy()", "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1024, __PRETTY_FUNCTION__)); |
1025 | Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); |
1026 | } |
1027 | |
1028 | if (VF > 1 && !RetTy->isVoidTy()) |
1029 | RetTy = VectorType::get(RetTy, VF); |
1030 | |
1031 | // Compute the scalarization overhead based on Args for a vector |
1032 | // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while |
1033 | // CostModel will pass a vector RetTy and VF is 1. |
1034 | unsigned ScalarizationCost = std::numeric_limits<unsigned>::max(); |
1035 | if (RetVF > 1 || VF > 1) { |
1036 | ScalarizationCost = 0; |
1037 | if (!RetTy->isVoidTy()) |
1038 | ScalarizationCost += getScalarizationOverhead(RetTy, true, false); |
1039 | ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); |
1040 | } |
1041 | |
1042 | return ConcreteTTI->getIntrinsicInstrCost(IID, RetTy, Types, FMF, |
1043 | ScalarizationCost); |
1044 | } |
1045 | case Intrinsic::masked_scatter: { |
1046 | assert(VF == 1 && "Can't vectorize types here.")((VF == 1 && "Can't vectorize types here.") ? static_cast <void> (0) : __assert_fail ("VF == 1 && \"Can't vectorize types here.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1046, __PRETTY_FUNCTION__)); |
1047 | Value *Mask = Args[3]; |
1048 | bool VarMask = !isa<Constant>(Mask); |
1049 | unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue(); |
1050 | return ConcreteTTI->getGatherScatterOpCost( |
1051 | Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment); |
1052 | } |
1053 | case Intrinsic::masked_gather: { |
1054 | assert(VF == 1 && "Can't vectorize types here.")((VF == 1 && "Can't vectorize types here.") ? static_cast <void> (0) : __assert_fail ("VF == 1 && \"Can't vectorize types here.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1054, __PRETTY_FUNCTION__)); |
1055 | Value *Mask = Args[2]; |
1056 | bool VarMask = !isa<Constant>(Mask); |
1057 | unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue(); |
1058 | return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy, |
1059 | Args[0], VarMask, Alignment); |
1060 | } |
1061 | case Intrinsic::experimental_vector_reduce_add: |
1062 | case Intrinsic::experimental_vector_reduce_mul: |
1063 | case Intrinsic::experimental_vector_reduce_and: |
1064 | case Intrinsic::experimental_vector_reduce_or: |
1065 | case Intrinsic::experimental_vector_reduce_xor: |
1066 | case Intrinsic::experimental_vector_reduce_fadd: |
1067 | case Intrinsic::experimental_vector_reduce_fmul: |
1068 | case Intrinsic::experimental_vector_reduce_smax: |
1069 | case Intrinsic::experimental_vector_reduce_smin: |
1070 | case Intrinsic::experimental_vector_reduce_fmax: |
1071 | case Intrinsic::experimental_vector_reduce_fmin: |
1072 | case Intrinsic::experimental_vector_reduce_umax: |
1073 | case Intrinsic::experimental_vector_reduce_umin: |
1074 | return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF); |
1075 | case Intrinsic::fshl: |
1076 | case Intrinsic::fshr: { |
1077 | Value *X = Args[0]; |
1078 | Value *Y = Args[1]; |
1079 | Value *Z = Args[2]; |
1080 | TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; |
1081 | TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); |
1082 | TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); |
1083 | TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); |
1084 | TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; |
1085 | OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 |
1086 | : TTI::OP_None; |
1087 | // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) |
1088 | // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) |
1089 | unsigned Cost = 0; |
1090 | Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Or, RetTy); |
1091 | Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Sub, RetTy); |
1092 | Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, |
1093 | OpKindX, OpKindZ, OpPropsX); |
1094 | Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::LShr, RetTy, |
1095 | OpKindY, OpKindZ, OpPropsY); |
1096 | // Non-constant shift amounts requires a modulo. |
1097 | if (OpKindZ != TTI::OK_UniformConstantValue && |
1098 | OpKindZ != TTI::OK_NonUniformConstantValue) |
1099 | Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::URem, RetTy, |
1100 | OpKindZ, OpKindBW, OpPropsZ, |
1101 | OpPropsBW); |
1102 | // For non-rotates (X != Y) we must add shift-by-zero handling costs. |
1103 | if (X != Y) { |
1104 | Type *CondTy = Type::getInt1Ty(RetTy->getContext()); |
1105 | if (RetVF > 1) |
1106 | CondTy = VectorType::get(CondTy, RetVF); |
1107 | Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, |
1108 | CondTy, nullptr); |
1109 | Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy, |
1110 | CondTy, nullptr); |
1111 | } |
1112 | return Cost; |
1113 | } |
1114 | } |
1115 | } |
1116 | |
1117 | /// Get intrinsic cost based on argument types. |
1118 | /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the |
1119 | /// cost of scalarizing the arguments and the return value will be computed |
1120 | /// based on types. |
1121 | unsigned getIntrinsicInstrCost( |
1122 | Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, |
1123 | unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) { |
1124 | unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); |
1125 | auto *ConcreteTTI = static_cast<T *>(this); |
1126 | |
1127 | SmallVector<unsigned, 2> ISDs; |
1128 | unsigned SingleCallCost = 10; // Library call cost. Make it expensive. |
1129 | switch (IID) { |
1130 | default: { |
1131 | // Assume that we need to scalarize this intrinsic. |
1132 | unsigned ScalarizationCost = ScalarizationCostPassed; |
1133 | unsigned ScalarCalls = 1; |
1134 | Type *ScalarRetTy = RetTy; |
1135 | if (RetTy->isVectorTy()) { |
1136 | if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) |
1137 | ScalarizationCost = getScalarizationOverhead(RetTy, true, false); |
1138 | ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); |
1139 | ScalarRetTy = RetTy->getScalarType(); |
1140 | } |
1141 | SmallVector<Type *, 4> ScalarTys; |
1142 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1143 | Type *Ty = Tys[i]; |
1144 | if (Ty->isVectorTy()) { |
1145 | if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) |
1146 | ScalarizationCost += getScalarizationOverhead(Ty, false, true); |
1147 | ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); |
1148 | Ty = Ty->getScalarType(); |
1149 | } |
1150 | ScalarTys.push_back(Ty); |
1151 | } |
1152 | if (ScalarCalls == 1) |
1153 | return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. |
1154 | |
1155 | unsigned ScalarCost = |
1156 | ConcreteTTI->getIntrinsicInstrCost(IID, ScalarRetTy, ScalarTys, FMF); |
1157 | |
1158 | return ScalarCalls * ScalarCost + ScalarizationCost; |
1159 | } |
1160 | // Look for intrinsics that can be lowered directly or turned into a scalar |
1161 | // intrinsic call. |
1162 | case Intrinsic::sqrt: |
1163 | ISDs.push_back(ISD::FSQRT); |
1164 | break; |
1165 | case Intrinsic::sin: |
1166 | ISDs.push_back(ISD::FSIN); |
1167 | break; |
1168 | case Intrinsic::cos: |
1169 | ISDs.push_back(ISD::FCOS); |
1170 | break; |
1171 | case Intrinsic::exp: |
1172 | ISDs.push_back(ISD::FEXP); |
1173 | break; |
1174 | case Intrinsic::exp2: |
1175 | ISDs.push_back(ISD::FEXP2); |
1176 | break; |
1177 | case Intrinsic::log: |
1178 | ISDs.push_back(ISD::FLOG); |
1179 | break; |
1180 | case Intrinsic::log10: |
1181 | ISDs.push_back(ISD::FLOG10); |
1182 | break; |
1183 | case Intrinsic::log2: |
1184 | ISDs.push_back(ISD::FLOG2); |
1185 | break; |
1186 | case Intrinsic::fabs: |
1187 | ISDs.push_back(ISD::FABS); |
1188 | break; |
1189 | case Intrinsic::canonicalize: |
1190 | ISDs.push_back(ISD::FCANONICALIZE); |
1191 | break; |
1192 | case Intrinsic::minnum: |
1193 | ISDs.push_back(ISD::FMINNUM); |
1194 | if (FMF.noNaNs()) |
1195 | ISDs.push_back(ISD::FMINIMUM); |
1196 | break; |
1197 | case Intrinsic::maxnum: |
1198 | ISDs.push_back(ISD::FMAXNUM); |
1199 | if (FMF.noNaNs()) |
1200 | ISDs.push_back(ISD::FMAXIMUM); |
1201 | break; |
1202 | case Intrinsic::copysign: |
1203 | ISDs.push_back(ISD::FCOPYSIGN); |
1204 | break; |
1205 | case Intrinsic::floor: |
1206 | ISDs.push_back(ISD::FFLOOR); |
1207 | break; |
1208 | case Intrinsic::ceil: |
1209 | ISDs.push_back(ISD::FCEIL); |
1210 | break; |
1211 | case Intrinsic::trunc: |
1212 | ISDs.push_back(ISD::FTRUNC); |
1213 | break; |
1214 | case Intrinsic::nearbyint: |
1215 | ISDs.push_back(ISD::FNEARBYINT); |
1216 | break; |
1217 | case Intrinsic::rint: |
1218 | ISDs.push_back(ISD::FRINT); |
1219 | break; |
1220 | case Intrinsic::round: |
1221 | ISDs.push_back(ISD::FROUND); |
1222 | break; |
1223 | case Intrinsic::pow: |
1224 | ISDs.push_back(ISD::FPOW); |
1225 | break; |
1226 | case Intrinsic::fma: |
1227 | ISDs.push_back(ISD::FMA); |
1228 | break; |
1229 | case Intrinsic::fmuladd: |
1230 | ISDs.push_back(ISD::FMA); |
1231 | break; |
1232 | // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. |
1233 | case Intrinsic::lifetime_start: |
1234 | case Intrinsic::lifetime_end: |
1235 | case Intrinsic::sideeffect: |
1236 | return 0; |
1237 | case Intrinsic::masked_store: |
1238 | return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, |
1239 | 0); |
1240 | case Intrinsic::masked_load: |
1241 | return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); |
1242 | case Intrinsic::experimental_vector_reduce_add: |
1243 | return ConcreteTTI->getArithmeticReductionCost(Instruction::Add, Tys[0], |
1244 | /*IsPairwiseForm=*/false); |
1245 | case Intrinsic::experimental_vector_reduce_mul: |
1246 | return ConcreteTTI->getArithmeticReductionCost(Instruction::Mul, Tys[0], |
1247 | /*IsPairwiseForm=*/false); |
1248 | case Intrinsic::experimental_vector_reduce_and: |
1249 | return ConcreteTTI->getArithmeticReductionCost(Instruction::And, Tys[0], |
1250 | /*IsPairwiseForm=*/false); |
1251 | case Intrinsic::experimental_vector_reduce_or: |
1252 | return ConcreteTTI->getArithmeticReductionCost(Instruction::Or, Tys[0], |
1253 | /*IsPairwiseForm=*/false); |
1254 | case Intrinsic::experimental_vector_reduce_xor: |
1255 | return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0], |
1256 | /*IsPairwiseForm=*/false); |
1257 | case Intrinsic::experimental_vector_reduce_fadd: |
1258 | return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0], |
1259 | /*IsPairwiseForm=*/false); |
1260 | case Intrinsic::experimental_vector_reduce_fmul: |
1261 | return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0], |
1262 | /*IsPairwiseForm=*/false); |
1263 | case Intrinsic::experimental_vector_reduce_smax: |
1264 | case Intrinsic::experimental_vector_reduce_smin: |
1265 | case Intrinsic::experimental_vector_reduce_fmax: |
1266 | case Intrinsic::experimental_vector_reduce_fmin: |
1267 | return ConcreteTTI->getMinMaxReductionCost( |
1268 | Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false, |
1269 | /*IsSigned=*/true); |
1270 | case Intrinsic::experimental_vector_reduce_umax: |
1271 | case Intrinsic::experimental_vector_reduce_umin: |
1272 | return ConcreteTTI->getMinMaxReductionCost( |
1273 | Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false, |
1274 | /*IsSigned=*/false); |
1275 | case Intrinsic::sadd_sat: |
1276 | case Intrinsic::ssub_sat: { |
1277 | Type *CondTy = Type::getInt1Ty(RetTy->getContext()); |
1278 | if (RetVF > 1) |
1279 | CondTy = VectorType::get(CondTy, RetVF); |
1280 | |
1281 | Type *OpTy = StructType::create({RetTy, CondTy}); |
1282 | Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat |
1283 | ? Intrinsic::sadd_with_overflow |
1284 | : Intrinsic::ssub_with_overflow; |
1285 | |
1286 | // SatMax -> Overflow && SumDiff < 0 |
1287 | // SatMin -> Overflow && SumDiff >= 0 |
1288 | unsigned Cost = 0; |
1289 | Cost += ConcreteTTI->getIntrinsicInstrCost( |
1290 | OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed); |
1291 | Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, |
1292 | CondTy, nullptr); |
1293 | Cost += 2 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy, |
1294 | CondTy, nullptr); |
1295 | return Cost; |
1296 | } |
1297 | case Intrinsic::uadd_sat: |
1298 | case Intrinsic::usub_sat: { |
1299 | Type *CondTy = Type::getInt1Ty(RetTy->getContext()); |
1300 | if (RetVF > 1) |
1301 | CondTy = VectorType::get(CondTy, RetVF); |
1302 | |
1303 | Type *OpTy = StructType::create({RetTy, CondTy}); |
1304 | Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat |
1305 | ? Intrinsic::uadd_with_overflow |
1306 | : Intrinsic::usub_with_overflow; |
1307 | |
1308 | unsigned Cost = 0; |
1309 | Cost += ConcreteTTI->getIntrinsicInstrCost( |
1310 | OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed); |
1311 | Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy, |
1312 | CondTy, nullptr); |
1313 | return Cost; |
1314 | } |
1315 | case Intrinsic::smul_fix: |
1316 | case Intrinsic::umul_fix: { |
1317 | unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; |
1318 | Type *ExtTy = Type::getIntNTy(RetTy->getContext(), ExtSize); |
1319 | if (RetVF > 1) |
1320 | ExtTy = VectorType::get(ExtTy, RetVF); |
1321 | |
1322 | unsigned ExtOp = |
1323 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; |
1324 | |
1325 | unsigned Cost = 0; |
1326 | Cost += 2 * ConcreteTTI->getCastInstrCost(ExtOp, ExtTy, RetTy); |
1327 | Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Mul, ExtTy); |
1328 | Cost += |
1329 | 2 * ConcreteTTI->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy); |
1330 | Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::LShr, RetTy, |
1331 | TTI::OK_AnyValue, |
1332 | TTI::OK_UniformConstantValue); |
1333 | Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Shl, RetTy, |
1334 | TTI::OK_AnyValue, |
1335 | TTI::OK_UniformConstantValue); |
1336 | Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Or, RetTy); |
1337 | return Cost; |
1338 | } |
1339 | case Intrinsic::sadd_with_overflow: |
1340 | case Intrinsic::ssub_with_overflow: { |
1341 | Type *SumTy = RetTy->getContainedType(0); |
1342 | Type *OverflowTy = RetTy->getContainedType(1); |
1343 | unsigned Opcode = IID == Intrinsic::sadd_with_overflow |
1344 | ? BinaryOperator::Add |
1345 | : BinaryOperator::Sub; |
1346 | |
1347 | // LHSSign -> LHS >= 0 |
1348 | // RHSSign -> RHS >= 0 |
1349 | // SumSign -> Sum >= 0 |
1350 | // |
1351 | // Add: |
1352 | // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) |
1353 | // Sub: |
1354 | // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) |
1355 | unsigned Cost = 0; |
1356 | Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy); |
1357 | Cost += 3 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, |
1358 | OverflowTy, nullptr); |
1359 | Cost += 2 * ConcreteTTI->getCmpSelInstrCost( |
1360 | BinaryOperator::ICmp, OverflowTy, OverflowTy, nullptr); |
1361 | Cost += |
1362 | ConcreteTTI->getArithmeticInstrCost(BinaryOperator::And, OverflowTy); |
1363 | return Cost; |
1364 | } |
1365 | case Intrinsic::uadd_with_overflow: |
1366 | case Intrinsic::usub_with_overflow: { |
1367 | Type *SumTy = RetTy->getContainedType(0); |
1368 | Type *OverflowTy = RetTy->getContainedType(1); |
1369 | unsigned Opcode = IID == Intrinsic::uadd_with_overflow |
1370 | ? BinaryOperator::Add |
1371 | : BinaryOperator::Sub; |
1372 | |
1373 | unsigned Cost = 0; |
1374 | Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy); |
1375 | Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, |
1376 | OverflowTy, nullptr); |
1377 | return Cost; |
1378 | } |
1379 | case Intrinsic::smul_with_overflow: |
1380 | case Intrinsic::umul_with_overflow: { |
1381 | Type *MulTy = RetTy->getContainedType(0); |
1382 | Type *OverflowTy = RetTy->getContainedType(1); |
1383 | unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; |
1384 | Type *ExtTy = Type::getIntNTy(RetTy->getContext(), ExtSize); |
1385 | if (MulTy->isVectorTy()) |
1386 | ExtTy = VectorType::get(ExtTy, MulTy->getVectorNumElements() ); |
1387 | |
1388 | unsigned ExtOp = |
1389 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; |
1390 | |
1391 | unsigned Cost = 0; |
1392 | Cost += 2 * ConcreteTTI->getCastInstrCost(ExtOp, ExtTy, MulTy); |
1393 | Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Mul, ExtTy); |
1394 | Cost += |
1395 | 2 * ConcreteTTI->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy); |
1396 | Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::LShr, MulTy, |
1397 | TTI::OK_AnyValue, |
1398 | TTI::OK_UniformConstantValue); |
1399 | |
1400 | if (IID == Intrinsic::smul_with_overflow) |
1401 | Cost += ConcreteTTI->getArithmeticInstrCost( |
1402 | Instruction::AShr, MulTy, TTI::OK_AnyValue, |
1403 | TTI::OK_UniformConstantValue); |
1404 | |
1405 | Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, |
1406 | OverflowTy, nullptr); |
1407 | return Cost; |
1408 | } |
1409 | case Intrinsic::ctpop: |
1410 | ISDs.push_back(ISD::CTPOP); |
1411 | // In case of legalization use TCC_Expensive. This is cheaper than a |
1412 | // library call but still not a cheap instruction. |
1413 | SingleCallCost = TargetTransformInfo::TCC_Expensive; |
1414 | break; |
1415 | // FIXME: ctlz, cttz, ... |
1416 | } |
1417 | |
1418 | const TargetLoweringBase *TLI = getTLI(); |
1419 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); |
1420 | |
1421 | SmallVector<unsigned, 2> LegalCost; |
1422 | SmallVector<unsigned, 2> CustomCost; |
1423 | for (unsigned ISD : ISDs) { |
1424 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { |
1425 | if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && |
1426 | TLI->isFAbsFree(LT.second)) { |
1427 | return 0; |
1428 | } |
1429 | |
1430 | // The operation is legal. Assume it costs 1. |
1431 | // If the type is split to multiple registers, assume that there is some |
1432 | // overhead to this. |
1433 | // TODO: Once we have extract/insert subvector cost we need to use them. |
1434 | if (LT.first > 1) |
1435 | LegalCost.push_back(LT.first * 2); |
1436 | else |
1437 | LegalCost.push_back(LT.first * 1); |
1438 | } else if (!TLI->isOperationExpand(ISD, LT.second)) { |
1439 | // If the operation is custom lowered then assume |
1440 | // that the code is twice as expensive. |
1441 | CustomCost.push_back(LT.first * 2); |
1442 | } |
1443 | } |
1444 | |
1445 | auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); |
1446 | if (MinLegalCostI != LegalCost.end()) |
1447 | return *MinLegalCostI; |
1448 | |
1449 | auto MinCustomCostI = |
1450 | std::min_element(CustomCost.begin(), CustomCost.end()); |
1451 | if (MinCustomCostI != CustomCost.end()) |
1452 | return *MinCustomCostI; |
1453 | |
1454 | // If we can't lower fmuladd into an FMA estimate the cost as a floating |
1455 | // point mul followed by an add. |
1456 | if (IID == Intrinsic::fmuladd) |
1457 | return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + |
1458 | ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); |
1459 | |
1460 | // Else, assume that we need to scalarize this intrinsic. For math builtins |
1461 | // this will emit a costly libcall, adding call overhead and spills. Make it |
1462 | // very expensive. |
1463 | if (RetTy->isVectorTy()) { |
1464 | unsigned ScalarizationCost = |
1465 | ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max()) |
1466 | ? ScalarizationCostPassed |
1467 | : getScalarizationOverhead(RetTy, true, false)); |
1468 | unsigned ScalarCalls = RetTy->getVectorNumElements(); |
1469 | SmallVector<Type *, 4> ScalarTys; |
1470 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1471 | Type *Ty = Tys[i]; |
1472 | if (Ty->isVectorTy()) |
1473 | Ty = Ty->getScalarType(); |
1474 | ScalarTys.push_back(Ty); |
1475 | } |
1476 | unsigned ScalarCost = ConcreteTTI->getIntrinsicInstrCost( |
1477 | IID, RetTy->getScalarType(), ScalarTys, FMF); |
1478 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1479 | if (Tys[i]->isVectorTy()) { |
1480 | if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) |
1481 | ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); |
1482 | ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); |
1483 | } |
1484 | } |
1485 | |
1486 | return ScalarCalls * ScalarCost + ScalarizationCost; |
1487 | } |
1488 | |
1489 | // This is going to be turned into a library call, make it expensive. |
1490 | return SingleCallCost; |
1491 | } |
1492 | |
1493 | /// Compute a cost of the given call instruction. |
1494 | /// |
1495 | /// Compute the cost of calling function F with return type RetTy and |
1496 | /// argument types Tys. F might be nullptr, in this case the cost of an |
1497 | /// arbitrary call with the specified signature will be returned. |
1498 | /// This is used, for instance, when we estimate call of a vector |
1499 | /// counterpart of the given function. |
1500 | /// \param F Called function, might be nullptr. |
1501 | /// \param RetTy Return value types. |
1502 | /// \param Tys Argument types. |
1503 | /// \returns The cost of Call instruction. |
1504 | unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) { |
1505 | return 10; |
1506 | } |
1507 | |
1508 | unsigned getNumberOfParts(Type *Tp) { |
1509 | std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); |
1510 | return LT.first; |
1511 | } |
1512 | |
1513 | unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, |
1514 | const SCEV *) { |
1515 | return 0; |
1516 | } |
1517 | |
1518 | /// Try to calculate arithmetic and shuffle op costs for reduction operations. |
1519 | /// We're assuming that reduction operation are performing the following way: |
1520 | /// 1. Non-pairwise reduction |
1521 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, |
1522 | /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> |
1523 | /// \----------------v-------------/ \----------v------------/ |
1524 | /// n/2 elements n/2 elements |
1525 | /// %red1 = op <n x t> %val, <n x t> val1 |
1526 | /// After this operation we have a vector %red1 where only the first n/2 |
1527 | /// elements are meaningful, the second n/2 elements are undefined and can be |
1528 | /// dropped. All other operations are actually working with the vector of |
1529 | /// length n/2, not n, though the real vector length is still n. |
1530 | /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, |
1531 | /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> |
1532 | /// \----------------v-------------/ \----------v------------/ |
1533 | /// n/4 elements 3*n/4 elements |
1534 | /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of |
1535 | /// length n/2, the resulting vector has length n/4 etc. |
1536 | /// 2. Pairwise reduction: |
1537 | /// Everything is the same except for an additional shuffle operation which |
1538 | /// is used to produce operands for pairwise kind of reductions. |
1539 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, |
1540 | /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> |
1541 | /// \-------------v----------/ \----------v------------/ |
1542 | /// n/2 elements n/2 elements |
1543 | /// %val2 = shufflevector<n x t> %val, <n x t> %undef, |
1544 | /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> |
1545 | /// \-------------v----------/ \----------v------------/ |
1546 | /// n/2 elements n/2 elements |
1547 | /// %red1 = op <n x t> %val1, <n x t> val2 |
1548 | /// Again, the operation is performed on <n x t> vector, but the resulting |
1549 | /// vector %red1 is <n/2 x t> vector. |
1550 | /// |
1551 | /// The cost model should take into account that the actual length of the |
1552 | /// vector is reduced on each iteration. |
1553 | unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, |
1554 | bool IsPairwise) { |
1555 | assert(Ty->isVectorTy() && "Expect a vector type")((Ty->isVectorTy() && "Expect a vector type") ? static_cast <void> (0) : __assert_fail ("Ty->isVectorTy() && \"Expect a vector type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1555, __PRETTY_FUNCTION__)); |
1556 | Type *ScalarTy = Ty->getVectorElementType(); |
1557 | unsigned NumVecElts = Ty->getVectorNumElements(); |
1558 | unsigned NumReduxLevels = Log2_32(NumVecElts); |
1559 | unsigned ArithCost = 0; |
1560 | unsigned ShuffleCost = 0; |
1561 | auto *ConcreteTTI = static_cast<T *>(this); |
1562 | std::pair<unsigned, MVT> LT = |
1563 | ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); |
1564 | unsigned LongVectorCount = 0; |
1565 | unsigned MVTLen = |
1566 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
1567 | while (NumVecElts > MVTLen) { |
1568 | NumVecElts /= 2; |
1569 | Type *SubTy = VectorType::get(ScalarTy, NumVecElts); |
1570 | // Assume the pairwise shuffles add a cost. |
1571 | ShuffleCost += (IsPairwise + 1) * |
1572 | ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, |
1573 | NumVecElts, SubTy); |
1574 | ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy); |
1575 | Ty = SubTy; |
1576 | ++LongVectorCount; |
1577 | } |
1578 | |
1579 | NumReduxLevels -= LongVectorCount; |
1580 | |
1581 | // The minimal length of the vector is limited by the real length of vector |
1582 | // operations performed on the current platform. That's why several final |
1583 | // reduction operations are performed on the vectors with the same |
1584 | // architecture-dependent length. |
1585 | |
1586 | // Non pairwise reductions need one shuffle per reduction level. Pairwise |
1587 | // reductions need two shuffles on every level, but the last one. On that |
1588 | // level one of the shuffles is <0, u, u, ...> which is identity. |
1589 | unsigned NumShuffles = NumReduxLevels; |
1590 | if (IsPairwise && NumReduxLevels >= 1) |
1591 | NumShuffles += NumReduxLevels - 1; |
1592 | ShuffleCost += NumShuffles * |
1593 | ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, |
1594 | 0, Ty); |
1595 | ArithCost += NumReduxLevels * |
1596 | ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); |
1597 | return ShuffleCost + ArithCost + |
1598 | ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); |
1599 | } |
1600 | |
1601 | /// Try to calculate op costs for min/max reduction operations. |
1602 | /// \param CondTy Conditional type for the Select instruction. |
1603 | unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, |
1604 | bool) { |
1605 | assert(Ty->isVectorTy() && "Expect a vector type")((Ty->isVectorTy() && "Expect a vector type") ? static_cast <void> (0) : __assert_fail ("Ty->isVectorTy() && \"Expect a vector type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1605, __PRETTY_FUNCTION__)); |
1606 | Type *ScalarTy = Ty->getVectorElementType(); |
1607 | Type *ScalarCondTy = CondTy->getVectorElementType(); |
1608 | unsigned NumVecElts = Ty->getVectorNumElements(); |
1609 | unsigned NumReduxLevels = Log2_32(NumVecElts); |
1610 | unsigned CmpOpcode; |
1611 | if (Ty->isFPOrFPVectorTy()) { |
1612 | CmpOpcode = Instruction::FCmp; |
1613 | } else { |
1614 | assert(Ty->isIntOrIntVectorTy() &&((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1615, __PRETTY_FUNCTION__)) |
1615 | "expecting floating point or integer type for min/max reduction")((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "/build/llvm-toolchain-snapshot-9~svn362543/include/llvm/CodeGen/BasicTTIImpl.h" , 1615, __PRETTY_FUNCTION__)); |
1616 | CmpOpcode = Instruction::ICmp; |
1617 | } |
1618 | unsigned MinMaxCost = 0; |
1619 | unsigned ShuffleCost = 0; |
1620 | auto *ConcreteTTI = static_cast<T *>(this); |
1621 | std::pair<unsigned, MVT> LT = |
1622 | ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); |
1623 | unsigned LongVectorCount = 0; |
1624 | unsigned MVTLen = |
1625 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
1626 | while (NumVecElts > MVTLen) { |
1627 | NumVecElts /= 2; |
1628 | Type *SubTy = VectorType::get(ScalarTy, NumVecElts); |
1629 | CondTy = VectorType::get(ScalarCondTy, NumVecElts); |
1630 | |
1631 | // Assume the pairwise shuffles add a cost. |
1632 | ShuffleCost += (IsPairwise + 1) * |
1633 | ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, |
1634 | NumVecElts, SubTy); |
1635 | MinMaxCost += |
1636 | ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) + |
1637 | ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, |
1638 | nullptr); |
1639 | Ty = SubTy; |
1640 | ++LongVectorCount; |
1641 | } |
1642 | |
1643 | NumReduxLevels -= LongVectorCount; |
1644 | |
1645 | // The minimal length of the vector is limited by the real length of vector |
1646 | // operations performed on the current platform. That's why several final |
1647 | // reduction opertions are perfomed on the vectors with the same |
1648 | // architecture-dependent length. |
1649 | |
1650 | // Non pairwise reductions need one shuffle per reduction level. Pairwise |
1651 | // reductions need two shuffles on every level, but the last one. On that |
1652 | // level one of the shuffles is <0, u, u, ...> which is identity. |
1653 | unsigned NumShuffles = NumReduxLevels; |
1654 | if (IsPairwise && NumReduxLevels >= 1) |
1655 | NumShuffles += NumReduxLevels - 1; |
1656 | ShuffleCost += NumShuffles * |
1657 | ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, |
1658 | 0, Ty); |
1659 | MinMaxCost += |
1660 | NumReduxLevels * |
1661 | (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + |
1662 | ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, |
1663 | nullptr)); |
1664 | // The last min/max should be in vector registers and we counted it above. |
1665 | // So just need a single extractelement. |
1666 | return ShuffleCost + MinMaxCost + |
1667 | ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); |
1668 | } |
1669 | |
1670 | unsigned getVectorSplitCost() { return 1; } |
1671 | |
1672 | /// @} |
1673 | }; |
1674 | |
1675 | /// Concrete BasicTTIImpl that can be used if no further customization |
1676 | /// is needed. |
1677 | class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { |
1678 | using BaseT = BasicTTIImplBase<BasicTTIImpl>; |
1679 | |
1680 | friend class BasicTTIImplBase<BasicTTIImpl>; |
1681 | |
1682 | const TargetSubtargetInfo *ST; |
1683 | const TargetLoweringBase *TLI; |
1684 | |
1685 | const TargetSubtargetInfo *getST() const { return ST; } |
1686 | const TargetLoweringBase *getTLI() const { return TLI; } |
1687 | |
1688 | public: |
1689 | explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); |
1690 | }; |
1691 | |
1692 | } // end namespace llvm |
1693 | |
1694 | #endif // LLVM_CODEGEN_BASICTTIIMPL_H |