File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 2294, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | /// \file | ||||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||||
10 | /// AMDGPU. | ||||||
11 | /// \todo This should be generated by TableGen. | ||||||
12 | //===----------------------------------------------------------------------===// | ||||||
13 | |||||||
14 | #if defined(_MSC_VER) || defined(__MINGW32__) | ||||||
15 | // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI | ||||||
16 | // from the Visual C++ cmath / math.h headers: | ||||||
17 | // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 | ||||||
18 | #define _USE_MATH_DEFINES | ||||||
19 | #endif | ||||||
20 | |||||||
21 | #include "AMDGPULegalizerInfo.h" | ||||||
22 | |||||||
23 | #include "AMDGPU.h" | ||||||
24 | #include "AMDGPUGlobalISelUtils.h" | ||||||
25 | #include "AMDGPUTargetMachine.h" | ||||||
26 | #include "SIMachineFunctionInfo.h" | ||||||
27 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||||
28 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||||
29 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||||
30 | #include "llvm/CodeGen/TargetOpcodes.h" | ||||||
31 | #include "llvm/CodeGen/ValueTypes.h" | ||||||
32 | #include "llvm/IR/DerivedTypes.h" | ||||||
33 | #include "llvm/IR/DiagnosticInfo.h" | ||||||
34 | #include "llvm/IR/Type.h" | ||||||
35 | #include "llvm/Support/Debug.h" | ||||||
36 | |||||||
37 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||||
38 | |||||||
39 | using namespace llvm; | ||||||
40 | using namespace LegalizeActions; | ||||||
41 | using namespace LegalizeMutations; | ||||||
42 | using namespace LegalityPredicates; | ||||||
43 | using namespace MIPatternMatch; | ||||||
44 | |||||||
45 | // Round the number of elements to the next power of two elements | ||||||
46 | static LLT getPow2VectorType(LLT Ty) { | ||||||
47 | unsigned NElts = Ty.getNumElements(); | ||||||
48 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||||
49 | return Ty.changeNumElements(Pow2NElts); | ||||||
50 | } | ||||||
51 | |||||||
52 | // Round the number of bits to the next power of two bits | ||||||
53 | static LLT getPow2ScalarType(LLT Ty) { | ||||||
54 | unsigned Bits = Ty.getSizeInBits(); | ||||||
55 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||||
56 | return LLT::scalar(Pow2Bits); | ||||||
57 | } | ||||||
58 | |||||||
59 | static LegalityPredicate isMultiple32(unsigned TypeIdx, | ||||||
60 | unsigned MaxSize = 1024) { | ||||||
61 | return [=](const LegalityQuery &Query) { | ||||||
62 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
63 | const LLT EltTy = Ty.getScalarType(); | ||||||
64 | return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; | ||||||
65 | }; | ||||||
66 | } | ||||||
67 | |||||||
68 | static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { | ||||||
69 | return [=](const LegalityQuery &Query) { | ||||||
70 | return Query.Types[TypeIdx].getSizeInBits() == Size; | ||||||
71 | }; | ||||||
72 | } | ||||||
73 | |||||||
74 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||||
75 | return [=](const LegalityQuery &Query) { | ||||||
76 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
77 | return Ty.isVector() && | ||||||
78 | Ty.getNumElements() % 2 != 0 && | ||||||
79 | Ty.getElementType().getSizeInBits() < 32 && | ||||||
80 | Ty.getSizeInBits() % 32 != 0; | ||||||
81 | }; | ||||||
82 | } | ||||||
83 | |||||||
84 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||||
85 | return [=](const LegalityQuery &Query) { | ||||||
86 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
87 | const LLT EltTy = Ty.getScalarType(); | ||||||
88 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||||
89 | }; | ||||||
90 | } | ||||||
91 | |||||||
92 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||||
93 | return [=](const LegalityQuery &Query) { | ||||||
94 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
95 | const LLT EltTy = Ty.getElementType(); | ||||||
96 | return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); | ||||||
97 | }; | ||||||
98 | } | ||||||
99 | |||||||
100 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||||
101 | return [=](const LegalityQuery &Query) { | ||||||
102 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
103 | const LLT EltTy = Ty.getElementType(); | ||||||
104 | unsigned Size = Ty.getSizeInBits(); | ||||||
105 | unsigned Pieces = (Size + 63) / 64; | ||||||
106 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||||
107 | return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); | ||||||
108 | }; | ||||||
109 | } | ||||||
110 | |||||||
111 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||||
112 | // type. | ||||||
113 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||||
114 | return [=](const LegalityQuery &Query) { | ||||||
115 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
116 | |||||||
117 | const LLT EltTy = Ty.getElementType(); | ||||||
118 | const int Size = Ty.getSizeInBits(); | ||||||
119 | const int EltSize = EltTy.getSizeInBits(); | ||||||
120 | const int NextMul32 = (Size + 31) / 32; | ||||||
121 | |||||||
122 | assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail ("EltSize < 32", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 122, __PRETTY_FUNCTION__)); | ||||||
123 | |||||||
124 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||||
125 | return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); | ||||||
126 | }; | ||||||
127 | } | ||||||
128 | |||||||
129 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||||
130 | return [=](const LegalityQuery &Query) { | ||||||
131 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
132 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||||
133 | }; | ||||||
134 | } | ||||||
135 | |||||||
136 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||||
137 | return [=](const LegalityQuery &Query) { | ||||||
138 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
139 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||||
140 | }; | ||||||
141 | } | ||||||
142 | |||||||
143 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||||
144 | return [=](const LegalityQuery &Query) { | ||||||
145 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
146 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||||
147 | }; | ||||||
148 | } | ||||||
149 | |||||||
150 | // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of | ||||||
151 | // v2s16. | ||||||
152 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||||
153 | return [=](const LegalityQuery &Query) { | ||||||
154 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
155 | if (Ty.isVector()) { | ||||||
156 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||||
157 | return EltSize == 32 || EltSize == 64 || | ||||||
158 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||||
159 | EltSize == 128 || EltSize == 256; | ||||||
160 | } | ||||||
161 | |||||||
162 | return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; | ||||||
163 | }; | ||||||
164 | } | ||||||
165 | |||||||
166 | static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { | ||||||
167 | return [=](const LegalityQuery &Query) { | ||||||
168 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
169 | return QueryTy.isVector() && QueryTy.getElementType() == Type; | ||||||
170 | }; | ||||||
171 | } | ||||||
172 | |||||||
173 | static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { | ||||||
174 | return [=](const LegalityQuery &Query) { | ||||||
175 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
176 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||||
177 | Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); | ||||||
178 | }; | ||||||
179 | } | ||||||
180 | |||||||
181 | static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { | ||||||
182 | return [=](const LegalityQuery &Query) { | ||||||
183 | return Query.Types[TypeIdx0].getSizeInBits() < | ||||||
184 | Query.Types[TypeIdx1].getSizeInBits(); | ||||||
185 | }; | ||||||
186 | } | ||||||
187 | |||||||
188 | static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { | ||||||
189 | return [=](const LegalityQuery &Query) { | ||||||
190 | return Query.Types[TypeIdx0].getSizeInBits() > | ||||||
191 | Query.Types[TypeIdx1].getSizeInBits(); | ||||||
192 | }; | ||||||
193 | } | ||||||
194 | |||||||
195 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||||
196 | const GCNTargetMachine &TM) | ||||||
197 | : ST(ST_) { | ||||||
198 | using namespace TargetOpcode; | ||||||
199 | |||||||
200 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||||
201 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||||
202 | }; | ||||||
203 | |||||||
204 | const LLT S1 = LLT::scalar(1); | ||||||
205 | const LLT S16 = LLT::scalar(16); | ||||||
206 | const LLT S32 = LLT::scalar(32); | ||||||
207 | const LLT S64 = LLT::scalar(64); | ||||||
208 | const LLT S128 = LLT::scalar(128); | ||||||
209 | const LLT S256 = LLT::scalar(256); | ||||||
210 | const LLT S1024 = LLT::scalar(1024); | ||||||
211 | |||||||
212 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
213 | const LLT V4S16 = LLT::vector(4, 16); | ||||||
214 | |||||||
215 | const LLT V2S32 = LLT::vector(2, 32); | ||||||
216 | const LLT V3S32 = LLT::vector(3, 32); | ||||||
217 | const LLT V4S32 = LLT::vector(4, 32); | ||||||
218 | const LLT V5S32 = LLT::vector(5, 32); | ||||||
219 | const LLT V6S32 = LLT::vector(6, 32); | ||||||
220 | const LLT V7S32 = LLT::vector(7, 32); | ||||||
221 | const LLT V8S32 = LLT::vector(8, 32); | ||||||
222 | const LLT V9S32 = LLT::vector(9, 32); | ||||||
223 | const LLT V10S32 = LLT::vector(10, 32); | ||||||
224 | const LLT V11S32 = LLT::vector(11, 32); | ||||||
225 | const LLT V12S32 = LLT::vector(12, 32); | ||||||
226 | const LLT V13S32 = LLT::vector(13, 32); | ||||||
227 | const LLT V14S32 = LLT::vector(14, 32); | ||||||
228 | const LLT V15S32 = LLT::vector(15, 32); | ||||||
229 | const LLT V16S32 = LLT::vector(16, 32); | ||||||
230 | const LLT V32S32 = LLT::vector(32, 32); | ||||||
231 | |||||||
232 | const LLT V2S64 = LLT::vector(2, 64); | ||||||
233 | const LLT V3S64 = LLT::vector(3, 64); | ||||||
234 | const LLT V4S64 = LLT::vector(4, 64); | ||||||
235 | const LLT V5S64 = LLT::vector(5, 64); | ||||||
236 | const LLT V6S64 = LLT::vector(6, 64); | ||||||
237 | const LLT V7S64 = LLT::vector(7, 64); | ||||||
238 | const LLT V8S64 = LLT::vector(8, 64); | ||||||
239 | const LLT V16S64 = LLT::vector(16, 64); | ||||||
240 | |||||||
241 | std::initializer_list<LLT> AllS32Vectors = | ||||||
242 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||||
243 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||||
244 | std::initializer_list<LLT> AllS64Vectors = | ||||||
245 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||||
246 | |||||||
247 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||||
248 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
249 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||||
250 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||||
251 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||||
252 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||||
253 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||||
254 | |||||||
255 | const LLT CodePtr = FlatPtr; | ||||||
256 | |||||||
257 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||||
258 | GlobalPtr, ConstantPtr, FlatPtr | ||||||
259 | }; | ||||||
260 | |||||||
261 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||||
262 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||||
263 | }; | ||||||
264 | |||||||
265 | const std::initializer_list<LLT> FPTypesBase = { | ||||||
266 | S32, S64 | ||||||
267 | }; | ||||||
268 | |||||||
269 | const std::initializer_list<LLT> FPTypes16 = { | ||||||
270 | S32, S64, S16 | ||||||
271 | }; | ||||||
272 | |||||||
273 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||||
274 | S32, S64, S16, V2S16 | ||||||
275 | }; | ||||||
276 | |||||||
277 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||||
278 | |||||||
279 | setAction({G_BRCOND, S1}, Legal); // VCC branches | ||||||
280 | setAction({G_BRCOND, S32}, Legal); // SCC branches | ||||||
281 | |||||||
282 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||||
283 | // elements for v3s16 | ||||||
284 | getActionDefinitionsBuilder(G_PHI) | ||||||
285 | .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) | ||||||
286 | .legalFor(AllS32Vectors) | ||||||
287 | .legalFor(AllS64Vectors) | ||||||
288 | .legalFor(AddrSpaces64) | ||||||
289 | .legalFor(AddrSpaces32) | ||||||
290 | .clampScalar(0, S32, S256) | ||||||
291 | .widenScalarToNextPow2(0, 32) | ||||||
292 | .clampMaxNumElements(0, S32, 16) | ||||||
293 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
294 | .legalIf(isPointer(0)); | ||||||
295 | |||||||
296 | if (ST.hasVOP3PInsts()) { | ||||||
297 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
298 | .legalFor({S32, S16, V2S16}) | ||||||
299 | .clampScalar(0, S16, S32) | ||||||
300 | .clampMaxNumElements(0, S16, 2) | ||||||
301 | .scalarize(0) | ||||||
302 | .widenScalarToNextPow2(0, 32); | ||||||
303 | } else if (ST.has16BitInsts()) { | ||||||
304 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
305 | .legalFor({S32, S16}) | ||||||
306 | .clampScalar(0, S16, S32) | ||||||
307 | .scalarize(0) | ||||||
308 | .widenScalarToNextPow2(0, 32); | ||||||
309 | } else { | ||||||
310 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
311 | .legalFor({S32}) | ||||||
312 | .clampScalar(0, S32, S32) | ||||||
313 | .scalarize(0); | ||||||
314 | } | ||||||
315 | |||||||
316 | // FIXME: Not really legal. Placeholder for custom lowering. | ||||||
317 | getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) | ||||||
318 | .customFor({S32, S64}) | ||||||
319 | .clampScalar(0, S32, S64) | ||||||
320 | .widenScalarToNextPow2(0, 32) | ||||||
321 | .scalarize(0); | ||||||
322 | |||||||
323 | getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||||
324 | .legalFor({S32}) | ||||||
325 | .clampScalar(0, S32, S32) | ||||||
326 | .scalarize(0); | ||||||
327 | |||||||
328 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||||
329 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||||
330 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||||
331 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||||
332 | .clampScalar(0, S32, S64) | ||||||
333 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
334 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||||
335 | .widenScalarToNextPow2(0) | ||||||
336 | .scalarize(0); | ||||||
337 | |||||||
338 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | ||||||
339 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||||
340 | .legalFor({{S32, S1}, {S32, S32}}) | ||||||
341 | .minScalar(0, S32) | ||||||
342 | // TODO: .scalarize(0) | ||||||
343 | .lower(); | ||||||
344 | |||||||
345 | getActionDefinitionsBuilder(G_BITCAST) | ||||||
346 | // Don't worry about the size constraint. | ||||||
347 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
348 | .lower(); | ||||||
349 | |||||||
350 | |||||||
351 | getActionDefinitionsBuilder(G_CONSTANT) | ||||||
352 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||||
353 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||||
354 | .clampScalar(0, S32, S64) | ||||||
355 | .widenScalarToNextPow2(0) | ||||||
356 | .legalIf(isPointer(0)); | ||||||
357 | |||||||
358 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||||
359 | .legalFor({S32, S64, S16}) | ||||||
360 | .clampScalar(0, S16, S64); | ||||||
361 | |||||||
362 | getActionDefinitionsBuilder(G_IMPLICIT_DEF) | ||||||
363 | .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, | ||||||
364 | ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) | ||||||
365 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
366 | .clampScalarOrElt(0, S32, S1024) | ||||||
367 | .legalIf(isMultiple32(0)) | ||||||
368 | .widenScalarToNextPow2(0, 32) | ||||||
369 | .clampMaxNumElements(0, S32, 16); | ||||||
370 | |||||||
371 | setAction({G_FRAME_INDEX, PrivatePtr}, Legal); | ||||||
372 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||||
373 | .unsupportedFor({PrivatePtr}) | ||||||
374 | .custom(); | ||||||
375 | setAction({G_BLOCK_ADDR, CodePtr}, Legal); | ||||||
376 | |||||||
377 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||||
378 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | ||||||
379 | .legalFor({S32, S64}); | ||||||
380 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||||
381 | .customFor({S32, S64}); | ||||||
382 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||||
383 | .customFor({S32, S64}); | ||||||
384 | |||||||
385 | if (ST.has16BitInsts()) { | ||||||
386 | if (ST.hasVOP3PInsts()) | ||||||
387 | FPOpActions.legalFor({S16, V2S16}); | ||||||
388 | else | ||||||
389 | FPOpActions.legalFor({S16}); | ||||||
390 | |||||||
391 | TrigActions.customFor({S16}); | ||||||
392 | FDIVActions.customFor({S16}); | ||||||
393 | } | ||||||
394 | |||||||
395 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||||
396 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||||
397 | |||||||
398 | if (ST.hasVOP3PInsts()) { | ||||||
399 | MinNumMaxNum.customFor(FPTypesPK16) | ||||||
400 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
401 | .clampMaxNumElements(0, S16, 2) | ||||||
402 | .clampScalar(0, S16, S64) | ||||||
403 | .scalarize(0); | ||||||
404 | } else if (ST.has16BitInsts()) { | ||||||
405 | MinNumMaxNum.customFor(FPTypes16) | ||||||
406 | .clampScalar(0, S16, S64) | ||||||
407 | .scalarize(0); | ||||||
408 | } else { | ||||||
409 | MinNumMaxNum.customFor(FPTypesBase) | ||||||
410 | .clampScalar(0, S32, S64) | ||||||
411 | .scalarize(0); | ||||||
412 | } | ||||||
413 | |||||||
414 | if (ST.hasVOP3PInsts()) | ||||||
415 | FPOpActions.clampMaxNumElements(0, S16, 2); | ||||||
416 | |||||||
417 | FPOpActions | ||||||
418 | .scalarize(0) | ||||||
419 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
420 | |||||||
421 | TrigActions | ||||||
422 | .scalarize(0) | ||||||
423 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
424 | |||||||
425 | FDIVActions | ||||||
426 | .scalarize(0) | ||||||
427 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
428 | |||||||
429 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||||
430 | .legalFor(FPTypesPK16) | ||||||
431 | .clampMaxNumElements(0, S16, 2) | ||||||
432 | .scalarize(0) | ||||||
433 | .clampScalar(0, S16, S64); | ||||||
434 | |||||||
435 | if (ST.has16BitInsts()) { | ||||||
436 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||
437 | .legalFor({S32, S64, S16}) | ||||||
438 | .scalarize(0) | ||||||
439 | .clampScalar(0, S16, S64); | ||||||
440 | } else { | ||||||
441 | getActionDefinitionsBuilder(G_FSQRT) | ||||||
442 | .legalFor({S32, S64}) | ||||||
443 | .scalarize(0) | ||||||
444 | .clampScalar(0, S32, S64); | ||||||
445 | |||||||
446 | if (ST.hasFractBug()) { | ||||||
447 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
448 | .customFor({S64}) | ||||||
449 | .legalFor({S32, S64}) | ||||||
450 | .scalarize(0) | ||||||
451 | .clampScalar(0, S32, S64); | ||||||
452 | } else { | ||||||
453 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
454 | .legalFor({S32, S64}) | ||||||
455 | .scalarize(0) | ||||||
456 | .clampScalar(0, S32, S64); | ||||||
457 | } | ||||||
458 | } | ||||||
459 | |||||||
460 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||||
461 | .legalFor({{S32, S64}, {S16, S32}}) | ||||||
462 | .scalarize(0) | ||||||
463 | .lower(); | ||||||
464 | |||||||
465 | getActionDefinitionsBuilder(G_FPEXT) | ||||||
466 | .legalFor({{S64, S32}, {S32, S16}}) | ||||||
467 | .lowerFor({{S64, S16}}) // FIXME: Implement | ||||||
468 | .scalarize(0); | ||||||
469 | |||||||
470 | getActionDefinitionsBuilder(G_FSUB) | ||||||
471 | // Use actual fsub instruction | ||||||
472 | .legalFor({S32}) | ||||||
473 | // Must use fadd + fneg | ||||||
474 | .lowerFor({S64, S16, V2S16}) | ||||||
475 | .scalarize(0) | ||||||
476 | .clampScalar(0, S32, S64); | ||||||
477 | |||||||
478 | // Whether this is legal depends on the floating point mode for the function. | ||||||
479 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||||
480 | if (ST.hasMadF16()) | ||||||
481 | FMad.customFor({S32, S16}); | ||||||
482 | else | ||||||
483 | FMad.customFor({S32}); | ||||||
484 | FMad.scalarize(0) | ||||||
485 | .lower(); | ||||||
486 | |||||||
487 | getActionDefinitionsBuilder(G_TRUNC) | ||||||
488 | .alwaysLegal(); | ||||||
489 | |||||||
490 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||||
491 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||||
492 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||||
493 | .scalarize(0) | ||||||
494 | .clampScalar(0, S32, S64) | ||||||
495 | .widenScalarToNextPow2(1, 32); | ||||||
496 | |||||||
497 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||||
498 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||||
499 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||||
500 | .lowerFor({{S32, S64}}) | ||||||
501 | .lowerIf(typeIs(1, S1)) | ||||||
502 | .customFor({{S64, S64}}); | ||||||
503 | if (ST.has16BitInsts()) | ||||||
504 | IToFP.legalFor({{S16, S16}}); | ||||||
505 | IToFP.clampScalar(1, S32, S64) | ||||||
506 | .scalarize(0) | ||||||
507 | .widenScalarToNextPow2(1); | ||||||
508 | |||||||
509 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||||
510 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||||
511 | .customFor({{S64, S64}}); | ||||||
512 | if (ST.has16BitInsts()) | ||||||
513 | FPToI.legalFor({{S16, S16}}); | ||||||
514 | else | ||||||
515 | FPToI.minScalar(1, S32); | ||||||
516 | |||||||
517 | FPToI.minScalar(0, S32) | ||||||
518 | .scalarize(0) | ||||||
519 | .lower(); | ||||||
520 | |||||||
521 | getActionDefinitionsBuilder(G_INTRINSIC_ROUND) | ||||||
522 | .scalarize(0) | ||||||
523 | .lower(); | ||||||
524 | |||||||
525 | if (ST.has16BitInsts()) { | ||||||
526 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
527 | .legalFor({S16, S32, S64}) | ||||||
528 | .clampScalar(0, S16, S64) | ||||||
529 | .scalarize(0); | ||||||
530 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||||
531 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
532 | .legalFor({S32, S64}) | ||||||
533 | .clampScalar(0, S32, S64) | ||||||
534 | .scalarize(0); | ||||||
535 | } else { | ||||||
536 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
537 | .legalFor({S32}) | ||||||
538 | .customFor({S64}) | ||||||
539 | .clampScalar(0, S32, S64) | ||||||
540 | .scalarize(0); | ||||||
541 | } | ||||||
542 | |||||||
543 | getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) | ||||||
544 | .scalarize(0) | ||||||
545 | .alwaysLegal(); | ||||||
546 | |||||||
547 | auto &CmpBuilder = | ||||||
548 | getActionDefinitionsBuilder(G_ICMP) | ||||||
549 | // The compare output type differs based on the register bank of the output, | ||||||
550 | // so make both s1 and s32 legal. | ||||||
551 | // | ||||||
552 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||||
553 | // is the allocatable register type that will be needed for the copy from | ||||||
554 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||||
555 | // before that won't try to use s32 result types. | ||||||
556 | // | ||||||
557 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||||
558 | // bank. | ||||||
559 | .legalForCartesianProduct( | ||||||
560 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||||
561 | .legalForCartesianProduct( | ||||||
562 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||||
563 | if (ST.has16BitInsts()) { | ||||||
564 | CmpBuilder.legalFor({{S1, S16}}); | ||||||
565 | } | ||||||
566 | |||||||
567 | CmpBuilder | ||||||
568 | .widenScalarToNextPow2(1) | ||||||
569 | .clampScalar(1, S32, S64) | ||||||
570 | .scalarize(0) | ||||||
571 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||||
572 | |||||||
573 | getActionDefinitionsBuilder(G_FCMP) | ||||||
574 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||||
575 | .widenScalarToNextPow2(1) | ||||||
576 | .clampScalar(1, S32, S64) | ||||||
577 | .scalarize(0); | ||||||
578 | |||||||
579 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||||
580 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||||
581 | if (ST.has16BitInsts()) | ||||||
582 | Exp2Ops.legalFor({S32, S16}); | ||||||
583 | else | ||||||
584 | Exp2Ops.legalFor({S32}); | ||||||
585 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||||
586 | Exp2Ops.scalarize(0); | ||||||
587 | |||||||
588 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||||
589 | if (ST.has16BitInsts()) | ||||||
590 | ExpOps.customFor({{S32}, {S16}}); | ||||||
591 | else | ||||||
592 | ExpOps.customFor({S32}); | ||||||
593 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||||
594 | .scalarize(0); | ||||||
595 | |||||||
596 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
597 | getActionDefinitionsBuilder(G_CTPOP) | ||||||
598 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
599 | .clampScalar(0, S32, S32) | ||||||
600 | .clampScalar(1, S32, S64) | ||||||
601 | .scalarize(0) | ||||||
602 | .widenScalarToNextPow2(0, 32) | ||||||
603 | .widenScalarToNextPow2(1, 32); | ||||||
604 | |||||||
605 | // The hardware instructions return a different result on 0 than the generic | ||||||
606 | // instructions expect. The hardware produces -1, but these produce the | ||||||
607 | // bitwidth. | ||||||
608 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||||
609 | .scalarize(0) | ||||||
610 | .clampScalar(0, S32, S32) | ||||||
611 | .clampScalar(1, S32, S64) | ||||||
612 | .widenScalarToNextPow2(0, 32) | ||||||
613 | .widenScalarToNextPow2(1, 32) | ||||||
614 | .lower(); | ||||||
615 | |||||||
616 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
617 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||||
618 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
619 | .clampScalar(0, S32, S32) | ||||||
620 | .clampScalar(1, S32, S64) | ||||||
621 | .scalarize(0) | ||||||
622 | .widenScalarToNextPow2(0, 32) | ||||||
623 | .widenScalarToNextPow2(1, 32); | ||||||
624 | |||||||
625 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||||
626 | .legalFor({S32}) | ||||||
627 | .clampScalar(0, S32, S32) | ||||||
628 | .scalarize(0); | ||||||
629 | |||||||
630 | if (ST.has16BitInsts()) { | ||||||
631 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
632 | .legalFor({S16, S32, V2S16}) | ||||||
633 | .clampMaxNumElements(0, S16, 2) | ||||||
634 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
635 | // narrowScalar limitation. | ||||||
636 | .widenScalarToNextPow2(0) | ||||||
637 | .clampScalar(0, S16, S32) | ||||||
638 | .scalarize(0); | ||||||
639 | |||||||
640 | if (ST.hasVOP3PInsts()) { | ||||||
641 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
642 | .legalFor({S32, S16, V2S16}) | ||||||
643 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
644 | .clampMaxNumElements(0, S16, 2) | ||||||
645 | .minScalar(0, S16) | ||||||
646 | .widenScalarToNextPow2(0) | ||||||
647 | .scalarize(0) | ||||||
648 | .lower(); | ||||||
649 | } else { | ||||||
650 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
651 | .legalFor({S32, S16}) | ||||||
652 | .widenScalarToNextPow2(0) | ||||||
653 | .minScalar(0, S16) | ||||||
654 | .scalarize(0) | ||||||
655 | .lower(); | ||||||
656 | } | ||||||
657 | } else { | ||||||
658 | // TODO: Should have same legality without v_perm_b32 | ||||||
659 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
660 | .legalFor({S32}) | ||||||
661 | .lowerIf(narrowerThan(0, 32)) | ||||||
662 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
663 | // narrowScalar limitation. | ||||||
664 | .widenScalarToNextPow2(0) | ||||||
665 | .maxScalar(0, S32) | ||||||
666 | .scalarize(0) | ||||||
667 | .lower(); | ||||||
668 | |||||||
669 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
670 | .legalFor({S32}) | ||||||
671 | .minScalar(0, S32) | ||||||
672 | .widenScalarToNextPow2(0) | ||||||
673 | .scalarize(0) | ||||||
674 | .lower(); | ||||||
675 | } | ||||||
676 | |||||||
677 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||||
678 | // List the common cases | ||||||
679 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
680 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
681 | .scalarize(0) | ||||||
682 | // Accept any address space as long as the size matches | ||||||
683 | .legalIf(sameSize(0, 1)) | ||||||
684 | .widenScalarIf(smallerThan(1, 0), | ||||||
685 | [](const LegalityQuery &Query) { | ||||||
686 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
687 | }) | ||||||
688 | .narrowScalarIf(greaterThan(1, 0), | ||||||
689 | [](const LegalityQuery &Query) { | ||||||
690 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
691 | }); | ||||||
692 | |||||||
693 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||||
694 | // List the common cases | ||||||
695 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
696 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
697 | .scalarize(0) | ||||||
698 | // Accept any address space as long as the size matches | ||||||
699 | .legalIf(sameSize(0, 1)) | ||||||
700 | .widenScalarIf(smallerThan(0, 1), | ||||||
701 | [](const LegalityQuery &Query) { | ||||||
702 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
703 | }) | ||||||
704 | .narrowScalarIf( | ||||||
705 | greaterThan(0, 1), | ||||||
706 | [](const LegalityQuery &Query) { | ||||||
707 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
708 | }); | ||||||
709 | |||||||
710 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||||
711 | .scalarize(0) | ||||||
712 | .custom(); | ||||||
713 | |||||||
714 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||||
715 | // handle some operations by just promoting the register during | ||||||
716 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||||
717 | auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { | ||||||
718 | switch (AS) { | ||||||
719 | // FIXME: Private element size. | ||||||
720 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||||
721 | return 32; | ||||||
722 | // FIXME: Check subtarget | ||||||
723 | case AMDGPUAS::LOCAL_ADDRESS: | ||||||
724 | return ST.useDS128() ? 128 : 64; | ||||||
725 | |||||||
726 | // Treat constant and global as identical. SMRD loads are sometimes usable | ||||||
727 | // for global loads (ideally constant address space should be eliminated) | ||||||
728 | // depending on the context. Legality cannot be context dependent, but | ||||||
729 | // RegBankSelect can split the load as necessary depending on the pointer | ||||||
730 | // register bank/uniformity and if the memory is invariant or not written in | ||||||
731 | // a kernel. | ||||||
732 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||||
733 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||||
734 | return IsLoad ? 512 : 128; | ||||||
735 | default: | ||||||
736 | return 128; | ||||||
737 | } | ||||||
738 | }; | ||||||
739 | |||||||
740 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||||
741 | bool IsLoad) -> bool { | ||||||
742 | const LLT DstTy = Query.Types[0]; | ||||||
743 | |||||||
744 | // Split vector extloads. | ||||||
745 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
746 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
747 | |||||||
748 | if (MemSize < DstTy.getSizeInBits()) | ||||||
749 | MemSize = std::max(MemSize, Align); | ||||||
750 | |||||||
751 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||||
752 | return true; | ||||||
753 | |||||||
754 | const LLT PtrTy = Query.Types[1]; | ||||||
755 | unsigned AS = PtrTy.getAddressSpace(); | ||||||
756 | if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) | ||||||
757 | return true; | ||||||
758 | |||||||
759 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||||
760 | // TODO: May be able to widen depending on alignment etc. | ||||||
761 | unsigned NumRegs = (MemSize + 31) / 32; | ||||||
762 | if (NumRegs == 3) { | ||||||
763 | if (!ST.hasDwordx3LoadStores()) | ||||||
764 | return true; | ||||||
765 | } else { | ||||||
766 | // If the alignment allows, these should have been widened. | ||||||
767 | if (!isPowerOf2_32(NumRegs)) | ||||||
768 | return true; | ||||||
769 | } | ||||||
770 | |||||||
771 | if (Align < MemSize) { | ||||||
772 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
773 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); | ||||||
774 | } | ||||||
775 | |||||||
776 | return false; | ||||||
777 | }; | ||||||
778 | |||||||
779 | const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { | ||||||
780 | unsigned Size = Query.Types[0].getSizeInBits(); | ||||||
781 | if (isPowerOf2_32(Size)) | ||||||
782 | return false; | ||||||
783 | |||||||
784 | if (Size == 96 && ST.hasDwordx3LoadStores()) | ||||||
785 | return false; | ||||||
786 | |||||||
787 | unsigned AddrSpace = Query.Types[1].getAddressSpace(); | ||||||
788 | if (Size >= maxSizeForAddrSpace(AddrSpace, true)) | ||||||
789 | return false; | ||||||
790 | |||||||
791 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
792 | unsigned RoundedSize = NextPowerOf2(Size); | ||||||
793 | return (Align >= RoundedSize); | ||||||
794 | }; | ||||||
795 | |||||||
796 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; | ||||||
797 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; | ||||||
798 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; | ||||||
799 | |||||||
800 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||||
801 | // LDS | ||||||
802 | // TODO: Unsupported flat for SI. | ||||||
803 | |||||||
804 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||||
805 | const bool IsStore = Op == G_STORE; | ||||||
806 | |||||||
807 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||||
808 | // Whitelist the common cases. | ||||||
809 | // TODO: Loads to s16 on gfx9 | ||||||
810 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, | ||||||
811 | {V2S32, GlobalPtr, 64, GlobalAlign32}, | ||||||
812 | {V4S32, GlobalPtr, 128, GlobalAlign32}, | ||||||
813 | {S128, GlobalPtr, 128, GlobalAlign32}, | ||||||
814 | {S64, GlobalPtr, 64, GlobalAlign32}, | ||||||
815 | {V2S64, GlobalPtr, 128, GlobalAlign32}, | ||||||
816 | {V2S16, GlobalPtr, 32, GlobalAlign32}, | ||||||
817 | {S32, GlobalPtr, 8, GlobalAlign8}, | ||||||
818 | {S32, GlobalPtr, 16, GlobalAlign16}, | ||||||
819 | |||||||
820 | {S32, LocalPtr, 32, 32}, | ||||||
821 | {S64, LocalPtr, 64, 32}, | ||||||
822 | {V2S32, LocalPtr, 64, 32}, | ||||||
823 | {S32, LocalPtr, 8, 8}, | ||||||
824 | {S32, LocalPtr, 16, 16}, | ||||||
825 | {V2S16, LocalPtr, 32, 32}, | ||||||
826 | |||||||
827 | {S32, PrivatePtr, 32, 32}, | ||||||
828 | {S32, PrivatePtr, 8, 8}, | ||||||
829 | {S32, PrivatePtr, 16, 16}, | ||||||
830 | {V2S16, PrivatePtr, 32, 32}, | ||||||
831 | |||||||
832 | {S32, FlatPtr, 32, GlobalAlign32}, | ||||||
833 | {S32, FlatPtr, 16, GlobalAlign16}, | ||||||
834 | {S32, FlatPtr, 8, GlobalAlign8}, | ||||||
835 | {V2S16, FlatPtr, 32, GlobalAlign32}, | ||||||
836 | |||||||
837 | {S32, ConstantPtr, 32, GlobalAlign32}, | ||||||
838 | {V2S32, ConstantPtr, 64, GlobalAlign32}, | ||||||
839 | {V4S32, ConstantPtr, 128, GlobalAlign32}, | ||||||
840 | {S64, ConstantPtr, 64, GlobalAlign32}, | ||||||
841 | {S128, ConstantPtr, 128, GlobalAlign32}, | ||||||
842 | {V2S32, ConstantPtr, 32, GlobalAlign32}}); | ||||||
843 | Actions | ||||||
844 | .customIf(typeIs(1, Constant32Ptr)) | ||||||
845 | // Widen suitably aligned loads by loading extra elements. | ||||||
846 | .moreElementsIf([=](const LegalityQuery &Query) { | ||||||
847 | const LLT Ty = Query.Types[0]; | ||||||
848 | return Op == G_LOAD && Ty.isVector() && | ||||||
849 | shouldWidenLoadResult(Query); | ||||||
850 | }, moreElementsToNextPow2(0)) | ||||||
851 | .widenScalarIf([=](const LegalityQuery &Query) { | ||||||
852 | const LLT Ty = Query.Types[0]; | ||||||
853 | return Op == G_LOAD && !Ty.isVector() && | ||||||
854 | shouldWidenLoadResult(Query); | ||||||
855 | }, widenScalarOrEltToNextPow2(0)) | ||||||
856 | .narrowScalarIf( | ||||||
857 | [=](const LegalityQuery &Query) -> bool { | ||||||
858 | return !Query.Types[0].isVector() && | ||||||
859 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
860 | }, | ||||||
861 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
862 | const LLT DstTy = Query.Types[0]; | ||||||
863 | const LLT PtrTy = Query.Types[1]; | ||||||
864 | |||||||
865 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||||
866 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
867 | |||||||
868 | // Split extloads. | ||||||
869 | if (DstSize > MemSize) | ||||||
870 | return std::make_pair(0, LLT::scalar(MemSize)); | ||||||
871 | |||||||
872 | if (!isPowerOf2_32(DstSize)) { | ||||||
873 | // We're probably decomposing an odd sized store. Try to split | ||||||
874 | // to the widest type. TODO: Account for alignment. As-is it | ||||||
875 | // should be OK, since the new parts will be further legalized. | ||||||
876 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||
877 | return std::make_pair(0, LLT::scalar(FloorSize)); | ||||||
878 | } | ||||||
879 | |||||||
880 | if (DstSize > 32 && (DstSize % 32 != 0)) { | ||||||
881 | // FIXME: Need a way to specify non-extload of larger size if | ||||||
882 | // suitably aligned. | ||||||
883 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); | ||||||
884 | } | ||||||
885 | |||||||
886 | unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), | ||||||
887 | Op == G_LOAD); | ||||||
888 | if (MemSize > MaxSize) | ||||||
889 | return std::make_pair(0, LLT::scalar(MaxSize)); | ||||||
890 | |||||||
891 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
892 | return std::make_pair(0, LLT::scalar(Align)); | ||||||
893 | }) | ||||||
894 | .fewerElementsIf( | ||||||
895 | [=](const LegalityQuery &Query) -> bool { | ||||||
896 | return Query.Types[0].isVector() && | ||||||
897 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
898 | }, | ||||||
899 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
900 | const LLT DstTy = Query.Types[0]; | ||||||
901 | const LLT PtrTy = Query.Types[1]; | ||||||
902 | |||||||
903 | LLT EltTy = DstTy.getElementType(); | ||||||
904 | unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), | ||||||
905 | Op == G_LOAD); | ||||||
906 | |||||||
907 | // FIXME: Handle widened to power of 2 results better. This ends | ||||||
908 | // up scalarizing. | ||||||
909 | // FIXME: 3 element stores scalarized on SI | ||||||
910 | |||||||
911 | // Split if it's too large for the address space. | ||||||
912 | if (Query.MMODescrs[0].SizeInBits > MaxSize) { | ||||||
913 | unsigned NumElts = DstTy.getNumElements(); | ||||||
914 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
915 | |||||||
916 | if (MaxSize % EltSize == 0) { | ||||||
917 | return std::make_pair( | ||||||
918 | 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); | ||||||
919 | } | ||||||
920 | |||||||
921 | unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; | ||||||
922 | |||||||
923 | // FIXME: Refine when odd breakdowns handled | ||||||
924 | // The scalars will need to be re-legalized. | ||||||
925 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||||
926 | NumElts % NumPieces != 0) | ||||||
927 | return std::make_pair(0, EltTy); | ||||||
928 | |||||||
929 | return std::make_pair(0, | ||||||
930 | LLT::vector(NumElts / NumPieces, EltTy)); | ||||||
931 | } | ||||||
932 | |||||||
933 | // FIXME: We could probably handle weird extending loads better. | ||||||
934 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
935 | if (DstTy.getSizeInBits() > MemSize) | ||||||
936 | return std::make_pair(0, EltTy); | ||||||
937 | |||||||
938 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
939 | unsigned DstSize = DstTy.getSizeInBits(); | ||||||
940 | if (!isPowerOf2_32(DstSize)) { | ||||||
941 | // We're probably decomposing an odd sized store. Try to split | ||||||
942 | // to the widest type. TODO: Account for alignment. As-is it | ||||||
943 | // should be OK, since the new parts will be further legalized. | ||||||
944 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||
945 | return std::make_pair( | ||||||
946 | 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); | ||||||
947 | } | ||||||
948 | |||||||
949 | // Need to split because of alignment. | ||||||
950 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
951 | if (EltSize > Align && | ||||||
952 | (EltSize / Align < DstTy.getNumElements())) { | ||||||
953 | return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); | ||||||
954 | } | ||||||
955 | |||||||
956 | // May need relegalization for the scalars. | ||||||
957 | return std::make_pair(0, EltTy); | ||||||
958 | }) | ||||||
959 | .minScalar(0, S32); | ||||||
960 | |||||||
961 | if (IsStore) | ||||||
962 | Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); | ||||||
963 | |||||||
964 | // TODO: Need a bitcast lower option? | ||||||
965 | Actions | ||||||
966 | .legalIf([=](const LegalityQuery &Query) { | ||||||
967 | const LLT Ty0 = Query.Types[0]; | ||||||
968 | unsigned Size = Ty0.getSizeInBits(); | ||||||
969 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
970 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
971 | |||||||
972 | // FIXME: Widening store from alignment not valid. | ||||||
973 | if (MemSize < Size) | ||||||
974 | MemSize = std::max(MemSize, Align); | ||||||
975 | |||||||
976 | // No extending vector loads. | ||||||
977 | if (Size > MemSize && Ty0.isVector()) | ||||||
978 | return false; | ||||||
979 | |||||||
980 | switch (MemSize) { | ||||||
981 | case 8: | ||||||
982 | case 16: | ||||||
983 | return Size == 32; | ||||||
984 | case 32: | ||||||
985 | case 64: | ||||||
986 | case 128: | ||||||
987 | return true; | ||||||
988 | case 96: | ||||||
989 | return ST.hasDwordx3LoadStores(); | ||||||
990 | case 256: | ||||||
991 | case 512: | ||||||
992 | return true; | ||||||
993 | default: | ||||||
994 | return false; | ||||||
995 | } | ||||||
996 | }) | ||||||
997 | .widenScalarToNextPow2(0) | ||||||
998 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); | ||||||
999 | } | ||||||
1000 | |||||||
1001 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||||
1002 | .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, | ||||||
1003 | {S32, GlobalPtr, 16, 2 * 8}, | ||||||
1004 | {S32, LocalPtr, 8, 8}, | ||||||
1005 | {S32, LocalPtr, 16, 16}, | ||||||
1006 | {S32, PrivatePtr, 8, 8}, | ||||||
1007 | {S32, PrivatePtr, 16, 16}, | ||||||
1008 | {S32, ConstantPtr, 8, 8}, | ||||||
1009 | {S32, ConstantPtr, 16, 2 * 8}}); | ||||||
1010 | if (ST.hasFlatAddressSpace()) { | ||||||
1011 | ExtLoads.legalForTypesWithMemDesc( | ||||||
1012 | {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); | ||||||
1013 | } | ||||||
1014 | |||||||
1015 | ExtLoads.clampScalar(0, S32, S32) | ||||||
1016 | .widenScalarToNextPow2(0) | ||||||
1017 | .unsupportedIfMemSizeNotPow2() | ||||||
1018 | .lower(); | ||||||
1019 | |||||||
1020 | auto &Atomics = getActionDefinitionsBuilder( | ||||||
1021 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||||
1022 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||||
1023 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||||
1024 | G_ATOMICRMW_UMIN}) | ||||||
1025 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||||
1026 | {S64, GlobalPtr}, {S64, LocalPtr}}); | ||||||
1027 | if (ST.hasFlatAddressSpace()) { | ||||||
1028 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||||
1029 | } | ||||||
1030 | |||||||
1031 | getActionDefinitionsBuilder(G_ATOMICRMW_FADD) | ||||||
1032 | .legalFor({{S32, LocalPtr}}); | ||||||
1033 | |||||||
1034 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||||
1035 | // demarshalling | ||||||
1036 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||||
1037 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||||
1038 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||||
1039 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||||
1040 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
1041 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||||
1042 | |||||||
1043 | // Condition should be s32 for scalar, s1 for vector. | ||||||
1044 | getActionDefinitionsBuilder(G_SELECT) | ||||||
1045 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, | ||||||
1046 | GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, | ||||||
1047 | LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) | ||||||
1048 | .clampScalar(0, S16, S64) | ||||||
1049 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
1050 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||||
1051 | .scalarize(1) | ||||||
1052 | .clampMaxNumElements(0, S32, 2) | ||||||
1053 | .clampMaxNumElements(0, LocalPtr, 2) | ||||||
1054 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||||
1055 | .scalarize(0) | ||||||
1056 | .widenScalarToNextPow2(0) | ||||||
1057 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||||
1058 | |||||||
1059 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||||
1060 | // be more flexible with the shift amount type. | ||||||
1061 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||||
1062 | .legalFor({{S32, S32}, {S64, S32}}); | ||||||
1063 | if (ST.has16BitInsts()) { | ||||||
1064 | if (ST.hasVOP3PInsts()) { | ||||||
1065 | Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) | ||||||
1066 | .clampMaxNumElements(0, S16, 2); | ||||||
1067 | } else | ||||||
1068 | Shifts.legalFor({{S16, S32}, {S16, S16}}); | ||||||
1069 | |||||||
1070 | // TODO: Support 16-bit shift amounts | ||||||
1071 | Shifts.clampScalar(1, S32, S32); | ||||||
1072 | Shifts.clampScalar(0, S16, S64); | ||||||
1073 | Shifts.widenScalarToNextPow2(0, 16); | ||||||
1074 | } else { | ||||||
1075 | // Make sure we legalize the shift amount type first, as the general | ||||||
1076 | // expansion for the shifted type will produce much worse code if it hasn't | ||||||
1077 | // been truncated already. | ||||||
1078 | Shifts.clampScalar(1, S32, S32); | ||||||
1079 | Shifts.clampScalar(0, S32, S64); | ||||||
1080 | Shifts.widenScalarToNextPow2(0, 32); | ||||||
1081 | } | ||||||
1082 | Shifts.scalarize(0); | ||||||
1083 | |||||||
1084 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||||
1085 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||||
1086 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||||
1087 | unsigned IdxTypeIdx = 2; | ||||||
1088 | |||||||
1089 | getActionDefinitionsBuilder(Op) | ||||||
1090 | .customIf([=](const LegalityQuery &Query) { | ||||||
1091 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
1092 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
1093 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||||
1094 | return (EltTy.getSizeInBits() == 16 || | ||||||
1095 | EltTy.getSizeInBits() % 32 == 0) && | ||||||
1096 | VecTy.getSizeInBits() % 32 == 0 && | ||||||
1097 | VecTy.getSizeInBits() <= 1024 && | ||||||
1098 | IdxTy.getSizeInBits() == 32; | ||||||
1099 | }) | ||||||
1100 | .clampScalar(EltTypeIdx, S32, S64) | ||||||
1101 | .clampScalar(VecTypeIdx, S32, S64) | ||||||
1102 | .clampScalar(IdxTypeIdx, S32, S32); | ||||||
1103 | } | ||||||
1104 | |||||||
1105 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||||
1106 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||||
1107 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||||
1108 | return Query.Types[0] != EltTy; | ||||||
1109 | }); | ||||||
1110 | |||||||
1111 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||||
1112 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||||
1113 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||||
1114 | |||||||
1115 | // FIXME: Doesn't handle extract of illegal sizes. | ||||||
1116 | getActionDefinitionsBuilder(Op) | ||||||
1117 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||||
1118 | // FIXME: Multiples of 16 should not be legal. | ||||||
1119 | .legalIf([=](const LegalityQuery &Query) { | ||||||
1120 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1121 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1122 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||||
1123 | (LitTy.getSizeInBits() % 16 == 0); | ||||||
1124 | }) | ||||||
1125 | .widenScalarIf( | ||||||
1126 | [=](const LegalityQuery &Query) { | ||||||
1127 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1128 | return (BigTy.getScalarSizeInBits() < 16); | ||||||
1129 | }, | ||||||
1130 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||||
1131 | .widenScalarIf( | ||||||
1132 | [=](const LegalityQuery &Query) { | ||||||
1133 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1134 | return (LitTy.getScalarSizeInBits() < 16); | ||||||
1135 | }, | ||||||
1136 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||||
1137 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1138 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||||
1139 | |||||||
1140 | } | ||||||
1141 | |||||||
1142 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||||
1143 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||||
1144 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||||
1145 | .clampNumElements(0, V16S32, V32S32) | ||||||
1146 | .clampNumElements(0, V2S64, V16S64) | ||||||
1147 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||||
1148 | |||||||
1149 | if (ST.hasScalarPackInsts()) { | ||||||
1150 | BuildVector | ||||||
1151 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||||
1152 | .minScalarOrElt(0, S16) | ||||||
1153 | // Widen source elements and produce a G_BUILD_VECTOR_TRUNC | ||||||
1154 | .minScalar(1, S32); | ||||||
1155 | |||||||
1156 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1157 | .legalFor({V2S16, S32}) | ||||||
1158 | .lower(); | ||||||
1159 | BuildVector.minScalarOrElt(0, S32); | ||||||
1160 | } else { | ||||||
1161 | BuildVector.customFor({V2S16, S16}); | ||||||
1162 | BuildVector.minScalarOrElt(0, S32); | ||||||
1163 | |||||||
1164 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1165 | .customFor({V2S16, S32}) | ||||||
1166 | .lower(); | ||||||
1167 | } | ||||||
1168 | |||||||
1169 | BuildVector.legalIf(isRegisterType(0)); | ||||||
1170 | |||||||
1171 | // FIXME: Clamp maximum size | ||||||
1172 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||||
1173 | .legalIf(isRegisterType(0)); | ||||||
1174 | |||||||
1175 | // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse | ||||||
1176 | // pre-legalize. | ||||||
1177 | if (ST.hasVOP3PInsts()) { | ||||||
1178 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) | ||||||
1179 | .customFor({V2S16, V2S16}) | ||||||
1180 | .lower(); | ||||||
1181 | } else | ||||||
1182 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||||
1183 | |||||||
1184 | // Merge/Unmerge | ||||||
1185 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||||
1186 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||||
1187 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||||
1188 | |||||||
1189 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||||
1190 | const LLT &Ty = Query.Types[TypeIdx]; | ||||||
1191 | if (Ty.isVector()) { | ||||||
1192 | const LLT &EltTy = Ty.getElementType(); | ||||||
1193 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) | ||||||
1194 | return true; | ||||||
1195 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | ||||||
1196 | return true; | ||||||
1197 | } | ||||||
1198 | return false; | ||||||
1199 | }; | ||||||
1200 | |||||||
1201 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||||
1202 | // Try to widen to s16 first for small types. | ||||||
1203 | // TODO: Only do this on targets with legal s16 shifts | ||||||
1204 | .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||||
1205 | |||||||
1206 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||||
1207 | .lowerFor({{S16, V2S16}}) | ||||||
1208 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1209 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||||
1210 | elementTypeIs(1, S16)), | ||||||
1211 | changeTo(1, V2S16)) | ||||||
1212 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||||
1213 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||||
1214 | // valid. | ||||||
1215 | .clampScalar(LitTyIdx, S32, S256) | ||||||
1216 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||||
1217 | // Break up vectors with weird elements into scalars | ||||||
1218 | .fewerElementsIf( | ||||||
1219 | [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, | ||||||
1220 | scalarize(0)) | ||||||
1221 | .fewerElementsIf( | ||||||
1222 | [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, | ||||||
1223 | scalarize(1)) | ||||||
1224 | .clampScalar(BigTyIdx, S32, S1024); | ||||||
1225 | |||||||
1226 | if (Op == G_MERGE_VALUES) { | ||||||
1227 | Builder.widenScalarIf( | ||||||
1228 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||||
1229 | [=](const LegalityQuery &Query) { | ||||||
1230 | const LLT Ty = Query.Types[LitTyIdx]; | ||||||
1231 | return Ty.getSizeInBits() < 32; | ||||||
1232 | }, | ||||||
1233 | changeTo(LitTyIdx, S32)); | ||||||
1234 | } | ||||||
1235 | |||||||
1236 | Builder.widenScalarIf( | ||||||
1237 | [=](const LegalityQuery &Query) { | ||||||
1238 | const LLT Ty = Query.Types[BigTyIdx]; | ||||||
1239 | return !isPowerOf2_32(Ty.getSizeInBits()) && | ||||||
1240 | Ty.getSizeInBits() % 16 != 0; | ||||||
1241 | }, | ||||||
1242 | [=](const LegalityQuery &Query) { | ||||||
1243 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||||
1244 | // Whichever is smaller. | ||||||
1245 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||||
1246 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||||
1247 | if (NewSizeInBits >= 256) { | ||||||
1248 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||||
1249 | if (RoundedTo < NewSizeInBits) | ||||||
1250 | NewSizeInBits = RoundedTo; | ||||||
1251 | } | ||||||
1252 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||||
1253 | }) | ||||||
1254 | .legalIf([=](const LegalityQuery &Query) { | ||||||
1255 | const LLT &BigTy = Query.Types[BigTyIdx]; | ||||||
1256 | const LLT &LitTy = Query.Types[LitTyIdx]; | ||||||
1257 | |||||||
1258 | if (BigTy.isVector() && BigTy.getSizeInBits() < 32) | ||||||
1259 | return false; | ||||||
1260 | if (LitTy.isVector() && LitTy.getSizeInBits() < 32) | ||||||
1261 | return false; | ||||||
1262 | |||||||
1263 | return BigTy.getSizeInBits() % 16 == 0 && | ||||||
1264 | LitTy.getSizeInBits() % 16 == 0 && | ||||||
1265 | BigTy.getSizeInBits() <= 1024; | ||||||
1266 | }) | ||||||
1267 | // Any vectors left are the wrong size. Scalarize them. | ||||||
1268 | .scalarize(0) | ||||||
1269 | .scalarize(1); | ||||||
1270 | } | ||||||
1271 | |||||||
1272 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||
1273 | // RegBankSelect. | ||||||
1274 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||||
1275 | .legalFor({{S32}, {S64}}); | ||||||
1276 | |||||||
1277 | if (ST.hasVOP3PInsts()) { | ||||||
1278 | SextInReg.lowerFor({{V2S16}}) | ||||||
1279 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||||
1280 | // get more vector shift opportunities, since we'll get those when | ||||||
1281 | // expanded. | ||||||
1282 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); | ||||||
1283 | } else if (ST.has16BitInsts()) { | ||||||
1284 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||||
1285 | } else { | ||||||
1286 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||||
1287 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||||
1288 | SextInReg.lowerFor({{S32}, {S64}}); | ||||||
1289 | } | ||||||
1290 | |||||||
1291 | SextInReg | ||||||
1292 | .scalarize(0) | ||||||
1293 | .clampScalar(0, S32, S64) | ||||||
1294 | .lower(); | ||||||
1295 | |||||||
1296 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||||
1297 | .legalFor({S64}); | ||||||
1298 | |||||||
1299 | getActionDefinitionsBuilder({ | ||||||
1300 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||||
1301 | G_FCOPYSIGN, | ||||||
1302 | |||||||
1303 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||||
1304 | G_READ_REGISTER, | ||||||
1305 | G_WRITE_REGISTER, | ||||||
1306 | |||||||
1307 | G_SADDO, G_SSUBO, | ||||||
1308 | |||||||
1309 | // TODO: Implement | ||||||
1310 | G_FMINIMUM, G_FMAXIMUM | ||||||
1311 | }).lower(); | ||||||
1312 | |||||||
1313 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||||
1314 | G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||||
1315 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||||
1316 | .unsupported(); | ||||||
1317 | |||||||
1318 | computeTables(); | ||||||
1319 | verify(*ST.getInstrInfo()); | ||||||
1320 | } | ||||||
1321 | |||||||
1322 | bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, | ||||||
1323 | MachineRegisterInfo &MRI, | ||||||
1324 | MachineIRBuilder &B, | ||||||
1325 | GISelChangeObserver &Observer) const { | ||||||
1326 | switch (MI.getOpcode()) { | ||||||
1327 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||||
1328 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||||
1329 | case TargetOpcode::G_FRINT: | ||||||
1330 | return legalizeFrint(MI, MRI, B); | ||||||
1331 | case TargetOpcode::G_FCEIL: | ||||||
1332 | return legalizeFceil(MI, MRI, B); | ||||||
1333 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||||
1334 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||||
1335 | case TargetOpcode::G_SITOFP: | ||||||
1336 | return legalizeITOFP(MI, MRI, B, true); | ||||||
1337 | case TargetOpcode::G_UITOFP: | ||||||
1338 | return legalizeITOFP(MI, MRI, B, false); | ||||||
1339 | case TargetOpcode::G_FPTOSI: | ||||||
1340 | return legalizeFPTOI(MI, MRI, B, true); | ||||||
1341 | case TargetOpcode::G_FPTOUI: | ||||||
1342 | return legalizeFPTOI(MI, MRI, B, false); | ||||||
1343 | case TargetOpcode::G_FMINNUM: | ||||||
1344 | case TargetOpcode::G_FMAXNUM: | ||||||
1345 | case TargetOpcode::G_FMINNUM_IEEE: | ||||||
1346 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||||
1347 | return legalizeMinNumMaxNum(MI, MRI, B); | ||||||
1348 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||||
1349 | return legalizeExtractVectorElt(MI, MRI, B); | ||||||
1350 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||||
1351 | return legalizeInsertVectorElt(MI, MRI, B); | ||||||
1352 | case TargetOpcode::G_SHUFFLE_VECTOR: | ||||||
1353 | return legalizeShuffleVector(MI, MRI, B); | ||||||
1354 | case TargetOpcode::G_FSIN: | ||||||
1355 | case TargetOpcode::G_FCOS: | ||||||
1356 | return legalizeSinCos(MI, MRI, B); | ||||||
1357 | case TargetOpcode::G_GLOBAL_VALUE: | ||||||
1358 | return legalizeGlobalValue(MI, MRI, B); | ||||||
1359 | case TargetOpcode::G_LOAD: | ||||||
1360 | return legalizeLoad(MI, MRI, B, Observer); | ||||||
1361 | case TargetOpcode::G_FMAD: | ||||||
1362 | return legalizeFMad(MI, MRI, B); | ||||||
1363 | case TargetOpcode::G_FDIV: | ||||||
1364 | return legalizeFDIV(MI, MRI, B); | ||||||
1365 | case TargetOpcode::G_UDIV: | ||||||
1366 | case TargetOpcode::G_UREM: | ||||||
1367 | return legalizeUDIV_UREM(MI, MRI, B); | ||||||
1368 | case TargetOpcode::G_SDIV: | ||||||
1369 | case TargetOpcode::G_SREM: | ||||||
1370 | return legalizeSDIV_SREM(MI, MRI, B); | ||||||
1371 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||||
1372 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||||
1373 | case TargetOpcode::G_FLOG: | ||||||
1374 | return legalizeFlog(MI, B, 1.0f / numbers::log2ef); | ||||||
1375 | case TargetOpcode::G_FLOG10: | ||||||
1376 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||||
1377 | case TargetOpcode::G_FEXP: | ||||||
1378 | return legalizeFExp(MI, B); | ||||||
1379 | case TargetOpcode::G_FPOW: | ||||||
1380 | return legalizeFPow(MI, B); | ||||||
1381 | case TargetOpcode::G_FFLOOR: | ||||||
1382 | return legalizeFFloor(MI, MRI, B); | ||||||
1383 | case TargetOpcode::G_BUILD_VECTOR: | ||||||
1384 | return legalizeBuildVector(MI, MRI, B); | ||||||
1385 | default: | ||||||
1386 | return false; | ||||||
1387 | } | ||||||
1388 | |||||||
1389 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1389); | ||||||
1390 | } | ||||||
1391 | |||||||
1392 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||||
1393 | unsigned AS, | ||||||
1394 | MachineRegisterInfo &MRI, | ||||||
1395 | MachineIRBuilder &B) const { | ||||||
1396 | MachineFunction &MF = B.getMF(); | ||||||
1397 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
1398 | const LLT S32 = LLT::scalar(32); | ||||||
1399 | |||||||
1400 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1400, __PRETTY_FUNCTION__)); | ||||||
1401 | |||||||
1402 | if (ST.hasApertureRegs()) { | ||||||
1403 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | ||||||
1404 | // getreg. | ||||||
1405 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1406 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | ||||||
1407 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | ||||||
1408 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1409 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | ||||||
1410 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | ||||||
1411 | unsigned Encoding = | ||||||
1412 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | ||||||
1413 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | ||||||
1414 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | ||||||
1415 | |||||||
1416 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||||||
1417 | |||||||
1418 | B.buildInstr(AMDGPU::S_GETREG_B32) | ||||||
1419 | .addDef(GetReg) | ||||||
1420 | .addImm(Encoding); | ||||||
1421 | MRI.setType(GetReg, S32); | ||||||
1422 | |||||||
1423 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | ||||||
1424 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); | ||||||
1425 | } | ||||||
1426 | |||||||
1427 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||||
1428 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
1429 | |||||||
1430 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1431 | if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) | ||||||
1432 | return Register(); | ||||||
1433 | |||||||
1434 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||||
1435 | // private_segment_aperture_base_hi. | ||||||
1436 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||||
1437 | |||||||
1438 | // TODO: can we be smarter about machine pointer info? | ||||||
1439 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
1440 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
1441 | PtrInfo, | ||||||
1442 | MachineMemOperand::MOLoad | | ||||||
1443 | MachineMemOperand::MODereferenceable | | ||||||
1444 | MachineMemOperand::MOInvariant, | ||||||
1445 | 4, | ||||||
1446 | MinAlign(64, StructOffset)); | ||||||
1447 | |||||||
1448 | Register LoadAddr; | ||||||
1449 | |||||||
1450 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | ||||||
1451 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||||
1452 | } | ||||||
1453 | |||||||
1454 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||||
1455 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1456 | MachineIRBuilder &B) const { | ||||||
1457 | MachineFunction &MF = B.getMF(); | ||||||
1458 | |||||||
1459 | B.setInstr(MI); | ||||||
1460 | |||||||
1461 | const LLT S32 = LLT::scalar(32); | ||||||
1462 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1463 | Register Src = MI.getOperand(1).getReg(); | ||||||
1464 | |||||||
1465 | LLT DstTy = MRI.getType(Dst); | ||||||
1466 | LLT SrcTy = MRI.getType(Src); | ||||||
1467 | unsigned DestAS = DstTy.getAddressSpace(); | ||||||
1468 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||||
1469 | |||||||
1470 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||||
1471 | // vector element. | ||||||
1472 | assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail ("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1472, __PRETTY_FUNCTION__)); | ||||||
1473 | |||||||
1474 | const AMDGPUTargetMachine &TM | ||||||
1475 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||||
1476 | |||||||
1477 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
1478 | if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||||
1479 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||||
1480 | return true; | ||||||
1481 | } | ||||||
1482 | |||||||
1483 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1484 | // Truncate. | ||||||
1485 | B.buildExtract(Dst, Src, 0); | ||||||
1486 | MI.eraseFromParent(); | ||||||
1487 | return true; | ||||||
1488 | } | ||||||
1489 | |||||||
1490 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1491 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1492 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||||
1493 | |||||||
1494 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | ||||||
1495 | // another. Merge operands are required to be the same type, but creating an | ||||||
1496 | // extra ptrtoint would be kind of pointless. | ||||||
1497 | auto HighAddr = B.buildConstant( | ||||||
1498 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | ||||||
1499 | B.buildMerge(Dst, {Src, HighAddr}); | ||||||
1500 | MI.eraseFromParent(); | ||||||
1501 | return true; | ||||||
1502 | } | ||||||
1503 | |||||||
1504 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | ||||||
1505 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1506, __PRETTY_FUNCTION__)) | ||||||
1506 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1506, __PRETTY_FUNCTION__)); | ||||||
1507 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||||
1508 | |||||||
1509 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||||
1510 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||||
1511 | |||||||
1512 | // Extract low 32-bits of the pointer. | ||||||
1513 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||||
1514 | |||||||
1515 | auto CmpRes = | ||||||
1516 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||||
1517 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||||
1518 | |||||||
1519 | MI.eraseFromParent(); | ||||||
1520 | return true; | ||||||
1521 | } | ||||||
1522 | |||||||
1523 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | ||||||
1524 | return false; | ||||||
1525 | |||||||
1526 | if (!ST.hasFlatAddressSpace()) | ||||||
1527 | return false; | ||||||
1528 | |||||||
1529 | auto SegmentNull = | ||||||
1530 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||||
1531 | auto FlatNull = | ||||||
1532 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||||
1533 | |||||||
1534 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||||
1535 | if (!ApertureReg.isValid()) | ||||||
1536 | return false; | ||||||
1537 | |||||||
1538 | auto CmpRes = | ||||||
1539 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); | ||||||
1540 | |||||||
1541 | // Coerce the type of the low half of the result so we can use merge_values. | ||||||
1542 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||||
1543 | |||||||
1544 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||||
1545 | // avoid the ptrtoint? | ||||||
1546 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); | ||||||
1547 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||||
1548 | |||||||
1549 | MI.eraseFromParent(); | ||||||
1550 | return true; | ||||||
1551 | } | ||||||
1552 | |||||||
1553 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||||
1554 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1555 | MachineIRBuilder &B) const { | ||||||
1556 | B.setInstr(MI); | ||||||
1557 | |||||||
1558 | Register Src = MI.getOperand(1).getReg(); | ||||||
1559 | LLT Ty = MRI.getType(Src); | ||||||
1560 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast <void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1560, __PRETTY_FUNCTION__)); | ||||||
1561 | |||||||
1562 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||||
1563 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||||
1564 | |||||||
1565 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||||
1566 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||||
1567 | |||||||
1568 | // TODO: Should this propagate fast-math-flags? | ||||||
1569 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||||
1570 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||||
1571 | |||||||
1572 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||||
1573 | auto Fabs = B.buildFAbs(Ty, Src); | ||||||
1574 | |||||||
1575 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||||
1576 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||||
1577 | return true; | ||||||
1578 | } | ||||||
1579 | |||||||
1580 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||||
1581 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1582 | MachineIRBuilder &B) const { | ||||||
1583 | B.setInstr(MI); | ||||||
1584 | |||||||
1585 | const LLT S1 = LLT::scalar(1); | ||||||
1586 | const LLT S64 = LLT::scalar(64); | ||||||
1587 | |||||||
1588 | Register Src = MI.getOperand(1).getReg(); | ||||||
1589 | assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1589, __PRETTY_FUNCTION__)); | ||||||
1590 | |||||||
1591 | // result = trunc(src) | ||||||
1592 | // if (src > 0.0 && src != result) | ||||||
1593 | // result += 1.0 | ||||||
1594 | |||||||
1595 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||||
1596 | |||||||
1597 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||||
1598 | const auto One = B.buildFConstant(S64, 1.0); | ||||||
1599 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||||
1600 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||||
1601 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||||
1602 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||||
1603 | |||||||
1604 | // TODO: Should this propagate fast-math-flags? | ||||||
1605 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||||
1606 | return true; | ||||||
1607 | } | ||||||
1608 | |||||||
1609 | static MachineInstrBuilder extractF64Exponent(unsigned Hi, | ||||||
1610 | MachineIRBuilder &B) { | ||||||
1611 | const unsigned FractBits = 52; | ||||||
1612 | const unsigned ExpBits = 11; | ||||||
1613 | LLT S32 = LLT::scalar(32); | ||||||
1614 | |||||||
1615 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||||
1616 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||||
1617 | |||||||
1618 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||||
1619 | .addUse(Const0.getReg(0)) | ||||||
1620 | .addUse(Const1.getReg(0)); | ||||||
1621 | |||||||
1622 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||||
1623 | } | ||||||
1624 | |||||||
1625 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||||
1626 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1627 | MachineIRBuilder &B) const { | ||||||
1628 | B.setInstr(MI); | ||||||
1629 | |||||||
1630 | const LLT S1 = LLT::scalar(1); | ||||||
1631 | const LLT S32 = LLT::scalar(32); | ||||||
1632 | const LLT S64 = LLT::scalar(64); | ||||||
1633 | |||||||
1634 | Register Src = MI.getOperand(1).getReg(); | ||||||
1635 | assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1635, __PRETTY_FUNCTION__)); | ||||||
1636 | |||||||
1637 | // TODO: Should this use extract since the low half is unused? | ||||||
1638 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
1639 | Register Hi = Unmerge.getReg(1); | ||||||
1640 | |||||||
1641 | // Extract the upper half, since this is where we will find the sign and | ||||||
1642 | // exponent. | ||||||
1643 | auto Exp = extractF64Exponent(Hi, B); | ||||||
1644 | |||||||
1645 | const unsigned FractBits = 52; | ||||||
1646 | |||||||
1647 | // Extract the sign bit. | ||||||
1648 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||||
1649 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||||
1650 | |||||||
1651 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||||
1652 | |||||||
1653 | const auto Zero32 = B.buildConstant(S32, 0); | ||||||
1654 | |||||||
1655 | // Extend back to 64-bits. | ||||||
1656 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); | ||||||
1657 | |||||||
1658 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||||
1659 | auto Not = B.buildNot(S64, Shr); | ||||||
1660 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||||
1661 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||||
1662 | |||||||
1663 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||||
1664 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||||
1665 | |||||||
1666 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||||
1667 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||||
1668 | return true; | ||||||
1669 | } | ||||||
1670 | |||||||
1671 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||||
1672 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1673 | MachineIRBuilder &B, bool Signed) const { | ||||||
1674 | B.setInstr(MI); | ||||||
1675 | |||||||
1676 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1677 | Register Src = MI.getOperand(1).getReg(); | ||||||
1678 | |||||||
1679 | const LLT S64 = LLT::scalar(64); | ||||||
1680 | const LLT S32 = LLT::scalar(32); | ||||||
1681 | |||||||
1682 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1682, __PRETTY_FUNCTION__)); | ||||||
1683 | |||||||
1684 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
1685 | |||||||
1686 | auto CvtHi = Signed ? | ||||||
1687 | B.buildSITOFP(S64, Unmerge.getReg(1)) : | ||||||
1688 | B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||||
1689 | |||||||
1690 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||||
1691 | |||||||
1692 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||||
1693 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||||
1694 | .addUse(CvtHi.getReg(0)) | ||||||
1695 | .addUse(ThirtyTwo.getReg(0)); | ||||||
1696 | |||||||
1697 | // TODO: Should this propagate fast-math-flags? | ||||||
1698 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||||
1699 | MI.eraseFromParent(); | ||||||
1700 | return true; | ||||||
1701 | } | ||||||
1702 | |||||||
1703 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||||
1704 | // actually works. | ||||||
1705 | bool AMDGPULegalizerInfo::legalizeFPTOI( | ||||||
1706 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1707 | MachineIRBuilder &B, bool Signed) const { | ||||||
1708 | B.setInstr(MI); | ||||||
1709 | |||||||
1710 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1711 | Register Src = MI.getOperand(1).getReg(); | ||||||
1712 | |||||||
1713 | const LLT S64 = LLT::scalar(64); | ||||||
1714 | const LLT S32 = LLT::scalar(32); | ||||||
1715 | |||||||
1716 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1716, __PRETTY_FUNCTION__)); | ||||||
1717 | |||||||
1718 | unsigned Flags = MI.getFlags(); | ||||||
1719 | |||||||
1720 | auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); | ||||||
1721 | auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL)); | ||||||
1722 | auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL)); | ||||||
1723 | |||||||
1724 | auto Mul = B.buildFMul(S64, Trunc, K0, Flags); | ||||||
1725 | auto FloorMul = B.buildFFloor(S64, Mul, Flags); | ||||||
1726 | auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); | ||||||
1727 | |||||||
1728 | auto Hi = Signed ? | ||||||
1729 | B.buildFPTOSI(S32, FloorMul) : | ||||||
1730 | B.buildFPTOUI(S32, FloorMul); | ||||||
1731 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||||
1732 | |||||||
1733 | B.buildMerge(Dst, { Lo, Hi }); | ||||||
1734 | MI.eraseFromParent(); | ||||||
1735 | |||||||
1736 | return true; | ||||||
1737 | } | ||||||
1738 | |||||||
1739 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( | ||||||
1740 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1741 | MachineIRBuilder &B) const { | ||||||
1742 | MachineFunction &MF = B.getMF(); | ||||||
1743 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1744 | |||||||
1745 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||||
1746 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||||
1747 | |||||||
1748 | // With ieee_mode disabled, the instructions have the correct behavior | ||||||
1749 | // already for G_FMINNUM/G_FMAXNUM | ||||||
1750 | if (!MFI->getMode().IEEE) | ||||||
1751 | return !IsIEEEOp; | ||||||
1752 | |||||||
1753 | if (IsIEEEOp) | ||||||
1754 | return true; | ||||||
1755 | |||||||
1756 | MachineIRBuilder HelperBuilder(MI); | ||||||
1757 | GISelObserverWrapper DummyObserver; | ||||||
1758 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||
1759 | HelperBuilder.setInstr(MI); | ||||||
1760 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||||
1761 | } | ||||||
1762 | |||||||
1763 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||||
1764 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1765 | MachineIRBuilder &B) const { | ||||||
1766 | // TODO: Should move some of this into LegalizerHelper. | ||||||
1767 | |||||||
1768 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
1769 | |||||||
1770 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
1771 | // constant before this, so we shouldn't need | ||||||
1772 | // getConstantVRegValWithLookThrough. | ||||||
1773 | Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( | ||||||
1774 | MI.getOperand(2).getReg(), MRI); | ||||||
1775 | if (!IdxVal) // Dynamic case will be selected to register indexing. | ||||||
1776 | return true; | ||||||
1777 | |||||||
1778 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1779 | Register Vec = MI.getOperand(1).getReg(); | ||||||
1780 | |||||||
1781 | LLT VecTy = MRI.getType(Vec); | ||||||
1782 | LLT EltTy = VecTy.getElementType(); | ||||||
1783 | assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1783, __PRETTY_FUNCTION__)); | ||||||
1784 | |||||||
1785 | B.setInstr(MI); | ||||||
1786 | |||||||
1787 | if (IdxVal->Value < VecTy.getNumElements()) | ||||||
1788 | B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); | ||||||
1789 | else | ||||||
1790 | B.buildUndef(Dst); | ||||||
1791 | |||||||
1792 | MI.eraseFromParent(); | ||||||
1793 | return true; | ||||||
1794 | } | ||||||
1795 | |||||||
1796 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||||
1797 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1798 | MachineIRBuilder &B) const { | ||||||
1799 | // TODO: Should move some of this into LegalizerHelper. | ||||||
1800 | |||||||
1801 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
1802 | |||||||
1803 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
1804 | // constant before this, so we shouldn't need | ||||||
1805 | // getConstantVRegValWithLookThrough. | ||||||
1806 | Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( | ||||||
1807 | MI.getOperand(3).getReg(), MRI); | ||||||
1808 | if (!IdxVal) // Dynamic case will be selected to register indexing. | ||||||
1809 | return true; | ||||||
1810 | |||||||
1811 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1812 | Register Vec = MI.getOperand(1).getReg(); | ||||||
1813 | Register Ins = MI.getOperand(2).getReg(); | ||||||
1814 | |||||||
1815 | LLT VecTy = MRI.getType(Vec); | ||||||
1816 | LLT EltTy = VecTy.getElementType(); | ||||||
1817 | assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1817, __PRETTY_FUNCTION__)); | ||||||
1818 | |||||||
1819 | B.setInstr(MI); | ||||||
1820 | |||||||
1821 | if (IdxVal->Value < VecTy.getNumElements()) | ||||||
1822 | B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); | ||||||
1823 | else | ||||||
1824 | B.buildUndef(Dst); | ||||||
1825 | |||||||
1826 | MI.eraseFromParent(); | ||||||
1827 | return true; | ||||||
1828 | } | ||||||
1829 | |||||||
1830 | bool AMDGPULegalizerInfo::legalizeShuffleVector( | ||||||
1831 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1832 | MachineIRBuilder &B) const { | ||||||
1833 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
1834 | |||||||
1835 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1836 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
1837 | LLT DstTy = MRI.getType(Dst); | ||||||
1838 | LLT SrcTy = MRI.getType(Src0); | ||||||
1839 | |||||||
1840 | if (SrcTy == V2S16 && DstTy == V2S16 && | ||||||
1841 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) | ||||||
1842 | return true; | ||||||
1843 | |||||||
1844 | MachineIRBuilder HelperBuilder(MI); | ||||||
1845 | GISelObserverWrapper DummyObserver; | ||||||
1846 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); | ||||||
1847 | HelperBuilder.setInstr(MI); | ||||||
1848 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; | ||||||
1849 | } | ||||||
1850 | |||||||
1851 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||||
1852 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1853 | MachineIRBuilder &B) const { | ||||||
1854 | B.setInstr(MI); | ||||||
1855 | |||||||
1856 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
1857 | Register SrcReg = MI.getOperand(1).getReg(); | ||||||
1858 | LLT Ty = MRI.getType(DstReg); | ||||||
1859 | unsigned Flags = MI.getFlags(); | ||||||
1860 | |||||||
1861 | Register TrigVal; | ||||||
1862 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI3.14159265358979323846); | ||||||
1863 | if (ST.hasTrigReducedRange()) { | ||||||
1864 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||||
1865 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||||
1866 | .addUse(MulVal.getReg(0)) | ||||||
1867 | .setMIFlags(Flags).getReg(0); | ||||||
1868 | } else | ||||||
1869 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||||
1870 | |||||||
1871 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||||
1872 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||||
1873 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | ||||||
1874 | .addUse(TrigVal) | ||||||
1875 | .setMIFlags(Flags); | ||||||
1876 | MI.eraseFromParent(); | ||||||
1877 | return true; | ||||||
1878 | } | ||||||
1879 | |||||||
1880 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( | ||||||
1881 | Register DstReg, LLT PtrTy, | ||||||
1882 | MachineIRBuilder &B, const GlobalValue *GV, | ||||||
1883 | unsigned Offset, unsigned GAFlags) const { | ||||||
1884 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||||
1885 | // to the following code sequence: | ||||||
1886 | // | ||||||
1887 | // For constant address space: | ||||||
1888 | // s_getpc_b64 s[0:1] | ||||||
1889 | // s_add_u32 s0, s0, $symbol | ||||||
1890 | // s_addc_u32 s1, s1, 0 | ||||||
1891 | // | ||||||
1892 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
1893 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||||
1894 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||||
1895 | // operand to the global variable. | ||||||
1896 | // | ||||||
1897 | // For global address space: | ||||||
1898 | // s_getpc_b64 s[0:1] | ||||||
1899 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||||
1900 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||||
1901 | // | ||||||
1902 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
1903 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||||
1904 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||||
1905 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||||
1906 | // operand to the global variable. | ||||||
1907 | // | ||||||
1908 | // What we want here is an offset from the value returned by s_getpc | ||||||
1909 | // (which is the address of the s_add_u32 instruction) to the global | ||||||
1910 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||||
1911 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||||
1912 | // small. This requires us to add 4 to the global variable offset in order to | ||||||
1913 | // compute the correct address. | ||||||
1914 | |||||||
1915 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
1916 | |||||||
1917 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||||
1918 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||||
1919 | |||||||
1920 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||||
1921 | .addDef(PCReg); | ||||||
1922 | |||||||
1923 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||||
1924 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||||
1925 | MIB.addImm(0); | ||||||
1926 | else | ||||||
1927 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); | ||||||
1928 | |||||||
1929 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||||
1930 | |||||||
1931 | if (PtrTy.getSizeInBits() == 32) | ||||||
1932 | B.buildExtract(DstReg, PCReg, 0); | ||||||
1933 | return true; | ||||||
1934 | } | ||||||
1935 | |||||||
1936 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||||
1937 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1938 | MachineIRBuilder &B) const { | ||||||
1939 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
1940 | LLT Ty = MRI.getType(DstReg); | ||||||
1941 | unsigned AS = Ty.getAddressSpace(); | ||||||
1942 | |||||||
1943 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||||
1944 | MachineFunction &MF = B.getMF(); | ||||||
1945 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1946 | B.setInstr(MI); | ||||||
1947 | |||||||
1948 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||||
1949 | if (!MFI->isEntryFunction()) { | ||||||
1950 | const Function &Fn = MF.getFunction(); | ||||||
1951 | DiagnosticInfoUnsupported BadLDSDecl( | ||||||
1952 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); | ||||||
1953 | Fn.getContext().diagnose(BadLDSDecl); | ||||||
1954 | } | ||||||
1955 | |||||||
1956 | // TODO: We could emit code to handle the initialization somewhere. | ||||||
1957 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { | ||||||
1958 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
1959 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||||
1960 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||||
1961 | return true; // Leave in place; | ||||||
1962 | } | ||||||
1963 | |||||||
1964 | B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); | ||||||
1965 | MI.eraseFromParent(); | ||||||
1966 | return true; | ||||||
1967 | } | ||||||
1968 | |||||||
1969 | const Function &Fn = MF.getFunction(); | ||||||
1970 | DiagnosticInfoUnsupported BadInit( | ||||||
1971 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); | ||||||
1972 | Fn.getContext().diagnose(BadInit); | ||||||
1973 | return true; | ||||||
1974 | } | ||||||
1975 | |||||||
1976 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
1977 | |||||||
1978 | if (TLI->shouldEmitFixup(GV)) { | ||||||
1979 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||||
1980 | MI.eraseFromParent(); | ||||||
1981 | return true; | ||||||
1982 | } | ||||||
1983 | |||||||
1984 | if (TLI->shouldEmitPCReloc(GV)) { | ||||||
1985 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||||
1986 | MI.eraseFromParent(); | ||||||
1987 | return true; | ||||||
1988 | } | ||||||
1989 | |||||||
1990 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
1991 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||||
1992 | |||||||
1993 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||||
1994 | MachinePointerInfo::getGOT(MF), | ||||||
1995 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
1996 | MachineMemOperand::MOInvariant, | ||||||
1997 | 8 /*Size*/, 8 /*Align*/); | ||||||
1998 | |||||||
1999 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||||
2000 | |||||||
2001 | if (Ty.getSizeInBits() == 32) { | ||||||
2002 | // Truncate if this is a 32-bit constant adrdess. | ||||||
2003 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||||
2004 | B.buildExtract(DstReg, Load, 0); | ||||||
2005 | } else | ||||||
2006 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||||
2007 | |||||||
2008 | MI.eraseFromParent(); | ||||||
2009 | return true; | ||||||
2010 | } | ||||||
2011 | |||||||
2012 | bool AMDGPULegalizerInfo::legalizeLoad( | ||||||
2013 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2014 | MachineIRBuilder &B, GISelChangeObserver &Observer) const { | ||||||
2015 | B.setInstr(MI); | ||||||
2016 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2017 | auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); | ||||||
2018 | Observer.changingInstr(MI); | ||||||
2019 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||||
2020 | Observer.changedInstr(MI); | ||||||
2021 | return true; | ||||||
2022 | } | ||||||
2023 | |||||||
2024 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||||
2025 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2026 | MachineIRBuilder &B) const { | ||||||
2027 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
2028 | assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail ("Ty.isScalar()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2028, __PRETTY_FUNCTION__)); | ||||||
2029 | |||||||
2030 | MachineFunction &MF = B.getMF(); | ||||||
2031 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2032 | |||||||
2033 | // TODO: Always legal with future ftz flag. | ||||||
2034 | // FIXME: Do we need just output? | ||||||
2035 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||||
2036 | return true; | ||||||
2037 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||||
2038 | return true; | ||||||
2039 | |||||||
2040 | MachineIRBuilder HelperBuilder(MI); | ||||||
2041 | GISelObserverWrapper DummyObserver; | ||||||
2042 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||
2043 | HelperBuilder.setMBB(*MI.getParent()); | ||||||
2044 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||||
2045 | } | ||||||
2046 | |||||||
2047 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||||
2048 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2049 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2050 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
2051 | Register CmpVal = MI.getOperand(2).getReg(); | ||||||
2052 | Register NewVal = MI.getOperand(3).getReg(); | ||||||
2053 | |||||||
2054 | assert(SITargetLowering::isFlatGlobalAddrSpace(((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg ).getAddressSpace()) && "this should not have been custom lowered" ) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2056, __PRETTY_FUNCTION__)) | ||||||
2055 | MRI.getType(PtrReg).getAddressSpace()) &&((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg ).getAddressSpace()) && "this should not have been custom lowered" ) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2056, __PRETTY_FUNCTION__)) | ||||||
2056 | "this should not have been custom lowered")((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg ).getAddressSpace()) && "this should not have been custom lowered" ) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2056, __PRETTY_FUNCTION__)); | ||||||
2057 | |||||||
2058 | LLT ValTy = MRI.getType(CmpVal); | ||||||
2059 | LLT VecTy = LLT::vector(2, ValTy); | ||||||
2060 | |||||||
2061 | B.setInstr(MI); | ||||||
2062 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||||
2063 | |||||||
2064 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||||
2065 | .addDef(DstReg) | ||||||
2066 | .addUse(PtrReg) | ||||||
2067 | .addUse(PackedVal) | ||||||
2068 | .setMemRefs(MI.memoperands()); | ||||||
2069 | |||||||
2070 | MI.eraseFromParent(); | ||||||
2071 | return true; | ||||||
2072 | } | ||||||
2073 | |||||||
2074 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||||
2075 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||||
2076 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2077 | Register Src = MI.getOperand(1).getReg(); | ||||||
2078 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2079 | unsigned Flags = MI.getFlags(); | ||||||
2080 | B.setInstr(MI); | ||||||
2081 | |||||||
2082 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||||
2083 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||||
2084 | |||||||
2085 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||||
2086 | MI.eraseFromParent(); | ||||||
2087 | return true; | ||||||
2088 | } | ||||||
2089 | |||||||
2090 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||||
2091 | MachineIRBuilder &B) const { | ||||||
2092 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2093 | Register Src = MI.getOperand(1).getReg(); | ||||||
2094 | unsigned Flags = MI.getFlags(); | ||||||
2095 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2096 | B.setInstr(MI); | ||||||
2097 | |||||||
2098 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||||
2099 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||||
2100 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2101 | MI.eraseFromParent(); | ||||||
2102 | return true; | ||||||
2103 | } | ||||||
2104 | |||||||
2105 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||||
2106 | MachineIRBuilder &B) const { | ||||||
2107 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2108 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2109 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2110 | unsigned Flags = MI.getFlags(); | ||||||
2111 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2112 | B.setInstr(MI); | ||||||
2113 | const LLT S16 = LLT::scalar(16); | ||||||
2114 | const LLT S32 = LLT::scalar(32); | ||||||
2115 | |||||||
2116 | if (Ty == S32) { | ||||||
2117 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||||
2118 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2119 | .addUse(Log.getReg(0)) | ||||||
2120 | .addUse(Src1) | ||||||
2121 | .setMIFlags(Flags); | ||||||
2122 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2123 | } else if (Ty == S16) { | ||||||
2124 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||||
2125 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||||
2126 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||||
2127 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||||
2128 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2129 | .addUse(Ext0.getReg(0)) | ||||||
2130 | .addUse(Ext1.getReg(0)) | ||||||
2131 | .setMIFlags(Flags); | ||||||
2132 | |||||||
2133 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||||
2134 | } else | ||||||
2135 | return false; | ||||||
2136 | |||||||
2137 | MI.eraseFromParent(); | ||||||
2138 | return true; | ||||||
2139 | } | ||||||
2140 | |||||||
2141 | // Find a source register, ignoring any possible source modifiers. | ||||||
2142 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||||
2143 | Register ModSrc = OrigSrc; | ||||||
2144 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||||
2145 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||||
2146 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2147 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2148 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2149 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2150 | return ModSrc; | ||||||
2151 | } | ||||||
2152 | |||||||
2153 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||||
2154 | MachineRegisterInfo &MRI, | ||||||
2155 | MachineIRBuilder &B) const { | ||||||
2156 | B.setInstr(MI); | ||||||
2157 | |||||||
2158 | const LLT S1 = LLT::scalar(1); | ||||||
2159 | const LLT S64 = LLT::scalar(64); | ||||||
2160 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2161 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||||
2162 | unsigned Flags = MI.getFlags(); | ||||||
2163 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&((ST.hasFractBug() && MRI.getType(Dst) == S64 && "this should not have been custom lowered") ? static_cast< void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2164, __PRETTY_FUNCTION__)) | ||||||
2164 | "this should not have been custom lowered")((ST.hasFractBug() && MRI.getType(Dst) == S64 && "this should not have been custom lowered") ? static_cast< void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2164, __PRETTY_FUNCTION__)); | ||||||
2165 | |||||||
2166 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||||
2167 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||||
2168 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||||
2169 | // V_FRACT bug is: | ||||||
2170 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||||
2171 | // | ||||||
2172 | // Convert floor(x) to (x - fract(x)) | ||||||
2173 | |||||||
2174 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||||
2175 | .addUse(OrigSrc) | ||||||
2176 | .setMIFlags(Flags); | ||||||
2177 | |||||||
2178 | // Give source modifier matching some assistance before obscuring a foldable | ||||||
2179 | // pattern. | ||||||
2180 | |||||||
2181 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||||
2182 | // shouldn't matter? | ||||||
2183 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||||
2184 | |||||||
2185 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); | ||||||
2186 | |||||||
2187 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||||
2188 | |||||||
2189 | // We don't need to concern ourselves with the snan handling difference, so | ||||||
2190 | // use the one which will directly select. | ||||||
2191 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2192 | if (MFI->getMode().IEEE) | ||||||
2193 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||||
2194 | else | ||||||
2195 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||||
2196 | |||||||
2197 | Register CorrectedFract = Min; | ||||||
2198 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||||
2199 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||||
2200 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||||
2201 | } | ||||||
2202 | |||||||
2203 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||||
2204 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||||
2205 | |||||||
2206 | MI.eraseFromParent(); | ||||||
2207 | return true; | ||||||
2208 | } | ||||||
2209 | |||||||
2210 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||||
2211 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||||
2212 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||||
2213 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2214 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2215 | LLT DstTy = MRI.getType(Dst); | ||||||
2216 | const LLT S32 = LLT::scalar(32); | ||||||
2217 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
2218 | (void)DstTy; | ||||||
2219 | (void)V2S16; | ||||||
2220 | assert(DstTy == V2S16)((DstTy == V2S16) ? static_cast<void> (0) : __assert_fail ("DstTy == V2S16", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2220, __PRETTY_FUNCTION__)); | ||||||
2221 | |||||||
2222 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2223 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2224 | assert(MRI.getType(Src0) == LLT::scalar(16))((MRI.getType(Src0) == LLT::scalar(16)) ? static_cast<void > (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2224, __PRETTY_FUNCTION__)); | ||||||
2225 | |||||||
2226 | B.setInstr(MI); | ||||||
2227 | auto Merge = B.buildMerge(S32, {Src0, Src1}); | ||||||
2228 | B.buildBitcast(Dst, Merge); | ||||||
2229 | |||||||
2230 | MI.eraseFromParent(); | ||||||
2231 | return true; | ||||||
2232 | } | ||||||
2233 | |||||||
2234 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||||
2235 | static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, | ||||||
2236 | MachineRegisterInfo &MRI, | ||||||
2237 | MachineInstr *&Br) { | ||||||
2238 | Register CondDef = MI.getOperand(0).getReg(); | ||||||
2239 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||||
2240 | return nullptr; | ||||||
2241 | |||||||
2242 | MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); | ||||||
2243 | if (UseMI.getParent() != MI.getParent() || | ||||||
2244 | UseMI.getOpcode() != AMDGPU::G_BRCOND) | ||||||
2245 | return nullptr; | ||||||
2246 | |||||||
2247 | // Make sure the cond br is followed by a G_BR | ||||||
2248 | MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); | ||||||
2249 | if (Next != MI.getParent()->end()) { | ||||||
2250 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||||
2251 | return nullptr; | ||||||
2252 | Br = &*Next; | ||||||
2253 | } | ||||||
2254 | |||||||
2255 | return &UseMI; | ||||||
2256 | } | ||||||
2257 | |||||||
2258 | Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, | ||||||
2259 | Register Reg, LLT Ty) const { | ||||||
2260 | Register LiveIn = MRI.getLiveInVirtReg(Reg); | ||||||
2261 | if (LiveIn) | ||||||
2262 | return LiveIn; | ||||||
2263 | |||||||
2264 | Register NewReg = MRI.createGenericVirtualRegister(Ty); | ||||||
2265 | MRI.addLiveIn(Reg, NewReg); | ||||||
2266 | return NewReg; | ||||||
2267 | } | ||||||
2268 | |||||||
2269 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||||
2270 | const ArgDescriptor *Arg) const { | ||||||
2271 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||||
2272 | return false; // TODO: Handle these | ||||||
2273 | |||||||
2274 | assert(Arg->getRegister().isPhysical())((Arg->getRegister().isPhysical()) ? static_cast<void> (0) : __assert_fail ("Arg->getRegister().isPhysical()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2274, __PRETTY_FUNCTION__)); | ||||||
2275 | |||||||
2276 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
2277 | |||||||
2278 | LLT Ty = MRI.getType(DstReg); | ||||||
2279 | Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); | ||||||
2280 | |||||||
2281 | if (Arg->isMasked()) { | ||||||
2282 | // TODO: Should we try to emit this once in the entry block? | ||||||
2283 | const LLT S32 = LLT::scalar(32); | ||||||
2284 | const unsigned Mask = Arg->getMask(); | ||||||
2285 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | ||||||
2286 | |||||||
2287 | Register AndMaskSrc = LiveIn; | ||||||
2288 | |||||||
2289 | if (Shift
| ||||||
2290 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||||
2291 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||||
2292 | } | ||||||
2293 | |||||||
2294 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||||
| |||||||
2295 | } else | ||||||
2296 | B.buildCopy(DstReg, LiveIn); | ||||||
2297 | |||||||
2298 | // Insert the argument copy if it doens't already exist. | ||||||
2299 | // FIXME: It seems EmitLiveInCopies isn't called anywhere? | ||||||
2300 | if (!MRI.getVRegDef(LiveIn)) { | ||||||
2301 | // FIXME: Should have scoped insert pt | ||||||
2302 | MachineBasicBlock &OrigInsBB = B.getMBB(); | ||||||
2303 | auto OrigInsPt = B.getInsertPt(); | ||||||
2304 | |||||||
2305 | MachineBasicBlock &EntryMBB = B.getMF().front(); | ||||||
2306 | EntryMBB.addLiveIn(Arg->getRegister()); | ||||||
2307 | B.setInsertPt(EntryMBB, EntryMBB.begin()); | ||||||
2308 | B.buildCopy(LiveIn, Arg->getRegister()); | ||||||
2309 | |||||||
2310 | B.setInsertPt(OrigInsBB, OrigInsPt); | ||||||
2311 | } | ||||||
2312 | |||||||
2313 | return true; | ||||||
2314 | } | ||||||
2315 | |||||||
2316 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||||
2317 | MachineInstr &MI, | ||||||
2318 | MachineRegisterInfo &MRI, | ||||||
2319 | MachineIRBuilder &B, | ||||||
2320 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
2321 | B.setInstr(MI); | ||||||
2322 | |||||||
2323 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2324 | |||||||
2325 | const ArgDescriptor *Arg; | ||||||
2326 | const TargetRegisterClass *RC; | ||||||
2327 | std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); | ||||||
2328 | if (!Arg) { | ||||||
2329 | LLVM_DEBUG(dbgs() << "Required arg register missing\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("amdgpu-legalinfo")) { dbgs() << "Required arg register missing\n" ; } } while (false); | ||||||
2330 | return false; | ||||||
2331 | } | ||||||
2332 | |||||||
2333 | if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { | ||||||
2334 | MI.eraseFromParent(); | ||||||
2335 | return true; | ||||||
2336 | } | ||||||
2337 | |||||||
2338 | return false; | ||||||
2339 | } | ||||||
2340 | |||||||
2341 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||||
2342 | MachineRegisterInfo &MRI, | ||||||
2343 | MachineIRBuilder &B) const { | ||||||
2344 | B.setInstr(MI); | ||||||
2345 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2346 | LLT DstTy = MRI.getType(Dst); | ||||||
2347 | LLT S16 = LLT::scalar(16); | ||||||
2348 | LLT S32 = LLT::scalar(32); | ||||||
2349 | LLT S64 = LLT::scalar(64); | ||||||
2350 | |||||||
2351 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
2352 | return true; | ||||||
2353 | |||||||
2354 | if (DstTy == S16) | ||||||
2355 | return legalizeFDIV16(MI, MRI, B); | ||||||
2356 | if (DstTy == S32) | ||||||
2357 | return legalizeFDIV32(MI, MRI, B); | ||||||
2358 | if (DstTy == S64) | ||||||
2359 | return legalizeFDIV64(MI, MRI, B); | ||||||
2360 | |||||||
2361 | return false; | ||||||
2362 | } | ||||||
2363 | |||||||
2364 | static Register buildDivRCP(MachineIRBuilder &B, Register Src) { | ||||||
2365 | const LLT S32 = LLT::scalar(32); | ||||||
2366 | |||||||
2367 | auto Cvt0 = B.buildUITOFP(S32, Src); | ||||||
2368 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); | ||||||
2369 | auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); | ||||||
2370 | auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); | ||||||
2371 | return B.buildFPTOUI(S32, Mul).getReg(0); | ||||||
2372 | } | ||||||
2373 | |||||||
2374 | void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, | ||||||
2375 | Register DstReg, | ||||||
2376 | Register Num, | ||||||
2377 | Register Den, | ||||||
2378 | bool IsRem) const { | ||||||
2379 | const LLT S1 = LLT::scalar(1); | ||||||
2380 | const LLT S32 = LLT::scalar(32); | ||||||
2381 | |||||||
2382 | // RCP = URECIP(Den) = 2^32 / Den + e | ||||||
2383 | // e is rounding error. | ||||||
2384 | auto RCP = buildDivRCP(B, Den); | ||||||
2385 | |||||||
2386 | // RCP_LO = mul(RCP, Den) | ||||||
2387 | auto RCP_LO = B.buildMul(S32, RCP, Den); | ||||||
2388 | |||||||
2389 | // RCP_HI = mulhu (RCP, Den) */ | ||||||
2390 | auto RCP_HI = B.buildUMulH(S32, RCP, Den); | ||||||
2391 | |||||||
2392 | // NEG_RCP_LO = -RCP_LO | ||||||
2393 | auto Zero = B.buildConstant(S32, 0); | ||||||
2394 | auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); | ||||||
2395 | |||||||
2396 | // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) | ||||||
2397 | auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); | ||||||
2398 | auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); | ||||||
2399 | |||||||
2400 | // Calculate the rounding error from the URECIP instruction | ||||||
2401 | // E = mulhu(ABS_RCP_LO, RCP) | ||||||
2402 | auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); | ||||||
2403 | |||||||
2404 | // RCP_A_E = RCP + E | ||||||
2405 | auto RCP_A_E = B.buildAdd(S32, RCP, E); | ||||||
2406 | |||||||
2407 | // RCP_S_E = RCP - E | ||||||
2408 | auto RCP_S_E = B.buildSub(S32, RCP, E); | ||||||
2409 | |||||||
2410 | // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) | ||||||
2411 | auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); | ||||||
2412 | |||||||
2413 | // Quotient = mulhu(Tmp0, Num)stmp | ||||||
2414 | auto Quotient = B.buildUMulH(S32, Tmp0, Num); | ||||||
2415 | |||||||
2416 | // Num_S_Remainder = Quotient * Den | ||||||
2417 | auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); | ||||||
2418 | |||||||
2419 | // Remainder = Num - Num_S_Remainder | ||||||
2420 | auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); | ||||||
2421 | |||||||
2422 | // Remainder_GE_Den = Remainder >= Den | ||||||
2423 | auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); | ||||||
2424 | |||||||
2425 | // Remainder_GE_Zero = Num >= Num_S_Remainder; | ||||||
2426 | auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, | ||||||
2427 | Num, Num_S_Remainder); | ||||||
2428 | |||||||
2429 | // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero | ||||||
2430 | auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); | ||||||
2431 | |||||||
2432 | // Calculate Division result: | ||||||
2433 | |||||||
2434 | // Quotient_A_One = Quotient + 1 | ||||||
2435 | auto One = B.buildConstant(S32, 1); | ||||||
2436 | auto Quotient_A_One = B.buildAdd(S32, Quotient, One); | ||||||
2437 | |||||||
2438 | // Quotient_S_One = Quotient - 1 | ||||||
2439 | auto Quotient_S_One = B.buildSub(S32, Quotient, One); | ||||||
2440 | |||||||
2441 | // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) | ||||||
2442 | auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); | ||||||
2443 | |||||||
2444 | // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) | ||||||
2445 | if (IsRem) { | ||||||
2446 | Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); | ||||||
2447 | |||||||
2448 | // Calculate Rem result: | ||||||
2449 | auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); | ||||||
2450 | |||||||
2451 | // Remainder_A_Den = Remainder + Den | ||||||
2452 | auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); | ||||||
2453 | |||||||
2454 | // Rem = (Tmp1 ? Remainder_S_Den : Remainder) | ||||||
2455 | auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); | ||||||
2456 | |||||||
2457 | // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) | ||||||
2458 | B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); | ||||||
2459 | } else { | ||||||
2460 | B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); | ||||||
2461 | } | ||||||
2462 | } | ||||||
2463 | |||||||
2464 | bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, | ||||||
2465 | MachineRegisterInfo &MRI, | ||||||
2466 | MachineIRBuilder &B) const { | ||||||
2467 | B.setInstr(MI); | ||||||
2468 | const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; | ||||||
2469 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2470 | Register Num = MI.getOperand(1).getReg(); | ||||||
2471 | Register Den = MI.getOperand(2).getReg(); | ||||||
2472 | legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); | ||||||
2473 | MI.eraseFromParent(); | ||||||
2474 | return true; | ||||||
2475 | } | ||||||
2476 | |||||||
2477 | bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, | ||||||
2478 | MachineRegisterInfo &MRI, | ||||||
2479 | MachineIRBuilder &B) const { | ||||||
2480 | if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) | ||||||
2481 | return legalizeUDIV_UREM32(MI, MRI, B); | ||||||
2482 | return false; | ||||||
2483 | } | ||||||
2484 | |||||||
2485 | bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, | ||||||
2486 | MachineRegisterInfo &MRI, | ||||||
2487 | MachineIRBuilder &B) const { | ||||||
2488 | B.setInstr(MI); | ||||||
2489 | const LLT S32 = LLT::scalar(32); | ||||||
2490 | |||||||
2491 | const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; | ||||||
2492 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2493 | Register LHS = MI.getOperand(1).getReg(); | ||||||
2494 | Register RHS = MI.getOperand(2).getReg(); | ||||||
2495 | |||||||
2496 | auto ThirtyOne = B.buildConstant(S32, 31); | ||||||
2497 | auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); | ||||||
2498 | auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); | ||||||
2499 | |||||||
2500 | LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); | ||||||
2501 | RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); | ||||||
2502 | |||||||
2503 | LHS = B.buildXor(S32, LHS, LHSign).getReg(0); | ||||||
2504 | RHS = B.buildXor(S32, RHS, RHSign).getReg(0); | ||||||
2505 | |||||||
2506 | Register UDivRem = MRI.createGenericVirtualRegister(S32); | ||||||
2507 | legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); | ||||||
2508 | |||||||
2509 | if (IsRem) { | ||||||
2510 | auto RSign = LHSign; // Remainder sign is the same as LHS | ||||||
2511 | UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); | ||||||
2512 | B.buildSub(DstReg, UDivRem, RSign); | ||||||
2513 | } else { | ||||||
2514 | auto DSign = B.buildXor(S32, LHSign, RHSign); | ||||||
2515 | UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); | ||||||
2516 | B.buildSub(DstReg, UDivRem, DSign); | ||||||
2517 | } | ||||||
2518 | |||||||
2519 | MI.eraseFromParent(); | ||||||
2520 | return true; | ||||||
2521 | } | ||||||
2522 | |||||||
2523 | bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, | ||||||
2524 | MachineRegisterInfo &MRI, | ||||||
2525 | MachineIRBuilder &B) const { | ||||||
2526 | if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) | ||||||
2527 | return legalizeSDIV_SREM32(MI, MRI, B); | ||||||
2528 | return false; | ||||||
2529 | } | ||||||
2530 | |||||||
2531 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||||
2532 | MachineRegisterInfo &MRI, | ||||||
2533 | MachineIRBuilder &B) const { | ||||||
2534 | Register Res = MI.getOperand(0).getReg(); | ||||||
2535 | Register LHS = MI.getOperand(1).getReg(); | ||||||
2536 | Register RHS = MI.getOperand(2).getReg(); | ||||||
2537 | |||||||
2538 | uint16_t Flags = MI.getFlags(); | ||||||
2539 | |||||||
2540 | LLT ResTy = MRI.getType(Res); | ||||||
2541 | LLT S32 = LLT::scalar(32); | ||||||
2542 | LLT S64 = LLT::scalar(64); | ||||||
2543 | |||||||
2544 | const MachineFunction &MF = B.getMF(); | ||||||
2545 | bool Unsafe = | ||||||
2546 | MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); | ||||||
2547 | |||||||
2548 | if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) | ||||||
2549 | return false; | ||||||
2550 | |||||||
2551 | if (!Unsafe && ResTy == S32 && | ||||||
2552 | MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) | ||||||
2553 | return false; | ||||||
2554 | |||||||
2555 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||||
2556 | // 1 / x -> RCP(x) | ||||||
2557 | if (CLHS->isExactlyValue(1.0)) { | ||||||
2558 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
2559 | .addUse(RHS) | ||||||
2560 | .setMIFlags(Flags); | ||||||
2561 | |||||||
2562 | MI.eraseFromParent(); | ||||||
2563 | return true; | ||||||
2564 | } | ||||||
2565 | |||||||
2566 | // -1 / x -> RCP( FNEG(x) ) | ||||||
2567 | if (CLHS->isExactlyValue(-1.0)) { | ||||||
2568 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||||
2569 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
2570 | .addUse(FNeg.getReg(0)) | ||||||
2571 | .setMIFlags(Flags); | ||||||
2572 | |||||||
2573 | MI.eraseFromParent(); | ||||||
2574 | return true; | ||||||
2575 | } | ||||||
2576 | } | ||||||
2577 | |||||||
2578 | // x / y -> x * (1.0 / y) | ||||||
2579 | if (Unsafe) { | ||||||
2580 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
2581 | .addUse(RHS) | ||||||
2582 | .setMIFlags(Flags); | ||||||
2583 | B.buildFMul(Res, LHS, RCP, Flags); | ||||||
2584 | |||||||
2585 | MI.eraseFromParent(); | ||||||
2586 | return true; | ||||||
2587 | } | ||||||
2588 | |||||||
2589 | return false; | ||||||
2590 | } | ||||||
2591 | |||||||
2592 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||||
2593 | MachineRegisterInfo &MRI, | ||||||
2594 | MachineIRBuilder &B) const { | ||||||
2595 | B.setInstr(MI); | ||||||
2596 | Register Res = MI.getOperand(0).getReg(); | ||||||
2597 | Register LHS = MI.getOperand(1).getReg(); | ||||||
2598 | Register RHS = MI.getOperand(2).getReg(); | ||||||
2599 | |||||||
2600 | uint16_t Flags = MI.getFlags(); | ||||||
2601 | |||||||
2602 | LLT S16 = LLT::scalar(16); | ||||||
2603 | LLT S32 = LLT::scalar(32); | ||||||
2604 | |||||||
2605 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||||
2606 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||||
2607 | |||||||
2608 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
2609 | .addUse(RHSExt.getReg(0)) | ||||||
2610 | .setMIFlags(Flags); | ||||||
2611 | |||||||
2612 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||||
2613 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||||
2614 | |||||||
2615 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
2616 | .addUse(RDst.getReg(0)) | ||||||
2617 | .addUse(RHS) | ||||||
2618 | .addUse(LHS) | ||||||
2619 | .setMIFlags(Flags); | ||||||
2620 | |||||||
2621 | MI.eraseFromParent(); | ||||||
2622 | return true; | ||||||
2623 | } | ||||||
2624 | |||||||
2625 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||||
2626 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||||
2627 | static void toggleSPDenormMode(bool Enable, | ||||||
2628 | MachineIRBuilder &B, | ||||||
2629 | const GCNSubtarget &ST, | ||||||
2630 | AMDGPU::SIModeRegisterDefaults Mode) { | ||||||
2631 | // Set SP denorm mode to this value. | ||||||
2632 | unsigned SPDenormMode = | ||||||
2633 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||||
2634 | |||||||
2635 | if (ST.hasDenormModeInst()) { | ||||||
2636 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||||
2637 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||||
2638 | |||||||
2639 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||||
2640 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||||
2641 | .addImm(NewDenormModeValue); | ||||||
2642 | |||||||
2643 | } else { | ||||||
2644 | // Select FP32 bit field in mode register. | ||||||
2645 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||||
2646 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||||
2647 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||||
2648 | |||||||
2649 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||||
2650 | .addImm(SPDenormMode) | ||||||
2651 | .addImm(SPDenormModeBitField); | ||||||
2652 | } | ||||||
2653 | } | ||||||
2654 | |||||||
2655 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||||
2656 | MachineRegisterInfo &MRI, | ||||||
2657 | MachineIRBuilder &B) const { | ||||||
2658 | B.setInstr(MI); | ||||||
2659 | Register Res = MI.getOperand(0).getReg(); | ||||||
2660 | Register LHS = MI.getOperand(1).getReg(); | ||||||
2661 | Register RHS = MI.getOperand(2).getReg(); | ||||||
2662 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2663 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); | ||||||
2664 | |||||||
2665 | uint16_t Flags = MI.getFlags(); | ||||||
2666 | |||||||
2667 | LLT S32 = LLT::scalar(32); | ||||||
2668 | LLT S1 = LLT::scalar(1); | ||||||
2669 | |||||||
2670 | auto One = B.buildFConstant(S32, 1.0f); | ||||||
2671 | |||||||
2672 | auto DenominatorScaled = | ||||||
2673 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
2674 | .addUse(RHS) | ||||||
2675 | .addUse(LHS) | ||||||
2676 | .addImm(1) | ||||||
2677 | .setMIFlags(Flags); | ||||||
2678 | auto NumeratorScaled = | ||||||
2679 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
2680 | .addUse(LHS) | ||||||
2681 | .addUse(RHS) | ||||||
2682 | .addImm(0) | ||||||
2683 | .setMIFlags(Flags); | ||||||
2684 | |||||||
2685 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
2686 | .addUse(DenominatorScaled.getReg(0)) | ||||||
2687 | .setMIFlags(Flags); | ||||||
2688 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||||
2689 | |||||||
2690 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||||
2691 | // aren't modeled as reading it. | ||||||
2692 | if (!Mode.allFP32Denormals()) | ||||||
2693 | toggleSPDenormMode(true, B, ST, Mode); | ||||||
2694 | |||||||
2695 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||||
2696 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||||
2697 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||||
2698 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||||
2699 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||||
2700 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||||
2701 | |||||||
2702 | if (!Mode.allFP32Denormals()) | ||||||
2703 | toggleSPDenormMode(false, B, ST, Mode); | ||||||
2704 | |||||||
2705 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||||
2706 | .addUse(Fma4.getReg(0)) | ||||||
2707 | .addUse(Fma1.getReg(0)) | ||||||
2708 | .addUse(Fma3.getReg(0)) | ||||||
2709 | .addUse(NumeratorScaled.getReg(1)) | ||||||
2710 | .setMIFlags(Flags); | ||||||
2711 | |||||||
2712 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
2713 | .addUse(Fmas.getReg(0)) | ||||||
2714 | .addUse(RHS) | ||||||
2715 | .addUse(LHS) | ||||||
2716 | .setMIFlags(Flags); | ||||||
2717 | |||||||
2718 | MI.eraseFromParent(); | ||||||
2719 | return true; | ||||||
2720 | } | ||||||
2721 | |||||||
2722 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||||
2723 | MachineRegisterInfo &MRI, | ||||||
2724 | MachineIRBuilder &B) const { | ||||||
2725 | B.setInstr(MI); | ||||||
2726 | Register Res = MI.getOperand(0).getReg(); | ||||||
2727 | Register LHS = MI.getOperand(1).getReg(); | ||||||
2728 | Register RHS = MI.getOperand(2).getReg(); | ||||||
2729 | |||||||
2730 | uint16_t Flags = MI.getFlags(); | ||||||
2731 | |||||||
2732 | LLT S64 = LLT::scalar(64); | ||||||
2733 | LLT S1 = LLT::scalar(1); | ||||||
2734 | |||||||
2735 | auto One = B.buildFConstant(S64, 1.0); | ||||||
2736 | |||||||
2737 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
2738 | .addUse(LHS) | ||||||
2739 | .addUse(RHS) | ||||||
2740 | .addImm(1) | ||||||
2741 | .setMIFlags(Flags); | ||||||
2742 | |||||||
2743 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||||
2744 | |||||||
2745 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||||
2746 | .addUse(DivScale0.getReg(0)) | ||||||
2747 | .setMIFlags(Flags); | ||||||
2748 | |||||||
2749 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||||
2750 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||||
2751 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||||
2752 | |||||||
2753 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
2754 | .addUse(LHS) | ||||||
2755 | .addUse(RHS) | ||||||
2756 | .addImm(0) | ||||||
2757 | .setMIFlags(Flags); | ||||||
2758 | |||||||
2759 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||||
2760 | auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||||
2761 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||||
2762 | |||||||
2763 | Register Scale; | ||||||
2764 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||||
2765 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||||
2766 | // is not usable. | ||||||
2767 | |||||||
2768 | LLT S32 = LLT::scalar(32); | ||||||
2769 | |||||||
2770 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||||
2771 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||||
2772 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||||
2773 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||||
2774 | |||||||
2775 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||||
2776 | Scale1Unmerge.getReg(1)); | ||||||
2777 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||||
2778 | Scale0Unmerge.getReg(1)); | ||||||
2779 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||||
2780 | } else { | ||||||
2781 | Scale = DivScale1.getReg(1); | ||||||
2782 | } | ||||||
2783 | |||||||
2784 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||||
2785 | .addUse(Fma4.getReg(0)) | ||||||
2786 | .addUse(Fma3.getReg(0)) | ||||||
2787 | .addUse(Mul.getReg(0)) | ||||||
2788 | .addUse(Scale) | ||||||
2789 | .setMIFlags(Flags); | ||||||
2790 | |||||||
2791 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) | ||||||
2792 | .addUse(Fmas.getReg(0)) | ||||||
2793 | .addUse(RHS) | ||||||
2794 | .addUse(LHS) | ||||||
2795 | .setMIFlags(Flags); | ||||||
2796 | |||||||
2797 | MI.eraseFromParent(); | ||||||
2798 | return true; | ||||||
2799 | } | ||||||
2800 | |||||||
2801 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||||
2802 | MachineRegisterInfo &MRI, | ||||||
2803 | MachineIRBuilder &B) const { | ||||||
2804 | B.setInstr(MI); | ||||||
2805 | Register Res = MI.getOperand(0).getReg(); | ||||||
2806 | Register LHS = MI.getOperand(2).getReg(); | ||||||
2807 | Register RHS = MI.getOperand(3).getReg(); | ||||||
2808 | uint16_t Flags = MI.getFlags(); | ||||||
2809 | |||||||
2810 | LLT S32 = LLT::scalar(32); | ||||||
2811 | LLT S1 = LLT::scalar(1); | ||||||
2812 | |||||||
2813 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||||
2814 | const APFloat C0Val(1.0f); | ||||||
2815 | |||||||
2816 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||||
2817 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||||
2818 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | ||||||
2819 | |||||||
2820 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||||
2821 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||||
2822 | |||||||
2823 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||||
2824 | |||||||
2825 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
2826 | .addUse(Mul0.getReg(0)) | ||||||
2827 | .setMIFlags(Flags); | ||||||
2828 | |||||||
2829 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||||
2830 | |||||||
2831 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||||
2832 | |||||||
2833 | MI.eraseFromParent(); | ||||||
2834 | return true; | ||||||
2835 | } | ||||||
2836 | |||||||
2837 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||||
2838 | MachineRegisterInfo &MRI, | ||||||
2839 | MachineIRBuilder &B) const { | ||||||
2840 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2841 | if (!MFI->isEntryFunction()) { | ||||||
2842 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2843 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||||
2844 | } | ||||||
2845 | |||||||
2846 | B.setInstr(MI); | ||||||
2847 | |||||||
2848 | uint64_t Offset = | ||||||
2849 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||||
2850 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||||
2851 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2852 | LLT DstTy = MRI.getType(DstReg); | ||||||
2853 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||||
2854 | |||||||
2855 | const ArgDescriptor *Arg; | ||||||
2856 | const TargetRegisterClass *RC; | ||||||
2857 | std::tie(Arg, RC) | ||||||
2858 | = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||
2859 | if (!Arg) | ||||||
2860 | return false; | ||||||
2861 | |||||||
2862 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||||
2863 | if (!loadInputValue(KernargPtrReg, B, Arg)) | ||||||
2864 | return false; | ||||||
2865 | |||||||
2866 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||||
2867 | MI.eraseFromParent(); | ||||||
2868 | return true; | ||||||
2869 | } | ||||||
2870 | |||||||
2871 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||||
2872 | MachineRegisterInfo &MRI, | ||||||
2873 | MachineIRBuilder &B, | ||||||
2874 | unsigned AddrSpace) const { | ||||||
2875 | B.setInstr(MI); | ||||||
2876 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||||
2877 | auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); | ||||||
2878 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||||
2879 | MI.eraseFromParent(); | ||||||
2880 | return true; | ||||||
2881 | } | ||||||
2882 | |||||||
2883 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||||
2884 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||||
2885 | // split between the instruction's voffset and immoffset fields) and soffset | ||||||
2886 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||||
2887 | // the instruction's soffset field). This function takes the first kind of | ||||||
2888 | // offset and figures out how to split it between voffset and immoffset. | ||||||
2889 | std::tuple<Register, unsigned, unsigned> | ||||||
2890 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||||
2891 | Register OrigOffset) const { | ||||||
2892 | const unsigned MaxImm = 4095; | ||||||
2893 | Register BaseReg; | ||||||
2894 | unsigned TotalConstOffset; | ||||||
2895 | MachineInstr *OffsetDef; | ||||||
2896 | const LLT S32 = LLT::scalar(32); | ||||||
2897 | |||||||
2898 | std::tie(BaseReg, TotalConstOffset, OffsetDef) | ||||||
2899 | = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); | ||||||
2900 | |||||||
2901 | unsigned ImmOffset = TotalConstOffset; | ||||||
2902 | |||||||
2903 | // If the immediate value is too big for the immoffset field, put the value | ||||||
2904 | // and -4096 into the immoffset field so that the value that is copied/added | ||||||
2905 | // for the voffset field is a multiple of 4096, and it stands more chance | ||||||
2906 | // of being CSEd with the copy/add for another similar load/store. | ||||||
2907 | // However, do not do that rounding down to a multiple of 4096 if that is a | ||||||
2908 | // negative number, as it appears to be illegal to have a negative offset | ||||||
2909 | // in the vgpr, even if adding the immediate offset makes it positive. | ||||||
2910 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||||
2911 | ImmOffset -= Overflow; | ||||||
2912 | if ((int32_t)Overflow < 0) { | ||||||
2913 | Overflow += ImmOffset; | ||||||
2914 | ImmOffset = 0; | ||||||
2915 | } | ||||||
2916 | |||||||
2917 | if (Overflow != 0) { | ||||||
2918 | if (!BaseReg) { | ||||||
2919 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||||
2920 | } else { | ||||||
2921 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||||
2922 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||||
2923 | } | ||||||
2924 | } | ||||||
2925 | |||||||
2926 | if (!BaseReg) | ||||||
2927 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||||
2928 | |||||||
2929 | return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); | ||||||
2930 | } | ||||||
2931 | |||||||
2932 | /// Handle register layout difference for f16 images for some subtargets. | ||||||
2933 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||||
2934 | MachineRegisterInfo &MRI, | ||||||
2935 | Register Reg) const { | ||||||
2936 | if (!ST.hasUnpackedD16VMem()) | ||||||
2937 | return Reg; | ||||||
2938 | |||||||
2939 | const LLT S16 = LLT::scalar(16); | ||||||
2940 | const LLT S32 = LLT::scalar(32); | ||||||
2941 | LLT StoreVT = MRI.getType(Reg); | ||||||
2942 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16 ) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2942, __PRETTY_FUNCTION__)); | ||||||
2943 | |||||||
2944 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
2945 | |||||||
2946 | SmallVector<Register, 4> WideRegs; | ||||||
2947 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
2948 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||||
2949 | |||||||
2950 | int NumElts = StoreVT.getNumElements(); | ||||||
2951 | |||||||
2952 | return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); | ||||||
2953 | } | ||||||
2954 | |||||||
2955 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||||
2956 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||||
2957 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
2958 | LLT Ty = MRI->getType(VData); | ||||||
2959 | |||||||
2960 | const LLT S16 = LLT::scalar(16); | ||||||
2961 | |||||||
2962 | // Fixup illegal register types for i8 stores. | ||||||
2963 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||||
2964 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||||
2965 | return AnyExt; | ||||||
2966 | } | ||||||
2967 | |||||||
2968 | if (Ty.isVector()) { | ||||||
2969 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||||
2970 | if (IsFormat) | ||||||
2971 | return handleD16VData(B, *MRI, VData); | ||||||
2972 | } | ||||||
2973 | } | ||||||
2974 | |||||||
2975 | return VData; | ||||||
2976 | } | ||||||
2977 | |||||||
2978 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||||
2979 | MachineRegisterInfo &MRI, | ||||||
2980 | MachineIRBuilder &B, | ||||||
2981 | bool IsTyped, | ||||||
2982 | bool IsFormat) const { | ||||||
2983 | B.setInstr(MI); | ||||||
2984 | |||||||
2985 | Register VData = MI.getOperand(1).getReg(); | ||||||
2986 | LLT Ty = MRI.getType(VData); | ||||||
2987 | LLT EltTy = Ty.getScalarType(); | ||||||
2988 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
2989 | const LLT S32 = LLT::scalar(32); | ||||||
2990 | |||||||
2991 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||||
2992 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
2993 | |||||||
2994 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
2995 | const int MemSize = MMO->getSize(); | ||||||
2996 | |||||||
2997 | unsigned ImmOffset; | ||||||
2998 | unsigned TotalOffset; | ||||||
2999 | |||||||
3000 | // The typed intrinsics add an immediate after the registers. | ||||||
3001 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3002 | |||||||
3003 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3004 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3005 | Register VIndex; | ||||||
3006 | int OpOffset = 0; | ||||||
3007 | if (HasVIndex) { | ||||||
3008 | VIndex = MI.getOperand(3).getReg(); | ||||||
3009 | OpOffset = 1; | ||||||
3010 | } | ||||||
3011 | |||||||
3012 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3013 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3014 | |||||||
3015 | unsigned Format = 0; | ||||||
3016 | if (IsTyped) { | ||||||
3017 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3018 | ++OpOffset; | ||||||
3019 | } | ||||||
3020 | |||||||
3021 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3022 | |||||||
3023 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||
3024 | if (TotalOffset != 0) | ||||||
3025 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||
3026 | |||||||
3027 | unsigned Opc; | ||||||
3028 | if (IsTyped) { | ||||||
3029 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||||
3030 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||||
3031 | } else if (IsFormat) { | ||||||
3032 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||||
3033 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||||
3034 | } else { | ||||||
3035 | switch (MemSize) { | ||||||
3036 | case 1: | ||||||
3037 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||||
3038 | break; | ||||||
3039 | case 2: | ||||||
3040 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||||
3041 | break; | ||||||
3042 | default: | ||||||
3043 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||||
3044 | break; | ||||||
3045 | } | ||||||
3046 | } | ||||||
3047 | |||||||
3048 | if (!VIndex) | ||||||
3049 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
3050 | |||||||
3051 | auto MIB = B.buildInstr(Opc) | ||||||
3052 | .addUse(VData) // vdata | ||||||
3053 | .addUse(RSrc) // rsrc | ||||||
3054 | .addUse(VIndex) // vindex | ||||||
3055 | .addUse(VOffset) // voffset | ||||||
3056 | .addUse(SOffset) // soffset | ||||||
3057 | .addImm(ImmOffset); // offset(imm) | ||||||
3058 | |||||||
3059 | if (IsTyped) | ||||||
3060 | MIB.addImm(Format); | ||||||
3061 | |||||||
3062 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3063 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3064 | .addMemOperand(MMO); | ||||||
3065 | |||||||
3066 | MI.eraseFromParent(); | ||||||
3067 | return true; | ||||||
3068 | } | ||||||
3069 | |||||||
3070 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||||
3071 | MachineRegisterInfo &MRI, | ||||||
3072 | MachineIRBuilder &B, | ||||||
3073 | bool IsFormat, | ||||||
3074 | bool IsTyped) const { | ||||||
3075 | B.setInstr(MI); | ||||||
3076 | |||||||
3077 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||||
3078 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3079 | const int MemSize = MMO->getSize(); | ||||||
3080 | const LLT S32 = LLT::scalar(32); | ||||||
3081 | |||||||
3082 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3083 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
3084 | |||||||
3085 | // The typed intrinsics add an immediate after the registers. | ||||||
3086 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3087 | |||||||
3088 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3089 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3090 | Register VIndex; | ||||||
3091 | int OpOffset = 0; | ||||||
3092 | if (HasVIndex) { | ||||||
3093 | VIndex = MI.getOperand(3).getReg(); | ||||||
3094 | OpOffset = 1; | ||||||
3095 | } | ||||||
3096 | |||||||
3097 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3098 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3099 | |||||||
3100 | unsigned Format = 0; | ||||||
3101 | if (IsTyped) { | ||||||
3102 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3103 | ++OpOffset; | ||||||
3104 | } | ||||||
3105 | |||||||
3106 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3107 | unsigned ImmOffset; | ||||||
3108 | unsigned TotalOffset; | ||||||
3109 | |||||||
3110 | LLT Ty = MRI.getType(Dst); | ||||||
3111 | LLT EltTy = Ty.getScalarType(); | ||||||
3112 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
3113 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||||
3114 | |||||||
3115 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||
3116 | if (TotalOffset != 0) | ||||||
3117 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||
3118 | |||||||
3119 | unsigned Opc; | ||||||
3120 | |||||||
3121 | if (IsTyped) { | ||||||
3122 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||||
3123 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||||
3124 | } else if (IsFormat) { | ||||||
3125 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : | ||||||
3126 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||||
3127 | } else { | ||||||
3128 | switch (MemSize) { | ||||||
3129 | case 1: | ||||||
3130 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||||
3131 | break; | ||||||
3132 | case 2: | ||||||
3133 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||||
3134 | break; | ||||||
3135 | default: | ||||||
3136 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||||
3137 | break; | ||||||
3138 | } | ||||||
3139 | } | ||||||
3140 | |||||||
3141 | Register LoadDstReg; | ||||||
3142 | |||||||
3143 | bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); | ||||||
3144 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||||
3145 | |||||||
3146 | if (IsExtLoad) | ||||||
3147 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||||
3148 | else if (Unpacked && IsD16 && Ty.isVector()) | ||||||
3149 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||||
3150 | else | ||||||
3151 | LoadDstReg = Dst; | ||||||
3152 | |||||||
3153 | if (!VIndex) | ||||||
3154 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
3155 | |||||||
3156 | auto MIB = B.buildInstr(Opc) | ||||||
3157 | .addDef(LoadDstReg) // vdata | ||||||
3158 | .addUse(RSrc) // rsrc | ||||||
3159 | .addUse(VIndex) // vindex | ||||||
3160 | .addUse(VOffset) // voffset | ||||||
3161 | .addUse(SOffset) // soffset | ||||||
3162 | .addImm(ImmOffset); // offset(imm) | ||||||
3163 | |||||||
3164 | if (IsTyped) | ||||||
3165 | MIB.addImm(Format); | ||||||
3166 | |||||||
3167 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3168 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3169 | .addMemOperand(MMO); | ||||||
3170 | |||||||
3171 | if (LoadDstReg != Dst) { | ||||||
3172 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||
3173 | |||||||
3174 | // Widen result for extending loads was widened. | ||||||
3175 | if (IsExtLoad) | ||||||
3176 | B.buildTrunc(Dst, LoadDstReg); | ||||||
3177 | else { | ||||||
3178 | // Repack to original 16-bit vector result | ||||||
3179 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||||
3180 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||||
3181 | SmallVector<Register, 4> Repack; | ||||||
3182 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||||
3183 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||||
3184 | B.buildMerge(Dst, Repack); | ||||||
3185 | } | ||||||
3186 | } | ||||||
3187 | |||||||
3188 | MI.eraseFromParent(); | ||||||
3189 | return true; | ||||||
3190 | } | ||||||
3191 | |||||||
3192 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||||
3193 | MachineIRBuilder &B, | ||||||
3194 | bool IsInc) const { | ||||||
3195 | B.setInstr(MI); | ||||||
3196 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : | ||||||
3197 | AMDGPU::G_AMDGPU_ATOMIC_DEC; | ||||||
3198 | B.buildInstr(Opc) | ||||||
3199 | .addDef(MI.getOperand(0).getReg()) | ||||||
3200 | .addUse(MI.getOperand(2).getReg()) | ||||||
3201 | .addUse(MI.getOperand(3).getReg()) | ||||||
3202 | .cloneMemRefs(MI); | ||||||
3203 | MI.eraseFromParent(); | ||||||
3204 | return true; | ||||||
3205 | } | ||||||
3206 | |||||||
3207 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||||
3208 | switch (IntrID) { | ||||||
3209 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
3210 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
3211 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||||
3212 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
3213 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
3214 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||||
3215 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
3216 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
3217 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||||
3218 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
3219 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
3220 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||||
3221 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
3222 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
3223 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||||
3224 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
3225 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
3226 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||||
3227 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
3228 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
3229 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||||
3230 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
3231 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
3232 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||||
3233 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
3234 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
3235 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||||
3236 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
3237 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
3238 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||||
3239 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
3240 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
3241 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||||
3242 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
3243 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
3244 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||||
3245 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
3246 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
3247 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||||
3248 | default: | ||||||
3249 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3249); | ||||||
3250 | } | ||||||
3251 | } | ||||||
3252 | |||||||
3253 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||||
3254 | MachineIRBuilder &B, | ||||||
3255 | Intrinsic::ID IID) const { | ||||||
3256 | B.setInstr(MI); | ||||||
3257 | |||||||
3258 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||||
3259 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||||
3260 | |||||||
3261 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3262 | Register VData = MI.getOperand(2).getReg(); | ||||||
3263 | |||||||
3264 | Register CmpVal; | ||||||
3265 | int OpOffset = 0; | ||||||
3266 | |||||||
3267 | if (IsCmpSwap) { | ||||||
3268 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3269 | ++OpOffset; | ||||||
3270 | } | ||||||
3271 | |||||||
3272 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3273 | const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; | ||||||
3274 | |||||||
3275 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3276 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3277 | Register VIndex; | ||||||
3278 | if (HasVIndex) { | ||||||
3279 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3280 | ++OpOffset; | ||||||
3281 | } | ||||||
3282 | |||||||
3283 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3284 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||||
3285 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||||
3286 | |||||||
3287 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3288 | |||||||
3289 | unsigned ImmOffset; | ||||||
3290 | unsigned TotalOffset; | ||||||
3291 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||
3292 | if (TotalOffset != 0) | ||||||
3293 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); | ||||||
3294 | |||||||
3295 | if (!VIndex) | ||||||
3296 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||||
3297 | |||||||
3298 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) | ||||||
3299 | .addDef(Dst) | ||||||
3300 | .addUse(VData); // vdata | ||||||
3301 | |||||||
3302 | if (IsCmpSwap) | ||||||
3303 | MIB.addReg(CmpVal); | ||||||
3304 | |||||||
3305 | MIB.addUse(RSrc) // rsrc | ||||||
3306 | .addUse(VIndex) // vindex | ||||||
3307 | .addUse(VOffset) // voffset | ||||||
3308 | .addUse(SOffset) // soffset | ||||||
3309 | .addImm(ImmOffset) // offset(imm) | ||||||
3310 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3311 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3312 | .addMemOperand(MMO); | ||||||
3313 | |||||||
3314 | MI.eraseFromParent(); | ||||||
3315 | return true; | ||||||
3316 | } | ||||||
3317 | |||||||
3318 | // Produce a vector of s16 elements from s32 pieces. | ||||||
3319 | static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, | ||||||
3320 | ArrayRef<Register> UnmergeParts) { | ||||||
3321 | const LLT S16 = LLT::scalar(16); | ||||||
3322 | |||||||
3323 | SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); | ||||||
3324 | for (int I = 0, E = UnmergeParts.size(); I != E; ++I) | ||||||
3325 | RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); | ||||||
3326 | |||||||
3327 | B.buildBuildVector(DstReg, RemergeParts); | ||||||
3328 | } | ||||||
3329 | |||||||
3330 | /// Convert a set of s32 registers to a result vector with s16 elements. | ||||||
3331 | static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, | ||||||
3332 | ArrayRef<Register> UnmergeParts) { | ||||||
3333 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
3334 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
3335 | LLT TargetTy = MRI.getType(DstReg); | ||||||
3336 | int NumElts = UnmergeParts.size(); | ||||||
3337 | |||||||
3338 | if (NumElts == 1) { | ||||||
3339 | assert(TargetTy == V2S16)((TargetTy == V2S16) ? static_cast<void> (0) : __assert_fail ("TargetTy == V2S16", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3339, __PRETTY_FUNCTION__)); | ||||||
3340 | B.buildBitcast(DstReg, UnmergeParts[0]); | ||||||
3341 | return; | ||||||
3342 | } | ||||||
3343 | |||||||
3344 | SmallVector<Register, 4> RemergeParts(NumElts); | ||||||
3345 | for (int I = 0; I != NumElts; ++I) | ||||||
3346 | RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); | ||||||
3347 | |||||||
3348 | if (TargetTy.getSizeInBits() == 32u * NumElts) { | ||||||
3349 | B.buildConcatVectors(DstReg, RemergeParts); | ||||||
3350 | return; | ||||||
3351 | } | ||||||
3352 | |||||||
3353 | const LLT V3S16 = LLT::vector(3, 16); | ||||||
3354 | const LLT V6S16 = LLT::vector(6, 16); | ||||||
3355 | |||||||
3356 | // Widen to v6s16 and unpack v3 parts. | ||||||
3357 | assert(TargetTy == V3S16)((TargetTy == V3S16) ? static_cast<void> (0) : __assert_fail ("TargetTy == V3S16", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3357, __PRETTY_FUNCTION__)); | ||||||
3358 | |||||||
3359 | RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); | ||||||
3360 | auto Concat = B.buildConcatVectors(V6S16, RemergeParts); | ||||||
3361 | B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); | ||||||
3362 | } | ||||||
3363 | |||||||
3364 | // FIXME: Just vector trunc should be sufficent, but legalization currently | ||||||
3365 | // broken. | ||||||
3366 | static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, | ||||||
3367 | Register WideDstReg) { | ||||||
3368 | const LLT S32 = LLT::scalar(32); | ||||||
3369 | const LLT S16 = LLT::scalar(16); | ||||||
3370 | |||||||
3371 | auto Unmerge = B.buildUnmerge(S32, WideDstReg); | ||||||
3372 | |||||||
3373 | int NumOps = Unmerge->getNumOperands() - 1; | ||||||
3374 | SmallVector<Register, 4> RemergeParts(NumOps); | ||||||
3375 | for (int I = 0; I != NumOps; ++I) | ||||||
3376 | RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); | ||||||
3377 | |||||||
3378 | B.buildBuildVector(DstReg, RemergeParts); | ||||||
3379 | } | ||||||
3380 | |||||||
3381 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||||
3382 | MachineInstr &MI, MachineIRBuilder &B, | ||||||
3383 | GISelChangeObserver &Observer, | ||||||
3384 | const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { | ||||||
3385 | bool IsTFE = MI.getNumExplicitDefs() == 2; | ||||||
3386 | |||||||
3387 | // We are only processing the operands of d16 image operations on subtargets | ||||||
3388 | // that use the unpacked register layout, or need to repack the TFE result. | ||||||
3389 | |||||||
3390 | // TODO: Need to handle a16 images too | ||||||
3391 | // TODO: Do we need to guard against already legalized intrinsics? | ||||||
3392 | if (!IsTFE && !ST.hasUnpackedD16VMem()) | ||||||
3393 | return true; | ||||||
3394 | |||||||
3395 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||||
3396 | AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); | ||||||
3397 | |||||||
3398 | if (BaseOpcode->Atomic) // No d16 atomics, or TFE. | ||||||
3399 | return true; | ||||||
3400 | |||||||
3401 | B.setInstr(MI); | ||||||
3402 | |||||||
3403 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
3404 | const LLT S32 = LLT::scalar(32); | ||||||
3405 | const LLT S16 = LLT::scalar(16); | ||||||
3406 | |||||||
3407 | if (BaseOpcode->Store) { // No TFE for stores? | ||||||
3408 | Register VData = MI.getOperand(1).getReg(); | ||||||
3409 | LLT Ty = MRI->getType(VData); | ||||||
3410 | if (!Ty.isVector() || Ty.getElementType() != S16) | ||||||
3411 | return true; | ||||||
3412 | |||||||
3413 | B.setInstr(MI); | ||||||
3414 | |||||||
3415 | Observer.changingInstr(MI); | ||||||
3416 | MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); | ||||||
3417 | Observer.changedInstr(MI); | ||||||
3418 | return true; | ||||||
3419 | } | ||||||
3420 | |||||||
3421 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
3422 | LLT Ty = MRI->getType(DstReg); | ||||||
3423 | const LLT EltTy = Ty.getScalarType(); | ||||||
3424 | const bool IsD16 = Ty.getScalarType() == S16; | ||||||
3425 | const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||||
3426 | |||||||
3427 | if (IsTFE) { | ||||||
3428 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||||
3429 | // type. The intruction really returns these two values in one contiguous | ||||||
3430 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||||
3431 | // return type to use a single register result. | ||||||
3432 | Register Dst1Reg = MI.getOperand(1).getReg(); | ||||||
3433 | if (MRI->getType(Dst1Reg) != S32) | ||||||
3434 | return false; | ||||||
3435 | |||||||
3436 | // TODO: Make sure the TFE operand bit is set. | ||||||
3437 | |||||||
3438 | // The raw dword aligned data component of the load. The only legal cases | ||||||
3439 | // where this matters should be when using the packed D16 format, for | ||||||
3440 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||||
3441 | LLT RoundedTy; | ||||||
3442 | LLT TFETy; | ||||||
3443 | |||||||
3444 | if (IsD16 && ST.hasUnpackedD16VMem()) { | ||||||
3445 | RoundedTy = LLT::scalarOrVector(NumElts, 32); | ||||||
3446 | TFETy = LLT::vector(NumElts + 1, 32); | ||||||
3447 | } else { | ||||||
3448 | unsigned EltSize = Ty.getScalarSizeInBits(); | ||||||
3449 | unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; | ||||||
3450 | unsigned RoundedSize = 32 * RoundedElts; | ||||||
3451 | RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); | ||||||
3452 | TFETy = LLT::vector(RoundedSize / 32 + 1, S32); | ||||||
3453 | } | ||||||
3454 | |||||||
3455 | Register TFEReg = MRI->createGenericVirtualRegister(TFETy); | ||||||
3456 | Observer.changingInstr(MI); | ||||||
3457 | |||||||
3458 | MI.getOperand(0).setReg(TFEReg); | ||||||
3459 | MI.RemoveOperand(1); | ||||||
3460 | |||||||
3461 | Observer.changedInstr(MI); | ||||||
3462 | |||||||
3463 | // Insert after the instruction. | ||||||
3464 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||||
3465 | |||||||
3466 | // Now figure out how to copy the new result register back into the old | ||||||
3467 | // result. | ||||||
3468 | |||||||
3469 | SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); | ||||||
3470 | int NumDataElts = TFETy.getNumElements() - 1; | ||||||
3471 | |||||||
3472 | if (!Ty.isVector()) { | ||||||
3473 | // Simplest case is a trivial unmerge (plus a truncate for d16). | ||||||
3474 | UnmergeResults[0] = Ty == S32 ? | ||||||
3475 | DstReg : MRI->createGenericVirtualRegister(S32); | ||||||
3476 | |||||||
3477 | B.buildUnmerge(UnmergeResults, TFEReg); | ||||||
3478 | if (Ty != S32) | ||||||
3479 | B.buildTrunc(DstReg, UnmergeResults[0]); | ||||||
3480 | return true; | ||||||
3481 | } | ||||||
3482 | |||||||
3483 | // We have to repack into a new vector of some kind. | ||||||
3484 | for (int I = 0; I != NumDataElts; ++I) | ||||||
3485 | UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); | ||||||
3486 | B.buildUnmerge(UnmergeResults, TFEReg); | ||||||
3487 | |||||||
3488 | // Drop the final TFE element. | ||||||
3489 | ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); | ||||||
3490 | |||||||
3491 | if (EltTy == S32) | ||||||
3492 | B.buildBuildVector(DstReg, DataPart); | ||||||
3493 | else if (ST.hasUnpackedD16VMem()) | ||||||
3494 | truncToS16Vector(B, DstReg, DataPart); | ||||||
3495 | else | ||||||
3496 | bitcastToS16Vector(B, DstReg, DataPart); | ||||||
3497 | |||||||
3498 | return true; | ||||||
3499 | } | ||||||
3500 | |||||||
3501 | // Must be an image load. | ||||||
3502 | if (!Ty.isVector() || Ty.getElementType() != S16) | ||||||
3503 | return true; | ||||||
3504 | |||||||
3505 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||||
3506 | |||||||
3507 | LLT WidenedTy = Ty.changeElementType(S32); | ||||||
3508 | Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); | ||||||
3509 | |||||||
3510 | Observer.changingInstr(MI); | ||||||
3511 | MI.getOperand(0).setReg(WideDstReg); | ||||||
3512 | Observer.changedInstr(MI); | ||||||
3513 | |||||||
3514 | repackUnpackedD16Load(B, DstReg, WideDstReg); | ||||||
3515 | return true; | ||||||
3516 | } | ||||||
3517 | |||||||
3518 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||||
3519 | MachineInstr &MI, MachineIRBuilder &B, | ||||||
3520 | GISelChangeObserver &Observer) const { | ||||||
3521 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3522 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
3523 | unsigned Size = Ty.getSizeInBits(); | ||||||
3524 | MachineFunction &MF = B.getMF(); | ||||||
3525 | |||||||
3526 | Observer.changingInstr(MI); | ||||||
3527 | |||||||
3528 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||||
3529 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||||
3530 | // allowed to add one. | ||||||
3531 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||||
3532 | MI.RemoveOperand(1); // Remove intrinsic ID | ||||||
3533 | |||||||
3534 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||||
3535 | // TODO: Should this use datalayout alignment? | ||||||
3536 | const unsigned MemSize = (Size + 7) / 8; | ||||||
3537 | const unsigned MemAlign = 4; | ||||||
3538 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
3539 | MachinePointerInfo(), | ||||||
3540 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
3541 | MachineMemOperand::MOInvariant, MemSize, MemAlign); | ||||||
3542 | MI.addMemOperand(MF, MMO); | ||||||
3543 | |||||||
3544 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||||
3545 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||||
3546 | // out this needs to be converted to a vector load during RegBankSelect. | ||||||
3547 | if (!isPowerOf2_32(Size)) { | ||||||
3548 | LegalizerHelper Helper(MF, *this, Observer, B); | ||||||
3549 | B.setInstr(MI); | ||||||
3550 | |||||||
3551 | if (Ty.isVector()) | ||||||
3552 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||||
3553 | else | ||||||
3554 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||||
3555 | } | ||||||
3556 | |||||||
3557 | Observer.changedInstr(MI); | ||||||
3558 | return true; | ||||||
3559 | } | ||||||
3560 | |||||||
3561 | bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, | ||||||
3562 | MachineIRBuilder &B, | ||||||
3563 | GISelChangeObserver &Observer) const { | ||||||
3564 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
3565 | |||||||
3566 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||||
3567 | auto IntrID = MI.getIntrinsicID(); | ||||||
3568 | switch (IntrID) { | ||||||
| |||||||
3569 | case Intrinsic::amdgcn_if: | ||||||
3570 | case Intrinsic::amdgcn_else: { | ||||||
3571 | MachineInstr *Br = nullptr; | ||||||
3572 | if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { | ||||||
3573 | const SIRegisterInfo *TRI | ||||||
3574 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
3575 | |||||||
3576 | B.setInstr(*BrCond); | ||||||
3577 | Register Def = MI.getOperand(1).getReg(); | ||||||
3578 | Register Use = MI.getOperand(3).getReg(); | ||||||
3579 | |||||||
3580 | MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); | ||||||
3581 | if (Br) | ||||||
3582 | BrTarget = Br->getOperand(0).getMBB(); | ||||||
3583 | |||||||
3584 | if (IntrID == Intrinsic::amdgcn_if) { | ||||||
3585 | B.buildInstr(AMDGPU::SI_IF) | ||||||
3586 | .addDef(Def) | ||||||
3587 | .addUse(Use) | ||||||
3588 | .addMBB(BrTarget); | ||||||
3589 | } else { | ||||||
3590 | B.buildInstr(AMDGPU::SI_ELSE) | ||||||
3591 | .addDef(Def) | ||||||
3592 | .addUse(Use) | ||||||
3593 | .addMBB(BrTarget) | ||||||
3594 | .addImm(0); | ||||||
3595 | } | ||||||
3596 | |||||||
3597 | if (Br) | ||||||
3598 | Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); | ||||||
3599 | |||||||
3600 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||||
3601 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||||
3602 | MI.eraseFromParent(); | ||||||
3603 | BrCond->eraseFromParent(); | ||||||
3604 | return true; | ||||||
3605 | } | ||||||
3606 | |||||||
3607 | return false; | ||||||
3608 | } | ||||||
3609 | case Intrinsic::amdgcn_loop: { | ||||||
3610 | MachineInstr *Br = nullptr; | ||||||
3611 | if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { | ||||||
3612 | const SIRegisterInfo *TRI | ||||||
3613 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
3614 | |||||||
3615 | B.setInstr(*BrCond); | ||||||
3616 | |||||||
3617 | MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); | ||||||
3618 | if (Br) | ||||||
3619 | BrTarget = Br->getOperand(0).getMBB(); | ||||||
3620 | |||||||
3621 | Register Reg = MI.getOperand(2).getReg(); | ||||||
3622 | B.buildInstr(AMDGPU::SI_LOOP) | ||||||
3623 | .addUse(Reg) | ||||||
3624 | .addMBB(BrTarget); | ||||||
3625 | |||||||
3626 | if (Br) | ||||||
3627 | Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); | ||||||
3628 | |||||||
3629 | MI.eraseFromParent(); | ||||||
3630 | BrCond->eraseFromParent(); | ||||||
3631 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||||
3632 | return true; | ||||||
3633 | } | ||||||
3634 | |||||||
3635 | return false; | ||||||
3636 | } | ||||||
3637 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||||
3638 | return legalizePreloadedArgIntrin( | ||||||
3639 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||
3640 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||||
3641 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||||
3642 | case Intrinsic::amdgcn_workitem_id_x: | ||||||
3643 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3644 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||||
3645 | case Intrinsic::amdgcn_workitem_id_y: | ||||||
3646 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3647 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||||
3648 | case Intrinsic::amdgcn_workitem_id_z: | ||||||
3649 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3650 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||||
3651 | case Intrinsic::amdgcn_workgroup_id_x: | ||||||
3652 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3653 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||||
3654 | case Intrinsic::amdgcn_workgroup_id_y: | ||||||
3655 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3656 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||||
3657 | case Intrinsic::amdgcn_workgroup_id_z: | ||||||
3658 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3659 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||||
3660 | case Intrinsic::amdgcn_dispatch_ptr: | ||||||
3661 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3662 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||||
3663 | case Intrinsic::amdgcn_queue_ptr: | ||||||
3664 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3665 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||||
3666 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||||
3667 | return legalizePreloadedArgIntrin( | ||||||
3668 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||||
3669 | case Intrinsic::amdgcn_dispatch_id: | ||||||
3670 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3671 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||||
3672 | case Intrinsic::amdgcn_fdiv_fast: | ||||||
3673 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||||
3674 | case Intrinsic::amdgcn_is_shared: | ||||||
3675 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||||
3676 | case Intrinsic::amdgcn_is_private: | ||||||
3677 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||||
3678 | case Intrinsic::amdgcn_wavefrontsize: { | ||||||
3679 | B.setInstr(MI); | ||||||
3680 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||||
3681 | MI.eraseFromParent(); | ||||||
3682 | return true; | ||||||
3683 | } | ||||||
3684 | case Intrinsic::amdgcn_s_buffer_load: | ||||||
3685 | return legalizeSBufferLoad(MI, B, Observer); | ||||||
3686 | case Intrinsic::amdgcn_raw_buffer_store: | ||||||
3687 | case Intrinsic::amdgcn_struct_buffer_store: | ||||||
3688 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||||
3689 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||||
3690 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||||
3691 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||||
3692 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||||
3693 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||||
3694 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||||
3695 | case Intrinsic::amdgcn_raw_buffer_load: | ||||||
3696 | case Intrinsic::amdgcn_struct_buffer_load: | ||||||
3697 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||||
3698 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||||
3699 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||||
3700 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||||
3701 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||||
3702 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||||
3703 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||||
3704 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
3705 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
3706 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
3707 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
3708 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
3709 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
3710 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
3711 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
3712 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
3713 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
3714 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
3715 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
3716 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
3717 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
3718 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
3719 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
3720 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
3721 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
3722 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
3723 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
3724 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
3725 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
3726 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
3727 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
3728 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
3729 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
3730 | return legalizeBufferAtomic(MI, B, IntrID); | ||||||
3731 | case Intrinsic::amdgcn_atomic_inc: | ||||||
3732 | return legalizeAtomicIncDec(MI, B, true); | ||||||
3733 | case Intrinsic::amdgcn_atomic_dec: | ||||||
3734 | return legalizeAtomicIncDec(MI, B, false); | ||||||
3735 | default: { | ||||||
3736 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||||
3737 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||||
3738 | return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); | ||||||
3739 | return true; | ||||||
3740 | } | ||||||
3741 | } | ||||||
3742 | |||||||
3743 | return true; | ||||||
3744 | } |
1 | //==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
10 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
11 | |
12 | #include "llvm/ADT/DenseMap.h" |
13 | #include "llvm/CodeGen/Register.h" |
14 | #include "llvm/IR/Function.h" |
15 | #include "llvm/Pass.h" |
16 | |
17 | namespace llvm { |
18 | |
19 | class Function; |
20 | class raw_ostream; |
21 | class GCNSubtarget; |
22 | class TargetMachine; |
23 | class TargetRegisterClass; |
24 | class TargetRegisterInfo; |
25 | |
26 | struct ArgDescriptor { |
27 | private: |
28 | friend struct AMDGPUFunctionArgInfo; |
29 | friend class AMDGPUArgumentUsageInfo; |
30 | |
31 | union { |
32 | Register Reg; |
33 | unsigned StackOffset; |
34 | }; |
35 | |
36 | // Bitmask to locate argument within the register. |
37 | unsigned Mask; |
38 | |
39 | bool IsStack : 1; |
40 | bool IsSet : 1; |
41 | |
42 | public: |
43 | ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, |
44 | bool IsStack = false, bool IsSet = false) |
45 | : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} |
46 | |
47 | static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { |
48 | return ArgDescriptor(Reg, Mask, false, true); |
49 | } |
50 | |
51 | static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { |
52 | return ArgDescriptor(Offset, Mask, true, true); |
53 | } |
54 | |
55 | static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { |
56 | return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); |
57 | } |
58 | |
59 | bool isSet() const { |
60 | return IsSet; |
61 | } |
62 | |
63 | explicit operator bool() const { |
64 | return isSet(); |
65 | } |
66 | |
67 | bool isRegister() const { |
68 | return !IsStack; |
69 | } |
70 | |
71 | Register getRegister() const { |
72 | assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 72, __PRETTY_FUNCTION__)); |
73 | return Reg; |
74 | } |
75 | |
76 | unsigned getStackOffset() const { |
77 | assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 77, __PRETTY_FUNCTION__)); |
78 | return StackOffset; |
79 | } |
80 | |
81 | unsigned getMask() const { |
82 | return Mask; |
83 | } |
84 | |
85 | bool isMasked() const { |
86 | return Mask != ~0u; |
87 | } |
88 | |
89 | void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; |
90 | }; |
91 | |
92 | inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { |
93 | Arg.print(OS); |
94 | return OS; |
95 | } |
96 | |
97 | struct AMDGPUFunctionArgInfo { |
98 | enum PreloadedValue { |
99 | // SGPRS: |
100 | PRIVATE_SEGMENT_BUFFER = 0, |
101 | DISPATCH_PTR = 1, |
102 | QUEUE_PTR = 2, |
103 | KERNARG_SEGMENT_PTR = 3, |
104 | DISPATCH_ID = 4, |
105 | FLAT_SCRATCH_INIT = 5, |
106 | WORKGROUP_ID_X = 10, |
107 | WORKGROUP_ID_Y = 11, |
108 | WORKGROUP_ID_Z = 12, |
109 | PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, |
110 | IMPLICIT_BUFFER_PTR = 15, |
111 | IMPLICIT_ARG_PTR = 16, |
112 | |
113 | // VGPRS: |
114 | WORKITEM_ID_X = 17, |
115 | WORKITEM_ID_Y = 18, |
116 | WORKITEM_ID_Z = 19, |
117 | FIRST_VGPR_VALUE = WORKITEM_ID_X |
118 | }; |
119 | |
120 | // Kernel input registers setup for the HSA ABI in allocation order. |
121 | |
122 | // User SGPRs in kernels |
123 | // XXX - Can these require argument spills? |
124 | ArgDescriptor PrivateSegmentBuffer; |
125 | ArgDescriptor DispatchPtr; |
126 | ArgDescriptor QueuePtr; |
127 | ArgDescriptor KernargSegmentPtr; |
128 | ArgDescriptor DispatchID; |
129 | ArgDescriptor FlatScratchInit; |
130 | ArgDescriptor PrivateSegmentSize; |
131 | |
132 | // System SGPRs in kernels. |
133 | ArgDescriptor WorkGroupIDX; |
134 | ArgDescriptor WorkGroupIDY; |
135 | ArgDescriptor WorkGroupIDZ; |
136 | ArgDescriptor WorkGroupInfo; |
137 | ArgDescriptor PrivateSegmentWaveByteOffset; |
138 | |
139 | // Pointer with offset from kernargsegmentptr to where special ABI arguments |
140 | // are passed to callable functions. |
141 | ArgDescriptor ImplicitArgPtr; |
142 | |
143 | // Input registers for non-HSA ABI |
144 | ArgDescriptor ImplicitBufferPtr = 0; |
145 | |
146 | // VGPRs inputs. These are always v0, v1 and v2 for entry functions. |
147 | ArgDescriptor WorkItemIDX; |
148 | ArgDescriptor WorkItemIDY; |
149 | ArgDescriptor WorkItemIDZ; |
150 | |
151 | std::pair<const ArgDescriptor *, const TargetRegisterClass *> |
152 | getPreloadedValue(PreloadedValue Value) const; |
153 | }; |
154 | |
155 | class AMDGPUArgumentUsageInfo : public ImmutablePass { |
156 | private: |
157 | static const AMDGPUFunctionArgInfo ExternFunctionInfo; |
158 | DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; |
159 | |
160 | public: |
161 | static char ID; |
162 | |
163 | AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } |
164 | |
165 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
166 | AU.setPreservesAll(); |
167 | } |
168 | |
169 | bool doInitialization(Module &M) override; |
170 | bool doFinalization(Module &M) override; |
171 | |
172 | void print(raw_ostream &OS, const Module *M = nullptr) const override; |
173 | |
174 | void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { |
175 | ArgInfoMap[&F] = ArgInfo; |
176 | } |
177 | |
178 | const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const { |
179 | auto I = ArgInfoMap.find(&F); |
180 | if (I == ArgInfoMap.end()) { |
181 | assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail ("F.isDeclaration()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 181, __PRETTY_FUNCTION__)); |
182 | return ExternFunctionInfo; |
183 | } |
184 | |
185 | return I->second; |
186 | } |
187 | }; |
188 | |
189 | } // end namespace llvm |
190 | |
191 | #endif |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file contains some functions that are useful for math stuff. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||||
14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||||
15 | |||||||
16 | #include "llvm/Support/Compiler.h" | ||||||
17 | #include <algorithm> | ||||||
18 | #include <cassert> | ||||||
19 | #include <climits> | ||||||
20 | #include <cmath> | ||||||
21 | #include <cstdint> | ||||||
22 | #include <cstring> | ||||||
23 | #include <limits> | ||||||
24 | #include <type_traits> | ||||||
25 | |||||||
26 | #ifdef __ANDROID_NDK__ | ||||||
27 | #include <android/api-level.h> | ||||||
28 | #endif | ||||||
29 | |||||||
30 | #ifdef _MSC_VER | ||||||
31 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||||
32 | // expensive, and MathExtras.h is popular. | ||||||
33 | // #include <intrin.h> | ||||||
34 | extern "C" { | ||||||
35 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||||
36 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
37 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||||
38 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
39 | } | ||||||
40 | #endif | ||||||
41 | |||||||
42 | namespace llvm { | ||||||
43 | |||||||
44 | /// The behavior an operation has on an input of 0. | ||||||
45 | enum ZeroBehavior { | ||||||
46 | /// The returned value is undefined. | ||||||
47 | ZB_Undefined, | ||||||
48 | /// The returned value is numeric_limits<T>::max() | ||||||
49 | ZB_Max, | ||||||
50 | /// The returned value is numeric_limits<T>::digits | ||||||
51 | ZB_Width | ||||||
52 | }; | ||||||
53 | |||||||
54 | /// Mathematical constants. | ||||||
55 | namespace numbers { | ||||||
56 | // TODO: Track C++20 std::numbers. | ||||||
57 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||||
58 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||||
59 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||||
60 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||||
61 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||||
62 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||||
63 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||||
64 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||||
65 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||||
66 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||||
67 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||||
68 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||||
69 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||||
70 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||||
71 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||||
72 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||||
73 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||||
74 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||||
75 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||||
76 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||||
77 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||||
78 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||||
79 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||||
80 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||||
81 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||||
82 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||||
83 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||||
84 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||||
85 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||||
86 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||||
87 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||||
88 | } // namespace numbers | ||||||
89 | |||||||
90 | namespace detail { | ||||||
91 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||||
92 | static unsigned count(T Val, ZeroBehavior) { | ||||||
93 | if (!Val) | ||||||
94 | return std::numeric_limits<T>::digits; | ||||||
95 | if (Val & 0x1) | ||||||
96 | return 0; | ||||||
97 | |||||||
98 | // Bisection method. | ||||||
99 | unsigned ZeroBits = 0; | ||||||
100 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||||
101 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||||
102 | while (Shift) { | ||||||
103 | if ((Val & Mask) == 0) { | ||||||
104 | Val >>= Shift; | ||||||
105 | ZeroBits |= Shift; | ||||||
106 | } | ||||||
107 | Shift >>= 1; | ||||||
108 | Mask >>= Shift; | ||||||
109 | } | ||||||
110 | return ZeroBits; | ||||||
111 | } | ||||||
112 | }; | ||||||
113 | |||||||
114 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
115 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||||
116 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
117 | if (ZB
| ||||||
118 | return 32; | ||||||
119 | |||||||
120 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||||
121 | return __builtin_ctz(Val); | ||||||
122 | #elif defined(_MSC_VER) | ||||||
123 | unsigned long Index; | ||||||
124 | _BitScanForward(&Index, Val); | ||||||
125 | return Index; | ||||||
126 | #endif | ||||||
127 | } | ||||||
128 | }; | ||||||
129 | |||||||
130 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
131 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||||
132 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
133 | if (ZB != ZB_Undefined && Val == 0) | ||||||
134 | return 64; | ||||||
135 | |||||||
136 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||||
137 | return __builtin_ctzll(Val); | ||||||
138 | #elif defined(_MSC_VER) | ||||||
139 | unsigned long Index; | ||||||
140 | _BitScanForward64(&Index, Val); | ||||||
141 | return Index; | ||||||
142 | #endif | ||||||
143 | } | ||||||
144 | }; | ||||||
145 | #endif | ||||||
146 | #endif | ||||||
147 | } // namespace detail | ||||||
148 | |||||||
149 | /// Count number of 0's from the least significant bit to the most | ||||||
150 | /// stopping at the first 1. | ||||||
151 | /// | ||||||
152 | /// Only unsigned integral types are allowed. | ||||||
153 | /// | ||||||
154 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
155 | /// valid arguments. | ||||||
156 | template <typename T> | ||||||
157 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
158 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
159 | !std::numeric_limits<T>::is_signed, | ||||||
160 | "Only unsigned integral types are allowed."); | ||||||
161 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
162 | } | ||||||
163 | |||||||
164 | namespace detail { | ||||||
165 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||||
166 | static unsigned count(T Val, ZeroBehavior) { | ||||||
167 | if (!Val) | ||||||
168 | return std::numeric_limits<T>::digits; | ||||||
169 | |||||||
170 | // Bisection method. | ||||||
171 | unsigned ZeroBits = 0; | ||||||
172 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||||
173 | T Tmp = Val >> Shift; | ||||||
174 | if (Tmp) | ||||||
175 | Val = Tmp; | ||||||
176 | else | ||||||
177 | ZeroBits |= Shift; | ||||||
178 | } | ||||||
179 | return ZeroBits; | ||||||
180 | } | ||||||
181 | }; | ||||||
182 | |||||||
183 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
184 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||||
185 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
186 | if (ZB != ZB_Undefined && Val == 0) | ||||||
187 | return 32; | ||||||
188 | |||||||
189 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||||
190 | return __builtin_clz(Val); | ||||||
191 | #elif defined(_MSC_VER) | ||||||
192 | unsigned long Index; | ||||||
193 | _BitScanReverse(&Index, Val); | ||||||
194 | return Index ^ 31; | ||||||
195 | #endif | ||||||
196 | } | ||||||
197 | }; | ||||||
198 | |||||||
199 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
200 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||||
201 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
202 | if (ZB != ZB_Undefined && Val == 0) | ||||||
203 | return 64; | ||||||
204 | |||||||
205 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||||
206 | return __builtin_clzll(Val); | ||||||
207 | #elif defined(_MSC_VER) | ||||||
208 | unsigned long Index; | ||||||
209 | _BitScanReverse64(&Index, Val); | ||||||
210 | return Index ^ 63; | ||||||
211 | #endif | ||||||
212 | } | ||||||
213 | }; | ||||||
214 | #endif | ||||||
215 | #endif | ||||||
216 | } // namespace detail | ||||||
217 | |||||||
218 | /// Count number of 0's from the most significant bit to the least | ||||||
219 | /// stopping at the first 1. | ||||||
220 | /// | ||||||
221 | /// Only unsigned integral types are allowed. | ||||||
222 | /// | ||||||
223 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
224 | /// valid arguments. | ||||||
225 | template <typename T> | ||||||
226 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
227 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
228 | !std::numeric_limits<T>::is_signed, | ||||||
229 | "Only unsigned integral types are allowed."); | ||||||
230 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
231 | } | ||||||
232 | |||||||
233 | /// Get the index of the first set bit starting from the least | ||||||
234 | /// significant bit. | ||||||
235 | /// | ||||||
236 | /// Only unsigned integral types are allowed. | ||||||
237 | /// | ||||||
238 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
239 | /// valid arguments. | ||||||
240 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
241 | if (ZB == ZB_Max && Val == 0) | ||||||
242 | return std::numeric_limits<T>::max(); | ||||||
243 | |||||||
244 | return countTrailingZeros(Val, ZB_Undefined); | ||||||
245 | } | ||||||
246 | |||||||
247 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||||
248 | /// bits set to 0. Only unsigned types are allowed. | ||||||
249 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||||
250 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||||
251 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||||
252 | assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast< void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 252, __PRETTY_FUNCTION__)); | ||||||
253 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||||
254 | } | ||||||
255 | |||||||
256 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||||
257 | /// bits set to 0. Only unsigned types are allowed. | ||||||
258 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||||
259 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
260 | } | ||||||
261 | |||||||
262 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||||
263 | /// bits set to 1. Only unsigned types are allowed. | ||||||
264 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||||
265 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
266 | } | ||||||
267 | |||||||
268 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||||
269 | /// bits set to 1. Only unsigned types are allowed. | ||||||
270 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||||
271 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
272 | } | ||||||
273 | |||||||
274 | /// Get the index of the last set bit starting from the least | ||||||
275 | /// significant bit. | ||||||
276 | /// | ||||||
277 | /// Only unsigned integral types are allowed. | ||||||
278 | /// | ||||||
279 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
280 | /// valid arguments. | ||||||
281 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
282 | if (ZB == ZB_Max && Val == 0) | ||||||
283 | return std::numeric_limits<T>::max(); | ||||||
284 | |||||||
285 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||||
286 | // in the __builtin_clz intrinsic on x86. | ||||||
287 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||||
288 | (std::numeric_limits<T>::digits - 1); | ||||||
289 | } | ||||||
290 | |||||||
291 | /// Macro compressed bit reversal table for 256 bits. | ||||||
292 | /// | ||||||
293 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||||
294 | static const unsigned char BitReverseTable256[256] = { | ||||||
295 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||||
296 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||||
297 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||||
298 | R6(0), R6(2), R6(1), R6(3) | ||||||
299 | #undef R2 | ||||||
300 | #undef R4 | ||||||
301 | #undef R6 | ||||||
302 | }; | ||||||
303 | |||||||
304 | /// Reverse the bits in \p Val. | ||||||
305 | template <typename T> | ||||||
306 | T reverseBits(T Val) { | ||||||
307 | unsigned char in[sizeof(Val)]; | ||||||
308 | unsigned char out[sizeof(Val)]; | ||||||
309 | std::memcpy(in, &Val, sizeof(Val)); | ||||||
310 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||||
311 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||||
312 | std::memcpy(&Val, out, sizeof(Val)); | ||||||
313 | return Val; | ||||||
314 | } | ||||||
315 | |||||||
316 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||||
317 | // type overloading so that signed and unsigned integers can be used without | ||||||
318 | // ambiguity. | ||||||
319 | |||||||
320 | /// Return the high 32 bits of a 64 bit value. | ||||||
321 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||||
322 | return static_cast<uint32_t>(Value >> 32); | ||||||
323 | } | ||||||
324 | |||||||
325 | /// Return the low 32 bits of a 64 bit value. | ||||||
326 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||||
327 | return static_cast<uint32_t>(Value); | ||||||
328 | } | ||||||
329 | |||||||
330 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||||
331 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||||
332 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||||
333 | } | ||||||
334 | |||||||
335 | /// Checks if an integer fits into the given bit width. | ||||||
336 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||||
337 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); | ||||||
338 | } | ||||||
339 | // Template specializations to get better code for common cases. | ||||||
340 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||||
341 | return static_cast<int8_t>(x) == x; | ||||||
342 | } | ||||||
343 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||||
344 | return static_cast<int16_t>(x) == x; | ||||||
345 | } | ||||||
346 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||||
347 | return static_cast<int32_t>(x) == x; | ||||||
348 | } | ||||||
349 | |||||||
350 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||||
351 | template <unsigned N, unsigned S> | ||||||
352 | constexpr inline bool isShiftedInt(int64_t x) { | ||||||
353 | static_assert( | ||||||
354 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||||
355 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||||
356 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
357 | } | ||||||
358 | |||||||
359 | /// Checks if an unsigned integer fits into the given bit width. | ||||||
360 | /// | ||||||
361 | /// This is written as two functions rather than as simply | ||||||
362 | /// | ||||||
363 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||||
364 | /// | ||||||
365 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||||
366 | /// left too many places. | ||||||
367 | template <unsigned N> | ||||||
368 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { | ||||||
369 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||||
370 | return X < (UINT64_C(1)1UL << (N)); | ||||||
371 | } | ||||||
372 | template <unsigned N> | ||||||
373 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t X) { | ||||||
374 | return true; | ||||||
375 | } | ||||||
376 | |||||||
377 | // Template specializations to get better code for common cases. | ||||||
378 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||||
379 | return static_cast<uint8_t>(x) == x; | ||||||
380 | } | ||||||
381 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||||
382 | return static_cast<uint16_t>(x) == x; | ||||||
383 | } | ||||||
384 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||||
385 | return static_cast<uint32_t>(x) == x; | ||||||
386 | } | ||||||
387 | |||||||
388 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||||
389 | template <unsigned N, unsigned S> | ||||||
390 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||||
391 | static_assert( | ||||||
392 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||||
393 | static_assert(N + S <= 64, | ||||||
394 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||||
395 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||||
396 | // 1 << S is not undefined behavior. | ||||||
397 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
398 | } | ||||||
399 | |||||||
400 | /// Gets the maximum value for a N-bit unsigned integer. | ||||||
401 | inline uint64_t maxUIntN(uint64_t N) { | ||||||
402 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 402, __PRETTY_FUNCTION__)); | ||||||
403 | |||||||
404 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||||
405 | // (uint64_t(1) << N) - 1 | ||||||
406 | // without checking first that N != 64. But this works and doesn't have a | ||||||
407 | // branch. | ||||||
408 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); | ||||||
409 | } | ||||||
410 | |||||||
411 | /// Gets the minimum value for a N-bit signed integer. | ||||||
412 | inline int64_t minIntN(int64_t N) { | ||||||
413 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 413, __PRETTY_FUNCTION__)); | ||||||
414 | |||||||
415 | return -(UINT64_C(1)1UL<<(N-1)); | ||||||
416 | } | ||||||
417 | |||||||
418 | /// Gets the maximum value for a N-bit signed integer. | ||||||
419 | inline int64_t maxIntN(int64_t N) { | ||||||
420 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 420, __PRETTY_FUNCTION__)); | ||||||
421 | |||||||
422 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||||
423 | // int64_t only at the very end to avoid UB. | ||||||
424 | return (UINT64_C(1)1UL << (N - 1)) - 1; | ||||||
425 | } | ||||||
426 | |||||||
427 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||||
428 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||||
429 | return N >= 64 || x <= maxUIntN(N); | ||||||
430 | } | ||||||
431 | |||||||
432 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||||
433 | inline bool isIntN(unsigned N, int64_t x) { | ||||||
434 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||||
435 | } | ||||||
436 | |||||||
437 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
438 | /// least significant bit with the remainder zero (32 bit version). | ||||||
439 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||||
440 | constexpr inline bool isMask_32(uint32_t Value) { | ||||||
441 | return Value && ((Value + 1) & Value) == 0; | ||||||
442 | } | ||||||
443 | |||||||
444 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
445 | /// least significant bit with the remainder zero (64 bit version). | ||||||
446 | constexpr inline bool isMask_64(uint64_t Value) { | ||||||
447 | return Value && ((Value + 1) & Value) == 0; | ||||||
448 | } | ||||||
449 | |||||||
450 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
451 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||||
452 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||||
453 | return Value && isMask_32((Value - 1) | Value); | ||||||
454 | } | ||||||
455 | |||||||
456 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
457 | /// remainder zero (64 bit version.) | ||||||
458 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||||
459 | return Value && isMask_64((Value - 1) | Value); | ||||||
460 | } | ||||||
461 | |||||||
462 | /// Return true if the argument is a power of two > 0. | ||||||
463 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||||
464 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||||
465 | return Value && !(Value & (Value - 1)); | ||||||
466 | } | ||||||
467 | |||||||
468 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||||
469 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||||
470 | return Value && !(Value & (Value - 1)); | ||||||
471 | } | ||||||
472 | |||||||
473 | /// Count the number of ones from the most significant bit to the first | ||||||
474 | /// zero bit. | ||||||
475 | /// | ||||||
476 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||||
477 | /// Only unsigned integral types are allowed. | ||||||
478 | /// | ||||||
479 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
480 | /// ZB_Undefined are valid arguments. | ||||||
481 | template <typename T> | ||||||
482 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
483 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
484 | !std::numeric_limits<T>::is_signed, | ||||||
485 | "Only unsigned integral types are allowed."); | ||||||
486 | return countLeadingZeros<T>(~Value, ZB); | ||||||
487 | } | ||||||
488 | |||||||
489 | /// Count the number of ones from the least significant bit to the first | ||||||
490 | /// zero bit. | ||||||
491 | /// | ||||||
492 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||||
493 | /// Only unsigned integral types are allowed. | ||||||
494 | /// | ||||||
495 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
496 | /// ZB_Undefined are valid arguments. | ||||||
497 | template <typename T> | ||||||
498 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
499 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
500 | !std::numeric_limits<T>::is_signed, | ||||||
501 | "Only unsigned integral types are allowed."); | ||||||
502 | return countTrailingZeros<T>(~Value, ZB); | ||||||
503 | } | ||||||
504 | |||||||
505 | namespace detail { | ||||||
506 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||||
507 | static unsigned count(T Value) { | ||||||
508 | // Generic version, forward to 32 bits. | ||||||
509 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||||
510 | #if defined(__GNUC__4) | ||||||
511 | return __builtin_popcount(Value); | ||||||
512 | #else | ||||||
513 | uint32_t v = Value; | ||||||
514 | v = v - ((v >> 1) & 0x55555555); | ||||||
515 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||||
516 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||||
517 | #endif | ||||||
518 | } | ||||||
519 | }; | ||||||
520 | |||||||
521 | template <typename T> struct PopulationCounter<T, 8> { | ||||||
522 | static unsigned count(T Value) { | ||||||
523 | #if defined(__GNUC__4) | ||||||
524 | return __builtin_popcountll(Value); | ||||||
525 | #else | ||||||
526 | uint64_t v = Value; | ||||||
527 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||||
528 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||||
529 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||||
530 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||||
531 | #endif | ||||||
532 | } | ||||||
533 | }; | ||||||
534 | } // namespace detail | ||||||
535 | |||||||
536 | /// Count the number of set bits in a value. | ||||||
537 | /// Ex. countPopulation(0xF000F000) = 8 | ||||||
538 | /// Returns 0 if the word is zero. | ||||||
539 | template <typename T> | ||||||
540 | inline unsigned countPopulation(T Value) { | ||||||
541 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
542 | !std::numeric_limits<T>::is_signed, | ||||||
543 | "Only unsigned integral types are allowed."); | ||||||
544 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||||
545 | } | ||||||
546 | |||||||
547 | /// Compile time Log2. | ||||||
548 | /// Valid only for positive powers of two. | ||||||
549 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||||
550 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||||
551 | "Value is not a valid power of 2"); | ||||||
552 | return 1 + CTLog2<kValue / 2>(); | ||||||
553 | } | ||||||
554 | |||||||
555 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||||
556 | |||||||
557 | /// Return the log base 2 of the specified value. | ||||||
558 | inline double Log2(double Value) { | ||||||
559 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||||
560 | return __builtin_log(Value) / __builtin_log(2.0); | ||||||
561 | #else | ||||||
562 | return log2(Value); | ||||||
563 | #endif | ||||||
564 | } | ||||||
565 | |||||||
566 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
567 | /// (32 bit edition.) | ||||||
568 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||||
569 | inline unsigned Log2_32(uint32_t Value) { | ||||||
570 | return 31 - countLeadingZeros(Value); | ||||||
571 | } | ||||||
572 | |||||||
573 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
574 | /// (64 bit edition.) | ||||||
575 | inline unsigned Log2_64(uint64_t Value) { | ||||||
576 | return 63 - countLeadingZeros(Value); | ||||||
577 | } | ||||||
578 | |||||||
579 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||||
580 | /// (32 bit edition). | ||||||
581 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||||
582 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||||
583 | return 32 - countLeadingZeros(Value - 1); | ||||||
584 | } | ||||||
585 | |||||||
586 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||||
587 | /// (64 bit edition.) | ||||||
588 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||||
589 | return 64 - countLeadingZeros(Value - 1); | ||||||
590 | } | ||||||
591 | |||||||
592 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||||
593 | template <typename T> | ||||||
594 | inline T greatestCommonDivisor(T A, T B) { | ||||||
595 | while (B) { | ||||||
596 | T Tmp = B; | ||||||
597 | B = A % B; | ||||||
598 | A = Tmp; | ||||||
599 | } | ||||||
600 | return A; | ||||||
601 | } | ||||||
602 | |||||||
603 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||||
604 | return greatestCommonDivisor<uint64_t>(A, B); | ||||||
605 | } | ||||||
606 | |||||||
607 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||||
608 | inline double BitsToDouble(uint64_t Bits) { | ||||||
609 | double D; | ||||||
610 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
611 | memcpy(&D, &Bits, sizeof(Bits)); | ||||||
612 | return D; | ||||||
613 | } | ||||||
614 | |||||||
615 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||||
616 | inline float BitsToFloat(uint32_t Bits) { | ||||||
617 | float F; | ||||||
618 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
619 | memcpy(&F, &Bits, sizeof(Bits)); | ||||||
620 | return F; | ||||||
621 | } | ||||||
622 | |||||||
623 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||||
624 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||||
625 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
626 | inline uint64_t DoubleToBits(double Double) { | ||||||
627 | uint64_t Bits; | ||||||
628 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
629 | memcpy(&Bits, &Double, sizeof(Double)); | ||||||
630 | return Bits; | ||||||
631 | } | ||||||
632 | |||||||
633 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||||
634 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||||
635 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
636 | inline uint32_t FloatToBits(float Float) { | ||||||
637 | uint32_t Bits; | ||||||
638 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
639 | memcpy(&Bits, &Float, sizeof(Float)); | ||||||
640 | return Bits; | ||||||
641 | } | ||||||
642 | |||||||
643 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||||
644 | /// may be assumed after adding the two together. | ||||||
645 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||||
646 | // The largest power of 2 that divides both A and B. | ||||||
647 | // | ||||||
648 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||||
649 | // MSVC warning C4146 | ||||||
650 | // return (A | B) & -(A | B); | ||||||
651 | return (A | B) & (1 + ~(A | B)); | ||||||
652 | } | ||||||
653 | |||||||
654 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||||
655 | /// Returns zero on overflow. | ||||||
656 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||||
657 | A |= (A >> 1); | ||||||
658 | A |= (A >> 2); | ||||||
659 | A |= (A >> 4); | ||||||
660 | A |= (A >> 8); | ||||||
661 | A |= (A >> 16); | ||||||
662 | A |= (A >> 32); | ||||||
663 | return A + 1; | ||||||
664 | } | ||||||
665 | |||||||
666 | /// Returns the power of two which is less than or equal to the given value. | ||||||
667 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||||
668 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||||
669 | if (!A) return 0; | ||||||
670 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||||
671 | } | ||||||
672 | |||||||
673 | /// Returns the power of two which is greater than or equal to the given value. | ||||||
674 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||||
675 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||||
676 | if (!A) | ||||||
677 | return 0; | ||||||
678 | return NextPowerOf2(A - 1); | ||||||
679 | } | ||||||
680 | |||||||
681 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
682 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||||
683 | /// | ||||||
684 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||||
685 | /// integer that is greater than or equal to \p Value and equal to | ||||||
686 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||||
687 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||||
688 | /// | ||||||
689 | /// Examples: | ||||||
690 | /// \code | ||||||
691 | /// alignTo(5, 8) = 8 | ||||||
692 | /// alignTo(17, 8) = 24 | ||||||
693 | /// alignTo(~0LL, 8) = 0 | ||||||
694 | /// alignTo(321, 255) = 510 | ||||||
695 | /// | ||||||
696 | /// alignTo(5, 8, 7) = 7 | ||||||
697 | /// alignTo(17, 8, 1) = 17 | ||||||
698 | /// alignTo(~0LL, 8, 3) = 3 | ||||||
699 | /// alignTo(321, 255, 42) = 552 | ||||||
700 | /// \endcode | ||||||
701 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
702 | assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast< void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 702, __PRETTY_FUNCTION__)); | ||||||
703 | Skew %= Align; | ||||||
704 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||||
705 | } | ||||||
706 | |||||||
707 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
708 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||||
709 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||||
710 | static_assert(Align != 0u, "Align must be non-zero"); | ||||||
711 | return (Value + Align - 1) / Align * Align; | ||||||
712 | } | ||||||
713 | |||||||
714 | /// Returns the integer ceil(Numerator / Denominator). | ||||||
715 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||||
716 | return alignTo(Numerator, Denominator) / Denominator; | ||||||
717 | } | ||||||
718 | |||||||
719 | /// Returns the integer nearest(Numerator / Denominator). | ||||||
720 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { | ||||||
721 | return (Numerator + (Denominator / 2)) / Denominator; | ||||||
722 | } | ||||||
723 | |||||||
724 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||||
725 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||||
726 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
727 | assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast< void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 727, __PRETTY_FUNCTION__)); | ||||||
728 | Skew %= Align; | ||||||
729 | return (Value - Skew) / Align * Align + Skew; | ||||||
730 | } | ||||||
731 | |||||||
732 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
733 | /// Requires 0 < B <= 32. | ||||||
734 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||||
735 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
736 | static_assert(B <= 32, "Bit width out of range."); | ||||||
737 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
738 | } | ||||||
739 | |||||||
740 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
741 | /// Requires 0 < B < 32. | ||||||
742 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||||
743 | assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast< void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 743, __PRETTY_FUNCTION__)); | ||||||
744 | assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast <void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 744, __PRETTY_FUNCTION__)); | ||||||
745 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
746 | } | ||||||
747 | |||||||
748 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
749 | /// Requires 0 < B < 64. | ||||||
750 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||||
751 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
752 | static_assert(B <= 64, "Bit width out of range."); | ||||||
753 | return int64_t(x << (64 - B)) >> (64 - B); | ||||||
754 | } | ||||||
755 | |||||||
756 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
757 | /// Requires 0 < B < 64. | ||||||
758 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||||
759 | assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast< void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 759, __PRETTY_FUNCTION__)); | ||||||
760 | assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast <void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h" , 760, __PRETTY_FUNCTION__)); | ||||||
761 | return int64_t(X << (64 - B)) >> (64 - B); | ||||||
762 | } | ||||||
763 | |||||||
764 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||||
765 | /// value of the result. | ||||||
766 | template <typename T> | ||||||
767 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { | ||||||
768 | return std::max(X, Y) - std::min(X, Y); | ||||||
769 | } | ||||||
770 | |||||||
771 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
772 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
773 | /// the result is larger than the maximum representable value of type T. | ||||||
774 | template <typename T> | ||||||
775 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
776 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
777 | bool Dummy; | ||||||
778 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
779 | // Hacker's Delight, p. 29 | ||||||
780 | T Z = X + Y; | ||||||
781 | Overflowed = (Z < X || Z < Y); | ||||||
782 | if (Overflowed) | ||||||
783 | return std::numeric_limits<T>::max(); | ||||||
784 | else | ||||||
785 | return Z; | ||||||
786 | } | ||||||
787 | |||||||
788 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
789 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
790 | /// the result is larger than the maximum representable value of type T. | ||||||
791 | template <typename T> | ||||||
792 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
793 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
794 | bool Dummy; | ||||||
795 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
796 | |||||||
797 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||||
798 | // because it fails for uint16_t (where multiplication can have undefined | ||||||
799 | // behavior due to promotion to int), and requires a division in addition | ||||||
800 | // to the multiplication. | ||||||
801 | |||||||
802 | Overflowed = false; | ||||||
803 | |||||||
804 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||||
805 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||||
806 | // will necessarily be less than Log2Max as desired. | ||||||
807 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||||
808 | const T Max = std::numeric_limits<T>::max(); | ||||||
809 | int Log2Max = Log2_64(Max); | ||||||
810 | if (Log2Z < Log2Max) { | ||||||
811 | return X * Y; | ||||||
812 | } | ||||||
813 | if (Log2Z > Log2Max) { | ||||||
814 | Overflowed = true; | ||||||
815 | return Max; | ||||||
816 | } | ||||||
817 | |||||||
818 | // We're going to use the top bit, and maybe overflow one | ||||||
819 | // bit past it. Multiply all but the bottom bit then add | ||||||
820 | // that on at the end. | ||||||
821 | T Z = (X >> 1) * Y; | ||||||
822 | if (Z & ~(Max >> 1)) { | ||||||
823 | Overflowed = true; | ||||||
824 | return Max; | ||||||
825 | } | ||||||
826 | Z <<= 1; | ||||||
827 | if (X & 1) | ||||||
828 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||||
829 | |||||||
830 | return Z; | ||||||
831 | } | ||||||
832 | |||||||
833 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||||
834 | /// the product. Clamp the result to the maximum representable value of T on | ||||||
835 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||||
836 | /// maximum representable value of type T. | ||||||
837 | template <typename T> | ||||||
838 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
839 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||||
840 | bool Dummy; | ||||||
841 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
842 | |||||||
843 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||||
844 | if (Overflowed) | ||||||
845 | return Product; | ||||||
846 | |||||||
847 | return SaturatingAdd(A, Product, &Overflowed); | ||||||
848 | } | ||||||
849 | |||||||
850 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||||
851 | extern const float huge_valf; | ||||||
852 | |||||||
853 | |||||||
854 | /// Add two signed integers, computing the two's complement truncated result, | ||||||
855 | /// returning true if overflow occured. | ||||||
856 | template <typename T> | ||||||
857 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { | ||||||
858 | #if __has_builtin(__builtin_add_overflow)1 | ||||||
859 | return __builtin_add_overflow(X, Y, &Result); | ||||||
860 | #else | ||||||
861 | // Perform the unsigned addition. | ||||||
862 | using U = std::make_unsigned_t<T>; | ||||||
863 | const U UX = static_cast<U>(X); | ||||||
864 | const U UY = static_cast<U>(Y); | ||||||
865 | const U UResult = UX + UY; | ||||||
866 | |||||||
867 | // Convert to signed. | ||||||
868 | Result = static_cast<T>(UResult); | ||||||
869 | |||||||
870 | // Adding two positive numbers should result in a positive number. | ||||||
871 | if (X > 0 && Y > 0) | ||||||
872 | return Result <= 0; | ||||||
873 | // Adding two negatives should result in a negative number. | ||||||
874 | if (X < 0 && Y < 0) | ||||||
875 | return Result >= 0; | ||||||
876 | return false; | ||||||
877 | #endif | ||||||
878 | } | ||||||
879 | |||||||
880 | /// Subtract two signed integers, computing the two's complement truncated | ||||||
881 | /// result, returning true if an overflow ocurred. | ||||||
882 | template <typename T> | ||||||
883 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { | ||||||
884 | #if __has_builtin(__builtin_sub_overflow)1 | ||||||
885 | return __builtin_sub_overflow(X, Y, &Result); | ||||||
886 | #else | ||||||
887 | // Perform the unsigned addition. | ||||||
888 | using U = std::make_unsigned_t<T>; | ||||||
889 | const U UX = static_cast<U>(X); | ||||||
890 | const U UY = static_cast<U>(Y); | ||||||
891 | const U UResult = UX - UY; | ||||||
892 | |||||||
893 | // Convert to signed. | ||||||
894 | Result = static_cast<T>(UResult); | ||||||
895 | |||||||
896 | // Subtracting a positive number from a negative results in a negative number. | ||||||
897 | if (X <= 0 && Y > 0) | ||||||
898 | return Result >= 0; | ||||||
899 | // Subtracting a negative number from a positive results in a positive number. | ||||||
900 | if (X >= 0 && Y < 0) | ||||||
901 | return Result <= 0; | ||||||
902 | return false; | ||||||
903 | #endif | ||||||
904 | } | ||||||
905 | |||||||
906 | /// Multiply two signed integers, computing the two's complement truncated | ||||||
907 | /// result, returning true if an overflow ocurred. | ||||||
908 | template <typename T> | ||||||
909 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { | ||||||
910 | // Perform the unsigned multiplication on absolute values. | ||||||
911 | using U = std::make_unsigned_t<T>; | ||||||
912 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||||
913 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||||
914 | const U UResult = UX * UY; | ||||||
915 | |||||||
916 | // Convert to signed. | ||||||
917 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||||
918 | Result = IsNegative ? (0 - UResult) : UResult; | ||||||
919 | |||||||
920 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||||
921 | if (UX == 0 || UY == 0) | ||||||
922 | return false; | ||||||
923 | |||||||
924 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||||
925 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||||
926 | // positive) divided by an argument compares to the other. | ||||||
927 | if (IsNegative) | ||||||
928 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||||
929 | else | ||||||
930 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||||
931 | } | ||||||
932 | |||||||
933 | } // End llvm namespace | ||||||
934 | |||||||
935 | #endif |