File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 1812, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | /// \file | ||||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||||
10 | /// AMDGPU. | ||||||
11 | /// \todo This should be generated by TableGen. | ||||||
12 | //===----------------------------------------------------------------------===// | ||||||
13 | |||||||
14 | #if defined(_MSC_VER) || defined(__MINGW32__) | ||||||
15 | // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI | ||||||
16 | // from the Visual C++ cmath / math.h headers: | ||||||
17 | // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 | ||||||
18 | #define _USE_MATH_DEFINES | ||||||
19 | #endif | ||||||
20 | |||||||
21 | #include "AMDGPU.h" | ||||||
22 | #include "AMDGPULegalizerInfo.h" | ||||||
23 | #include "AMDGPUTargetMachine.h" | ||||||
24 | #include "SIMachineFunctionInfo.h" | ||||||
25 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||||
27 | #include "llvm/CodeGen/TargetOpcodes.h" | ||||||
28 | #include "llvm/CodeGen/ValueTypes.h" | ||||||
29 | #include "llvm/IR/DerivedTypes.h" | ||||||
30 | #include "llvm/IR/DiagnosticInfo.h" | ||||||
31 | #include "llvm/IR/Type.h" | ||||||
32 | #include "llvm/Support/Debug.h" | ||||||
33 | |||||||
34 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||||
35 | |||||||
36 | using namespace llvm; | ||||||
37 | using namespace LegalizeActions; | ||||||
38 | using namespace LegalizeMutations; | ||||||
39 | using namespace LegalityPredicates; | ||||||
40 | |||||||
41 | |||||||
42 | static LegalityPredicate isMultiple32(unsigned TypeIdx, | ||||||
43 | unsigned MaxSize = 1024) { | ||||||
44 | return [=](const LegalityQuery &Query) { | ||||||
45 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
46 | const LLT EltTy = Ty.getScalarType(); | ||||||
47 | return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; | ||||||
48 | }; | ||||||
49 | } | ||||||
50 | |||||||
51 | static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { | ||||||
52 | return [=](const LegalityQuery &Query) { | ||||||
53 | return Query.Types[TypeIdx].getSizeInBits() == Size; | ||||||
54 | }; | ||||||
55 | } | ||||||
56 | |||||||
57 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||||
58 | return [=](const LegalityQuery &Query) { | ||||||
59 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
60 | return Ty.isVector() && | ||||||
61 | Ty.getNumElements() % 2 != 0 && | ||||||
62 | Ty.getElementType().getSizeInBits() < 32 && | ||||||
63 | Ty.getSizeInBits() % 32 != 0; | ||||||
64 | }; | ||||||
65 | } | ||||||
66 | |||||||
67 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||||
68 | return [=](const LegalityQuery &Query) { | ||||||
69 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
70 | const LLT EltTy = Ty.getScalarType(); | ||||||
71 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||||
72 | }; | ||||||
73 | } | ||||||
74 | |||||||
75 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||||
76 | return [=](const LegalityQuery &Query) { | ||||||
77 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
78 | const LLT EltTy = Ty.getElementType(); | ||||||
79 | return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); | ||||||
80 | }; | ||||||
81 | } | ||||||
82 | |||||||
83 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||||
84 | return [=](const LegalityQuery &Query) { | ||||||
85 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
86 | const LLT EltTy = Ty.getElementType(); | ||||||
87 | unsigned Size = Ty.getSizeInBits(); | ||||||
88 | unsigned Pieces = (Size + 63) / 64; | ||||||
89 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||||
90 | return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); | ||||||
91 | }; | ||||||
92 | } | ||||||
93 | |||||||
94 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||||
95 | // type. | ||||||
96 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||||
97 | return [=](const LegalityQuery &Query) { | ||||||
98 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
99 | |||||||
100 | const LLT EltTy = Ty.getElementType(); | ||||||
101 | const int Size = Ty.getSizeInBits(); | ||||||
102 | const int EltSize = EltTy.getSizeInBits(); | ||||||
103 | const int NextMul32 = (Size + 31) / 32; | ||||||
104 | |||||||
105 | assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail ("EltSize < 32", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 105, __PRETTY_FUNCTION__)); | ||||||
106 | |||||||
107 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||||
108 | return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); | ||||||
109 | }; | ||||||
110 | } | ||||||
111 | |||||||
112 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||||
113 | return [=](const LegalityQuery &Query) { | ||||||
114 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
115 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||||
116 | }; | ||||||
117 | } | ||||||
118 | |||||||
119 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||||
120 | return [=](const LegalityQuery &Query) { | ||||||
121 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
122 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||||
123 | }; | ||||||
124 | } | ||||||
125 | |||||||
126 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||||
127 | return [=](const LegalityQuery &Query) { | ||||||
128 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
129 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||||
130 | }; | ||||||
131 | } | ||||||
132 | |||||||
133 | // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of | ||||||
134 | // v2s16. | ||||||
135 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||||
136 | return [=](const LegalityQuery &Query) { | ||||||
137 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
138 | if (Ty.isVector()) { | ||||||
139 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||||
140 | return EltSize == 32 || EltSize == 64 || | ||||||
141 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||||
142 | EltSize == 128 || EltSize == 256; | ||||||
143 | } | ||||||
144 | |||||||
145 | return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; | ||||||
146 | }; | ||||||
147 | } | ||||||
148 | |||||||
149 | static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { | ||||||
150 | return [=](const LegalityQuery &Query) { | ||||||
151 | return Query.Types[TypeIdx].getElementType() == Type; | ||||||
152 | }; | ||||||
153 | } | ||||||
154 | |||||||
155 | static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { | ||||||
156 | return [=](const LegalityQuery &Query) { | ||||||
157 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
158 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||||
159 | Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); | ||||||
160 | }; | ||||||
161 | } | ||||||
162 | |||||||
163 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||||
164 | const GCNTargetMachine &TM) | ||||||
165 | : ST(ST_) { | ||||||
166 | using namespace TargetOpcode; | ||||||
167 | |||||||
168 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||||
169 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||||
170 | }; | ||||||
171 | |||||||
172 | const LLT S1 = LLT::scalar(1); | ||||||
173 | const LLT S8 = LLT::scalar(8); | ||||||
174 | const LLT S16 = LLT::scalar(16); | ||||||
175 | const LLT S32 = LLT::scalar(32); | ||||||
176 | const LLT S64 = LLT::scalar(64); | ||||||
177 | const LLT S96 = LLT::scalar(96); | ||||||
178 | const LLT S128 = LLT::scalar(128); | ||||||
179 | const LLT S256 = LLT::scalar(256); | ||||||
180 | const LLT S1024 = LLT::scalar(1024); | ||||||
181 | |||||||
182 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
183 | const LLT V4S16 = LLT::vector(4, 16); | ||||||
184 | |||||||
185 | const LLT V2S32 = LLT::vector(2, 32); | ||||||
186 | const LLT V3S32 = LLT::vector(3, 32); | ||||||
187 | const LLT V4S32 = LLT::vector(4, 32); | ||||||
188 | const LLT V5S32 = LLT::vector(5, 32); | ||||||
189 | const LLT V6S32 = LLT::vector(6, 32); | ||||||
190 | const LLT V7S32 = LLT::vector(7, 32); | ||||||
191 | const LLT V8S32 = LLT::vector(8, 32); | ||||||
192 | const LLT V9S32 = LLT::vector(9, 32); | ||||||
193 | const LLT V10S32 = LLT::vector(10, 32); | ||||||
194 | const LLT V11S32 = LLT::vector(11, 32); | ||||||
195 | const LLT V12S32 = LLT::vector(12, 32); | ||||||
196 | const LLT V13S32 = LLT::vector(13, 32); | ||||||
197 | const LLT V14S32 = LLT::vector(14, 32); | ||||||
198 | const LLT V15S32 = LLT::vector(15, 32); | ||||||
199 | const LLT V16S32 = LLT::vector(16, 32); | ||||||
200 | const LLT V32S32 = LLT::vector(32, 32); | ||||||
201 | |||||||
202 | const LLT V2S64 = LLT::vector(2, 64); | ||||||
203 | const LLT V3S64 = LLT::vector(3, 64); | ||||||
204 | const LLT V4S64 = LLT::vector(4, 64); | ||||||
205 | const LLT V5S64 = LLT::vector(5, 64); | ||||||
206 | const LLT V6S64 = LLT::vector(6, 64); | ||||||
207 | const LLT V7S64 = LLT::vector(7, 64); | ||||||
208 | const LLT V8S64 = LLT::vector(8, 64); | ||||||
209 | const LLT V16S64 = LLT::vector(16, 64); | ||||||
210 | |||||||
211 | std::initializer_list<LLT> AllS32Vectors = | ||||||
212 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||||
213 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||||
214 | std::initializer_list<LLT> AllS64Vectors = | ||||||
215 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||||
216 | |||||||
217 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||||
218 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
219 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||||
220 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||||
221 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||||
222 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||||
223 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||||
224 | |||||||
225 | const LLT CodePtr = FlatPtr; | ||||||
226 | |||||||
227 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||||
228 | GlobalPtr, ConstantPtr, FlatPtr | ||||||
229 | }; | ||||||
230 | |||||||
231 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||||
232 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||||
233 | }; | ||||||
234 | |||||||
235 | const std::initializer_list<LLT> FPTypesBase = { | ||||||
236 | S32, S64 | ||||||
237 | }; | ||||||
238 | |||||||
239 | const std::initializer_list<LLT> FPTypes16 = { | ||||||
240 | S32, S64, S16 | ||||||
241 | }; | ||||||
242 | |||||||
243 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||||
244 | S32, S64, S16, V2S16 | ||||||
245 | }; | ||||||
246 | |||||||
247 | setAction({G_BRCOND, S1}, Legal); | ||||||
248 | |||||||
249 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||||
250 | // elements for v3s16 | ||||||
251 | getActionDefinitionsBuilder(G_PHI) | ||||||
252 | .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) | ||||||
253 | .legalFor(AllS32Vectors) | ||||||
254 | .legalFor(AllS64Vectors) | ||||||
255 | .legalFor(AddrSpaces64) | ||||||
256 | .legalFor(AddrSpaces32) | ||||||
257 | .clampScalar(0, S32, S256) | ||||||
258 | .widenScalarToNextPow2(0, 32) | ||||||
259 | .clampMaxNumElements(0, S32, 16) | ||||||
260 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
261 | .legalIf(isPointer(0)); | ||||||
262 | |||||||
263 | if (ST.has16BitInsts()) { | ||||||
264 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
265 | .legalFor({S32, S16}) | ||||||
266 | .clampScalar(0, S16, S32) | ||||||
267 | .scalarize(0); | ||||||
268 | } else { | ||||||
269 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
270 | .legalFor({S32}) | ||||||
271 | .clampScalar(0, S32, S32) | ||||||
272 | .scalarize(0); | ||||||
273 | } | ||||||
274 | |||||||
275 | getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||||
276 | .legalFor({S32}) | ||||||
277 | .clampScalar(0, S32, S32) | ||||||
278 | .scalarize(0); | ||||||
279 | |||||||
280 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||||
281 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||||
282 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||||
283 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||||
284 | .clampScalar(0, S32, S64) | ||||||
285 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
286 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||||
287 | .widenScalarToNextPow2(0) | ||||||
288 | .scalarize(0); | ||||||
289 | |||||||
290 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | ||||||
291 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||||
292 | .legalFor({{S32, S1}}) | ||||||
293 | .clampScalar(0, S32, S32) | ||||||
294 | .scalarize(0); // TODO: Implement. | ||||||
295 | |||||||
296 | getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) | ||||||
297 | .lower(); | ||||||
298 | |||||||
299 | getActionDefinitionsBuilder(G_BITCAST) | ||||||
300 | // Don't worry about the size constraint. | ||||||
301 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
302 | // FIXME: Testing hack | ||||||
303 | .legalForCartesianProduct({S16, LLT::vector(2, 8), }); | ||||||
304 | |||||||
305 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||||
306 | .legalFor({S32, S64, S16}) | ||||||
307 | .clampScalar(0, S16, S64); | ||||||
308 | |||||||
309 | getActionDefinitionsBuilder(G_IMPLICIT_DEF) | ||||||
310 | .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, | ||||||
311 | ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) | ||||||
312 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
313 | .clampScalarOrElt(0, S32, S1024) | ||||||
314 | .legalIf(isMultiple32(0)) | ||||||
315 | .widenScalarToNextPow2(0, 32) | ||||||
316 | .clampMaxNumElements(0, S32, 16); | ||||||
317 | |||||||
318 | |||||||
319 | // FIXME: i1 operands to intrinsics should always be legal, but other i1 | ||||||
320 | // values may not be legal. We need to figure out how to distinguish | ||||||
321 | // between these two scenarios. | ||||||
322 | getActionDefinitionsBuilder(G_CONSTANT) | ||||||
323 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||||
324 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||||
325 | .clampScalar(0, S32, S64) | ||||||
326 | .widenScalarToNextPow2(0) | ||||||
327 | .legalIf(isPointer(0)); | ||||||
328 | |||||||
329 | setAction({G_FRAME_INDEX, PrivatePtr}, Legal); | ||||||
330 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||||
331 | .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); | ||||||
332 | |||||||
333 | |||||||
334 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||||
335 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | ||||||
336 | .legalFor({S32, S64}); | ||||||
337 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||||
338 | .customFor({S32, S64}); | ||||||
339 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||||
340 | .customFor({S32, S64}); | ||||||
341 | |||||||
342 | if (ST.has16BitInsts()) { | ||||||
343 | if (ST.hasVOP3PInsts()) | ||||||
344 | FPOpActions.legalFor({S16, V2S16}); | ||||||
345 | else | ||||||
346 | FPOpActions.legalFor({S16}); | ||||||
347 | |||||||
348 | TrigActions.customFor({S16}); | ||||||
349 | FDIVActions.customFor({S16}); | ||||||
350 | } | ||||||
351 | |||||||
352 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||||
353 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||||
354 | |||||||
355 | if (ST.hasVOP3PInsts()) { | ||||||
356 | MinNumMaxNum.customFor(FPTypesPK16) | ||||||
357 | .clampMaxNumElements(0, S16, 2) | ||||||
358 | .clampScalar(0, S16, S64) | ||||||
359 | .scalarize(0); | ||||||
360 | } else if (ST.has16BitInsts()) { | ||||||
361 | MinNumMaxNum.customFor(FPTypes16) | ||||||
362 | .clampScalar(0, S16, S64) | ||||||
363 | .scalarize(0); | ||||||
364 | } else { | ||||||
365 | MinNumMaxNum.customFor(FPTypesBase) | ||||||
366 | .clampScalar(0, S32, S64) | ||||||
367 | .scalarize(0); | ||||||
368 | } | ||||||
369 | |||||||
370 | if (ST.hasVOP3PInsts()) | ||||||
371 | FPOpActions.clampMaxNumElements(0, S16, 2); | ||||||
372 | |||||||
373 | FPOpActions | ||||||
374 | .scalarize(0) | ||||||
375 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
376 | |||||||
377 | TrigActions | ||||||
378 | .scalarize(0) | ||||||
379 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
380 | |||||||
381 | FDIVActions | ||||||
382 | .scalarize(0) | ||||||
383 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
384 | |||||||
385 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||||
386 | .legalFor(FPTypesPK16) | ||||||
387 | .clampMaxNumElements(0, S16, 2) | ||||||
388 | .scalarize(0) | ||||||
389 | .clampScalar(0, S16, S64); | ||||||
390 | |||||||
391 | // TODO: Implement | ||||||
392 | getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||||
393 | |||||||
394 | if (ST.has16BitInsts()) { | ||||||
395 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||
396 | .legalFor({S32, S64, S16}) | ||||||
397 | .scalarize(0) | ||||||
398 | .clampScalar(0, S16, S64); | ||||||
399 | } else { | ||||||
400 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||
401 | .legalFor({S32, S64}) | ||||||
402 | .scalarize(0) | ||||||
403 | .clampScalar(0, S32, S64); | ||||||
404 | } | ||||||
405 | |||||||
406 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||||
407 | .legalFor({{S32, S64}, {S16, S32}}) | ||||||
408 | .scalarize(0); | ||||||
409 | |||||||
410 | getActionDefinitionsBuilder(G_FPEXT) | ||||||
411 | .legalFor({{S64, S32}, {S32, S16}}) | ||||||
412 | .lowerFor({{S64, S16}}) // FIXME: Implement | ||||||
413 | .scalarize(0); | ||||||
414 | |||||||
415 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops. | ||||||
416 | getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); | ||||||
417 | |||||||
418 | getActionDefinitionsBuilder(G_FSUB) | ||||||
419 | // Use actual fsub instruction | ||||||
420 | .legalFor({S32}) | ||||||
421 | // Must use fadd + fneg | ||||||
422 | .lowerFor({S64, S16, V2S16}) | ||||||
423 | .scalarize(0) | ||||||
424 | .clampScalar(0, S32, S64); | ||||||
425 | |||||||
426 | // Whether this is legal depends on the floating point mode for the function. | ||||||
427 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||||
428 | if (ST.hasMadF16()) | ||||||
429 | FMad.customFor({S32, S16}); | ||||||
430 | else | ||||||
431 | FMad.customFor({S32}); | ||||||
432 | FMad.scalarize(0) | ||||||
433 | .lower(); | ||||||
434 | |||||||
435 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||||
436 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||||
437 | {S32, S1}, {S64, S1}, {S16, S1}, | ||||||
438 | {S96, S32}, | ||||||
439 | // FIXME: Hack | ||||||
440 | {S64, LLT::scalar(33)}, | ||||||
441 | {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) | ||||||
442 | .scalarize(0); | ||||||
443 | |||||||
444 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||||
445 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||||
446 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) | ||||||
447 | .lowerFor({{S32, S64}}) | ||||||
448 | .customFor({{S64, S64}}); | ||||||
449 | if (ST.has16BitInsts()) | ||||||
450 | IToFP.legalFor({{S16, S16}}); | ||||||
451 | IToFP.clampScalar(1, S32, S64) | ||||||
452 | .scalarize(0); | ||||||
453 | |||||||
454 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||||
455 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); | ||||||
456 | if (ST.has16BitInsts()) | ||||||
457 | FPToI.legalFor({{S16, S16}}); | ||||||
458 | else | ||||||
459 | FPToI.minScalar(1, S32); | ||||||
460 | |||||||
461 | FPToI.minScalar(0, S32) | ||||||
462 | .scalarize(0); | ||||||
463 | |||||||
464 | getActionDefinitionsBuilder(G_INTRINSIC_ROUND) | ||||||
465 | .legalFor({S32, S64}) | ||||||
466 | .scalarize(0); | ||||||
467 | |||||||
468 | if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||||
469 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
470 | .legalFor({S32, S64}) | ||||||
471 | .clampScalar(0, S32, S64) | ||||||
472 | .scalarize(0); | ||||||
473 | } else { | ||||||
474 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
475 | .legalFor({S32}) | ||||||
476 | .customFor({S64}) | ||||||
477 | .clampScalar(0, S32, S64) | ||||||
478 | .scalarize(0); | ||||||
479 | } | ||||||
480 | |||||||
481 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||||
482 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
483 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
484 | .scalarize(0); | ||||||
485 | |||||||
486 | getActionDefinitionsBuilder(G_PTR_MASK) | ||||||
487 | .scalarize(0) | ||||||
488 | .alwaysLegal(); | ||||||
489 | |||||||
490 | setAction({G_BLOCK_ADDR, CodePtr}, Legal); | ||||||
491 | |||||||
492 | auto &CmpBuilder = | ||||||
493 | getActionDefinitionsBuilder(G_ICMP) | ||||||
494 | .legalForCartesianProduct( | ||||||
495 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||||
496 | .legalFor({{S1, S32}, {S1, S64}}); | ||||||
497 | if (ST.has16BitInsts()) { | ||||||
498 | CmpBuilder.legalFor({{S1, S16}}); | ||||||
499 | } | ||||||
500 | |||||||
501 | CmpBuilder | ||||||
502 | .widenScalarToNextPow2(1) | ||||||
503 | .clampScalar(1, S32, S64) | ||||||
504 | .scalarize(0) | ||||||
505 | .legalIf(all(typeIs(0, S1), isPointer(1))); | ||||||
506 | |||||||
507 | getActionDefinitionsBuilder(G_FCMP) | ||||||
508 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||||
509 | .widenScalarToNextPow2(1) | ||||||
510 | .clampScalar(1, S32, S64) | ||||||
511 | .scalarize(0); | ||||||
512 | |||||||
513 | // FIXME: fexp, flog2, flog10 needs to be custom lowered. | ||||||
514 | getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, | ||||||
515 | G_FLOG, G_FLOG2, G_FLOG10}) | ||||||
516 | .legalFor({S32}) | ||||||
517 | .scalarize(0); | ||||||
518 | |||||||
519 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
520 | getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, | ||||||
521 | G_CTTZ, G_CTTZ_ZERO_UNDEF, | ||||||
522 | G_CTPOP}) | ||||||
523 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
524 | .clampScalar(0, S32, S32) | ||||||
525 | .clampScalar(1, S32, S64) | ||||||
526 | .scalarize(0) | ||||||
527 | .widenScalarToNextPow2(0, 32) | ||||||
528 | .widenScalarToNextPow2(1, 32); | ||||||
529 | |||||||
530 | // TODO: Expand for > s32 | ||||||
531 | getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) | ||||||
532 | .legalFor({S32}) | ||||||
533 | .clampScalar(0, S32, S32) | ||||||
534 | .scalarize(0); | ||||||
535 | |||||||
536 | if (ST.has16BitInsts()) { | ||||||
537 | if (ST.hasVOP3PInsts()) { | ||||||
538 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
539 | .legalFor({S32, S16, V2S16}) | ||||||
540 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
541 | .clampMaxNumElements(0, S16, 2) | ||||||
542 | .clampScalar(0, S16, S32) | ||||||
543 | .widenScalarToNextPow2(0) | ||||||
544 | .scalarize(0); | ||||||
545 | } else { | ||||||
546 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
547 | .legalFor({S32, S16}) | ||||||
548 | .widenScalarToNextPow2(0) | ||||||
549 | .clampScalar(0, S16, S32) | ||||||
550 | .scalarize(0); | ||||||
551 | } | ||||||
552 | } else { | ||||||
553 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
554 | .legalFor({S32}) | ||||||
555 | .clampScalar(0, S32, S32) | ||||||
556 | .widenScalarToNextPow2(0) | ||||||
557 | .scalarize(0); | ||||||
558 | } | ||||||
559 | |||||||
560 | auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { | ||||||
561 | return [=](const LegalityQuery &Query) { | ||||||
562 | return Query.Types[TypeIdx0].getSizeInBits() < | ||||||
563 | Query.Types[TypeIdx1].getSizeInBits(); | ||||||
564 | }; | ||||||
565 | }; | ||||||
566 | |||||||
567 | auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { | ||||||
568 | return [=](const LegalityQuery &Query) { | ||||||
569 | return Query.Types[TypeIdx0].getSizeInBits() > | ||||||
570 | Query.Types[TypeIdx1].getSizeInBits(); | ||||||
571 | }; | ||||||
572 | }; | ||||||
573 | |||||||
574 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||||
575 | // List the common cases | ||||||
576 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
577 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
578 | .scalarize(0) | ||||||
579 | // Accept any address space as long as the size matches | ||||||
580 | .legalIf(sameSize(0, 1)) | ||||||
581 | .widenScalarIf(smallerThan(1, 0), | ||||||
582 | [](const LegalityQuery &Query) { | ||||||
583 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
584 | }) | ||||||
585 | .narrowScalarIf(greaterThan(1, 0), | ||||||
586 | [](const LegalityQuery &Query) { | ||||||
587 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
588 | }); | ||||||
589 | |||||||
590 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||||
591 | // List the common cases | ||||||
592 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
593 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
594 | .scalarize(0) | ||||||
595 | // Accept any address space as long as the size matches | ||||||
596 | .legalIf(sameSize(0, 1)) | ||||||
597 | .widenScalarIf(smallerThan(0, 1), | ||||||
598 | [](const LegalityQuery &Query) { | ||||||
599 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
600 | }) | ||||||
601 | .narrowScalarIf( | ||||||
602 | greaterThan(0, 1), | ||||||
603 | [](const LegalityQuery &Query) { | ||||||
604 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
605 | }); | ||||||
606 | |||||||
607 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||||
608 | .scalarize(0) | ||||||
609 | .custom(); | ||||||
610 | |||||||
611 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||||
612 | // handle some operations by just promoting the register during | ||||||
613 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||||
614 | auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { | ||||||
615 | switch (AS) { | ||||||
616 | // FIXME: Private element size. | ||||||
617 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||||
618 | return 32; | ||||||
619 | // FIXME: Check subtarget | ||||||
620 | case AMDGPUAS::LOCAL_ADDRESS: | ||||||
621 | return ST.useDS128() ? 128 : 64; | ||||||
622 | |||||||
623 | // Treat constant and global as identical. SMRD loads are sometimes usable | ||||||
624 | // for global loads (ideally constant address space should be eliminated) | ||||||
625 | // depending on the context. Legality cannot be context dependent, but | ||||||
626 | // RegBankSelect can split the load as necessary depending on the pointer | ||||||
627 | // register bank/uniformity and if the memory is invariant or not written in | ||||||
628 | // a kernel. | ||||||
629 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||||
630 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||||
631 | return 512; | ||||||
632 | default: | ||||||
633 | return 128; | ||||||
634 | } | ||||||
635 | }; | ||||||
636 | |||||||
637 | const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { | ||||||
638 | const LLT DstTy = Query.Types[0]; | ||||||
639 | |||||||
640 | // Split vector extloads. | ||||||
641 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
642 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||||
643 | return true; | ||||||
644 | |||||||
645 | const LLT PtrTy = Query.Types[1]; | ||||||
646 | unsigned AS = PtrTy.getAddressSpace(); | ||||||
647 | if (MemSize > maxSizeForAddrSpace(AS)) | ||||||
648 | return true; | ||||||
649 | |||||||
650 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||||
651 | // TODO: May be able to widen depending on alignment etc. | ||||||
652 | unsigned NumRegs = MemSize / 32; | ||||||
653 | if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) | ||||||
654 | return true; | ||||||
655 | |||||||
656 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
657 | if (Align < MemSize) { | ||||||
658 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
659 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); | ||||||
660 | } | ||||||
661 | |||||||
662 | return false; | ||||||
663 | }; | ||||||
664 | |||||||
665 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; | ||||||
666 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; | ||||||
667 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; | ||||||
668 | |||||||
669 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||||
670 | // LDS | ||||||
671 | // TODO: Unsupported flat for SI. | ||||||
672 | |||||||
673 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||||
674 | const bool IsStore = Op == G_STORE; | ||||||
675 | |||||||
676 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||||
677 | // Whitelist the common cases. | ||||||
678 | // TODO: Pointer loads | ||||||
679 | // TODO: Wide constant loads | ||||||
680 | // TODO: Only CI+ has 3x loads | ||||||
681 | // TODO: Loads to s16 on gfx9 | ||||||
682 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, | ||||||
683 | {V2S32, GlobalPtr, 64, GlobalAlign32}, | ||||||
684 | {V3S32, GlobalPtr, 96, GlobalAlign32}, | ||||||
685 | {S96, GlobalPtr, 96, GlobalAlign32}, | ||||||
686 | {V4S32, GlobalPtr, 128, GlobalAlign32}, | ||||||
687 | {S128, GlobalPtr, 128, GlobalAlign32}, | ||||||
688 | {S64, GlobalPtr, 64, GlobalAlign32}, | ||||||
689 | {V2S64, GlobalPtr, 128, GlobalAlign32}, | ||||||
690 | {V2S16, GlobalPtr, 32, GlobalAlign32}, | ||||||
691 | {S32, GlobalPtr, 8, GlobalAlign8}, | ||||||
692 | {S32, GlobalPtr, 16, GlobalAlign16}, | ||||||
693 | |||||||
694 | {S32, LocalPtr, 32, 32}, | ||||||
695 | {S64, LocalPtr, 64, 32}, | ||||||
696 | {V2S32, LocalPtr, 64, 32}, | ||||||
697 | {S32, LocalPtr, 8, 8}, | ||||||
698 | {S32, LocalPtr, 16, 16}, | ||||||
699 | {V2S16, LocalPtr, 32, 32}, | ||||||
700 | |||||||
701 | {S32, PrivatePtr, 32, 32}, | ||||||
702 | {S32, PrivatePtr, 8, 8}, | ||||||
703 | {S32, PrivatePtr, 16, 16}, | ||||||
704 | {V2S16, PrivatePtr, 32, 32}, | ||||||
705 | |||||||
706 | {S32, FlatPtr, 32, GlobalAlign32}, | ||||||
707 | {S32, FlatPtr, 16, GlobalAlign16}, | ||||||
708 | {S32, FlatPtr, 8, GlobalAlign8}, | ||||||
709 | {V2S16, FlatPtr, 32, GlobalAlign32}, | ||||||
710 | |||||||
711 | {S32, ConstantPtr, 32, GlobalAlign32}, | ||||||
712 | {V2S32, ConstantPtr, 64, GlobalAlign32}, | ||||||
713 | {V3S32, ConstantPtr, 96, GlobalAlign32}, | ||||||
714 | {V4S32, ConstantPtr, 128, GlobalAlign32}, | ||||||
715 | {S64, ConstantPtr, 64, GlobalAlign32}, | ||||||
716 | {S128, ConstantPtr, 128, GlobalAlign32}, | ||||||
717 | {V2S32, ConstantPtr, 32, GlobalAlign32}}); | ||||||
718 | Actions | ||||||
719 | .customIf(typeIs(1, Constant32Ptr)) | ||||||
720 | .narrowScalarIf( | ||||||
721 | [=](const LegalityQuery &Query) -> bool { | ||||||
722 | return !Query.Types[0].isVector() && needToSplitLoad(Query); | ||||||
723 | }, | ||||||
724 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
725 | const LLT DstTy = Query.Types[0]; | ||||||
726 | const LLT PtrTy = Query.Types[1]; | ||||||
727 | |||||||
728 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||||
729 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
730 | |||||||
731 | // Split extloads. | ||||||
732 | if (DstSize > MemSize) | ||||||
733 | return std::make_pair(0, LLT::scalar(MemSize)); | ||||||
734 | |||||||
735 | if (DstSize > 32 && (DstSize % 32 != 0)) { | ||||||
736 | // FIXME: Need a way to specify non-extload of larger size if | ||||||
737 | // suitably aligned. | ||||||
738 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); | ||||||
739 | } | ||||||
740 | |||||||
741 | unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); | ||||||
742 | if (MemSize > MaxSize) | ||||||
743 | return std::make_pair(0, LLT::scalar(MaxSize)); | ||||||
744 | |||||||
745 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
746 | return std::make_pair(0, LLT::scalar(Align)); | ||||||
747 | }) | ||||||
748 | .fewerElementsIf( | ||||||
749 | [=](const LegalityQuery &Query) -> bool { | ||||||
750 | return Query.Types[0].isVector() && needToSplitLoad(Query); | ||||||
751 | }, | ||||||
752 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
753 | const LLT DstTy = Query.Types[0]; | ||||||
754 | const LLT PtrTy = Query.Types[1]; | ||||||
755 | |||||||
756 | LLT EltTy = DstTy.getElementType(); | ||||||
757 | unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); | ||||||
758 | |||||||
759 | // Split if it's too large for the address space. | ||||||
760 | if (Query.MMODescrs[0].SizeInBits > MaxSize) { | ||||||
761 | unsigned NumElts = DstTy.getNumElements(); | ||||||
762 | unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; | ||||||
763 | |||||||
764 | // FIXME: Refine when odd breakdowns handled | ||||||
765 | // The scalars will need to be re-legalized. | ||||||
766 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||||
767 | NumElts % NumPieces != 0) | ||||||
768 | return std::make_pair(0, EltTy); | ||||||
769 | |||||||
770 | return std::make_pair(0, | ||||||
771 | LLT::vector(NumElts / NumPieces, EltTy)); | ||||||
772 | } | ||||||
773 | |||||||
774 | // Need to split because of alignment. | ||||||
775 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
776 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
777 | if (EltSize > Align && | ||||||
778 | (EltSize / Align < DstTy.getNumElements())) { | ||||||
779 | return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); | ||||||
780 | } | ||||||
781 | |||||||
782 | // May need relegalization for the scalars. | ||||||
783 | return std::make_pair(0, EltTy); | ||||||
784 | }) | ||||||
785 | .minScalar(0, S32); | ||||||
786 | |||||||
787 | if (IsStore) | ||||||
788 | Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); | ||||||
789 | |||||||
790 | // TODO: Need a bitcast lower option? | ||||||
791 | Actions | ||||||
792 | .legalIf([=](const LegalityQuery &Query) { | ||||||
793 | const LLT Ty0 = Query.Types[0]; | ||||||
794 | unsigned Size = Ty0.getSizeInBits(); | ||||||
795 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
796 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
797 | |||||||
798 | // No extending vector loads. | ||||||
799 | if (Size > MemSize && Ty0.isVector()) | ||||||
800 | return false; | ||||||
801 | |||||||
802 | // FIXME: Widening store from alignment not valid. | ||||||
803 | if (MemSize < Size) | ||||||
804 | MemSize = std::max(MemSize, Align); | ||||||
805 | |||||||
806 | switch (MemSize) { | ||||||
807 | case 8: | ||||||
808 | case 16: | ||||||
809 | return Size == 32; | ||||||
810 | case 32: | ||||||
811 | case 64: | ||||||
812 | case 128: | ||||||
813 | return true; | ||||||
814 | case 96: | ||||||
815 | return ST.hasDwordx3LoadStores(); | ||||||
816 | case 256: | ||||||
817 | case 512: | ||||||
818 | return true; | ||||||
819 | default: | ||||||
820 | return false; | ||||||
821 | } | ||||||
822 | }) | ||||||
823 | .widenScalarToNextPow2(0) | ||||||
824 | // TODO: v3s32->v4s32 with alignment | ||||||
825 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); | ||||||
826 | } | ||||||
827 | |||||||
828 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||||
829 | .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, | ||||||
830 | {S32, GlobalPtr, 16, 2 * 8}, | ||||||
831 | {S32, LocalPtr, 8, 8}, | ||||||
832 | {S32, LocalPtr, 16, 16}, | ||||||
833 | {S32, PrivatePtr, 8, 8}, | ||||||
834 | {S32, PrivatePtr, 16, 16}, | ||||||
835 | {S32, ConstantPtr, 8, 8}, | ||||||
836 | {S32, ConstantPtr, 16, 2 * 8}}); | ||||||
837 | if (ST.hasFlatAddressSpace()) { | ||||||
838 | ExtLoads.legalForTypesWithMemDesc( | ||||||
839 | {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); | ||||||
840 | } | ||||||
841 | |||||||
842 | ExtLoads.clampScalar(0, S32, S32) | ||||||
843 | .widenScalarToNextPow2(0) | ||||||
844 | .unsupportedIfMemSizeNotPow2() | ||||||
845 | .lower(); | ||||||
846 | |||||||
847 | auto &Atomics = getActionDefinitionsBuilder( | ||||||
848 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||||
849 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||||
850 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||||
851 | G_ATOMICRMW_UMIN}) | ||||||
852 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||||
853 | {S64, GlobalPtr}, {S64, LocalPtr}}); | ||||||
854 | if (ST.hasFlatAddressSpace()) { | ||||||
855 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||||
856 | } | ||||||
857 | |||||||
858 | getActionDefinitionsBuilder(G_ATOMICRMW_FADD) | ||||||
859 | .legalFor({{S32, LocalPtr}}); | ||||||
860 | |||||||
861 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||||
862 | // demarshalling | ||||||
863 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||||
864 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||||
865 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||||
866 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||||
867 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
868 | |||||||
869 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) | ||||||
870 | .lower(); | ||||||
871 | |||||||
872 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||||
873 | getActionDefinitionsBuilder(G_SELECT) | ||||||
874 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, | ||||||
875 | GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, | ||||||
876 | LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) | ||||||
877 | .clampScalar(0, S16, S64) | ||||||
878 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
879 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||||
880 | .scalarize(1) | ||||||
881 | .clampMaxNumElements(0, S32, 2) | ||||||
882 | .clampMaxNumElements(0, LocalPtr, 2) | ||||||
883 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||||
884 | .scalarize(0) | ||||||
885 | .widenScalarToNextPow2(0) | ||||||
886 | .legalIf(all(isPointer(0), typeIs(1, S1))); | ||||||
887 | |||||||
888 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||||
889 | // be more flexible with the shift amount type. | ||||||
890 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||||
891 | .legalFor({{S32, S32}, {S64, S32}}); | ||||||
892 | if (ST.has16BitInsts()) { | ||||||
893 | if (ST.hasVOP3PInsts()) { | ||||||
894 | Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) | ||||||
895 | .clampMaxNumElements(0, S16, 2); | ||||||
896 | } else | ||||||
897 | Shifts.legalFor({{S16, S32}, {S16, S16}}); | ||||||
898 | |||||||
899 | Shifts.clampScalar(1, S16, S32); | ||||||
900 | Shifts.clampScalar(0, S16, S64); | ||||||
901 | Shifts.widenScalarToNextPow2(0, 16); | ||||||
902 | } else { | ||||||
903 | // Make sure we legalize the shift amount type first, as the general | ||||||
904 | // expansion for the shifted type will produce much worse code if it hasn't | ||||||
905 | // been truncated already. | ||||||
906 | Shifts.clampScalar(1, S32, S32); | ||||||
907 | Shifts.clampScalar(0, S32, S64); | ||||||
908 | Shifts.widenScalarToNextPow2(0, 32); | ||||||
909 | } | ||||||
910 | Shifts.scalarize(0); | ||||||
911 | |||||||
912 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||||
913 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||||
914 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||||
915 | unsigned IdxTypeIdx = 2; | ||||||
916 | |||||||
917 | getActionDefinitionsBuilder(Op) | ||||||
918 | .customIf([=](const LegalityQuery &Query) { | ||||||
919 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
920 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
921 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||||
922 | return (EltTy.getSizeInBits() == 16 || | ||||||
923 | EltTy.getSizeInBits() % 32 == 0) && | ||||||
924 | VecTy.getSizeInBits() % 32 == 0 && | ||||||
925 | VecTy.getSizeInBits() <= 1024 && | ||||||
926 | IdxTy.getSizeInBits() == 32; | ||||||
927 | }) | ||||||
928 | .clampScalar(EltTypeIdx, S32, S64) | ||||||
929 | .clampScalar(VecTypeIdx, S32, S64) | ||||||
930 | .clampScalar(IdxTypeIdx, S32, S32); | ||||||
931 | } | ||||||
932 | |||||||
933 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||||
934 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||||
935 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||||
936 | return Query.Types[0] != EltTy; | ||||||
937 | }); | ||||||
938 | |||||||
939 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||||
940 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||||
941 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||||
942 | |||||||
943 | // FIXME: Doesn't handle extract of illegal sizes. | ||||||
944 | getActionDefinitionsBuilder(Op) | ||||||
945 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||||
946 | // FIXME: Multiples of 16 should not be legal. | ||||||
947 | .legalIf([=](const LegalityQuery &Query) { | ||||||
948 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
949 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
950 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||||
951 | (LitTy.getSizeInBits() % 16 == 0); | ||||||
952 | }) | ||||||
953 | .widenScalarIf( | ||||||
954 | [=](const LegalityQuery &Query) { | ||||||
955 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
956 | return (BigTy.getScalarSizeInBits() < 16); | ||||||
957 | }, | ||||||
958 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||||
959 | .widenScalarIf( | ||||||
960 | [=](const LegalityQuery &Query) { | ||||||
961 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
962 | return (LitTy.getScalarSizeInBits() < 16); | ||||||
963 | }, | ||||||
964 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||||
965 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
966 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||||
967 | |||||||
968 | } | ||||||
969 | |||||||
970 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||||
971 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||||
972 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||||
973 | .clampNumElements(0, V16S32, V32S32) | ||||||
974 | .clampNumElements(0, V2S64, V16S64) | ||||||
975 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||||
976 | |||||||
977 | if (ST.hasScalarPackInsts()) | ||||||
978 | BuildVector.legalFor({V2S16, S32}); | ||||||
979 | |||||||
980 | BuildVector | ||||||
981 | .minScalarSameAs(1, 0) | ||||||
982 | .legalIf(isRegisterType(0)) | ||||||
983 | .minScalarOrElt(0, S32); | ||||||
984 | |||||||
985 | if (ST.hasScalarPackInsts()) { | ||||||
986 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
987 | .legalFor({V2S16, S32}) | ||||||
988 | .lower(); | ||||||
989 | } else { | ||||||
990 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
991 | .lower(); | ||||||
992 | } | ||||||
993 | |||||||
994 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||||
995 | .legalIf(isRegisterType(0)); | ||||||
996 | |||||||
997 | // TODO: Don't fully scalarize v2s16 pieces | ||||||
998 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||||
999 | |||||||
1000 | // Merge/Unmerge | ||||||
1001 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||||
1002 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||||
1003 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||||
1004 | |||||||
1005 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||||
1006 | const LLT &Ty = Query.Types[TypeIdx]; | ||||||
1007 | if (Ty.isVector()) { | ||||||
1008 | const LLT &EltTy = Ty.getElementType(); | ||||||
1009 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) | ||||||
1010 | return true; | ||||||
1011 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | ||||||
1012 | return true; | ||||||
1013 | } | ||||||
1014 | return false; | ||||||
1015 | }; | ||||||
1016 | |||||||
1017 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||||
1018 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||||
1019 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||||
1020 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||||
1021 | // valid. | ||||||
1022 | .clampScalar(LitTyIdx, S16, S256) | ||||||
1023 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||||
1024 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1025 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||||
1026 | elementTypeIs(1, S16)), | ||||||
1027 | changeTo(1, V2S16)) | ||||||
1028 | // Break up vectors with weird elements into scalars | ||||||
1029 | .fewerElementsIf( | ||||||
1030 | [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, | ||||||
1031 | scalarize(0)) | ||||||
1032 | .fewerElementsIf( | ||||||
1033 | [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, | ||||||
1034 | scalarize(1)) | ||||||
1035 | .clampScalar(BigTyIdx, S32, S1024) | ||||||
1036 | .lowerFor({{S16, V2S16}}); | ||||||
1037 | |||||||
1038 | if (Op == G_MERGE_VALUES) { | ||||||
1039 | Builder.widenScalarIf( | ||||||
1040 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||||
1041 | [=](const LegalityQuery &Query) { | ||||||
1042 | const LLT Ty = Query.Types[LitTyIdx]; | ||||||
1043 | return Ty.getSizeInBits() < 32; | ||||||
1044 | }, | ||||||
1045 | changeTo(LitTyIdx, S32)); | ||||||
1046 | } | ||||||
1047 | |||||||
1048 | Builder.widenScalarIf( | ||||||
1049 | [=](const LegalityQuery &Query) { | ||||||
1050 | const LLT Ty = Query.Types[BigTyIdx]; | ||||||
1051 | return !isPowerOf2_32(Ty.getSizeInBits()) && | ||||||
1052 | Ty.getSizeInBits() % 16 != 0; | ||||||
1053 | }, | ||||||
1054 | [=](const LegalityQuery &Query) { | ||||||
1055 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||||
1056 | // Whichever is smaller. | ||||||
1057 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||||
1058 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||||
1059 | if (NewSizeInBits >= 256) { | ||||||
1060 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||||
1061 | if (RoundedTo < NewSizeInBits) | ||||||
1062 | NewSizeInBits = RoundedTo; | ||||||
1063 | } | ||||||
1064 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||||
1065 | }) | ||||||
1066 | .legalIf([=](const LegalityQuery &Query) { | ||||||
1067 | const LLT &BigTy = Query.Types[BigTyIdx]; | ||||||
1068 | const LLT &LitTy = Query.Types[LitTyIdx]; | ||||||
1069 | |||||||
1070 | if (BigTy.isVector() && BigTy.getSizeInBits() < 32) | ||||||
1071 | return false; | ||||||
1072 | if (LitTy.isVector() && LitTy.getSizeInBits() < 32) | ||||||
1073 | return false; | ||||||
1074 | |||||||
1075 | return BigTy.getSizeInBits() % 16 == 0 && | ||||||
1076 | LitTy.getSizeInBits() % 16 == 0 && | ||||||
1077 | BigTy.getSizeInBits() <= 1024; | ||||||
1078 | }) | ||||||
1079 | // Any vectors left are the wrong size. Scalarize them. | ||||||
1080 | .scalarize(0) | ||||||
1081 | .scalarize(1); | ||||||
1082 | } | ||||||
1083 | |||||||
1084 | getActionDefinitionsBuilder(G_SEXT_INREG).lower(); | ||||||
1085 | |||||||
1086 | computeTables(); | ||||||
1087 | verify(*ST.getInstrInfo()); | ||||||
1088 | } | ||||||
1089 | |||||||
1090 | bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, | ||||||
1091 | MachineRegisterInfo &MRI, | ||||||
1092 | MachineIRBuilder &B, | ||||||
1093 | GISelChangeObserver &Observer) const { | ||||||
1094 | switch (MI.getOpcode()) { | ||||||
1095 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||||
1096 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||||
1097 | case TargetOpcode::G_FRINT: | ||||||
1098 | return legalizeFrint(MI, MRI, B); | ||||||
1099 | case TargetOpcode::G_FCEIL: | ||||||
1100 | return legalizeFceil(MI, MRI, B); | ||||||
1101 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||||
1102 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||||
1103 | case TargetOpcode::G_SITOFP: | ||||||
1104 | return legalizeITOFP(MI, MRI, B, true); | ||||||
1105 | case TargetOpcode::G_UITOFP: | ||||||
1106 | return legalizeITOFP(MI, MRI, B, false); | ||||||
1107 | case TargetOpcode::G_FMINNUM: | ||||||
1108 | case TargetOpcode::G_FMAXNUM: | ||||||
1109 | case TargetOpcode::G_FMINNUM_IEEE: | ||||||
1110 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||||
1111 | return legalizeMinNumMaxNum(MI, MRI, B); | ||||||
1112 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||||
1113 | return legalizeExtractVectorElt(MI, MRI, B); | ||||||
1114 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||||
1115 | return legalizeInsertVectorElt(MI, MRI, B); | ||||||
1116 | case TargetOpcode::G_FSIN: | ||||||
1117 | case TargetOpcode::G_FCOS: | ||||||
1118 | return legalizeSinCos(MI, MRI, B); | ||||||
1119 | case TargetOpcode::G_GLOBAL_VALUE: | ||||||
1120 | return legalizeGlobalValue(MI, MRI, B); | ||||||
1121 | case TargetOpcode::G_LOAD: | ||||||
1122 | return legalizeLoad(MI, MRI, B, Observer); | ||||||
1123 | case TargetOpcode::G_FMAD: | ||||||
1124 | return legalizeFMad(MI, MRI, B); | ||||||
1125 | case TargetOpcode::G_FDIV: | ||||||
1126 | return legalizeFDIV(MI, MRI, B); | ||||||
1127 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||||
1128 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||||
1129 | default: | ||||||
1130 | return false; | ||||||
1131 | } | ||||||
1132 | |||||||
1133 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1133); | ||||||
1134 | } | ||||||
1135 | |||||||
1136 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||||
1137 | unsigned AS, | ||||||
1138 | MachineRegisterInfo &MRI, | ||||||
1139 | MachineIRBuilder &B) const { | ||||||
1140 | MachineFunction &MF = B.getMF(); | ||||||
1141 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
1142 | const LLT S32 = LLT::scalar(32); | ||||||
1143 | |||||||
1144 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1144, __PRETTY_FUNCTION__)); | ||||||
1145 | |||||||
1146 | if (ST.hasApertureRegs()) { | ||||||
1147 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | ||||||
1148 | // getreg. | ||||||
1149 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1150 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | ||||||
1151 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | ||||||
1152 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1153 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | ||||||
1154 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | ||||||
1155 | unsigned Encoding = | ||||||
1156 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | ||||||
1157 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | ||||||
1158 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | ||||||
1159 | |||||||
1160 | Register ApertureReg = MRI.createGenericVirtualRegister(S32); | ||||||
1161 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||||||
1162 | |||||||
1163 | B.buildInstr(AMDGPU::S_GETREG_B32) | ||||||
1164 | .addDef(GetReg) | ||||||
1165 | .addImm(Encoding); | ||||||
1166 | MRI.setType(GetReg, S32); | ||||||
1167 | |||||||
1168 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | ||||||
1169 | B.buildInstr(TargetOpcode::G_SHL) | ||||||
1170 | .addDef(ApertureReg) | ||||||
1171 | .addUse(GetReg) | ||||||
1172 | .addUse(ShiftAmt.getReg(0)); | ||||||
1173 | |||||||
1174 | return ApertureReg; | ||||||
1175 | } | ||||||
1176 | |||||||
1177 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||||
1178 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
1179 | |||||||
1180 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1181 | if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) | ||||||
1182 | return Register(); | ||||||
1183 | |||||||
1184 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||||
1185 | // private_segment_aperture_base_hi. | ||||||
1186 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||||
1187 | |||||||
1188 | // FIXME: Don't use undef | ||||||
1189 | Value *V = UndefValue::get(PointerType::get( | ||||||
1190 | Type::getInt8Ty(MF.getFunction().getContext()), | ||||||
1191 | AMDGPUAS::CONSTANT_ADDRESS)); | ||||||
1192 | |||||||
1193 | MachinePointerInfo PtrInfo(V, StructOffset); | ||||||
1194 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
1195 | PtrInfo, | ||||||
1196 | MachineMemOperand::MOLoad | | ||||||
1197 | MachineMemOperand::MODereferenceable | | ||||||
1198 | MachineMemOperand::MOInvariant, | ||||||
1199 | 4, | ||||||
1200 | MinAlign(64, StructOffset)); | ||||||
1201 | |||||||
1202 | Register LoadResult = MRI.createGenericVirtualRegister(S32); | ||||||
1203 | Register LoadAddr; | ||||||
1204 | |||||||
1205 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | ||||||
1206 | B.buildLoad(LoadResult, LoadAddr, *MMO); | ||||||
1207 | return LoadResult; | ||||||
1208 | } | ||||||
1209 | |||||||
1210 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||||
1211 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1212 | MachineIRBuilder &B) const { | ||||||
1213 | MachineFunction &MF = B.getMF(); | ||||||
1214 | |||||||
1215 | B.setInstr(MI); | ||||||
1216 | |||||||
1217 | const LLT S32 = LLT::scalar(32); | ||||||
1218 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1219 | Register Src = MI.getOperand(1).getReg(); | ||||||
1220 | |||||||
1221 | LLT DstTy = MRI.getType(Dst); | ||||||
1222 | LLT SrcTy = MRI.getType(Src); | ||||||
1223 | unsigned DestAS = DstTy.getAddressSpace(); | ||||||
1224 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||||
1225 | |||||||
1226 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||||
1227 | // vector element. | ||||||
1228 | assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail ("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1228, __PRETTY_FUNCTION__)); | ||||||
1229 | |||||||
1230 | const AMDGPUTargetMachine &TM | ||||||
1231 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||||
1232 | |||||||
1233 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
1234 | if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||||
1235 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||||
1236 | return true; | ||||||
1237 | } | ||||||
1238 | |||||||
1239 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1240 | // Truncate. | ||||||
1241 | B.buildExtract(Dst, Src, 0); | ||||||
1242 | MI.eraseFromParent(); | ||||||
1243 | return true; | ||||||
1244 | } | ||||||
1245 | |||||||
1246 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1247 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1248 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||||
1249 | |||||||
1250 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | ||||||
1251 | // another. Merge operands are required to be the same type, but creating an | ||||||
1252 | // extra ptrtoint would be kind of pointless. | ||||||
1253 | auto HighAddr = B.buildConstant( | ||||||
1254 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | ||||||
1255 | B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); | ||||||
1256 | MI.eraseFromParent(); | ||||||
1257 | return true; | ||||||
1258 | } | ||||||
1259 | |||||||
1260 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | ||||||
1261 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1262, __PRETTY_FUNCTION__)) | ||||||
1262 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1262, __PRETTY_FUNCTION__)); | ||||||
1263 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||||
1264 | |||||||
1265 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||||
1266 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||||
1267 | |||||||
1268 | Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); | ||||||
1269 | |||||||
1270 | // Extract low 32-bits of the pointer. | ||||||
1271 | B.buildExtract(PtrLo32, Src, 0); | ||||||
1272 | |||||||
1273 | Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); | ||||||
1274 | B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); | ||||||
1275 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||||
1276 | |||||||
1277 | MI.eraseFromParent(); | ||||||
1278 | return true; | ||||||
1279 | } | ||||||
1280 | |||||||
1281 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | ||||||
1282 | return false; | ||||||
1283 | |||||||
1284 | if (!ST.hasFlatAddressSpace()) | ||||||
1285 | return false; | ||||||
1286 | |||||||
1287 | auto SegmentNull = | ||||||
1288 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||||
1289 | auto FlatNull = | ||||||
1290 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||||
1291 | |||||||
1292 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||||
1293 | if (!ApertureReg.isValid()) | ||||||
1294 | return false; | ||||||
1295 | |||||||
1296 | Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); | ||||||
1297 | B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); | ||||||
1298 | |||||||
1299 | Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); | ||||||
1300 | |||||||
1301 | // Coerce the type of the low half of the result so we can use merge_values. | ||||||
1302 | Register SrcAsInt = MRI.createGenericVirtualRegister(S32); | ||||||
1303 | B.buildInstr(TargetOpcode::G_PTRTOINT) | ||||||
1304 | .addDef(SrcAsInt) | ||||||
1305 | .addUse(Src); | ||||||
1306 | |||||||
1307 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||||
1308 | // avoid the ptrtoint? | ||||||
1309 | B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); | ||||||
1310 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); | ||||||
1311 | |||||||
1312 | MI.eraseFromParent(); | ||||||
1313 | return true; | ||||||
1314 | } | ||||||
1315 | |||||||
1316 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||||
1317 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1318 | MachineIRBuilder &B) const { | ||||||
1319 | B.setInstr(MI); | ||||||
1320 | |||||||
1321 | Register Src = MI.getOperand(1).getReg(); | ||||||
1322 | LLT Ty = MRI.getType(Src); | ||||||
1323 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast <void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1323, __PRETTY_FUNCTION__)); | ||||||
1324 | |||||||
1325 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||||
1326 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||||
1327 | |||||||
1328 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||||
1329 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||||
1330 | |||||||
1331 | // TODO: Should this propagate fast-math-flags? | ||||||
1332 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||||
1333 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||||
1334 | |||||||
1335 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||||
1336 | auto Fabs = B.buildFAbs(Ty, Src); | ||||||
1337 | |||||||
1338 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||||
1339 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||||
1340 | return true; | ||||||
1341 | } | ||||||
1342 | |||||||
1343 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||||
1344 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1345 | MachineIRBuilder &B) const { | ||||||
1346 | B.setInstr(MI); | ||||||
1347 | |||||||
1348 | const LLT S1 = LLT::scalar(1); | ||||||
1349 | const LLT S64 = LLT::scalar(64); | ||||||
1350 | |||||||
1351 | Register Src = MI.getOperand(1).getReg(); | ||||||
1352 | assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1352, __PRETTY_FUNCTION__)); | ||||||
1353 | |||||||
1354 | // result = trunc(src) | ||||||
1355 | // if (src > 0.0 && src != result) | ||||||
1356 | // result += 1.0 | ||||||
1357 | |||||||
1358 | auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); | ||||||
1359 | |||||||
1360 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||||
1361 | const auto One = B.buildFConstant(S64, 1.0); | ||||||
1362 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||||
1363 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||||
1364 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||||
1365 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||||
1366 | |||||||
1367 | // TODO: Should this propagate fast-math-flags? | ||||||
1368 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||||
1369 | return true; | ||||||
1370 | } | ||||||
1371 | |||||||
1372 | static MachineInstrBuilder extractF64Exponent(unsigned Hi, | ||||||
1373 | MachineIRBuilder &B) { | ||||||
1374 | const unsigned FractBits = 52; | ||||||
1375 | const unsigned ExpBits = 11; | ||||||
1376 | LLT S32 = LLT::scalar(32); | ||||||
1377 | |||||||
1378 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||||
1379 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||||
1380 | |||||||
1381 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||||
1382 | .addUse(Const0.getReg(0)) | ||||||
1383 | .addUse(Const1.getReg(0)); | ||||||
1384 | |||||||
1385 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||||
1386 | } | ||||||
1387 | |||||||
1388 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||||
1389 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1390 | MachineIRBuilder &B) const { | ||||||
1391 | B.setInstr(MI); | ||||||
1392 | |||||||
1393 | const LLT S1 = LLT::scalar(1); | ||||||
1394 | const LLT S32 = LLT::scalar(32); | ||||||
1395 | const LLT S64 = LLT::scalar(64); | ||||||
1396 | |||||||
1397 | Register Src = MI.getOperand(1).getReg(); | ||||||
1398 | assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1398, __PRETTY_FUNCTION__)); | ||||||
1399 | |||||||
1400 | // TODO: Should this use extract since the low half is unused? | ||||||
1401 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
1402 | Register Hi = Unmerge.getReg(1); | ||||||
1403 | |||||||
1404 | // Extract the upper half, since this is where we will find the sign and | ||||||
1405 | // exponent. | ||||||
1406 | auto Exp = extractF64Exponent(Hi, B); | ||||||
1407 | |||||||
1408 | const unsigned FractBits = 52; | ||||||
1409 | |||||||
1410 | // Extract the sign bit. | ||||||
1411 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||||
1412 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||||
1413 | |||||||
1414 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||||
1415 | |||||||
1416 | const auto Zero32 = B.buildConstant(S32, 0); | ||||||
1417 | |||||||
1418 | // Extend back to 64-bits. | ||||||
1419 | auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); | ||||||
1420 | |||||||
1421 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||||
1422 | auto Not = B.buildNot(S64, Shr); | ||||||
1423 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||||
1424 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||||
1425 | |||||||
1426 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||||
1427 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||||
1428 | |||||||
1429 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||||
1430 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||||
1431 | return true; | ||||||
1432 | } | ||||||
1433 | |||||||
1434 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||||
1435 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1436 | MachineIRBuilder &B, bool Signed) const { | ||||||
1437 | B.setInstr(MI); | ||||||
1438 | |||||||
1439 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1440 | Register Src = MI.getOperand(1).getReg(); | ||||||
1441 | |||||||
1442 | const LLT S64 = LLT::scalar(64); | ||||||
1443 | const LLT S32 = LLT::scalar(32); | ||||||
1444 | |||||||
1445 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1445, __PRETTY_FUNCTION__)); | ||||||
1446 | |||||||
1447 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
1448 | |||||||
1449 | auto CvtHi = Signed ? | ||||||
1450 | B.buildSITOFP(S64, Unmerge.getReg(1)) : | ||||||
1451 | B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||||
1452 | |||||||
1453 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||||
1454 | |||||||
1455 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||||
1456 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||||
1457 | .addUse(CvtHi.getReg(0)) | ||||||
1458 | .addUse(ThirtyTwo.getReg(0)); | ||||||
1459 | |||||||
1460 | // TODO: Should this propagate fast-math-flags? | ||||||
1461 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||||
1462 | MI.eraseFromParent(); | ||||||
1463 | return true; | ||||||
1464 | } | ||||||
1465 | |||||||
1466 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( | ||||||
1467 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1468 | MachineIRBuilder &B) const { | ||||||
1469 | MachineFunction &MF = B.getMF(); | ||||||
1470 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1471 | |||||||
1472 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||||
1473 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||||
1474 | |||||||
1475 | // With ieee_mode disabled, the instructions have the correct behavior | ||||||
1476 | // already for G_FMINNUM/G_FMAXNUM | ||||||
1477 | if (!MFI->getMode().IEEE) | ||||||
1478 | return !IsIEEEOp; | ||||||
1479 | |||||||
1480 | if (IsIEEEOp) | ||||||
1481 | return true; | ||||||
1482 | |||||||
1483 | MachineIRBuilder HelperBuilder(MI); | ||||||
1484 | GISelObserverWrapper DummyObserver; | ||||||
1485 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||
1486 | HelperBuilder.setInstr(MI); | ||||||
1487 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||||
1488 | } | ||||||
1489 | |||||||
1490 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||||
1491 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1492 | MachineIRBuilder &B) const { | ||||||
1493 | // TODO: Should move some of this into LegalizerHelper. | ||||||
1494 | |||||||
1495 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
1496 | // TODO: Dynamic s64 indexing is only legal for SGPR. | ||||||
1497 | Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); | ||||||
1498 | if (!IdxVal) // Dynamic case will be selected to register indexing. | ||||||
1499 | return true; | ||||||
1500 | |||||||
1501 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1502 | Register Vec = MI.getOperand(1).getReg(); | ||||||
1503 | |||||||
1504 | LLT VecTy = MRI.getType(Vec); | ||||||
1505 | LLT EltTy = VecTy.getElementType(); | ||||||
1506 | assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1506, __PRETTY_FUNCTION__)); | ||||||
1507 | |||||||
1508 | B.setInstr(MI); | ||||||
1509 | |||||||
1510 | if (IdxVal.getValue() < VecTy.getNumElements()) | ||||||
1511 | B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); | ||||||
1512 | else | ||||||
1513 | B.buildUndef(Dst); | ||||||
1514 | |||||||
1515 | MI.eraseFromParent(); | ||||||
1516 | return true; | ||||||
1517 | } | ||||||
1518 | |||||||
1519 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||||
1520 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1521 | MachineIRBuilder &B) const { | ||||||
1522 | // TODO: Should move some of this into LegalizerHelper. | ||||||
1523 | |||||||
1524 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
1525 | // TODO: Dynamic s64 indexing is only legal for SGPR. | ||||||
1526 | Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); | ||||||
1527 | if (!IdxVal) // Dynamic case will be selected to register indexing. | ||||||
1528 | return true; | ||||||
1529 | |||||||
1530 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1531 | Register Vec = MI.getOperand(1).getReg(); | ||||||
1532 | Register Ins = MI.getOperand(2).getReg(); | ||||||
1533 | |||||||
1534 | LLT VecTy = MRI.getType(Vec); | ||||||
1535 | LLT EltTy = VecTy.getElementType(); | ||||||
1536 | assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1536, __PRETTY_FUNCTION__)); | ||||||
1537 | |||||||
1538 | B.setInstr(MI); | ||||||
1539 | |||||||
1540 | if (IdxVal.getValue() < VecTy.getNumElements()) | ||||||
1541 | B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); | ||||||
1542 | else | ||||||
1543 | B.buildUndef(Dst); | ||||||
1544 | |||||||
1545 | MI.eraseFromParent(); | ||||||
1546 | return true; | ||||||
1547 | } | ||||||
1548 | |||||||
1549 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||||
1550 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1551 | MachineIRBuilder &B) const { | ||||||
1552 | B.setInstr(MI); | ||||||
1553 | |||||||
1554 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
1555 | Register SrcReg = MI.getOperand(1).getReg(); | ||||||
1556 | LLT Ty = MRI.getType(DstReg); | ||||||
1557 | unsigned Flags = MI.getFlags(); | ||||||
1558 | |||||||
1559 | Register TrigVal; | ||||||
1560 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI3.14159265358979323846); | ||||||
1561 | if (ST.hasTrigReducedRange()) { | ||||||
1562 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||||
1563 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||||
1564 | .addUse(MulVal.getReg(0)) | ||||||
1565 | .setMIFlags(Flags).getReg(0); | ||||||
1566 | } else | ||||||
1567 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||||
1568 | |||||||
1569 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||||
1570 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||||
1571 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | ||||||
1572 | .addUse(TrigVal) | ||||||
1573 | .setMIFlags(Flags); | ||||||
1574 | MI.eraseFromParent(); | ||||||
1575 | return true; | ||||||
1576 | } | ||||||
1577 | |||||||
1578 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( | ||||||
1579 | Register DstReg, LLT PtrTy, | ||||||
1580 | MachineIRBuilder &B, const GlobalValue *GV, | ||||||
1581 | unsigned Offset, unsigned GAFlags) const { | ||||||
1582 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||||
1583 | // to the following code sequence: | ||||||
1584 | // | ||||||
1585 | // For constant address space: | ||||||
1586 | // s_getpc_b64 s[0:1] | ||||||
1587 | // s_add_u32 s0, s0, $symbol | ||||||
1588 | // s_addc_u32 s1, s1, 0 | ||||||
1589 | // | ||||||
1590 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
1591 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||||
1592 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||||
1593 | // operand to the global variable. | ||||||
1594 | // | ||||||
1595 | // For global address space: | ||||||
1596 | // s_getpc_b64 s[0:1] | ||||||
1597 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||||
1598 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||||
1599 | // | ||||||
1600 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
1601 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||||
1602 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||||
1603 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||||
1604 | // operand to the global variable. | ||||||
1605 | // | ||||||
1606 | // What we want here is an offset from the value returned by s_getpc | ||||||
1607 | // (which is the address of the s_add_u32 instruction) to the global | ||||||
1608 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||||
1609 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||||
1610 | // small. This requires us to add 4 to the global variable offset in order to | ||||||
1611 | // compute the correct address. | ||||||
1612 | |||||||
1613 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
1614 | |||||||
1615 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||||
1616 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||||
1617 | |||||||
1618 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||||
1619 | .addDef(PCReg); | ||||||
1620 | |||||||
1621 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||||
1622 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||||
1623 | MIB.addImm(0); | ||||||
1624 | else | ||||||
1625 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); | ||||||
1626 | |||||||
1627 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||||
1628 | |||||||
1629 | if (PtrTy.getSizeInBits() == 32) | ||||||
1630 | B.buildExtract(DstReg, PCReg, 0); | ||||||
1631 | return true; | ||||||
1632 | } | ||||||
1633 | |||||||
1634 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||||
1635 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1636 | MachineIRBuilder &B) const { | ||||||
1637 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
1638 | LLT Ty = MRI.getType(DstReg); | ||||||
1639 | unsigned AS = Ty.getAddressSpace(); | ||||||
1640 | |||||||
1641 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||||
1642 | MachineFunction &MF = B.getMF(); | ||||||
1643 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1644 | B.setInstr(MI); | ||||||
1645 | |||||||
1646 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||||
1647 | if (!MFI->isEntryFunction()) { | ||||||
1648 | const Function &Fn = MF.getFunction(); | ||||||
1649 | DiagnosticInfoUnsupported BadLDSDecl( | ||||||
1650 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); | ||||||
1651 | Fn.getContext().diagnose(BadLDSDecl); | ||||||
1652 | } | ||||||
1653 | |||||||
1654 | // TODO: We could emit code to handle the initialization somewhere. | ||||||
1655 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { | ||||||
1656 | B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); | ||||||
1657 | MI.eraseFromParent(); | ||||||
1658 | return true; | ||||||
1659 | } | ||||||
1660 | |||||||
1661 | const Function &Fn = MF.getFunction(); | ||||||
1662 | DiagnosticInfoUnsupported BadInit( | ||||||
1663 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); | ||||||
1664 | Fn.getContext().diagnose(BadInit); | ||||||
1665 | return true; | ||||||
1666 | } | ||||||
1667 | |||||||
1668 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
1669 | |||||||
1670 | if (TLI->shouldEmitFixup(GV)) { | ||||||
1671 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||||
1672 | MI.eraseFromParent(); | ||||||
1673 | return true; | ||||||
1674 | } | ||||||
1675 | |||||||
1676 | if (TLI->shouldEmitPCReloc(GV)) { | ||||||
1677 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||||
1678 | MI.eraseFromParent(); | ||||||
1679 | return true; | ||||||
1680 | } | ||||||
1681 | |||||||
1682 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
1683 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||||
1684 | |||||||
1685 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||||
1686 | MachinePointerInfo::getGOT(MF), | ||||||
1687 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
1688 | MachineMemOperand::MOInvariant, | ||||||
1689 | 8 /*Size*/, 8 /*Align*/); | ||||||
1690 | |||||||
1691 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||||
1692 | |||||||
1693 | if (Ty.getSizeInBits() == 32) { | ||||||
1694 | // Truncate if this is a 32-bit constant adrdess. | ||||||
1695 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||||
1696 | B.buildExtract(DstReg, Load, 0); | ||||||
1697 | } else | ||||||
1698 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||||
1699 | |||||||
1700 | MI.eraseFromParent(); | ||||||
1701 | return true; | ||||||
1702 | } | ||||||
1703 | |||||||
1704 | bool AMDGPULegalizerInfo::legalizeLoad( | ||||||
1705 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1706 | MachineIRBuilder &B, GISelChangeObserver &Observer) const { | ||||||
1707 | B.setInstr(MI); | ||||||
1708 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
1709 | auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); | ||||||
1710 | Observer.changingInstr(MI); | ||||||
1711 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||||
1712 | Observer.changedInstr(MI); | ||||||
1713 | return true; | ||||||
1714 | } | ||||||
1715 | |||||||
1716 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||||
1717 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1718 | MachineIRBuilder &B) const { | ||||||
1719 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
1720 | assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail ("Ty.isScalar()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1720, __PRETTY_FUNCTION__)); | ||||||
1721 | |||||||
1722 | // TODO: Always legal with future ftz flag. | ||||||
1723 | if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) | ||||||
1724 | return true; | ||||||
1725 | if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) | ||||||
1726 | return true; | ||||||
1727 | |||||||
1728 | MachineFunction &MF = B.getMF(); | ||||||
1729 | |||||||
1730 | MachineIRBuilder HelperBuilder(MI); | ||||||
1731 | GISelObserverWrapper DummyObserver; | ||||||
1732 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||
1733 | HelperBuilder.setMBB(*MI.getParent()); | ||||||
1734 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||||
1735 | } | ||||||
1736 | |||||||
1737 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||||
1738 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
1739 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
1740 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
1741 | Register CmpVal = MI.getOperand(2).getReg(); | ||||||
1742 | Register NewVal = MI.getOperand(3).getReg(); | ||||||
1743 | |||||||
1744 | assert(SITargetLowering::isFlatGlobalAddrSpace(((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg ).getAddressSpace()) && "this should not have been custom lowered" ) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1746, __PRETTY_FUNCTION__)) | ||||||
1745 | MRI.getType(PtrReg).getAddressSpace()) &&((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg ).getAddressSpace()) && "this should not have been custom lowered" ) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1746, __PRETTY_FUNCTION__)) | ||||||
1746 | "this should not have been custom lowered")((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg ).getAddressSpace()) && "this should not have been custom lowered" ) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1746, __PRETTY_FUNCTION__)); | ||||||
1747 | |||||||
1748 | LLT ValTy = MRI.getType(CmpVal); | ||||||
1749 | LLT VecTy = LLT::vector(2, ValTy); | ||||||
1750 | |||||||
1751 | B.setInstr(MI); | ||||||
1752 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||||
1753 | |||||||
1754 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||||
1755 | .addDef(DstReg) | ||||||
1756 | .addUse(PtrReg) | ||||||
1757 | .addUse(PackedVal) | ||||||
1758 | .setMemRefs(MI.memoperands()); | ||||||
1759 | |||||||
1760 | MI.eraseFromParent(); | ||||||
1761 | return true; | ||||||
1762 | } | ||||||
1763 | |||||||
1764 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||||
1765 | static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, | ||||||
1766 | MachineRegisterInfo &MRI) { | ||||||
1767 | Register CondDef = MI.getOperand(0).getReg(); | ||||||
1768 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||||
1769 | return nullptr; | ||||||
1770 | |||||||
1771 | MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); | ||||||
1772 | return UseMI.getParent() == MI.getParent() && | ||||||
1773 | UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; | ||||||
1774 | } | ||||||
1775 | |||||||
1776 | Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, | ||||||
1777 | Register Reg, LLT Ty) const { | ||||||
1778 | Register LiveIn = MRI.getLiveInVirtReg(Reg); | ||||||
1779 | if (LiveIn) | ||||||
1780 | return LiveIn; | ||||||
1781 | |||||||
1782 | Register NewReg = MRI.createGenericVirtualRegister(Ty); | ||||||
1783 | MRI.addLiveIn(Reg, NewReg); | ||||||
1784 | return NewReg; | ||||||
1785 | } | ||||||
1786 | |||||||
1787 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||||
1788 | const ArgDescriptor *Arg) const { | ||||||
1789 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||||
1790 | return false; // TODO: Handle these | ||||||
1791 | |||||||
1792 | assert(Arg->getRegister().isPhysical())((Arg->getRegister().isPhysical()) ? static_cast<void> (0) : __assert_fail ("Arg->getRegister().isPhysical()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1792, __PRETTY_FUNCTION__)); | ||||||
1793 | |||||||
1794 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
1795 | |||||||
1796 | LLT Ty = MRI.getType(DstReg); | ||||||
1797 | Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); | ||||||
1798 | |||||||
1799 | if (Arg->isMasked()) { | ||||||
1800 | // TODO: Should we try to emit this once in the entry block? | ||||||
1801 | const LLT S32 = LLT::scalar(32); | ||||||
1802 | const unsigned Mask = Arg->getMask(); | ||||||
1803 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | ||||||
1804 | |||||||
1805 | Register AndMaskSrc = LiveIn; | ||||||
1806 | |||||||
1807 | if (Shift
| ||||||
1808 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||||
1809 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||||
1810 | } | ||||||
1811 | |||||||
1812 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||||
| |||||||
1813 | } else | ||||||
1814 | B.buildCopy(DstReg, LiveIn); | ||||||
1815 | |||||||
1816 | // Insert the argument copy if it doens't already exist. | ||||||
1817 | // FIXME: It seems EmitLiveInCopies isn't called anywhere? | ||||||
1818 | if (!MRI.getVRegDef(LiveIn)) { | ||||||
1819 | // FIXME: Should have scoped insert pt | ||||||
1820 | MachineBasicBlock &OrigInsBB = B.getMBB(); | ||||||
1821 | auto OrigInsPt = B.getInsertPt(); | ||||||
1822 | |||||||
1823 | MachineBasicBlock &EntryMBB = B.getMF().front(); | ||||||
1824 | EntryMBB.addLiveIn(Arg->getRegister()); | ||||||
1825 | B.setInsertPt(EntryMBB, EntryMBB.begin()); | ||||||
1826 | B.buildCopy(LiveIn, Arg->getRegister()); | ||||||
1827 | |||||||
1828 | B.setInsertPt(OrigInsBB, OrigInsPt); | ||||||
1829 | } | ||||||
1830 | |||||||
1831 | return true; | ||||||
1832 | } | ||||||
1833 | |||||||
1834 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||||
1835 | MachineInstr &MI, | ||||||
1836 | MachineRegisterInfo &MRI, | ||||||
1837 | MachineIRBuilder &B, | ||||||
1838 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
1839 | B.setInstr(MI); | ||||||
1840 | |||||||
1841 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
1842 | |||||||
1843 | const ArgDescriptor *Arg; | ||||||
1844 | const TargetRegisterClass *RC; | ||||||
1845 | std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); | ||||||
1846 | if (!Arg) { | ||||||
1847 | LLVM_DEBUG(dbgs() << "Required arg register missing\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("amdgpu-legalinfo")) { dbgs() << "Required arg register missing\n" ; } } while (false); | ||||||
1848 | return false; | ||||||
1849 | } | ||||||
1850 | |||||||
1851 | if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { | ||||||
1852 | MI.eraseFromParent(); | ||||||
1853 | return true; | ||||||
1854 | } | ||||||
1855 | |||||||
1856 | return false; | ||||||
1857 | } | ||||||
1858 | |||||||
1859 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||||
1860 | MachineRegisterInfo &MRI, | ||||||
1861 | MachineIRBuilder &B) const { | ||||||
1862 | B.setInstr(MI); | ||||||
1863 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1864 | LLT DstTy = MRI.getType(Dst); | ||||||
1865 | LLT S16 = LLT::scalar(16); | ||||||
1866 | LLT S32 = LLT::scalar(32); | ||||||
1867 | |||||||
1868 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
1869 | return true; | ||||||
1870 | |||||||
1871 | if (DstTy == S16) | ||||||
1872 | return legalizeFDIV16(MI, MRI, B); | ||||||
1873 | if (DstTy == S32) | ||||||
1874 | return legalizeFDIV32(MI, MRI, B); | ||||||
1875 | |||||||
1876 | return false; | ||||||
1877 | } | ||||||
1878 | |||||||
1879 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||||
1880 | MachineRegisterInfo &MRI, | ||||||
1881 | MachineIRBuilder &B) const { | ||||||
1882 | Register Res = MI.getOperand(0).getReg(); | ||||||
1883 | Register LHS = MI.getOperand(1).getReg(); | ||||||
1884 | Register RHS = MI.getOperand(2).getReg(); | ||||||
1885 | |||||||
1886 | uint16_t Flags = MI.getFlags(); | ||||||
1887 | |||||||
1888 | LLT ResTy = MRI.getType(Res); | ||||||
1889 | LLT S32 = LLT::scalar(32); | ||||||
1890 | LLT S64 = LLT::scalar(64); | ||||||
1891 | |||||||
1892 | const MachineFunction &MF = B.getMF(); | ||||||
1893 | bool Unsafe = | ||||||
1894 | MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); | ||||||
1895 | |||||||
1896 | if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) | ||||||
1897 | return false; | ||||||
1898 | |||||||
1899 | if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals()) | ||||||
1900 | return false; | ||||||
1901 | |||||||
1902 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||||
1903 | // 1 / x -> RCP(x) | ||||||
1904 | if (CLHS->isExactlyValue(1.0)) { | ||||||
1905 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
1906 | .addUse(RHS) | ||||||
1907 | .setMIFlags(Flags); | ||||||
1908 | |||||||
1909 | MI.eraseFromParent(); | ||||||
1910 | return true; | ||||||
1911 | } | ||||||
1912 | |||||||
1913 | // -1 / x -> RCP( FNEG(x) ) | ||||||
1914 | if (CLHS->isExactlyValue(-1.0)) { | ||||||
1915 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||||
1916 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
1917 | .addUse(FNeg.getReg(0)) | ||||||
1918 | .setMIFlags(Flags); | ||||||
1919 | |||||||
1920 | MI.eraseFromParent(); | ||||||
1921 | return true; | ||||||
1922 | } | ||||||
1923 | } | ||||||
1924 | |||||||
1925 | // x / y -> x * (1.0 / y) | ||||||
1926 | if (Unsafe) { | ||||||
1927 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
1928 | .addUse(RHS) | ||||||
1929 | .setMIFlags(Flags); | ||||||
1930 | B.buildFMul(Res, LHS, RCP, Flags); | ||||||
1931 | |||||||
1932 | MI.eraseFromParent(); | ||||||
1933 | return true; | ||||||
1934 | } | ||||||
1935 | |||||||
1936 | return false; | ||||||
1937 | } | ||||||
1938 | |||||||
1939 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||||
1940 | MachineRegisterInfo &MRI, | ||||||
1941 | MachineIRBuilder &B) const { | ||||||
1942 | B.setInstr(MI); | ||||||
1943 | Register Res = MI.getOperand(0).getReg(); | ||||||
1944 | Register LHS = MI.getOperand(1).getReg(); | ||||||
1945 | Register RHS = MI.getOperand(2).getReg(); | ||||||
1946 | |||||||
1947 | uint16_t Flags = MI.getFlags(); | ||||||
1948 | |||||||
1949 | LLT S16 = LLT::scalar(16); | ||||||
1950 | LLT S32 = LLT::scalar(32); | ||||||
1951 | |||||||
1952 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||||
1953 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||||
1954 | |||||||
1955 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
1956 | .addUse(RHSExt.getReg(0)) | ||||||
1957 | .setMIFlags(Flags); | ||||||
1958 | |||||||
1959 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||||
1960 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||||
1961 | |||||||
1962 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
1963 | .addUse(RDst.getReg(0)) | ||||||
1964 | .addUse(RHS) | ||||||
1965 | .addUse(LHS) | ||||||
1966 | .setMIFlags(Flags); | ||||||
1967 | |||||||
1968 | MI.eraseFromParent(); | ||||||
1969 | return true; | ||||||
1970 | } | ||||||
1971 | |||||||
1972 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||||
1973 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||||
1974 | static void toggleSPDenormMode(bool Enable, | ||||||
1975 | const GCNSubtarget &ST, | ||||||
1976 | MachineIRBuilder &B) { | ||||||
1977 | // Set SP denorm mode to this value. | ||||||
1978 | unsigned SPDenormMode = | ||||||
1979 | Enable ? FP_DENORM_FLUSH_NONE3 : FP_DENORM_FLUSH_IN_FLUSH_OUT0; | ||||||
1980 | |||||||
1981 | if (ST.hasDenormModeInst()) { | ||||||
1982 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||||
1983 | unsigned DPDenormModeDefault = ST.hasFP64Denormals() | ||||||
1984 | ? FP_DENORM_FLUSH_NONE3 | ||||||
1985 | : FP_DENORM_FLUSH_IN_FLUSH_OUT0; | ||||||
1986 | |||||||
1987 | unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||||
1988 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||||
1989 | .addImm(NewDenormModeValue); | ||||||
1990 | |||||||
1991 | } else { | ||||||
1992 | // Select FP32 bit field in mode register. | ||||||
1993 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||||
1994 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||||
1995 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||||
1996 | |||||||
1997 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||||
1998 | .addImm(SPDenormMode) | ||||||
1999 | .addImm(SPDenormModeBitField); | ||||||
2000 | } | ||||||
2001 | } | ||||||
2002 | |||||||
2003 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||||
2004 | MachineRegisterInfo &MRI, | ||||||
2005 | MachineIRBuilder &B) const { | ||||||
2006 | B.setInstr(MI); | ||||||
2007 | Register Res = MI.getOperand(0).getReg(); | ||||||
2008 | Register LHS = MI.getOperand(1).getReg(); | ||||||
2009 | Register RHS = MI.getOperand(2).getReg(); | ||||||
2010 | |||||||
2011 | uint16_t Flags = MI.getFlags(); | ||||||
2012 | |||||||
2013 | LLT S32 = LLT::scalar(32); | ||||||
2014 | LLT S1 = LLT::scalar(1); | ||||||
2015 | |||||||
2016 | auto One = B.buildFConstant(S32, 1.0f); | ||||||
2017 | |||||||
2018 | auto DenominatorScaled = | ||||||
2019 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
2020 | .addUse(RHS) | ||||||
2021 | .addUse(RHS) | ||||||
2022 | .addUse(LHS) | ||||||
2023 | .setMIFlags(Flags); | ||||||
2024 | auto NumeratorScaled = | ||||||
2025 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
2026 | .addUse(LHS) | ||||||
2027 | .addUse(RHS) | ||||||
2028 | .addUse(LHS) | ||||||
2029 | .setMIFlags(Flags); | ||||||
2030 | |||||||
2031 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
2032 | .addUse(DenominatorScaled.getReg(0)) | ||||||
2033 | .setMIFlags(Flags); | ||||||
2034 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||||
2035 | |||||||
2036 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||||
2037 | // aren't modeled as reading it. | ||||||
2038 | if (!ST.hasFP32Denormals()) | ||||||
2039 | toggleSPDenormMode(true, ST, B); | ||||||
2040 | |||||||
2041 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||||
2042 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||||
2043 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||||
2044 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||||
2045 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||||
2046 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||||
2047 | |||||||
2048 | if (!ST.hasFP32Denormals()) | ||||||
2049 | toggleSPDenormMode(false, ST, B); | ||||||
2050 | |||||||
2051 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||||
2052 | .addUse(Fma4.getReg(0)) | ||||||
2053 | .addUse(Fma1.getReg(0)) | ||||||
2054 | .addUse(Fma3.getReg(0)) | ||||||
2055 | .addUse(NumeratorScaled.getReg(1)) | ||||||
2056 | .setMIFlags(Flags); | ||||||
2057 | |||||||
2058 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
2059 | .addUse(Fmas.getReg(0)) | ||||||
2060 | .addUse(RHS) | ||||||
2061 | .addUse(LHS) | ||||||
2062 | .setMIFlags(Flags); | ||||||
2063 | |||||||
2064 | MI.eraseFromParent(); | ||||||
2065 | return true; | ||||||
2066 | } | ||||||
2067 | |||||||
2068 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||||
2069 | MachineRegisterInfo &MRI, | ||||||
2070 | MachineIRBuilder &B) const { | ||||||
2071 | B.setInstr(MI); | ||||||
2072 | Register Res = MI.getOperand(0).getReg(); | ||||||
2073 | Register LHS = MI.getOperand(2).getReg(); | ||||||
2074 | Register RHS = MI.getOperand(3).getReg(); | ||||||
2075 | uint16_t Flags = MI.getFlags(); | ||||||
2076 | |||||||
2077 | LLT S32 = LLT::scalar(32); | ||||||
2078 | LLT S1 = LLT::scalar(1); | ||||||
2079 | |||||||
2080 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||||
2081 | const APFloat C0Val(1.0f); | ||||||
2082 | |||||||
2083 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||||
2084 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||||
2085 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | ||||||
2086 | |||||||
2087 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||||
2088 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||||
2089 | |||||||
2090 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||||
2091 | |||||||
2092 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
2093 | .addUse(Mul0.getReg(0)) | ||||||
2094 | .setMIFlags(Flags); | ||||||
2095 | |||||||
2096 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||||
2097 | |||||||
2098 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||||
2099 | |||||||
2100 | MI.eraseFromParent(); | ||||||
2101 | return true; | ||||||
2102 | } | ||||||
2103 | |||||||
2104 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||||
2105 | MachineRegisterInfo &MRI, | ||||||
2106 | MachineIRBuilder &B) const { | ||||||
2107 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2108 | if (!MFI->isEntryFunction()) { | ||||||
2109 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2110 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||||
2111 | } | ||||||
2112 | |||||||
2113 | B.setInstr(MI); | ||||||
2114 | |||||||
2115 | uint64_t Offset = | ||||||
2116 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||||
2117 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||||
2118 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2119 | LLT DstTy = MRI.getType(DstReg); | ||||||
2120 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||||
2121 | |||||||
2122 | const ArgDescriptor *Arg; | ||||||
2123 | const TargetRegisterClass *RC; | ||||||
2124 | std::tie(Arg, RC) | ||||||
2125 | = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||
2126 | if (!Arg) | ||||||
2127 | return false; | ||||||
2128 | |||||||
2129 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||||
2130 | if (!loadInputValue(KernargPtrReg, B, Arg)) | ||||||
2131 | return false; | ||||||
2132 | |||||||
2133 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||||
2134 | MI.eraseFromParent(); | ||||||
2135 | return true; | ||||||
2136 | } | ||||||
2137 | |||||||
2138 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||||
2139 | MachineRegisterInfo &MRI, | ||||||
2140 | MachineIRBuilder &B, | ||||||
2141 | unsigned AddrSpace) const { | ||||||
2142 | B.setInstr(MI); | ||||||
2143 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||||
2144 | auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); | ||||||
2145 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||||
2146 | MI.eraseFromParent(); | ||||||
2147 | return true; | ||||||
2148 | } | ||||||
2149 | |||||||
2150 | /// Handle register layout difference for f16 images for some subtargets. | ||||||
2151 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||||
2152 | MachineRegisterInfo &MRI, | ||||||
2153 | Register Reg) const { | ||||||
2154 | if (!ST.hasUnpackedD16VMem()) | ||||||
2155 | return Reg; | ||||||
2156 | |||||||
2157 | const LLT S16 = LLT::scalar(16); | ||||||
2158 | const LLT S32 = LLT::scalar(32); | ||||||
2159 | LLT StoreVT = MRI.getType(Reg); | ||||||
2160 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16 ) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2160, __PRETTY_FUNCTION__)); | ||||||
2161 | |||||||
2162 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
2163 | |||||||
2164 | SmallVector<Register, 4> WideRegs; | ||||||
2165 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
2166 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||||
2167 | |||||||
2168 | int NumElts = StoreVT.getNumElements(); | ||||||
2169 | |||||||
2170 | return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); | ||||||
2171 | } | ||||||
2172 | |||||||
2173 | bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, | ||||||
2174 | MachineRegisterInfo &MRI, | ||||||
2175 | MachineIRBuilder &B, | ||||||
2176 | bool IsFormat) const { | ||||||
2177 | // TODO: Reject f16 format on targets where unsupported. | ||||||
2178 | Register VData = MI.getOperand(1).getReg(); | ||||||
2179 | LLT Ty = MRI.getType(VData); | ||||||
2180 | |||||||
2181 | B.setInstr(MI); | ||||||
2182 | |||||||
2183 | const LLT S32 = LLT::scalar(32); | ||||||
2184 | const LLT S16 = LLT::scalar(16); | ||||||
2185 | |||||||
2186 | // Fixup illegal register types for i8 stores. | ||||||
2187 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||||
2188 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||||
2189 | MI.getOperand(1).setReg(AnyExt); | ||||||
2190 | return true; | ||||||
2191 | } | ||||||
2192 | |||||||
2193 | if (Ty.isVector()) { | ||||||
2194 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||||
2195 | if (IsFormat) | ||||||
2196 | MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); | ||||||
2197 | return true; | ||||||
2198 | } | ||||||
2199 | |||||||
2200 | return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; | ||||||
2201 | } | ||||||
2202 | |||||||
2203 | return Ty == S32; | ||||||
2204 | } | ||||||
2205 | |||||||
2206 | bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, | ||||||
2207 | MachineRegisterInfo &MRI, | ||||||
2208 | MachineIRBuilder &B) const { | ||||||
2209 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||||
2210 | switch (MI.getIntrinsicID()) { | ||||||
| |||||||
2211 | case Intrinsic::amdgcn_if: { | ||||||
2212 | if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { | ||||||
2213 | const SIRegisterInfo *TRI | ||||||
2214 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
2215 | |||||||
2216 | B.setInstr(*BrCond); | ||||||
2217 | Register Def = MI.getOperand(1).getReg(); | ||||||
2218 | Register Use = MI.getOperand(3).getReg(); | ||||||
2219 | B.buildInstr(AMDGPU::SI_IF) | ||||||
2220 | .addDef(Def) | ||||||
2221 | .addUse(Use) | ||||||
2222 | .addMBB(BrCond->getOperand(1).getMBB()); | ||||||
2223 | |||||||
2224 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||||
2225 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||||
2226 | MI.eraseFromParent(); | ||||||
2227 | BrCond->eraseFromParent(); | ||||||
2228 | return true; | ||||||
2229 | } | ||||||
2230 | |||||||
2231 | return false; | ||||||
2232 | } | ||||||
2233 | case Intrinsic::amdgcn_loop: { | ||||||
2234 | if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { | ||||||
2235 | const SIRegisterInfo *TRI | ||||||
2236 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
2237 | |||||||
2238 | B.setInstr(*BrCond); | ||||||
2239 | Register Reg = MI.getOperand(2).getReg(); | ||||||
2240 | B.buildInstr(AMDGPU::SI_LOOP) | ||||||
2241 | .addUse(Reg) | ||||||
2242 | .addMBB(BrCond->getOperand(1).getMBB()); | ||||||
2243 | MI.eraseFromParent(); | ||||||
2244 | BrCond->eraseFromParent(); | ||||||
2245 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||||
2246 | return true; | ||||||
2247 | } | ||||||
2248 | |||||||
2249 | return false; | ||||||
2250 | } | ||||||
2251 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||||
2252 | return legalizePreloadedArgIntrin( | ||||||
2253 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||
2254 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||||
2255 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||||
2256 | case Intrinsic::amdgcn_workitem_id_x: | ||||||
2257 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2258 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||||
2259 | case Intrinsic::amdgcn_workitem_id_y: | ||||||
2260 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2261 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||||
2262 | case Intrinsic::amdgcn_workitem_id_z: | ||||||
2263 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2264 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||||
2265 | case Intrinsic::amdgcn_workgroup_id_x: | ||||||
2266 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2267 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||||
2268 | case Intrinsic::amdgcn_workgroup_id_y: | ||||||
2269 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2270 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||||
2271 | case Intrinsic::amdgcn_workgroup_id_z: | ||||||
2272 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2273 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||||
2274 | case Intrinsic::amdgcn_dispatch_ptr: | ||||||
2275 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2276 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||||
2277 | case Intrinsic::amdgcn_queue_ptr: | ||||||
2278 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2279 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||||
2280 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||||
2281 | return legalizePreloadedArgIntrin( | ||||||
2282 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||||
2283 | case Intrinsic::amdgcn_dispatch_id: | ||||||
2284 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
2285 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||||
2286 | case Intrinsic::amdgcn_fdiv_fast: | ||||||
2287 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||||
2288 | case Intrinsic::amdgcn_is_shared: | ||||||
2289 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||||
2290 | case Intrinsic::amdgcn_is_private: | ||||||
2291 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||||
2292 | case Intrinsic::amdgcn_wavefrontsize: { | ||||||
2293 | B.setInstr(MI); | ||||||
2294 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||||
2295 | MI.eraseFromParent(); | ||||||
2296 | return true; | ||||||
2297 | } | ||||||
2298 | case Intrinsic::amdgcn_raw_buffer_store: | ||||||
2299 | return legalizeRawBufferStore(MI, MRI, B, false); | ||||||
2300 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||||
2301 | return legalizeRawBufferStore(MI, MRI, B, true); | ||||||
2302 | default: | ||||||
2303 | return true; | ||||||
2304 | } | ||||||
2305 | |||||||
2306 | return true; | ||||||
2307 | } |
1 | //==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
10 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
11 | |
12 | #include "llvm/ADT/DenseMap.h" |
13 | #include "llvm/CodeGen/Register.h" |
14 | #include "llvm/IR/Function.h" |
15 | #include "llvm/Pass.h" |
16 | |
17 | namespace llvm { |
18 | |
19 | class Function; |
20 | class raw_ostream; |
21 | class GCNSubtarget; |
22 | class TargetMachine; |
23 | class TargetRegisterClass; |
24 | class TargetRegisterInfo; |
25 | |
26 | struct ArgDescriptor { |
27 | private: |
28 | friend struct AMDGPUFunctionArgInfo; |
29 | friend class AMDGPUArgumentUsageInfo; |
30 | |
31 | union { |
32 | Register Reg; |
33 | unsigned StackOffset; |
34 | }; |
35 | |
36 | // Bitmask to locate argument within the register. |
37 | unsigned Mask; |
38 | |
39 | bool IsStack : 1; |
40 | bool IsSet : 1; |
41 | |
42 | public: |
43 | ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, |
44 | bool IsStack = false, bool IsSet = false) |
45 | : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} |
46 | |
47 | static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { |
48 | return ArgDescriptor(Reg, Mask, false, true); |
49 | } |
50 | |
51 | static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { |
52 | return ArgDescriptor(Offset, Mask, true, true); |
53 | } |
54 | |
55 | static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { |
56 | return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); |
57 | } |
58 | |
59 | bool isSet() const { |
60 | return IsSet; |
61 | } |
62 | |
63 | explicit operator bool() const { |
64 | return isSet(); |
65 | } |
66 | |
67 | bool isRegister() const { |
68 | return !IsStack; |
69 | } |
70 | |
71 | Register getRegister() const { |
72 | assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 72, __PRETTY_FUNCTION__)); |
73 | return Reg; |
74 | } |
75 | |
76 | unsigned getStackOffset() const { |
77 | assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 77, __PRETTY_FUNCTION__)); |
78 | return StackOffset; |
79 | } |
80 | |
81 | unsigned getMask() const { |
82 | return Mask; |
83 | } |
84 | |
85 | bool isMasked() const { |
86 | return Mask != ~0u; |
87 | } |
88 | |
89 | void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; |
90 | }; |
91 | |
92 | inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { |
93 | Arg.print(OS); |
94 | return OS; |
95 | } |
96 | |
97 | struct AMDGPUFunctionArgInfo { |
98 | enum PreloadedValue { |
99 | // SGPRS: |
100 | PRIVATE_SEGMENT_BUFFER = 0, |
101 | DISPATCH_PTR = 1, |
102 | QUEUE_PTR = 2, |
103 | KERNARG_SEGMENT_PTR = 3, |
104 | DISPATCH_ID = 4, |
105 | FLAT_SCRATCH_INIT = 5, |
106 | WORKGROUP_ID_X = 10, |
107 | WORKGROUP_ID_Y = 11, |
108 | WORKGROUP_ID_Z = 12, |
109 | PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, |
110 | IMPLICIT_BUFFER_PTR = 15, |
111 | IMPLICIT_ARG_PTR = 16, |
112 | |
113 | // VGPRS: |
114 | WORKITEM_ID_X = 17, |
115 | WORKITEM_ID_Y = 18, |
116 | WORKITEM_ID_Z = 19, |
117 | FIRST_VGPR_VALUE = WORKITEM_ID_X |
118 | }; |
119 | |
120 | // Kernel input registers setup for the HSA ABI in allocation order. |
121 | |
122 | // User SGPRs in kernels |
123 | // XXX - Can these require argument spills? |
124 | ArgDescriptor PrivateSegmentBuffer; |
125 | ArgDescriptor DispatchPtr; |
126 | ArgDescriptor QueuePtr; |
127 | ArgDescriptor KernargSegmentPtr; |
128 | ArgDescriptor DispatchID; |
129 | ArgDescriptor FlatScratchInit; |
130 | ArgDescriptor PrivateSegmentSize; |
131 | |
132 | // System SGPRs in kernels. |
133 | ArgDescriptor WorkGroupIDX; |
134 | ArgDescriptor WorkGroupIDY; |
135 | ArgDescriptor WorkGroupIDZ; |
136 | ArgDescriptor WorkGroupInfo; |
137 | ArgDescriptor PrivateSegmentWaveByteOffset; |
138 | |
139 | // Pointer with offset from kernargsegmentptr to where special ABI arguments |
140 | // are passed to callable functions. |
141 | ArgDescriptor ImplicitArgPtr; |
142 | |
143 | // Input registers for non-HSA ABI |
144 | ArgDescriptor ImplicitBufferPtr = 0; |
145 | |
146 | // VGPRs inputs. These are always v0, v1 and v2 for entry functions. |
147 | ArgDescriptor WorkItemIDX; |
148 | ArgDescriptor WorkItemIDY; |
149 | ArgDescriptor WorkItemIDZ; |
150 | |
151 | std::pair<const ArgDescriptor *, const TargetRegisterClass *> |
152 | getPreloadedValue(PreloadedValue Value) const; |
153 | }; |
154 | |
155 | class AMDGPUArgumentUsageInfo : public ImmutablePass { |
156 | private: |
157 | static const AMDGPUFunctionArgInfo ExternFunctionInfo; |
158 | DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; |
159 | |
160 | public: |
161 | static char ID; |
162 | |
163 | AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } |
164 | |
165 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
166 | AU.setPreservesAll(); |
167 | } |
168 | |
169 | bool doInitialization(Module &M) override; |
170 | bool doFinalization(Module &M) override; |
171 | |
172 | void print(raw_ostream &OS, const Module *M = nullptr) const override; |
173 | |
174 | void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { |
175 | ArgInfoMap[&F] = ArgInfo; |
176 | } |
177 | |
178 | const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const { |
179 | auto I = ArgInfoMap.find(&F); |
180 | if (I == ArgInfoMap.end()) { |
181 | assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail ("F.isDeclaration()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 181, __PRETTY_FUNCTION__)); |
182 | return ExternFunctionInfo; |
183 | } |
184 | |
185 | return I->second; |
186 | } |
187 | }; |
188 | |
189 | } // end namespace llvm |
190 | |
191 | #endif |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file contains some functions that are useful for math stuff. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||||
14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||||
15 | |||||||
16 | #include "llvm/Support/Compiler.h" | ||||||
17 | #include "llvm/Support/SwapByteOrder.h" | ||||||
18 | #include <algorithm> | ||||||
19 | #include <cassert> | ||||||
20 | #include <climits> | ||||||
21 | #include <cstring> | ||||||
22 | #include <limits> | ||||||
23 | #include <type_traits> | ||||||
24 | |||||||
25 | #ifdef __ANDROID_NDK__ | ||||||
26 | #include <android/api-level.h> | ||||||
27 | #endif | ||||||
28 | |||||||
29 | #ifdef _MSC_VER | ||||||
30 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||||
31 | // expensive, and MathExtras.h is popular. | ||||||
32 | // #include <intrin.h> | ||||||
33 | extern "C" { | ||||||
34 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||||
35 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
36 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||||
37 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
38 | } | ||||||
39 | #endif | ||||||
40 | |||||||
41 | namespace llvm { | ||||||
42 | |||||||
43 | /// The behavior an operation has on an input of 0. | ||||||
44 | enum ZeroBehavior { | ||||||
45 | /// The returned value is undefined. | ||||||
46 | ZB_Undefined, | ||||||
47 | /// The returned value is numeric_limits<T>::max() | ||||||
48 | ZB_Max, | ||||||
49 | /// The returned value is numeric_limits<T>::digits | ||||||
50 | ZB_Width | ||||||
51 | }; | ||||||
52 | |||||||
53 | /// Mathematical constants. | ||||||
54 | namespace numbers { | ||||||
55 | // TODO: Track C++20 std::numbers. | ||||||
56 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||||
57 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||||
58 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||||
59 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||||
60 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||||
61 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||||
62 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||||
63 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||||
64 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||||
65 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||||
66 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||||
67 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||||
68 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||||
69 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||||
70 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||||
71 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||||
72 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||||
73 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||||
74 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||||
75 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||||
76 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||||
77 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||||
78 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||||
79 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||||
80 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||||
81 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||||
82 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||||
83 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||||
84 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||||
85 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||||
86 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||||
87 | } // namespace numbers | ||||||
88 | |||||||
89 | namespace detail { | ||||||
90 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||||
91 | static unsigned count(T Val, ZeroBehavior) { | ||||||
92 | if (!Val) | ||||||
93 | return std::numeric_limits<T>::digits; | ||||||
94 | if (Val & 0x1) | ||||||
95 | return 0; | ||||||
96 | |||||||
97 | // Bisection method. | ||||||
98 | unsigned ZeroBits = 0; | ||||||
99 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||||
100 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||||
101 | while (Shift) { | ||||||
102 | if ((Val & Mask) == 0) { | ||||||
103 | Val >>= Shift; | ||||||
104 | ZeroBits |= Shift; | ||||||
105 | } | ||||||
106 | Shift >>= 1; | ||||||
107 | Mask >>= Shift; | ||||||
108 | } | ||||||
109 | return ZeroBits; | ||||||
110 | } | ||||||
111 | }; | ||||||
112 | |||||||
113 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
114 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||||
115 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
116 | if (ZB
| ||||||
117 | return 32; | ||||||
118 | |||||||
119 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||||
120 | return __builtin_ctz(Val); | ||||||
121 | #elif defined(_MSC_VER) | ||||||
122 | unsigned long Index; | ||||||
123 | _BitScanForward(&Index, Val); | ||||||
124 | return Index; | ||||||
125 | #endif | ||||||
126 | } | ||||||
127 | }; | ||||||
128 | |||||||
129 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
130 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||||
131 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
132 | if (ZB != ZB_Undefined && Val == 0) | ||||||
133 | return 64; | ||||||
134 | |||||||
135 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||||
136 | return __builtin_ctzll(Val); | ||||||
137 | #elif defined(_MSC_VER) | ||||||
138 | unsigned long Index; | ||||||
139 | _BitScanForward64(&Index, Val); | ||||||
140 | return Index; | ||||||
141 | #endif | ||||||
142 | } | ||||||
143 | }; | ||||||
144 | #endif | ||||||
145 | #endif | ||||||
146 | } // namespace detail | ||||||
147 | |||||||
148 | /// Count number of 0's from the least significant bit to the most | ||||||
149 | /// stopping at the first 1. | ||||||
150 | /// | ||||||
151 | /// Only unsigned integral types are allowed. | ||||||
152 | /// | ||||||
153 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
154 | /// valid arguments. | ||||||
155 | template <typename T> | ||||||
156 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
157 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
158 | !std::numeric_limits<T>::is_signed, | ||||||
159 | "Only unsigned integral types are allowed."); | ||||||
160 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
161 | } | ||||||
162 | |||||||
163 | namespace detail { | ||||||
164 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||||
165 | static unsigned count(T Val, ZeroBehavior) { | ||||||
166 | if (!Val) | ||||||
167 | return std::numeric_limits<T>::digits; | ||||||
168 | |||||||
169 | // Bisection method. | ||||||
170 | unsigned ZeroBits = 0; | ||||||
171 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||||
172 | T Tmp = Val >> Shift; | ||||||
173 | if (Tmp) | ||||||
174 | Val = Tmp; | ||||||
175 | else | ||||||
176 | ZeroBits |= Shift; | ||||||
177 | } | ||||||
178 | return ZeroBits; | ||||||
179 | } | ||||||
180 | }; | ||||||
181 | |||||||
182 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
183 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||||
184 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
185 | if (ZB != ZB_Undefined && Val == 0) | ||||||
186 | return 32; | ||||||
187 | |||||||
188 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||||
189 | return __builtin_clz(Val); | ||||||
190 | #elif defined(_MSC_VER) | ||||||
191 | unsigned long Index; | ||||||
192 | _BitScanReverse(&Index, Val); | ||||||
193 | return Index ^ 31; | ||||||
194 | #endif | ||||||
195 | } | ||||||
196 | }; | ||||||
197 | |||||||
198 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
199 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||||
200 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
201 | if (ZB != ZB_Undefined && Val == 0) | ||||||
202 | return 64; | ||||||
203 | |||||||
204 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||||
205 | return __builtin_clzll(Val); | ||||||
206 | #elif defined(_MSC_VER) | ||||||
207 | unsigned long Index; | ||||||
208 | _BitScanReverse64(&Index, Val); | ||||||
209 | return Index ^ 63; | ||||||
210 | #endif | ||||||
211 | } | ||||||
212 | }; | ||||||
213 | #endif | ||||||
214 | #endif | ||||||
215 | } // namespace detail | ||||||
216 | |||||||
217 | /// Count number of 0's from the most significant bit to the least | ||||||
218 | /// stopping at the first 1. | ||||||
219 | /// | ||||||
220 | /// Only unsigned integral types are allowed. | ||||||
221 | /// | ||||||
222 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
223 | /// valid arguments. | ||||||
224 | template <typename T> | ||||||
225 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
226 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
227 | !std::numeric_limits<T>::is_signed, | ||||||
228 | "Only unsigned integral types are allowed."); | ||||||
229 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
230 | } | ||||||
231 | |||||||
232 | /// Get the index of the first set bit starting from the least | ||||||
233 | /// significant bit. | ||||||
234 | /// | ||||||
235 | /// Only unsigned integral types are allowed. | ||||||
236 | /// | ||||||
237 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
238 | /// valid arguments. | ||||||
239 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
240 | if (ZB == ZB_Max && Val == 0) | ||||||
241 | return std::numeric_limits<T>::max(); | ||||||
242 | |||||||
243 | return countTrailingZeros(Val, ZB_Undefined); | ||||||
244 | } | ||||||
245 | |||||||
246 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||||
247 | /// bits set to 0. Only unsigned types are allowed. | ||||||
248 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||||
249 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||||
250 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||||
251 | assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast< void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 251, __PRETTY_FUNCTION__)); | ||||||
252 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||||
253 | } | ||||||
254 | |||||||
255 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||||
256 | /// bits set to 0. Only unsigned types are allowed. | ||||||
257 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||||
258 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
259 | } | ||||||
260 | |||||||
261 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||||
262 | /// bits set to 1. Only unsigned types are allowed. | ||||||
263 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||||
264 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
265 | } | ||||||
266 | |||||||
267 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||||
268 | /// bits set to 1. Only unsigned types are allowed. | ||||||
269 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||||
270 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
271 | } | ||||||
272 | |||||||
273 | /// Get the index of the last set bit starting from the least | ||||||
274 | /// significant bit. | ||||||
275 | /// | ||||||
276 | /// Only unsigned integral types are allowed. | ||||||
277 | /// | ||||||
278 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
279 | /// valid arguments. | ||||||
280 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
281 | if (ZB == ZB_Max && Val == 0) | ||||||
282 | return std::numeric_limits<T>::max(); | ||||||
283 | |||||||
284 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||||
285 | // in the __builtin_clz intrinsic on x86. | ||||||
286 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||||
287 | (std::numeric_limits<T>::digits - 1); | ||||||
288 | } | ||||||
289 | |||||||
290 | /// Macro compressed bit reversal table for 256 bits. | ||||||
291 | /// | ||||||
292 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||||
293 | static const unsigned char BitReverseTable256[256] = { | ||||||
294 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||||
295 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||||
296 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||||
297 | R6(0), R6(2), R6(1), R6(3) | ||||||
298 | #undef R2 | ||||||
299 | #undef R4 | ||||||
300 | #undef R6 | ||||||
301 | }; | ||||||
302 | |||||||
303 | /// Reverse the bits in \p Val. | ||||||
304 | template <typename T> | ||||||
305 | T reverseBits(T Val) { | ||||||
306 | unsigned char in[sizeof(Val)]; | ||||||
307 | unsigned char out[sizeof(Val)]; | ||||||
308 | std::memcpy(in, &Val, sizeof(Val)); | ||||||
309 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||||
310 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||||
311 | std::memcpy(&Val, out, sizeof(Val)); | ||||||
312 | return Val; | ||||||
313 | } | ||||||
314 | |||||||
315 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||||
316 | // type overloading so that signed and unsigned integers can be used without | ||||||
317 | // ambiguity. | ||||||
318 | |||||||
319 | /// Return the high 32 bits of a 64 bit value. | ||||||
320 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||||
321 | return static_cast<uint32_t>(Value >> 32); | ||||||
322 | } | ||||||
323 | |||||||
324 | /// Return the low 32 bits of a 64 bit value. | ||||||
325 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||||
326 | return static_cast<uint32_t>(Value); | ||||||
327 | } | ||||||
328 | |||||||
329 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||||
330 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||||
331 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||||
332 | } | ||||||
333 | |||||||
334 | /// Checks if an integer fits into the given bit width. | ||||||
335 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||||
336 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); | ||||||
337 | } | ||||||
338 | // Template specializations to get better code for common cases. | ||||||
339 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||||
340 | return static_cast<int8_t>(x) == x; | ||||||
341 | } | ||||||
342 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||||
343 | return static_cast<int16_t>(x) == x; | ||||||
344 | } | ||||||
345 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||||
346 | return static_cast<int32_t>(x) == x; | ||||||
347 | } | ||||||
348 | |||||||
349 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||||
350 | template <unsigned N, unsigned S> | ||||||
351 | constexpr inline bool isShiftedInt(int64_t x) { | ||||||
352 | static_assert( | ||||||
353 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||||
354 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||||
355 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
356 | } | ||||||
357 | |||||||
358 | /// Checks if an unsigned integer fits into the given bit width. | ||||||
359 | /// | ||||||
360 | /// This is written as two functions rather than as simply | ||||||
361 | /// | ||||||
362 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||||
363 | /// | ||||||
364 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||||
365 | /// left too many places. | ||||||
366 | template <unsigned N> | ||||||
367 | constexpr inline typename std::enable_if<(N < 64), bool>::type | ||||||
368 | isUInt(uint64_t X) { | ||||||
369 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||||
370 | return X < (UINT64_C(1)1UL << (N)); | ||||||
371 | } | ||||||
372 | template <unsigned N> | ||||||
373 | constexpr inline typename std::enable_if<N >= 64, bool>::type | ||||||
374 | isUInt(uint64_t X) { | ||||||
375 | return true; | ||||||
376 | } | ||||||
377 | |||||||
378 | // Template specializations to get better code for common cases. | ||||||
379 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||||
380 | return static_cast<uint8_t>(x) == x; | ||||||
381 | } | ||||||
382 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||||
383 | return static_cast<uint16_t>(x) == x; | ||||||
384 | } | ||||||
385 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||||
386 | return static_cast<uint32_t>(x) == x; | ||||||
387 | } | ||||||
388 | |||||||
389 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||||
390 | template <unsigned N, unsigned S> | ||||||
391 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||||
392 | static_assert( | ||||||
393 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||||
394 | static_assert(N + S <= 64, | ||||||
395 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||||
396 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||||
397 | // 1 << S is not undefined behavior. | ||||||
398 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
399 | } | ||||||
400 | |||||||
401 | /// Gets the maximum value for a N-bit unsigned integer. | ||||||
402 | inline uint64_t maxUIntN(uint64_t N) { | ||||||
403 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 403, __PRETTY_FUNCTION__)); | ||||||
404 | |||||||
405 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||||
406 | // (uint64_t(1) << N) - 1 | ||||||
407 | // without checking first that N != 64. But this works and doesn't have a | ||||||
408 | // branch. | ||||||
409 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); | ||||||
410 | } | ||||||
411 | |||||||
412 | /// Gets the minimum value for a N-bit signed integer. | ||||||
413 | inline int64_t minIntN(int64_t N) { | ||||||
414 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 414, __PRETTY_FUNCTION__)); | ||||||
415 | |||||||
416 | return -(UINT64_C(1)1UL<<(N-1)); | ||||||
417 | } | ||||||
418 | |||||||
419 | /// Gets the maximum value for a N-bit signed integer. | ||||||
420 | inline int64_t maxIntN(int64_t N) { | ||||||
421 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 421, __PRETTY_FUNCTION__)); | ||||||
422 | |||||||
423 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||||
424 | // int64_t only at the very end to avoid UB. | ||||||
425 | return (UINT64_C(1)1UL << (N - 1)) - 1; | ||||||
426 | } | ||||||
427 | |||||||
428 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||||
429 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||||
430 | return N >= 64 || x <= maxUIntN(N); | ||||||
431 | } | ||||||
432 | |||||||
433 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||||
434 | inline bool isIntN(unsigned N, int64_t x) { | ||||||
435 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||||
436 | } | ||||||
437 | |||||||
438 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
439 | /// least significant bit with the remainder zero (32 bit version). | ||||||
440 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||||
441 | constexpr inline bool isMask_32(uint32_t Value) { | ||||||
442 | return Value && ((Value + 1) & Value) == 0; | ||||||
443 | } | ||||||
444 | |||||||
445 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
446 | /// least significant bit with the remainder zero (64 bit version). | ||||||
447 | constexpr inline bool isMask_64(uint64_t Value) { | ||||||
448 | return Value && ((Value + 1) & Value) == 0; | ||||||
449 | } | ||||||
450 | |||||||
451 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
452 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||||
453 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||||
454 | return Value && isMask_32((Value - 1) | Value); | ||||||
455 | } | ||||||
456 | |||||||
457 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
458 | /// remainder zero (64 bit version.) | ||||||
459 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||||
460 | return Value && isMask_64((Value - 1) | Value); | ||||||
461 | } | ||||||
462 | |||||||
463 | /// Return true if the argument is a power of two > 0. | ||||||
464 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||||
465 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||||
466 | return Value && !(Value & (Value - 1)); | ||||||
467 | } | ||||||
468 | |||||||
469 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||||
470 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||||
471 | return Value && !(Value & (Value - 1)); | ||||||
472 | } | ||||||
473 | |||||||
474 | /// Return a byte-swapped representation of the 16-bit argument. | ||||||
475 | inline uint16_t ByteSwap_16(uint16_t Value) { | ||||||
476 | return sys::SwapByteOrder_16(Value); | ||||||
477 | } | ||||||
478 | |||||||
479 | /// Return a byte-swapped representation of the 32-bit argument. | ||||||
480 | inline uint32_t ByteSwap_32(uint32_t Value) { | ||||||
481 | return sys::SwapByteOrder_32(Value); | ||||||
482 | } | ||||||
483 | |||||||
484 | /// Return a byte-swapped representation of the 64-bit argument. | ||||||
485 | inline uint64_t ByteSwap_64(uint64_t Value) { | ||||||
486 | return sys::SwapByteOrder_64(Value); | ||||||
487 | } | ||||||
488 | |||||||
489 | /// Count the number of ones from the most significant bit to the first | ||||||
490 | /// zero bit. | ||||||
491 | /// | ||||||
492 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||||
493 | /// Only unsigned integral types are allowed. | ||||||
494 | /// | ||||||
495 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
496 | /// ZB_Undefined are valid arguments. | ||||||
497 | template <typename T> | ||||||
498 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
499 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
500 | !std::numeric_limits<T>::is_signed, | ||||||
501 | "Only unsigned integral types are allowed."); | ||||||
502 | return countLeadingZeros<T>(~Value, ZB); | ||||||
503 | } | ||||||
504 | |||||||
505 | /// Count the number of ones from the least significant bit to the first | ||||||
506 | /// zero bit. | ||||||
507 | /// | ||||||
508 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||||
509 | /// Only unsigned integral types are allowed. | ||||||
510 | /// | ||||||
511 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
512 | /// ZB_Undefined are valid arguments. | ||||||
513 | template <typename T> | ||||||
514 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
515 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
516 | !std::numeric_limits<T>::is_signed, | ||||||
517 | "Only unsigned integral types are allowed."); | ||||||
518 | return countTrailingZeros<T>(~Value, ZB); | ||||||
519 | } | ||||||
520 | |||||||
521 | namespace detail { | ||||||
522 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||||
523 | static unsigned count(T Value) { | ||||||
524 | // Generic version, forward to 32 bits. | ||||||
525 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||||
526 | #if defined(__GNUC__4) | ||||||
527 | return __builtin_popcount(Value); | ||||||
528 | #else | ||||||
529 | uint32_t v = Value; | ||||||
530 | v = v - ((v >> 1) & 0x55555555); | ||||||
531 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||||
532 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||||
533 | #endif | ||||||
534 | } | ||||||
535 | }; | ||||||
536 | |||||||
537 | template <typename T> struct PopulationCounter<T, 8> { | ||||||
538 | static unsigned count(T Value) { | ||||||
539 | #if defined(__GNUC__4) | ||||||
540 | return __builtin_popcountll(Value); | ||||||
541 | #else | ||||||
542 | uint64_t v = Value; | ||||||
543 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||||
544 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||||
545 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||||
546 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||||
547 | #endif | ||||||
548 | } | ||||||
549 | }; | ||||||
550 | } // namespace detail | ||||||
551 | |||||||
552 | /// Count the number of set bits in a value. | ||||||
553 | /// Ex. countPopulation(0xF000F000) = 8 | ||||||
554 | /// Returns 0 if the word is zero. | ||||||
555 | template <typename T> | ||||||
556 | inline unsigned countPopulation(T Value) { | ||||||
557 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
558 | !std::numeric_limits<T>::is_signed, | ||||||
559 | "Only unsigned integral types are allowed."); | ||||||
560 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||||
561 | } | ||||||
562 | |||||||
563 | /// Compile time Log2. | ||||||
564 | /// Valid only for positive powers of two. | ||||||
565 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||||
566 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||||
567 | "Value is not a valid power of 2"); | ||||||
568 | return 1 + CTLog2<kValue / 2>(); | ||||||
569 | } | ||||||
570 | |||||||
571 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||||
572 | |||||||
573 | /// Return the log base 2 of the specified value. | ||||||
574 | inline double Log2(double Value) { | ||||||
575 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||||
576 | return __builtin_log(Value) / __builtin_log(2.0); | ||||||
577 | #else | ||||||
578 | return log2(Value); | ||||||
579 | #endif | ||||||
580 | } | ||||||
581 | |||||||
582 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
583 | /// (32 bit edition.) | ||||||
584 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||||
585 | inline unsigned Log2_32(uint32_t Value) { | ||||||
586 | return 31 - countLeadingZeros(Value); | ||||||
587 | } | ||||||
588 | |||||||
589 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
590 | /// (64 bit edition.) | ||||||
591 | inline unsigned Log2_64(uint64_t Value) { | ||||||
592 | return 63 - countLeadingZeros(Value); | ||||||
593 | } | ||||||
594 | |||||||
595 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||||
596 | /// (32 bit edition). | ||||||
597 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||||
598 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||||
599 | return 32 - countLeadingZeros(Value - 1); | ||||||
600 | } | ||||||
601 | |||||||
602 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||||
603 | /// (64 bit edition.) | ||||||
604 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||||
605 | return 64 - countLeadingZeros(Value - 1); | ||||||
606 | } | ||||||
607 | |||||||
608 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||||
609 | template <typename T> | ||||||
610 | inline T greatestCommonDivisor(T A, T B) { | ||||||
611 | while (B) { | ||||||
612 | T Tmp = B; | ||||||
613 | B = A % B; | ||||||
614 | A = Tmp; | ||||||
615 | } | ||||||
616 | return A; | ||||||
617 | } | ||||||
618 | |||||||
619 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||||
620 | return greatestCommonDivisor<uint64_t>(A, B); | ||||||
621 | } | ||||||
622 | |||||||
623 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||||
624 | inline double BitsToDouble(uint64_t Bits) { | ||||||
625 | double D; | ||||||
626 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
627 | memcpy(&D, &Bits, sizeof(Bits)); | ||||||
628 | return D; | ||||||
629 | } | ||||||
630 | |||||||
631 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||||
632 | inline float BitsToFloat(uint32_t Bits) { | ||||||
633 | float F; | ||||||
634 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
635 | memcpy(&F, &Bits, sizeof(Bits)); | ||||||
636 | return F; | ||||||
637 | } | ||||||
638 | |||||||
639 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||||
640 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||||
641 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
642 | inline uint64_t DoubleToBits(double Double) { | ||||||
643 | uint64_t Bits; | ||||||
644 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
645 | memcpy(&Bits, &Double, sizeof(Double)); | ||||||
646 | return Bits; | ||||||
647 | } | ||||||
648 | |||||||
649 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||||
650 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||||
651 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
652 | inline uint32_t FloatToBits(float Float) { | ||||||
653 | uint32_t Bits; | ||||||
654 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
655 | memcpy(&Bits, &Float, sizeof(Float)); | ||||||
656 | return Bits; | ||||||
657 | } | ||||||
658 | |||||||
659 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||||
660 | /// may be assumed after adding the two together. | ||||||
661 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||||
662 | // The largest power of 2 that divides both A and B. | ||||||
663 | // | ||||||
664 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||||
665 | // MSVC warning C4146 | ||||||
666 | // return (A | B) & -(A | B); | ||||||
667 | return (A | B) & (1 + ~(A | B)); | ||||||
668 | } | ||||||
669 | |||||||
670 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||||
671 | /// Returns zero on overflow. | ||||||
672 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||||
673 | A |= (A >> 1); | ||||||
674 | A |= (A >> 2); | ||||||
675 | A |= (A >> 4); | ||||||
676 | A |= (A >> 8); | ||||||
677 | A |= (A >> 16); | ||||||
678 | A |= (A >> 32); | ||||||
679 | return A + 1; | ||||||
680 | } | ||||||
681 | |||||||
682 | /// Returns the power of two which is less than or equal to the given value. | ||||||
683 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||||
684 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||||
685 | if (!A) return 0; | ||||||
686 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||||
687 | } | ||||||
688 | |||||||
689 | /// Returns the power of two which is greater than or equal to the given value. | ||||||
690 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||||
691 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||||
692 | if (!A) | ||||||
693 | return 0; | ||||||
694 | return NextPowerOf2(A - 1); | ||||||
695 | } | ||||||
696 | |||||||
697 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
698 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||||
699 | /// | ||||||
700 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||||
701 | /// integer that is greater than or equal to \p Value and equal to | ||||||
702 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||||
703 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||||
704 | /// | ||||||
705 | /// Examples: | ||||||
706 | /// \code | ||||||
707 | /// alignTo(5, 8) = 8 | ||||||
708 | /// alignTo(17, 8) = 24 | ||||||
709 | /// alignTo(~0LL, 8) = 0 | ||||||
710 | /// alignTo(321, 255) = 510 | ||||||
711 | /// | ||||||
712 | /// alignTo(5, 8, 7) = 7 | ||||||
713 | /// alignTo(17, 8, 1) = 17 | ||||||
714 | /// alignTo(~0LL, 8, 3) = 3 | ||||||
715 | /// alignTo(321, 255, 42) = 552 | ||||||
716 | /// \endcode | ||||||
717 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
718 | assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast< void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 718, __PRETTY_FUNCTION__)); | ||||||
719 | Skew %= Align; | ||||||
720 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||||
721 | } | ||||||
722 | |||||||
723 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
724 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||||
725 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||||
726 | static_assert(Align != 0u, "Align must be non-zero"); | ||||||
727 | return (Value + Align - 1) / Align * Align; | ||||||
728 | } | ||||||
729 | |||||||
730 | /// Returns the integer ceil(Numerator / Denominator). | ||||||
731 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||||
732 | return alignTo(Numerator, Denominator) / Denominator; | ||||||
733 | } | ||||||
734 | |||||||
735 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||||
736 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||||
737 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
738 | assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast< void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 738, __PRETTY_FUNCTION__)); | ||||||
739 | Skew %= Align; | ||||||
740 | return (Value - Skew) / Align * Align + Skew; | ||||||
741 | } | ||||||
742 | |||||||
743 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
744 | /// Requires 0 < B <= 32. | ||||||
745 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||||
746 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
747 | static_assert(B <= 32, "Bit width out of range."); | ||||||
748 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
749 | } | ||||||
750 | |||||||
751 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
752 | /// Requires 0 < B < 32. | ||||||
753 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||||
754 | assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast< void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 754, __PRETTY_FUNCTION__)); | ||||||
755 | assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast <void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 755, __PRETTY_FUNCTION__)); | ||||||
756 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
757 | } | ||||||
758 | |||||||
759 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
760 | /// Requires 0 < B < 64. | ||||||
761 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||||
762 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
763 | static_assert(B <= 64, "Bit width out of range."); | ||||||
764 | return int64_t(x << (64 - B)) >> (64 - B); | ||||||
765 | } | ||||||
766 | |||||||
767 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
768 | /// Requires 0 < B < 64. | ||||||
769 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||||
770 | assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast< void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 770, __PRETTY_FUNCTION__)); | ||||||
771 | assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast <void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h" , 771, __PRETTY_FUNCTION__)); | ||||||
772 | return int64_t(X << (64 - B)) >> (64 - B); | ||||||
773 | } | ||||||
774 | |||||||
775 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||||
776 | /// value of the result. | ||||||
777 | template <typename T> | ||||||
778 | typename std::enable_if<std::is_unsigned<T>::value, T>::type | ||||||
779 | AbsoluteDifference(T X, T Y) { | ||||||
780 | return std::max(X, Y) - std::min(X, Y); | ||||||
781 | } | ||||||
782 | |||||||
783 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
784 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
785 | /// the result is larger than the maximum representable value of type T. | ||||||
786 | template <typename T> | ||||||
787 | typename std::enable_if<std::is_unsigned<T>::value, T>::type | ||||||
788 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
789 | bool Dummy; | ||||||
790 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
791 | // Hacker's Delight, p. 29 | ||||||
792 | T Z = X + Y; | ||||||
793 | Overflowed = (Z < X || Z < Y); | ||||||
794 | if (Overflowed) | ||||||
795 | return std::numeric_limits<T>::max(); | ||||||
796 | else | ||||||
797 | return Z; | ||||||
798 | } | ||||||
799 | |||||||
800 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
801 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
802 | /// the result is larger than the maximum representable value of type T. | ||||||
803 | template <typename T> | ||||||
804 | typename std::enable_if<std::is_unsigned<T>::value, T>::type | ||||||
805 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
806 | bool Dummy; | ||||||
807 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
808 | |||||||
809 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||||
810 | // because it fails for uint16_t (where multiplication can have undefined | ||||||
811 | // behavior due to promotion to int), and requires a division in addition | ||||||
812 | // to the multiplication. | ||||||
813 | |||||||
814 | Overflowed = false; | ||||||
815 | |||||||
816 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||||
817 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||||
818 | // will necessarily be less than Log2Max as desired. | ||||||
819 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||||
820 | const T Max = std::numeric_limits<T>::max(); | ||||||
821 | int Log2Max = Log2_64(Max); | ||||||
822 | if (Log2Z < Log2Max) { | ||||||
823 | return X * Y; | ||||||
824 | } | ||||||
825 | if (Log2Z > Log2Max) { | ||||||
826 | Overflowed = true; | ||||||
827 | return Max; | ||||||
828 | } | ||||||
829 | |||||||
830 | // We're going to use the top bit, and maybe overflow one | ||||||
831 | // bit past it. Multiply all but the bottom bit then add | ||||||
832 | // that on at the end. | ||||||
833 | T Z = (X >> 1) * Y; | ||||||
834 | if (Z & ~(Max >> 1)) { | ||||||
835 | Overflowed = true; | ||||||
836 | return Max; | ||||||
837 | } | ||||||
838 | Z <<= 1; | ||||||
839 | if (X & 1) | ||||||
840 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||||
841 | |||||||
842 | return Z; | ||||||
843 | } | ||||||
844 | |||||||
845 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||||
846 | /// the product. Clamp the result to the maximum representable value of T on | ||||||
847 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||||
848 | /// maximum representable value of type T. | ||||||
849 | template <typename T> | ||||||
850 | typename std::enable_if<std::is_unsigned<T>::value, T>::type | ||||||
851 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||||
852 | bool Dummy; | ||||||
853 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
854 | |||||||
855 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||||
856 | if (Overflowed) | ||||||
857 | return Product; | ||||||
858 | |||||||
859 | return SaturatingAdd(A, Product, &Overflowed); | ||||||
860 | } | ||||||
861 | |||||||
862 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||||
863 | extern const float huge_valf; | ||||||
864 | |||||||
865 | |||||||
866 | /// Add two signed integers, computing the two's complement truncated result, | ||||||
867 | /// returning true if overflow occured. | ||||||
868 | template <typename T> | ||||||
869 | typename std::enable_if<std::is_signed<T>::value, T>::type | ||||||
870 | AddOverflow(T X, T Y, T &Result) { | ||||||
871 | #if __has_builtin(__builtin_add_overflow)1 | ||||||
872 | return __builtin_add_overflow(X, Y, &Result); | ||||||
873 | #else | ||||||
874 | // Perform the unsigned addition. | ||||||
875 | using U = typename std::make_unsigned<T>::type; | ||||||
876 | const U UX = static_cast<U>(X); | ||||||
877 | const U UY = static_cast<U>(Y); | ||||||
878 | const U UResult = UX + UY; | ||||||
879 | |||||||
880 | // Convert to signed. | ||||||
881 | Result = static_cast<T>(UResult); | ||||||
882 | |||||||
883 | // Adding two positive numbers should result in a positive number. | ||||||
884 | if (X > 0 && Y > 0) | ||||||
885 | return Result <= 0; | ||||||
886 | // Adding two negatives should result in a negative number. | ||||||
887 | if (X < 0 && Y < 0) | ||||||
888 | return Result >= 0; | ||||||
889 | return false; | ||||||
890 | #endif | ||||||
891 | } | ||||||
892 | |||||||
893 | /// Subtract two signed integers, computing the two's complement truncated | ||||||
894 | /// result, returning true if an overflow ocurred. | ||||||
895 | template <typename T> | ||||||
896 | typename std::enable_if<std::is_signed<T>::value, T>::type | ||||||
897 | SubOverflow(T X, T Y, T &Result) { | ||||||
898 | #if __has_builtin(__builtin_sub_overflow)1 | ||||||
899 | return __builtin_sub_overflow(X, Y, &Result); | ||||||
900 | #else | ||||||
901 | // Perform the unsigned addition. | ||||||
902 | using U = typename std::make_unsigned<T>::type; | ||||||
903 | const U UX = static_cast<U>(X); | ||||||
904 | const U UY = static_cast<U>(Y); | ||||||
905 | const U UResult = UX - UY; | ||||||
906 | |||||||
907 | // Convert to signed. | ||||||
908 | Result = static_cast<T>(UResult); | ||||||
909 | |||||||
910 | // Subtracting a positive number from a negative results in a negative number. | ||||||
911 | if (X <= 0 && Y > 0) | ||||||
912 | return Result >= 0; | ||||||
913 | // Subtracting a negative number from a positive results in a positive number. | ||||||
914 | if (X >= 0 && Y < 0) | ||||||
915 | return Result <= 0; | ||||||
916 | return false; | ||||||
917 | #endif | ||||||
918 | } | ||||||
919 | |||||||
920 | |||||||
921 | /// Multiply two signed integers, computing the two's complement truncated | ||||||
922 | /// result, returning true if an overflow ocurred. | ||||||
923 | template <typename T> | ||||||
924 | typename std::enable_if<std::is_signed<T>::value, T>::type | ||||||
925 | MulOverflow(T X, T Y, T &Result) { | ||||||
926 | // Perform the unsigned multiplication on absolute values. | ||||||
927 | using U = typename std::make_unsigned<T>::type; | ||||||
928 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||||
929 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||||
930 | const U UResult = UX * UY; | ||||||
931 | |||||||
932 | // Convert to signed. | ||||||
933 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||||
934 | Result = IsNegative ? (0 - UResult) : UResult; | ||||||
935 | |||||||
936 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||||
937 | if (UX == 0 || UY == 0) | ||||||
938 | return false; | ||||||
939 | |||||||
940 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||||
941 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||||
942 | // positive) divided by an argument compares to the other. | ||||||
943 | if (IsNegative) | ||||||
944 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||||
945 | else | ||||||
946 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||||
947 | } | ||||||
948 | |||||||
949 | } // End llvm namespace | ||||||
950 | |||||||
951 | #endif |