File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 2895, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | /// \file | ||||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||||
10 | /// AMDGPU. | ||||||
11 | /// \todo This should be generated by TableGen. | ||||||
12 | //===----------------------------------------------------------------------===// | ||||||
13 | |||||||
14 | #include "AMDGPULegalizerInfo.h" | ||||||
15 | |||||||
16 | #include "AMDGPU.h" | ||||||
17 | #include "AMDGPUGlobalISelUtils.h" | ||||||
18 | #include "AMDGPUInstrInfo.h" | ||||||
19 | #include "AMDGPUTargetMachine.h" | ||||||
20 | #include "SIMachineFunctionInfo.h" | ||||||
21 | #include "Utils/AMDGPUBaseInfo.h" | ||||||
22 | #include "llvm/ADT/ScopeExit.h" | ||||||
23 | #include "llvm/BinaryFormat/ELF.h" | ||||||
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||||
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||||
27 | #include "llvm/IR/DiagnosticInfo.h" | ||||||
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||||
29 | |||||||
30 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||||
31 | |||||||
32 | using namespace llvm; | ||||||
33 | using namespace LegalizeActions; | ||||||
34 | using namespace LegalizeMutations; | ||||||
35 | using namespace LegalityPredicates; | ||||||
36 | using namespace MIPatternMatch; | ||||||
37 | |||||||
38 | // Hack until load/store selection patterns support any tuple of legal types. | ||||||
39 | static cl::opt<bool> EnableNewLegality( | ||||||
40 | "amdgpu-global-isel-new-legality", | ||||||
41 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||||
42 | "rules compatible with selection patterns"), | ||||||
43 | cl::init(false), | ||||||
44 | cl::ReallyHidden); | ||||||
45 | |||||||
46 | static constexpr unsigned MaxRegisterSize = 1024; | ||||||
47 | |||||||
48 | // Round the number of elements to the next power of two elements | ||||||
49 | static LLT getPow2VectorType(LLT Ty) { | ||||||
50 | unsigned NElts = Ty.getNumElements(); | ||||||
51 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||||
52 | return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); | ||||||
53 | } | ||||||
54 | |||||||
55 | // Round the number of bits to the next power of two bits | ||||||
56 | static LLT getPow2ScalarType(LLT Ty) { | ||||||
57 | unsigned Bits = Ty.getSizeInBits(); | ||||||
58 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||||
59 | return LLT::scalar(Pow2Bits); | ||||||
60 | } | ||||||
61 | |||||||
62 | /// \returns true if this is an odd sized vector which should widen by adding an | ||||||
63 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||||
64 | /// excludes s1 vectors, which should always be scalarized. | ||||||
65 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||||
66 | return [=](const LegalityQuery &Query) { | ||||||
67 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
68 | if (!Ty.isVector()) | ||||||
69 | return false; | ||||||
70 | |||||||
71 | const LLT EltTy = Ty.getElementType(); | ||||||
72 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||
73 | return Ty.getNumElements() % 2 != 0 && | ||||||
74 | EltSize > 1 && EltSize < 32 && | ||||||
75 | Ty.getSizeInBits() % 32 != 0; | ||||||
76 | }; | ||||||
77 | } | ||||||
78 | |||||||
79 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||||
80 | return [=](const LegalityQuery &Query) { | ||||||
81 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
82 | return Ty.getSizeInBits() % 32 == 0; | ||||||
83 | }; | ||||||
84 | } | ||||||
85 | |||||||
86 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||||
87 | return [=](const LegalityQuery &Query) { | ||||||
88 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
89 | const LLT EltTy = Ty.getScalarType(); | ||||||
90 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||||
91 | }; | ||||||
92 | } | ||||||
93 | |||||||
94 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||||
95 | return [=](const LegalityQuery &Query) { | ||||||
96 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
97 | const LLT EltTy = Ty.getElementType(); | ||||||
98 | return std::make_pair(TypeIdx, | ||||||
99 | LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); | ||||||
100 | }; | ||||||
101 | } | ||||||
102 | |||||||
103 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||||
104 | return [=](const LegalityQuery &Query) { | ||||||
105 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
106 | const LLT EltTy = Ty.getElementType(); | ||||||
107 | unsigned Size = Ty.getSizeInBits(); | ||||||
108 | unsigned Pieces = (Size + 63) / 64; | ||||||
109 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||||
110 | return std::make_pair( | ||||||
111 | TypeIdx, | ||||||
112 | LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); | ||||||
113 | }; | ||||||
114 | } | ||||||
115 | |||||||
116 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||||
117 | // type. | ||||||
118 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||||
119 | return [=](const LegalityQuery &Query) { | ||||||
120 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
121 | |||||||
122 | const LLT EltTy = Ty.getElementType(); | ||||||
123 | const int Size = Ty.getSizeInBits(); | ||||||
124 | const int EltSize = EltTy.getSizeInBits(); | ||||||
125 | const int NextMul32 = (Size + 31) / 32; | ||||||
126 | |||||||
127 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 127, __extension__ __PRETTY_FUNCTION__)); | ||||||
128 | |||||||
129 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||||
130 | return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); | ||||||
131 | }; | ||||||
132 | } | ||||||
133 | |||||||
134 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||||
135 | const unsigned Size = Ty.getSizeInBits(); | ||||||
136 | |||||||
137 | LLT CoercedTy; | ||||||
138 | if (Size <= 32) { | ||||||
139 | // <2 x s8> -> s16 | ||||||
140 | // <4 x s8> -> s32 | ||||||
141 | return LLT::scalar(Size); | ||||||
142 | } | ||||||
143 | |||||||
144 | return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); | ||||||
145 | } | ||||||
146 | |||||||
147 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||||
148 | return [=](const LegalityQuery &Query) { | ||||||
149 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
150 | return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||||
151 | }; | ||||||
152 | } | ||||||
153 | |||||||
154 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||||
155 | return [=](const LegalityQuery &Query) { | ||||||
156 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
157 | unsigned Size = Ty.getSizeInBits(); | ||||||
158 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 158, __extension__ __PRETTY_FUNCTION__)); | ||||||
159 | return std::make_pair( | ||||||
160 | TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); | ||||||
161 | }; | ||||||
162 | } | ||||||
163 | |||||||
164 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||||
165 | return [=](const LegalityQuery &Query) { | ||||||
166 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
167 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||||
168 | }; | ||||||
169 | } | ||||||
170 | |||||||
171 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||||
172 | return [=](const LegalityQuery &Query) { | ||||||
173 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
174 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||||
175 | }; | ||||||
176 | } | ||||||
177 | |||||||
178 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||||
179 | return [=](const LegalityQuery &Query) { | ||||||
180 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
181 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||||
182 | }; | ||||||
183 | } | ||||||
184 | |||||||
185 | static bool isRegisterSize(unsigned Size) { | ||||||
186 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||||
187 | } | ||||||
188 | |||||||
189 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||||
190 | const int EltSize = EltTy.getSizeInBits(); | ||||||
191 | return EltSize == 16 || EltSize % 32 == 0; | ||||||
192 | } | ||||||
193 | |||||||
194 | static bool isRegisterVectorType(LLT Ty) { | ||||||
195 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||||
196 | return EltSize == 32 || EltSize == 64 || | ||||||
197 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||||
198 | EltSize == 128 || EltSize == 256; | ||||||
199 | } | ||||||
200 | |||||||
201 | static bool isRegisterType(LLT Ty) { | ||||||
202 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||||
203 | return false; | ||||||
204 | |||||||
205 | if (Ty.isVector()) | ||||||
206 | return isRegisterVectorType(Ty); | ||||||
207 | |||||||
208 | return true; | ||||||
209 | } | ||||||
210 | |||||||
211 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||||
212 | // multiples of v2s16. | ||||||
213 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||||
214 | return [=](const LegalityQuery &Query) { | ||||||
215 | return isRegisterType(Query.Types[TypeIdx]); | ||||||
216 | }; | ||||||
217 | } | ||||||
218 | |||||||
219 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||||
220 | return [=](const LegalityQuery &Query) { | ||||||
221 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
222 | if (!QueryTy.isVector()) | ||||||
223 | return false; | ||||||
224 | const LLT EltTy = QueryTy.getElementType(); | ||||||
225 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||||
226 | }; | ||||||
227 | } | ||||||
228 | |||||||
229 | // If we have a truncating store or an extending load with a data size larger | ||||||
230 | // than 32-bits, we need to reduce to a 32-bit type. | ||||||
231 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { | ||||||
232 | return [=](const LegalityQuery &Query) { | ||||||
233 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
234 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||||
235 | Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); | ||||||
236 | }; | ||||||
237 | } | ||||||
238 | |||||||
239 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||||
240 | // handle some operations by just promoting the register during | ||||||
241 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||||
242 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||||
243 | bool IsLoad) { | ||||||
244 | switch (AS) { | ||||||
245 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||||
246 | // FIXME: Private element size. | ||||||
247 | return ST.enableFlatScratch() ? 128 : 32; | ||||||
248 | case AMDGPUAS::LOCAL_ADDRESS: | ||||||
249 | return ST.useDS128() ? 128 : 64; | ||||||
250 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||||
251 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||||
252 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||||
253 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||||
254 | // global loads (ideally constant address space should be eliminated) | ||||||
255 | // depending on the context. Legality cannot be context dependent, but | ||||||
256 | // RegBankSelect can split the load as necessary depending on the pointer | ||||||
257 | // register bank/uniformity and if the memory is invariant or not written in a | ||||||
258 | // kernel. | ||||||
259 | return IsLoad ? 512 : 128; | ||||||
260 | default: | ||||||
261 | // Flat addresses may contextually need to be split to 32-bit parts if they | ||||||
262 | // may alias scratch depending on the subtarget. | ||||||
263 | return 128; | ||||||
264 | } | ||||||
265 | } | ||||||
266 | |||||||
267 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||||
268 | const LegalityQuery &Query) { | ||||||
269 | const LLT Ty = Query.Types[0]; | ||||||
270 | |||||||
271 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||||
272 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; | ||||||
273 | |||||||
274 | unsigned RegSize = Ty.getSizeInBits(); | ||||||
275 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
276 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||
277 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||||
278 | |||||||
279 | // All of these need to be custom lowered to cast the pointer operand. | ||||||
280 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||||
281 | return false; | ||||||
282 | |||||||
283 | // Do not handle extending vector loads. | ||||||
284 | if (Ty.isVector() && MemSize != RegSize) | ||||||
285 | return false; | ||||||
286 | |||||||
287 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||||
288 | // we also need to modify the memory access size. | ||||||
289 | #if 0 | ||||||
290 | // Accept widening loads based on alignment. | ||||||
291 | if (IsLoad && MemSize < Size) | ||||||
292 | MemSize = std::max(MemSize, Align); | ||||||
293 | #endif | ||||||
294 | |||||||
295 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||||
296 | if (MemSize != RegSize && RegSize != 32) | ||||||
297 | return false; | ||||||
298 | |||||||
299 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||
300 | return false; | ||||||
301 | |||||||
302 | switch (MemSize) { | ||||||
303 | case 8: | ||||||
304 | case 16: | ||||||
305 | case 32: | ||||||
306 | case 64: | ||||||
307 | case 128: | ||||||
308 | break; | ||||||
309 | case 96: | ||||||
310 | if (!ST.hasDwordx3LoadStores()) | ||||||
311 | return false; | ||||||
312 | break; | ||||||
313 | case 256: | ||||||
314 | case 512: | ||||||
315 | // These may contextually need to be broken down. | ||||||
316 | break; | ||||||
317 | default: | ||||||
318 | return false; | ||||||
319 | } | ||||||
320 | |||||||
321 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 321, __extension__ __PRETTY_FUNCTION__)); | ||||||
322 | |||||||
323 | if (AlignBits < MemSize) { | ||||||
324 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
325 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||
326 | Align(AlignBits / 8))) | ||||||
327 | return false; | ||||||
328 | } | ||||||
329 | |||||||
330 | return true; | ||||||
331 | } | ||||||
332 | |||||||
333 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||||
334 | // workaround this. Eventually it should ignore the type for loads and only care | ||||||
335 | // about the size. Return true in cases where we will workaround this for now by | ||||||
336 | // bitcasting. | ||||||
337 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||||
338 | if (EnableNewLegality) | ||||||
339 | return false; | ||||||
340 | |||||||
341 | const unsigned Size = Ty.getSizeInBits(); | ||||||
342 | if (Size <= 64) | ||||||
343 | return false; | ||||||
344 | if (!Ty.isVector()) | ||||||
345 | return true; | ||||||
346 | |||||||
347 | LLT EltTy = Ty.getElementType(); | ||||||
348 | if (EltTy.isPointer()) | ||||||
349 | return true; | ||||||
350 | |||||||
351 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
352 | return EltSize != 32 && EltSize != 64; | ||||||
353 | } | ||||||
354 | |||||||
355 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { | ||||||
356 | const LLT Ty = Query.Types[0]; | ||||||
357 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && | ||||||
358 | !loadStoreBitcastWorkaround(Ty); | ||||||
359 | } | ||||||
360 | |||||||
361 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||||
362 | /// to a different type. | ||||||
363 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||||
364 | const LLT MemTy) { | ||||||
365 | const unsigned MemSizeInBits = MemTy.getSizeInBits(); | ||||||
366 | const unsigned Size = Ty.getSizeInBits(); | ||||||
367 | if (Size != MemSizeInBits) | ||||||
368 | return Size <= 32 && Ty.isVector(); | ||||||
369 | |||||||
370 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||||
371 | return true; | ||||||
372 | |||||||
373 | // Don't try to handle bitcasting vector ext loads for now. | ||||||
374 | return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && | ||||||
375 | (Size <= 32 || isRegisterSize(Size)) && | ||||||
376 | !isRegisterVectorElementType(Ty.getElementType()); | ||||||
377 | } | ||||||
378 | |||||||
379 | /// Return true if we should legalize a load by widening an odd sized memory | ||||||
380 | /// access up to the alignment. Note this case when the memory access itself | ||||||
381 | /// changes, not the size of the result register. | ||||||
382 | static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, | ||||||
383 | unsigned AlignInBits, unsigned AddrSpace, | ||||||
384 | unsigned Opcode) { | ||||||
385 | unsigned SizeInBits = MemoryTy.getSizeInBits(); | ||||||
386 | // We don't want to widen cases that are naturally legal. | ||||||
387 | if (isPowerOf2_32(SizeInBits)) | ||||||
388 | return false; | ||||||
389 | |||||||
390 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||||
391 | // end up widening these for a scalar load during RegBankSelect, since there | ||||||
392 | // aren't 96-bit scalar loads. | ||||||
393 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||||
394 | return false; | ||||||
395 | |||||||
396 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) | ||||||
397 | return false; | ||||||
398 | |||||||
399 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||||
400 | // to it. | ||||||
401 | // | ||||||
402 | // TODO: Could check dereferenceable for less aligned cases. | ||||||
403 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||||
404 | if (AlignInBits < RoundedSize) | ||||||
405 | return false; | ||||||
406 | |||||||
407 | // Do not widen if it would introduce a slow unaligned load. | ||||||
408 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
409 | bool Fast = false; | ||||||
410 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||||
411 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||||
412 | MachineMemOperand::MOLoad, &Fast) && | ||||||
413 | Fast; | ||||||
414 | } | ||||||
415 | |||||||
416 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||||
417 | unsigned Opcode) { | ||||||
418 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||||
419 | return false; | ||||||
420 | |||||||
421 | return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, | ||||||
422 | Query.MMODescrs[0].AlignInBits, | ||||||
423 | Query.Types[1].getAddressSpace(), Opcode); | ||||||
424 | } | ||||||
425 | |||||||
426 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||||
427 | const GCNTargetMachine &TM) | ||||||
428 | : ST(ST_) { | ||||||
429 | using namespace TargetOpcode; | ||||||
430 | |||||||
431 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||||
432 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||||
433 | }; | ||||||
434 | |||||||
435 | const LLT S1 = LLT::scalar(1); | ||||||
436 | const LLT S8 = LLT::scalar(8); | ||||||
437 | const LLT S16 = LLT::scalar(16); | ||||||
438 | const LLT S32 = LLT::scalar(32); | ||||||
439 | const LLT S64 = LLT::scalar(64); | ||||||
440 | const LLT S128 = LLT::scalar(128); | ||||||
441 | const LLT S256 = LLT::scalar(256); | ||||||
442 | const LLT S512 = LLT::scalar(512); | ||||||
443 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||||
444 | |||||||
445 | const LLT V2S8 = LLT::fixed_vector(2, 8); | ||||||
446 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
447 | const LLT V4S16 = LLT::fixed_vector(4, 16); | ||||||
448 | |||||||
449 | const LLT V2S32 = LLT::fixed_vector(2, 32); | ||||||
450 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||||
451 | const LLT V4S32 = LLT::fixed_vector(4, 32); | ||||||
452 | const LLT V5S32 = LLT::fixed_vector(5, 32); | ||||||
453 | const LLT V6S32 = LLT::fixed_vector(6, 32); | ||||||
454 | const LLT V7S32 = LLT::fixed_vector(7, 32); | ||||||
455 | const LLT V8S32 = LLT::fixed_vector(8, 32); | ||||||
456 | const LLT V9S32 = LLT::fixed_vector(9, 32); | ||||||
457 | const LLT V10S32 = LLT::fixed_vector(10, 32); | ||||||
458 | const LLT V11S32 = LLT::fixed_vector(11, 32); | ||||||
459 | const LLT V12S32 = LLT::fixed_vector(12, 32); | ||||||
460 | const LLT V13S32 = LLT::fixed_vector(13, 32); | ||||||
461 | const LLT V14S32 = LLT::fixed_vector(14, 32); | ||||||
462 | const LLT V15S32 = LLT::fixed_vector(15, 32); | ||||||
463 | const LLT V16S32 = LLT::fixed_vector(16, 32); | ||||||
464 | const LLT V32S32 = LLT::fixed_vector(32, 32); | ||||||
465 | |||||||
466 | const LLT V2S64 = LLT::fixed_vector(2, 64); | ||||||
467 | const LLT V3S64 = LLT::fixed_vector(3, 64); | ||||||
468 | const LLT V4S64 = LLT::fixed_vector(4, 64); | ||||||
469 | const LLT V5S64 = LLT::fixed_vector(5, 64); | ||||||
470 | const LLT V6S64 = LLT::fixed_vector(6, 64); | ||||||
471 | const LLT V7S64 = LLT::fixed_vector(7, 64); | ||||||
472 | const LLT V8S64 = LLT::fixed_vector(8, 64); | ||||||
473 | const LLT V16S64 = LLT::fixed_vector(16, 64); | ||||||
474 | |||||||
475 | std::initializer_list<LLT> AllS32Vectors = | ||||||
476 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||||
477 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||||
478 | std::initializer_list<LLT> AllS64Vectors = | ||||||
479 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||||
480 | |||||||
481 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||||
482 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
483 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||||
484 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||||
485 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||||
486 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||||
487 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||||
488 | |||||||
489 | const LLT CodePtr = FlatPtr; | ||||||
490 | |||||||
491 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||||
492 | GlobalPtr, ConstantPtr, FlatPtr | ||||||
493 | }; | ||||||
494 | |||||||
495 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||||
496 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||||
497 | }; | ||||||
498 | |||||||
499 | const std::initializer_list<LLT> FPTypesBase = { | ||||||
500 | S32, S64 | ||||||
501 | }; | ||||||
502 | |||||||
503 | const std::initializer_list<LLT> FPTypes16 = { | ||||||
504 | S32, S64, S16 | ||||||
505 | }; | ||||||
506 | |||||||
507 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||||
508 | S32, S64, S16, V2S16 | ||||||
509 | }; | ||||||
510 | |||||||
511 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||||
512 | |||||||
513 | // s1 for VCC branches, s32 for SCC branches. | ||||||
514 | getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); | ||||||
515 | |||||||
516 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||||
517 | // elements for v3s16 | ||||||
518 | getActionDefinitionsBuilder(G_PHI) | ||||||
519 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||||
520 | .legalFor(AllS32Vectors) | ||||||
521 | .legalFor(AllS64Vectors) | ||||||
522 | .legalFor(AddrSpaces64) | ||||||
523 | .legalFor(AddrSpaces32) | ||||||
524 | .legalIf(isPointer(0)) | ||||||
525 | .clampScalar(0, S16, S256) | ||||||
526 | .widenScalarToNextPow2(0, 32) | ||||||
527 | .clampMaxNumElements(0, S32, 16) | ||||||
528 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
529 | .scalarize(0); | ||||||
530 | |||||||
531 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||||
532 | // Full set of gfx9 features. | ||||||
533 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
534 | .legalFor({S32, S16, V2S16}) | ||||||
535 | .minScalar(0, S16) | ||||||
536 | .clampMaxNumElementsStrict(0, S16, 2) | ||||||
537 | .widenScalarToNextMultipleOf(0, 32) | ||||||
538 | .maxScalar(0, S32) | ||||||
539 | .scalarize(0); | ||||||
540 | |||||||
541 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||||
542 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||||
543 | .minScalarOrElt(0, S16) | ||||||
544 | .clampMaxNumElementsStrict(0, S16, 2) | ||||||
545 | .scalarize(0) | ||||||
546 | .widenScalarToNextPow2(0, 32) | ||||||
547 | .lower(); | ||||||
548 | } else if (ST.has16BitInsts()) { | ||||||
549 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
550 | .legalFor({S32, S16}) | ||||||
551 | .minScalar(0, S16) | ||||||
552 | .widenScalarToNextMultipleOf(0, 32) | ||||||
553 | .maxScalar(0, S32) | ||||||
554 | .scalarize(0); | ||||||
555 | |||||||
556 | // Technically the saturating operations require clamp bit support, but this | ||||||
557 | // was introduced at the same time as 16-bit operations. | ||||||
558 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
559 | .legalFor({S32, S16}) // Clamp modifier | ||||||
560 | .minScalar(0, S16) | ||||||
561 | .scalarize(0) | ||||||
562 | .widenScalarToNextPow2(0, 16) | ||||||
563 | .lower(); | ||||||
564 | |||||||
565 | // We're just lowering this, but it helps get a better result to try to | ||||||
566 | // coerce to the desired type first. | ||||||
567 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||
568 | .minScalar(0, S16) | ||||||
569 | .scalarize(0) | ||||||
570 | .lower(); | ||||||
571 | } else { | ||||||
572 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
573 | .legalFor({S32}) | ||||||
574 | .widenScalarToNextMultipleOf(0, 32) | ||||||
575 | .clampScalar(0, S32, S32) | ||||||
576 | .scalarize(0); | ||||||
577 | |||||||
578 | if (ST.hasIntClamp()) { | ||||||
579 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
580 | .legalFor({S32}) // Clamp modifier. | ||||||
581 | .scalarize(0) | ||||||
582 | .minScalarOrElt(0, S32) | ||||||
583 | .lower(); | ||||||
584 | } else { | ||||||
585 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||||
586 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
587 | .minScalar(0, S32) | ||||||
588 | .scalarize(0) | ||||||
589 | .lower(); | ||||||
590 | } | ||||||
591 | |||||||
592 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||||
593 | // range values and goes for the min/max lowering directly. | ||||||
594 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||
595 | .minScalar(0, S32) | ||||||
596 | .scalarize(0) | ||||||
597 | .lower(); | ||||||
598 | } | ||||||
599 | |||||||
600 | getActionDefinitionsBuilder( | ||||||
601 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) | ||||||
602 | .customFor({S32, S64}) | ||||||
603 | .clampScalar(0, S32, S64) | ||||||
604 | .widenScalarToNextPow2(0, 32) | ||||||
605 | .scalarize(0); | ||||||
606 | |||||||
607 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||||
608 | .legalFor({S32}) | ||||||
609 | .maxScalar(0, S32); | ||||||
610 | |||||||
611 | if (ST.hasVOP3PInsts()) { | ||||||
612 | Mulh | ||||||
613 | .clampMaxNumElements(0, S8, 2) | ||||||
614 | .lowerFor({V2S8}); | ||||||
615 | } | ||||||
616 | |||||||
617 | Mulh | ||||||
618 | .scalarize(0) | ||||||
619 | .lower(); | ||||||
620 | |||||||
621 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||||
622 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||||
623 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||||
624 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||||
625 | .clampScalar(0, S32, S64) | ||||||
626 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
627 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||||
628 | .widenScalarToNextPow2(0) | ||||||
629 | .scalarize(0); | ||||||
630 | |||||||
631 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | ||||||
632 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||||
633 | .legalFor({{S32, S1}, {S32, S32}}) | ||||||
634 | .minScalar(0, S32) | ||||||
635 | // TODO: .scalarize(0) | ||||||
636 | .lower(); | ||||||
637 | |||||||
638 | getActionDefinitionsBuilder(G_BITCAST) | ||||||
639 | // Don't worry about the size constraint. | ||||||
640 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
641 | .lower(); | ||||||
642 | |||||||
643 | |||||||
644 | getActionDefinitionsBuilder(G_CONSTANT) | ||||||
645 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||||
646 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||||
647 | .legalIf(isPointer(0)) | ||||||
648 | .clampScalar(0, S32, S64) | ||||||
649 | .widenScalarToNextPow2(0); | ||||||
650 | |||||||
651 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||||
652 | .legalFor({S32, S64, S16}) | ||||||
653 | .clampScalar(0, S16, S64); | ||||||
654 | |||||||
655 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||||
656 | .legalIf(isRegisterType(0)) | ||||||
657 | // s1 and s16 are special cases because they have legal operations on | ||||||
658 | // them, but don't really occupy registers in the normal way. | ||||||
659 | .legalFor({S1, S16}) | ||||||
660 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
661 | .clampScalarOrElt(0, S32, MaxScalar) | ||||||
662 | .widenScalarToNextPow2(0, 32) | ||||||
663 | .clampMaxNumElements(0, S32, 16); | ||||||
664 | |||||||
665 | getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); | ||||||
666 | |||||||
667 | // If the amount is divergent, we have to do a wave reduction to get the | ||||||
668 | // maximum value, so this is expanded during RegBankSelect. | ||||||
669 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||||
670 | .legalFor({{PrivatePtr, S32}}); | ||||||
671 | |||||||
672 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||||
673 | .customIf(typeIsNot(0, PrivatePtr)); | ||||||
674 | |||||||
675 | getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); | ||||||
676 | |||||||
677 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||||
678 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | ||||||
679 | .legalFor({S32, S64}); | ||||||
680 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||||
681 | .customFor({S32, S64}); | ||||||
682 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||||
683 | .customFor({S32, S64}); | ||||||
684 | |||||||
685 | if (ST.has16BitInsts()) { | ||||||
686 | if (ST.hasVOP3PInsts()) | ||||||
687 | FPOpActions.legalFor({S16, V2S16}); | ||||||
688 | else | ||||||
689 | FPOpActions.legalFor({S16}); | ||||||
690 | |||||||
691 | TrigActions.customFor({S16}); | ||||||
692 | FDIVActions.customFor({S16}); | ||||||
693 | } | ||||||
694 | |||||||
695 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||||
696 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||||
697 | |||||||
698 | if (ST.hasVOP3PInsts()) { | ||||||
699 | MinNumMaxNum.customFor(FPTypesPK16) | ||||||
700 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
701 | .clampMaxNumElements(0, S16, 2) | ||||||
702 | .clampScalar(0, S16, S64) | ||||||
703 | .scalarize(0); | ||||||
704 | } else if (ST.has16BitInsts()) { | ||||||
705 | MinNumMaxNum.customFor(FPTypes16) | ||||||
706 | .clampScalar(0, S16, S64) | ||||||
707 | .scalarize(0); | ||||||
708 | } else { | ||||||
709 | MinNumMaxNum.customFor(FPTypesBase) | ||||||
710 | .clampScalar(0, S32, S64) | ||||||
711 | .scalarize(0); | ||||||
712 | } | ||||||
713 | |||||||
714 | if (ST.hasVOP3PInsts()) | ||||||
715 | FPOpActions.clampMaxNumElementsStrict(0, S16, 2); | ||||||
716 | |||||||
717 | FPOpActions | ||||||
718 | .scalarize(0) | ||||||
719 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
720 | |||||||
721 | TrigActions | ||||||
722 | .scalarize(0) | ||||||
723 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
724 | |||||||
725 | FDIVActions | ||||||
726 | .scalarize(0) | ||||||
727 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
728 | |||||||
729 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||||
730 | .legalFor(FPTypesPK16) | ||||||
731 | .clampMaxNumElementsStrict(0, S16, 2) | ||||||
732 | .scalarize(0) | ||||||
733 | .clampScalar(0, S16, S64); | ||||||
734 | |||||||
735 | if (ST.has16BitInsts()) { | ||||||
736 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||
737 | .legalFor({S32, S64, S16}) | ||||||
738 | .scalarize(0) | ||||||
739 | .clampScalar(0, S16, S64); | ||||||
740 | } else { | ||||||
741 | getActionDefinitionsBuilder(G_FSQRT) | ||||||
742 | .legalFor({S32, S64}) | ||||||
743 | .scalarize(0) | ||||||
744 | .clampScalar(0, S32, S64); | ||||||
745 | |||||||
746 | if (ST.hasFractBug()) { | ||||||
747 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
748 | .customFor({S64}) | ||||||
749 | .legalFor({S32, S64}) | ||||||
750 | .scalarize(0) | ||||||
751 | .clampScalar(0, S32, S64); | ||||||
752 | } else { | ||||||
753 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
754 | .legalFor({S32, S64}) | ||||||
755 | .scalarize(0) | ||||||
756 | .clampScalar(0, S32, S64); | ||||||
757 | } | ||||||
758 | } | ||||||
759 | |||||||
760 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||||
761 | .legalFor({{S32, S64}, {S16, S32}}) | ||||||
762 | .scalarize(0) | ||||||
763 | .lower(); | ||||||
764 | |||||||
765 | getActionDefinitionsBuilder(G_FPEXT) | ||||||
766 | .legalFor({{S64, S32}, {S32, S16}}) | ||||||
767 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||||
768 | .scalarize(0); | ||||||
769 | |||||||
770 | getActionDefinitionsBuilder(G_FSUB) | ||||||
771 | // Use actual fsub instruction | ||||||
772 | .legalFor({S32}) | ||||||
773 | // Must use fadd + fneg | ||||||
774 | .lowerFor({S64, S16, V2S16}) | ||||||
775 | .scalarize(0) | ||||||
776 | .clampScalar(0, S32, S64); | ||||||
777 | |||||||
778 | // Whether this is legal depends on the floating point mode for the function. | ||||||
779 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||||
780 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||||
781 | FMad.customFor({S32, S16}); | ||||||
782 | else if (ST.hasMadMacF32Insts()) | ||||||
783 | FMad.customFor({S32}); | ||||||
784 | else if (ST.hasMadF16()) | ||||||
785 | FMad.customFor({S16}); | ||||||
786 | FMad.scalarize(0) | ||||||
787 | .lower(); | ||||||
788 | |||||||
789 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||||
790 | if (ST.has16BitInsts()) { | ||||||
791 | FRem.customFor({S16, S32, S64}); | ||||||
792 | } else { | ||||||
793 | FRem.minScalar(0, S32) | ||||||
794 | .customFor({S32, S64}); | ||||||
795 | } | ||||||
796 | FRem.scalarize(0); | ||||||
797 | |||||||
798 | // TODO: Do we need to clamp maximum bitwidth? | ||||||
799 | getActionDefinitionsBuilder(G_TRUNC) | ||||||
800 | .legalIf(isScalar(0)) | ||||||
801 | .legalFor({{V2S16, V2S32}}) | ||||||
802 | .clampMaxNumElements(0, S16, 2) | ||||||
803 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||||
804 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||||
805 | // in the legalizer. | ||||||
806 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||||
807 | .alwaysLegal(); | ||||||
808 | |||||||
809 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||||
810 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||||
811 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||||
812 | .scalarize(0) | ||||||
813 | .clampScalar(0, S32, S64) | ||||||
814 | .widenScalarToNextPow2(1, 32); | ||||||
815 | |||||||
816 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||||
817 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||||
818 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||||
819 | .lowerIf(typeIs(1, S1)) | ||||||
820 | .customFor({{S32, S64}, {S64, S64}}); | ||||||
821 | if (ST.has16BitInsts()) | ||||||
822 | IToFP.legalFor({{S16, S16}}); | ||||||
823 | IToFP.clampScalar(1, S32, S64) | ||||||
824 | .minScalar(0, S32) | ||||||
825 | .scalarize(0) | ||||||
826 | .widenScalarToNextPow2(1); | ||||||
827 | |||||||
828 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||||
829 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||||
830 | .customFor({{S64, S32}, {S64, S64}}) | ||||||
831 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||||
832 | if (ST.has16BitInsts()) | ||||||
833 | FPToI.legalFor({{S16, S16}}); | ||||||
834 | else | ||||||
835 | FPToI.minScalar(1, S32); | ||||||
836 | |||||||
837 | FPToI.minScalar(0, S32) | ||||||
838 | .widenScalarToNextPow2(0, 32) | ||||||
839 | .scalarize(0) | ||||||
840 | .lower(); | ||||||
841 | |||||||
842 | // Lower roundeven into G_FRINT | ||||||
843 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||||
844 | .scalarize(0) | ||||||
845 | .lower(); | ||||||
846 | |||||||
847 | if (ST.has16BitInsts()) { | ||||||
848 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
849 | .legalFor({S16, S32, S64}) | ||||||
850 | .clampScalar(0, S16, S64) | ||||||
851 | .scalarize(0); | ||||||
852 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||||
853 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
854 | .legalFor({S32, S64}) | ||||||
855 | .clampScalar(0, S32, S64) | ||||||
856 | .scalarize(0); | ||||||
857 | } else { | ||||||
858 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
859 | .legalFor({S32}) | ||||||
860 | .customFor({S64}) | ||||||
861 | .clampScalar(0, S32, S64) | ||||||
862 | .scalarize(0); | ||||||
863 | } | ||||||
864 | |||||||
865 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||||
866 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||||
867 | .scalarize(0) | ||||||
868 | .scalarSameSizeAs(1, 0); | ||||||
869 | |||||||
870 | getActionDefinitionsBuilder(G_PTRMASK) | ||||||
871 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||||
872 | .scalarSameSizeAs(1, 0) | ||||||
873 | .scalarize(0); | ||||||
874 | |||||||
875 | auto &CmpBuilder = | ||||||
876 | getActionDefinitionsBuilder(G_ICMP) | ||||||
877 | // The compare output type differs based on the register bank of the output, | ||||||
878 | // so make both s1 and s32 legal. | ||||||
879 | // | ||||||
880 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||||
881 | // is the allocatable register type that will be needed for the copy from | ||||||
882 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||||
883 | // before that won't try to use s32 result types. | ||||||
884 | // | ||||||
885 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||||
886 | // bank. | ||||||
887 | .legalForCartesianProduct( | ||||||
888 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||||
889 | .legalForCartesianProduct( | ||||||
890 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||||
891 | if (ST.has16BitInsts()) { | ||||||
892 | CmpBuilder.legalFor({{S1, S16}}); | ||||||
893 | } | ||||||
894 | |||||||
895 | CmpBuilder | ||||||
896 | .widenScalarToNextPow2(1) | ||||||
897 | .clampScalar(1, S32, S64) | ||||||
898 | .scalarize(0) | ||||||
899 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||||
900 | |||||||
901 | getActionDefinitionsBuilder(G_FCMP) | ||||||
902 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||||
903 | .widenScalarToNextPow2(1) | ||||||
904 | .clampScalar(1, S32, S64) | ||||||
905 | .scalarize(0); | ||||||
906 | |||||||
907 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||||
908 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||||
909 | if (ST.has16BitInsts()) | ||||||
910 | Exp2Ops.legalFor({S32, S16}); | ||||||
911 | else | ||||||
912 | Exp2Ops.legalFor({S32}); | ||||||
913 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||||
914 | Exp2Ops.scalarize(0); | ||||||
915 | |||||||
916 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||||
917 | if (ST.has16BitInsts()) | ||||||
918 | ExpOps.customFor({{S32}, {S16}}); | ||||||
919 | else | ||||||
920 | ExpOps.customFor({S32}); | ||||||
921 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||||
922 | .scalarize(0); | ||||||
923 | |||||||
924 | getActionDefinitionsBuilder(G_FPOWI) | ||||||
925 | .clampScalar(0, MinScalarFPTy, S32) | ||||||
926 | .lower(); | ||||||
927 | |||||||
928 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
929 | getActionDefinitionsBuilder(G_CTPOP) | ||||||
930 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
931 | .clampScalar(0, S32, S32) | ||||||
932 | .clampScalar(1, S32, S64) | ||||||
933 | .scalarize(0) | ||||||
934 | .widenScalarToNextPow2(0, 32) | ||||||
935 | .widenScalarToNextPow2(1, 32); | ||||||
936 | |||||||
937 | // The hardware instructions return a different result on 0 than the generic | ||||||
938 | // instructions expect. The hardware produces -1, but these produce the | ||||||
939 | // bitwidth. | ||||||
940 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||||
941 | .scalarize(0) | ||||||
942 | .clampScalar(0, S32, S32) | ||||||
943 | .clampScalar(1, S32, S64) | ||||||
944 | .widenScalarToNextPow2(0, 32) | ||||||
945 | .widenScalarToNextPow2(1, 32) | ||||||
946 | .custom(); | ||||||
947 | |||||||
948 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
949 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||||
950 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
951 | .clampScalar(0, S32, S32) | ||||||
952 | .clampScalar(1, S32, S64) | ||||||
953 | .scalarize(0) | ||||||
954 | .widenScalarToNextPow2(0, 32) | ||||||
955 | .widenScalarToNextPow2(1, 32); | ||||||
956 | |||||||
957 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||
958 | // RegBankSelect. | ||||||
959 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||||
960 | .legalFor({S32, S64}) | ||||||
961 | .clampScalar(0, S32, S64) | ||||||
962 | .scalarize(0) | ||||||
963 | .widenScalarToNextPow2(0); | ||||||
964 | |||||||
965 | if (ST.has16BitInsts()) { | ||||||
966 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
967 | .legalFor({S16, S32, V2S16}) | ||||||
968 | .clampMaxNumElementsStrict(0, S16, 2) | ||||||
969 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
970 | // narrowScalar limitation. | ||||||
971 | .widenScalarToNextPow2(0) | ||||||
972 | .clampScalar(0, S16, S32) | ||||||
973 | .scalarize(0); | ||||||
974 | |||||||
975 | if (ST.hasVOP3PInsts()) { | ||||||
976 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||
977 | .legalFor({S32, S16, V2S16}) | ||||||
978 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
979 | .clampMaxNumElements(0, S16, 2) | ||||||
980 | .minScalar(0, S16) | ||||||
981 | .widenScalarToNextPow2(0) | ||||||
982 | .scalarize(0) | ||||||
983 | .lower(); | ||||||
984 | } else { | ||||||
985 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||
986 | .legalFor({S32, S16}) | ||||||
987 | .widenScalarToNextPow2(0) | ||||||
988 | .minScalar(0, S16) | ||||||
989 | .scalarize(0) | ||||||
990 | .lower(); | ||||||
991 | } | ||||||
992 | } else { | ||||||
993 | // TODO: Should have same legality without v_perm_b32 | ||||||
994 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
995 | .legalFor({S32}) | ||||||
996 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||||
997 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
998 | // narrowScalar limitation. | ||||||
999 | .widenScalarToNextPow2(0) | ||||||
1000 | .maxScalar(0, S32) | ||||||
1001 | .scalarize(0) | ||||||
1002 | .lower(); | ||||||
1003 | |||||||
1004 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||
1005 | .legalFor({S32}) | ||||||
1006 | .minScalar(0, S32) | ||||||
1007 | .widenScalarToNextPow2(0) | ||||||
1008 | .scalarize(0) | ||||||
1009 | .lower(); | ||||||
1010 | } | ||||||
1011 | |||||||
1012 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||||
1013 | // List the common cases | ||||||
1014 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
1015 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
1016 | .scalarize(0) | ||||||
1017 | // Accept any address space as long as the size matches | ||||||
1018 | .legalIf(sameSize(0, 1)) | ||||||
1019 | .widenScalarIf(smallerThan(1, 0), | ||||||
1020 | [](const LegalityQuery &Query) { | ||||||
1021 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
1022 | }) | ||||||
1023 | .narrowScalarIf(largerThan(1, 0), | ||||||
1024 | [](const LegalityQuery &Query) { | ||||||
1025 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
1026 | }); | ||||||
1027 | |||||||
1028 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||||
1029 | // List the common cases | ||||||
1030 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
1031 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
1032 | .scalarize(0) | ||||||
1033 | // Accept any address space as long as the size matches | ||||||
1034 | .legalIf(sameSize(0, 1)) | ||||||
1035 | .widenScalarIf(smallerThan(0, 1), | ||||||
1036 | [](const LegalityQuery &Query) { | ||||||
1037 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
1038 | }) | ||||||
1039 | .narrowScalarIf( | ||||||
1040 | largerThan(0, 1), | ||||||
1041 | [](const LegalityQuery &Query) { | ||||||
1042 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
1043 | }); | ||||||
1044 | |||||||
1045 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||||
1046 | .scalarize(0) | ||||||
1047 | .custom(); | ||||||
1048 | |||||||
1049 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||||
1050 | bool IsLoad) -> bool { | ||||||
1051 | const LLT DstTy = Query.Types[0]; | ||||||
1052 | |||||||
1053 | // Split vector extloads. | ||||||
1054 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
1055 | |||||||
1056 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||||
1057 | return true; | ||||||
1058 | |||||||
1059 | const LLT PtrTy = Query.Types[1]; | ||||||
1060 | unsigned AS = PtrTy.getAddressSpace(); | ||||||
1061 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||
1062 | return true; | ||||||
1063 | |||||||
1064 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||||
1065 | // TODO: May be able to widen depending on alignment etc. | ||||||
1066 | unsigned NumRegs = (MemSize + 31) / 32; | ||||||
1067 | if (NumRegs == 3) { | ||||||
1068 | if (!ST.hasDwordx3LoadStores()) | ||||||
1069 | return true; | ||||||
1070 | } else { | ||||||
1071 | // If the alignment allows, these should have been widened. | ||||||
1072 | if (!isPowerOf2_32(NumRegs)) | ||||||
1073 | return true; | ||||||
1074 | } | ||||||
1075 | |||||||
1076 | return false; | ||||||
1077 | }; | ||||||
1078 | |||||||
1079 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||||
1080 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||||
1081 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||||
1082 | |||||||
1083 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||||
1084 | // LDS | ||||||
1085 | // TODO: Unsupported flat for SI. | ||||||
1086 | |||||||
1087 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||||
1088 | const bool IsStore = Op == G_STORE; | ||||||
1089 | |||||||
1090 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||||
1091 | // Explicitly list some common cases. | ||||||
1092 | // TODO: Does this help compile time at all? | ||||||
1093 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, | ||||||
1094 | {V2S32, GlobalPtr, V2S32, GlobalAlign32}, | ||||||
1095 | {V4S32, GlobalPtr, V4S32, GlobalAlign32}, | ||||||
1096 | {S64, GlobalPtr, S64, GlobalAlign32}, | ||||||
1097 | {V2S64, GlobalPtr, V2S64, GlobalAlign32}, | ||||||
1098 | {V2S16, GlobalPtr, V2S16, GlobalAlign32}, | ||||||
1099 | {S32, GlobalPtr, S8, GlobalAlign8}, | ||||||
1100 | {S32, GlobalPtr, S16, GlobalAlign16}, | ||||||
1101 | |||||||
1102 | {S32, LocalPtr, S32, 32}, | ||||||
1103 | {S64, LocalPtr, S64, 32}, | ||||||
1104 | {V2S32, LocalPtr, V2S32, 32}, | ||||||
1105 | {S32, LocalPtr, S8, 8}, | ||||||
1106 | {S32, LocalPtr, S16, 16}, | ||||||
1107 | {V2S16, LocalPtr, S32, 32}, | ||||||
1108 | |||||||
1109 | {S32, PrivatePtr, S32, 32}, | ||||||
1110 | {S32, PrivatePtr, S8, 8}, | ||||||
1111 | {S32, PrivatePtr, S16, 16}, | ||||||
1112 | {V2S16, PrivatePtr, S32, 32}, | ||||||
1113 | |||||||
1114 | {S32, ConstantPtr, S32, GlobalAlign32}, | ||||||
1115 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}, | ||||||
1116 | {V4S32, ConstantPtr, V4S32, GlobalAlign32}, | ||||||
1117 | {S64, ConstantPtr, S64, GlobalAlign32}, | ||||||
1118 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); | ||||||
1119 | Actions.legalIf( | ||||||
1120 | [=](const LegalityQuery &Query) -> bool { | ||||||
1121 | return isLoadStoreLegal(ST, Query); | ||||||
1122 | }); | ||||||
1123 | |||||||
1124 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||
1125 | // 64-bits. | ||||||
1126 | // | ||||||
1127 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||
1128 | // inserting addrspacecasts. | ||||||
1129 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||||
1130 | |||||||
1131 | // Turn any illegal element vectors into something easier to deal | ||||||
1132 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||||
1133 | // parts anyway. | ||||||
1134 | // | ||||||
1135 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||||
1136 | // 16-bit vector parts. | ||||||
1137 | Actions.bitcastIf( | ||||||
1138 | [=](const LegalityQuery &Query) -> bool { | ||||||
1139 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||||
1140 | Query.MMODescrs[0].MemoryTy); | ||||||
1141 | }, bitcastToRegisterType(0)); | ||||||
1142 | |||||||
1143 | if (!IsStore) { | ||||||
1144 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||||
1145 | // legalization actions can't properly express widening memory operands. | ||||||
1146 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||||
1147 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||||
1148 | }); | ||||||
1149 | } | ||||||
1150 | |||||||
1151 | // FIXME: load/store narrowing should be moved to lower action | ||||||
1152 | Actions | ||||||
1153 | .narrowScalarIf( | ||||||
1154 | [=](const LegalityQuery &Query) -> bool { | ||||||
1155 | return !Query.Types[0].isVector() && | ||||||
1156 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
1157 | }, | ||||||
1158 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
1159 | const LLT DstTy = Query.Types[0]; | ||||||
1160 | const LLT PtrTy = Query.Types[1]; | ||||||
1161 | |||||||
1162 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||||
1163 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
1164 | |||||||
1165 | // Split extloads. | ||||||
1166 | if (DstSize > MemSize) | ||||||
1167 | return std::make_pair(0, LLT::scalar(MemSize)); | ||||||
1168 | |||||||
1169 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||
1170 | PtrTy.getAddressSpace(), | ||||||
1171 | Op == G_LOAD); | ||||||
1172 | if (MemSize > MaxSize) | ||||||
1173 | return std::make_pair(0, LLT::scalar(MaxSize)); | ||||||
1174 | |||||||
1175 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
1176 | return std::make_pair(0, LLT::scalar(Align)); | ||||||
1177 | }) | ||||||
1178 | .fewerElementsIf( | ||||||
1179 | [=](const LegalityQuery &Query) -> bool { | ||||||
1180 | return Query.Types[0].isVector() && | ||||||
1181 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
1182 | }, | ||||||
1183 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
1184 | const LLT DstTy = Query.Types[0]; | ||||||
1185 | const LLT PtrTy = Query.Types[1]; | ||||||
1186 | |||||||
1187 | LLT EltTy = DstTy.getElementType(); | ||||||
1188 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||
1189 | PtrTy.getAddressSpace(), | ||||||
1190 | Op == G_LOAD); | ||||||
1191 | |||||||
1192 | // FIXME: Handle widened to power of 2 results better. This ends | ||||||
1193 | // up scalarizing. | ||||||
1194 | // FIXME: 3 element stores scalarized on SI | ||||||
1195 | |||||||
1196 | // Split if it's too large for the address space. | ||||||
1197 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
1198 | if (MemSize > MaxSize) { | ||||||
1199 | unsigned NumElts = DstTy.getNumElements(); | ||||||
1200 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1201 | |||||||
1202 | if (MaxSize % EltSize == 0) { | ||||||
1203 | return std::make_pair( | ||||||
1204 | 0, LLT::scalarOrVector( | ||||||
1205 | ElementCount::getFixed(MaxSize / EltSize), EltTy)); | ||||||
1206 | } | ||||||
1207 | |||||||
1208 | unsigned NumPieces = MemSize / MaxSize; | ||||||
1209 | |||||||
1210 | // FIXME: Refine when odd breakdowns handled | ||||||
1211 | // The scalars will need to be re-legalized. | ||||||
1212 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||||
1213 | NumElts % NumPieces != 0) | ||||||
1214 | return std::make_pair(0, EltTy); | ||||||
1215 | |||||||
1216 | return std::make_pair( | ||||||
1217 | 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); | ||||||
1218 | } | ||||||
1219 | |||||||
1220 | // FIXME: We could probably handle weird extending loads better. | ||||||
1221 | if (DstTy.getSizeInBits() > MemSize) | ||||||
1222 | return std::make_pair(0, EltTy); | ||||||
1223 | |||||||
1224 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1225 | unsigned DstSize = DstTy.getSizeInBits(); | ||||||
1226 | if (!isPowerOf2_32(DstSize)) { | ||||||
1227 | // We're probably decomposing an odd sized store. Try to split | ||||||
1228 | // to the widest type. TODO: Account for alignment. As-is it | ||||||
1229 | // should be OK, since the new parts will be further legalized. | ||||||
1230 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||
1231 | return std::make_pair( | ||||||
1232 | 0, LLT::scalarOrVector( | ||||||
1233 | ElementCount::getFixed(FloorSize / EltSize), EltTy)); | ||||||
1234 | } | ||||||
1235 | |||||||
1236 | // May need relegalization for the scalars. | ||||||
1237 | return std::make_pair(0, EltTy); | ||||||
1238 | }) | ||||||
1239 | .minScalar(0, S32) | ||||||
1240 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) | ||||||
1241 | .widenScalarToNextPow2(0) | ||||||
1242 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||||
1243 | .lower(); | ||||||
1244 | } | ||||||
1245 | |||||||
1246 | // FIXME: Unaligned accesses not lowered. | ||||||
1247 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||||
1248 | .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, | ||||||
1249 | {S32, GlobalPtr, S16, 2 * 8}, | ||||||
1250 | {S32, LocalPtr, S8, 8}, | ||||||
1251 | {S32, LocalPtr, S16, 16}, | ||||||
1252 | {S32, PrivatePtr, S8, 8}, | ||||||
1253 | {S32, PrivatePtr, S16, 16}, | ||||||
1254 | {S32, ConstantPtr, S8, 8}, | ||||||
1255 | {S32, ConstantPtr, S16, 2 * 8}}) | ||||||
1256 | .legalIf( | ||||||
1257 | [=](const LegalityQuery &Query) -> bool { | ||||||
1258 | return isLoadStoreLegal(ST, Query); | ||||||
1259 | }); | ||||||
1260 | |||||||
1261 | if (ST.hasFlatAddressSpace()) { | ||||||
1262 | ExtLoads.legalForTypesWithMemDesc( | ||||||
1263 | {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); | ||||||
1264 | } | ||||||
1265 | |||||||
1266 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||
1267 | // 64-bits. | ||||||
1268 | // | ||||||
1269 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||
1270 | // inserting addrspacecasts. | ||||||
1271 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); | ||||||
1272 | |||||||
1273 | ExtLoads.clampScalar(0, S32, S32) | ||||||
1274 | .widenScalarToNextPow2(0) | ||||||
1275 | .lower(); | ||||||
1276 | |||||||
1277 | auto &Atomics = getActionDefinitionsBuilder( | ||||||
1278 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||||
1279 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||||
1280 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||||
1281 | G_ATOMICRMW_UMIN}) | ||||||
1282 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||||
1283 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||||
1284 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
1285 | if (ST.hasFlatAddressSpace()) { | ||||||
1286 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||||
1287 | } | ||||||
1288 | |||||||
1289 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | ||||||
1290 | if (ST.hasLDSFPAtomicAdd()) { | ||||||
1291 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||||
1292 | if (ST.hasGFX90AInsts()) | ||||||
1293 | Atomic.legalFor({{S64, LocalPtr}}); | ||||||
1294 | } | ||||||
1295 | if (ST.hasAtomicFaddInsts()) | ||||||
1296 | Atomic.legalFor({{S32, GlobalPtr}}); | ||||||
1297 | |||||||
1298 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||||
1299 | // demarshalling | ||||||
1300 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||||
1301 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||||
1302 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||||
1303 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||||
1304 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
1305 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||||
1306 | |||||||
1307 | // Condition should be s32 for scalar, s1 for vector. | ||||||
1308 | getActionDefinitionsBuilder(G_SELECT) | ||||||
1309 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, | ||||||
1310 | LocalPtr, FlatPtr, PrivatePtr, | ||||||
1311 | LLT::fixed_vector(2, LocalPtr), | ||||||
1312 | LLT::fixed_vector(2, PrivatePtr)}, | ||||||
1313 | {S1, S32}) | ||||||
1314 | .clampScalar(0, S16, S64) | ||||||
1315 | .scalarize(1) | ||||||
1316 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
1317 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||||
1318 | .clampMaxNumElements(0, S32, 2) | ||||||
1319 | .clampMaxNumElements(0, LocalPtr, 2) | ||||||
1320 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||||
1321 | .scalarize(0) | ||||||
1322 | .widenScalarToNextPow2(0) | ||||||
1323 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||||
1324 | |||||||
1325 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||||
1326 | // be more flexible with the shift amount type. | ||||||
1327 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||||
1328 | .legalFor({{S32, S32}, {S64, S32}}); | ||||||
1329 | if (ST.has16BitInsts()) { | ||||||
1330 | if (ST.hasVOP3PInsts()) { | ||||||
1331 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||||
1332 | .clampMaxNumElements(0, S16, 2); | ||||||
1333 | } else | ||||||
1334 | Shifts.legalFor({{S16, S16}}); | ||||||
1335 | |||||||
1336 | // TODO: Support 16-bit shift amounts for all types | ||||||
1337 | Shifts.widenScalarIf( | ||||||
1338 | [=](const LegalityQuery &Query) { | ||||||
1339 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||||
1340 | // 32-bit amount. | ||||||
1341 | const LLT ValTy = Query.Types[0]; | ||||||
1342 | const LLT AmountTy = Query.Types[1]; | ||||||
1343 | return ValTy.getSizeInBits() <= 16 && | ||||||
1344 | AmountTy.getSizeInBits() < 16; | ||||||
1345 | }, changeTo(1, S16)); | ||||||
1346 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||||
1347 | Shifts.clampScalar(1, S32, S32); | ||||||
1348 | Shifts.clampScalar(0, S16, S64); | ||||||
1349 | Shifts.widenScalarToNextPow2(0, 16); | ||||||
1350 | |||||||
1351 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||
1352 | .minScalar(0, S16) | ||||||
1353 | .scalarize(0) | ||||||
1354 | .lower(); | ||||||
1355 | } else { | ||||||
1356 | // Make sure we legalize the shift amount type first, as the general | ||||||
1357 | // expansion for the shifted type will produce much worse code if it hasn't | ||||||
1358 | // been truncated already. | ||||||
1359 | Shifts.clampScalar(1, S32, S32); | ||||||
1360 | Shifts.clampScalar(0, S32, S64); | ||||||
1361 | Shifts.widenScalarToNextPow2(0, 32); | ||||||
1362 | |||||||
1363 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||
1364 | .minScalar(0, S32) | ||||||
1365 | .scalarize(0) | ||||||
1366 | .lower(); | ||||||
1367 | } | ||||||
1368 | Shifts.scalarize(0); | ||||||
1369 | |||||||
1370 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||||
1371 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||||
1372 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||||
1373 | unsigned IdxTypeIdx = 2; | ||||||
1374 | |||||||
1375 | getActionDefinitionsBuilder(Op) | ||||||
1376 | .customIf([=](const LegalityQuery &Query) { | ||||||
1377 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
1378 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
1379 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||||
1380 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1381 | return (EltSize == 32 || EltSize == 64) && | ||||||
1382 | VecTy.getSizeInBits() % 32 == 0 && | ||||||
1383 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||||
1384 | IdxTy.getSizeInBits() == 32; | ||||||
1385 | }) | ||||||
1386 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||||
1387 | bitcastToVectorElement32(VecTypeIdx)) | ||||||
1388 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||||
1389 | .bitcastIf( | ||||||
1390 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||||
1391 | [=](const LegalityQuery &Query) { | ||||||
1392 | // For > 64-bit element types, try to turn this into a 64-bit | ||||||
1393 | // element vector since we may be able to do better indexing | ||||||
1394 | // if this is scalar. If not, fall back to 32. | ||||||
1395 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
1396 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
1397 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||||
1398 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||||
1399 | |||||||
1400 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||||
1401 | return std::make_pair( | ||||||
1402 | VecTypeIdx, | ||||||
1403 | LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); | ||||||
1404 | }) | ||||||
1405 | .clampScalar(EltTypeIdx, S32, S64) | ||||||
1406 | .clampScalar(VecTypeIdx, S32, S64) | ||||||
1407 | .clampScalar(IdxTypeIdx, S32, S32) | ||||||
1408 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||||
1409 | // TODO: Clamp elements for 64-bit vectors? | ||||||
1410 | // It should only be necessary with variable indexes. | ||||||
1411 | // As a last resort, lower to the stack | ||||||
1412 | .lower(); | ||||||
1413 | } | ||||||
1414 | |||||||
1415 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||||
1416 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||||
1417 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||||
1418 | return Query.Types[0] != EltTy; | ||||||
1419 | }); | ||||||
1420 | |||||||
1421 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||||
1422 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||||
1423 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||||
1424 | |||||||
1425 | // FIXME: Doesn't handle extract of illegal sizes. | ||||||
1426 | getActionDefinitionsBuilder(Op) | ||||||
1427 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||||
1428 | .lowerIf([=](const LegalityQuery &Query) { | ||||||
1429 | // Sub-vector(or single element) insert and extract. | ||||||
1430 | // TODO: verify immediate offset here since lower only works with | ||||||
1431 | // whole elements. | ||||||
1432 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1433 | return BigTy.isVector(); | ||||||
1434 | }) | ||||||
1435 | // FIXME: Multiples of 16 should not be legal. | ||||||
1436 | .legalIf([=](const LegalityQuery &Query) { | ||||||
1437 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1438 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1439 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||||
1440 | (LitTy.getSizeInBits() % 16 == 0); | ||||||
1441 | }) | ||||||
1442 | .widenScalarIf( | ||||||
1443 | [=](const LegalityQuery &Query) { | ||||||
1444 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1445 | return (BigTy.getScalarSizeInBits() < 16); | ||||||
1446 | }, | ||||||
1447 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||||
1448 | .widenScalarIf( | ||||||
1449 | [=](const LegalityQuery &Query) { | ||||||
1450 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1451 | return (LitTy.getScalarSizeInBits() < 16); | ||||||
1452 | }, | ||||||
1453 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||||
1454 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1455 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||||
1456 | |||||||
1457 | } | ||||||
1458 | |||||||
1459 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||||
1460 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||||
1461 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||||
1462 | .clampNumElements(0, V16S32, V32S32) | ||||||
1463 | .clampNumElements(0, V2S64, V16S64) | ||||||
1464 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||||
1465 | |||||||
1466 | if (ST.hasScalarPackInsts()) { | ||||||
1467 | BuildVector | ||||||
1468 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||||
1469 | .minScalarOrElt(0, S16) | ||||||
1470 | // Widen source elements and produce a G_BUILD_VECTOR_TRUNC | ||||||
1471 | .minScalar(1, S32); | ||||||
1472 | |||||||
1473 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1474 | .legalFor({V2S16, S32}) | ||||||
1475 | .lower(); | ||||||
1476 | BuildVector.minScalarOrElt(0, S32); | ||||||
1477 | } else { | ||||||
1478 | BuildVector.customFor({V2S16, S16}); | ||||||
1479 | BuildVector.minScalarOrElt(0, S32); | ||||||
1480 | |||||||
1481 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1482 | .customFor({V2S16, S32}) | ||||||
1483 | .lower(); | ||||||
1484 | } | ||||||
1485 | |||||||
1486 | BuildVector.legalIf(isRegisterType(0)); | ||||||
1487 | |||||||
1488 | // FIXME: Clamp maximum size | ||||||
1489 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||||
1490 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
1491 | .clampMaxNumElements(0, S32, 32) | ||||||
1492 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||||
1493 | .clampMaxNumElements(0, S16, 64); | ||||||
1494 | |||||||
1495 | // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse | ||||||
1496 | // pre-legalize. | ||||||
1497 | if (ST.hasVOP3PInsts()) { | ||||||
1498 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) | ||||||
1499 | .customFor({V2S16, V2S16}) | ||||||
1500 | .lower(); | ||||||
1501 | } else | ||||||
1502 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||||
1503 | |||||||
1504 | // Merge/Unmerge | ||||||
1505 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||||
1506 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||||
1507 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||||
1508 | |||||||
1509 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||||
1510 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
1511 | if (Ty.isVector()) { | ||||||
1512 | const LLT &EltTy = Ty.getElementType(); | ||||||
1513 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||||
1514 | return true; | ||||||
1515 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | ||||||
1516 | return true; | ||||||
1517 | } | ||||||
1518 | return false; | ||||||
1519 | }; | ||||||
1520 | |||||||
1521 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||||
1522 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
1523 | .lowerFor({{S16, V2S16}}) | ||||||
1524 | .lowerIf([=](const LegalityQuery &Query) { | ||||||
1525 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1526 | return BigTy.getSizeInBits() == 32; | ||||||
1527 | }) | ||||||
1528 | // Try to widen to s16 first for small types. | ||||||
1529 | // TODO: Only do this on targets with legal s16 shifts | ||||||
1530 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||||
1531 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||||
1532 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1533 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||||
1534 | elementTypeIs(1, S16)), | ||||||
1535 | changeTo(1, V2S16)) | ||||||
1536 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||||
1537 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||||
1538 | // valid. | ||||||
1539 | .clampScalar(LitTyIdx, S32, S512) | ||||||
1540 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||||
1541 | // Break up vectors with weird elements into scalars | ||||||
1542 | .fewerElementsIf( | ||||||
1543 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||||
1544 | scalarize(0)) | ||||||
1545 | .fewerElementsIf( | ||||||
1546 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||||
1547 | scalarize(1)) | ||||||
1548 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||||
1549 | |||||||
1550 | if (Op == G_MERGE_VALUES) { | ||||||
1551 | Builder.widenScalarIf( | ||||||
1552 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||||
1553 | [=](const LegalityQuery &Query) { | ||||||
1554 | const LLT Ty = Query.Types[LitTyIdx]; | ||||||
1555 | return Ty.getSizeInBits() < 32; | ||||||
1556 | }, | ||||||
1557 | changeTo(LitTyIdx, S32)); | ||||||
1558 | } | ||||||
1559 | |||||||
1560 | Builder.widenScalarIf( | ||||||
1561 | [=](const LegalityQuery &Query) { | ||||||
1562 | const LLT Ty = Query.Types[BigTyIdx]; | ||||||
1563 | return !isPowerOf2_32(Ty.getSizeInBits()) && | ||||||
1564 | Ty.getSizeInBits() % 16 != 0; | ||||||
1565 | }, | ||||||
1566 | [=](const LegalityQuery &Query) { | ||||||
1567 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||||
1568 | // Whichever is smaller. | ||||||
1569 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||||
1570 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||||
1571 | if (NewSizeInBits >= 256) { | ||||||
1572 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||||
1573 | if (RoundedTo < NewSizeInBits) | ||||||
1574 | NewSizeInBits = RoundedTo; | ||||||
1575 | } | ||||||
1576 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||||
1577 | }) | ||||||
1578 | // Any vectors left are the wrong size. Scalarize them. | ||||||
1579 | .scalarize(0) | ||||||
1580 | .scalarize(1); | ||||||
1581 | } | ||||||
1582 | |||||||
1583 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||
1584 | // RegBankSelect. | ||||||
1585 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||||
1586 | .legalFor({{S32}, {S64}}); | ||||||
1587 | |||||||
1588 | if (ST.hasVOP3PInsts()) { | ||||||
1589 | SextInReg.lowerFor({{V2S16}}) | ||||||
1590 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||||
1591 | // get more vector shift opportunities, since we'll get those when | ||||||
1592 | // expanded. | ||||||
1593 | .clampMaxNumElementsStrict(0, S16, 2); | ||||||
1594 | } else if (ST.has16BitInsts()) { | ||||||
1595 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||||
1596 | } else { | ||||||
1597 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||||
1598 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||||
1599 | SextInReg.lowerFor({{S32}, {S64}}); | ||||||
1600 | } | ||||||
1601 | |||||||
1602 | SextInReg | ||||||
1603 | .scalarize(0) | ||||||
1604 | .clampScalar(0, S32, S64) | ||||||
1605 | .lower(); | ||||||
1606 | |||||||
1607 | getActionDefinitionsBuilder({G_ROTR, G_ROTL}) | ||||||
1608 | .scalarize(0) | ||||||
1609 | .lower(); | ||||||
1610 | |||||||
1611 | // TODO: Only Try to form v2s16 with legal packed instructions. | ||||||
1612 | getActionDefinitionsBuilder(G_FSHR) | ||||||
1613 | .legalFor({{S32, S32}}) | ||||||
1614 | .lowerFor({{V2S16, V2S16}}) | ||||||
1615 | .clampMaxNumElementsStrict(0, S16, 2) | ||||||
1616 | .scalarize(0) | ||||||
1617 | .lower(); | ||||||
1618 | |||||||
1619 | if (ST.hasVOP3PInsts()) { | ||||||
1620 | getActionDefinitionsBuilder(G_FSHL) | ||||||
1621 | .lowerFor({{V2S16, V2S16}}) | ||||||
1622 | .clampMaxNumElementsStrict(0, S16, 2) | ||||||
1623 | .scalarize(0) | ||||||
1624 | .lower(); | ||||||
1625 | } else { | ||||||
1626 | getActionDefinitionsBuilder(G_FSHL) | ||||||
1627 | .scalarize(0) | ||||||
1628 | .lower(); | ||||||
1629 | } | ||||||
1630 | |||||||
1631 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||||
1632 | .legalFor({S64}); | ||||||
1633 | |||||||
1634 | getActionDefinitionsBuilder(G_FENCE) | ||||||
1635 | .alwaysLegal(); | ||||||
1636 | |||||||
1637 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | ||||||
1638 | .scalarize(0) | ||||||
1639 | .minScalar(0, S32) | ||||||
1640 | .lower(); | ||||||
1641 | |||||||
1642 | getActionDefinitionsBuilder({G_SBFX, G_UBFX}) | ||||||
1643 | .legalFor({{S32, S32}, {S64, S32}}) | ||||||
1644 | .clampScalar(1, S32, S32) | ||||||
1645 | .clampScalar(0, S32, S64) | ||||||
1646 | .widenScalarToNextPow2(0) | ||||||
1647 | .scalarize(0); | ||||||
1648 | |||||||
1649 | getActionDefinitionsBuilder({ | ||||||
1650 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||||
1651 | G_FCOPYSIGN, | ||||||
1652 | |||||||
1653 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||||
1654 | G_ATOMICRMW_NAND, | ||||||
1655 | G_ATOMICRMW_FSUB, | ||||||
1656 | G_READ_REGISTER, | ||||||
1657 | G_WRITE_REGISTER, | ||||||
1658 | |||||||
1659 | G_SADDO, G_SSUBO, | ||||||
1660 | |||||||
1661 | // TODO: Implement | ||||||
1662 | G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||||
1663 | |||||||
1664 | getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) | ||||||
1665 | .lower(); | ||||||
1666 | |||||||
1667 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||||
1668 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||||
1669 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||||
1670 | .unsupported(); | ||||||
1671 | |||||||
1672 | getLegacyLegalizerInfo().computeTables(); | ||||||
1673 | verify(*ST.getInstrInfo()); | ||||||
1674 | } | ||||||
1675 | |||||||
1676 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||||
1677 | MachineInstr &MI) const { | ||||||
1678 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
1679 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
1680 | |||||||
1681 | switch (MI.getOpcode()) { | ||||||
1682 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||||
1683 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||||
1684 | case TargetOpcode::G_FRINT: | ||||||
1685 | return legalizeFrint(MI, MRI, B); | ||||||
1686 | case TargetOpcode::G_FCEIL: | ||||||
1687 | return legalizeFceil(MI, MRI, B); | ||||||
1688 | case TargetOpcode::G_FREM: | ||||||
1689 | return legalizeFrem(MI, MRI, B); | ||||||
1690 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||||
1691 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||||
1692 | case TargetOpcode::G_SITOFP: | ||||||
1693 | return legalizeITOFP(MI, MRI, B, true); | ||||||
1694 | case TargetOpcode::G_UITOFP: | ||||||
1695 | return legalizeITOFP(MI, MRI, B, false); | ||||||
1696 | case TargetOpcode::G_FPTOSI: | ||||||
1697 | return legalizeFPTOI(MI, MRI, B, true); | ||||||
1698 | case TargetOpcode::G_FPTOUI: | ||||||
1699 | return legalizeFPTOI(MI, MRI, B, false); | ||||||
1700 | case TargetOpcode::G_FMINNUM: | ||||||
1701 | case TargetOpcode::G_FMAXNUM: | ||||||
1702 | case TargetOpcode::G_FMINNUM_IEEE: | ||||||
1703 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||||
1704 | return legalizeMinNumMaxNum(Helper, MI); | ||||||
1705 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||||
1706 | return legalizeExtractVectorElt(MI, MRI, B); | ||||||
1707 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||||
1708 | return legalizeInsertVectorElt(MI, MRI, B); | ||||||
1709 | case TargetOpcode::G_SHUFFLE_VECTOR: | ||||||
1710 | return legalizeShuffleVector(MI, MRI, B); | ||||||
1711 | case TargetOpcode::G_FSIN: | ||||||
1712 | case TargetOpcode::G_FCOS: | ||||||
1713 | return legalizeSinCos(MI, MRI, B); | ||||||
1714 | case TargetOpcode::G_GLOBAL_VALUE: | ||||||
1715 | return legalizeGlobalValue(MI, MRI, B); | ||||||
1716 | case TargetOpcode::G_LOAD: | ||||||
1717 | case TargetOpcode::G_SEXTLOAD: | ||||||
1718 | case TargetOpcode::G_ZEXTLOAD: | ||||||
1719 | return legalizeLoad(Helper, MI); | ||||||
1720 | case TargetOpcode::G_FMAD: | ||||||
1721 | return legalizeFMad(MI, MRI, B); | ||||||
1722 | case TargetOpcode::G_FDIV: | ||||||
1723 | return legalizeFDIV(MI, MRI, B); | ||||||
1724 | case TargetOpcode::G_UDIV: | ||||||
1725 | case TargetOpcode::G_UREM: | ||||||
1726 | case TargetOpcode::G_UDIVREM: | ||||||
1727 | return legalizeUnsignedDIV_REM(MI, MRI, B); | ||||||
1728 | case TargetOpcode::G_SDIV: | ||||||
1729 | case TargetOpcode::G_SREM: | ||||||
1730 | case TargetOpcode::G_SDIVREM: | ||||||
1731 | return legalizeSignedDIV_REM(MI, MRI, B); | ||||||
1732 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||||
1733 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||||
1734 | case TargetOpcode::G_FLOG: | ||||||
1735 | return legalizeFlog(MI, B, numbers::ln2f); | ||||||
1736 | case TargetOpcode::G_FLOG10: | ||||||
1737 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||||
1738 | case TargetOpcode::G_FEXP: | ||||||
1739 | return legalizeFExp(MI, B); | ||||||
1740 | case TargetOpcode::G_FPOW: | ||||||
1741 | return legalizeFPow(MI, B); | ||||||
1742 | case TargetOpcode::G_FFLOOR: | ||||||
1743 | return legalizeFFloor(MI, MRI, B); | ||||||
1744 | case TargetOpcode::G_BUILD_VECTOR: | ||||||
1745 | return legalizeBuildVector(MI, MRI, B); | ||||||
1746 | case TargetOpcode::G_CTLZ: | ||||||
1747 | case TargetOpcode::G_CTTZ: | ||||||
1748 | return legalizeCTLZ_CTTZ(MI, MRI, B); | ||||||
1749 | default: | ||||||
1750 | return false; | ||||||
1751 | } | ||||||
1752 | |||||||
1753 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1753); | ||||||
1754 | } | ||||||
1755 | |||||||
1756 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||||
1757 | unsigned AS, | ||||||
1758 | MachineRegisterInfo &MRI, | ||||||
1759 | MachineIRBuilder &B) const { | ||||||
1760 | MachineFunction &MF = B.getMF(); | ||||||
1761 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
1762 | const LLT S32 = LLT::scalar(32); | ||||||
1763 | |||||||
1764 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1764, __extension__ __PRETTY_FUNCTION__)); | ||||||
1765 | |||||||
1766 | if (ST.hasApertureRegs()) { | ||||||
1767 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | ||||||
1768 | // getreg. | ||||||
1769 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1770 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | ||||||
1771 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | ||||||
1772 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1773 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | ||||||
1774 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | ||||||
1775 | unsigned Encoding = | ||||||
1776 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | ||||||
1777 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | ||||||
1778 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | ||||||
1779 | |||||||
1780 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||||||
1781 | |||||||
1782 | B.buildInstr(AMDGPU::S_GETREG_B32) | ||||||
1783 | .addDef(GetReg) | ||||||
1784 | .addImm(Encoding); | ||||||
1785 | MRI.setType(GetReg, S32); | ||||||
1786 | |||||||
1787 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | ||||||
1788 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); | ||||||
1789 | } | ||||||
1790 | |||||||
1791 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||||
1792 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
1793 | |||||||
1794 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||
1795 | return Register(); | ||||||
1796 | |||||||
1797 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||||
1798 | // private_segment_aperture_base_hi. | ||||||
1799 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||||
1800 | |||||||
1801 | // TODO: can we be smarter about machine pointer info? | ||||||
1802 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
1803 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
1804 | PtrInfo, | ||||||
1805 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
1806 | MachineMemOperand::MOInvariant, | ||||||
1807 | LLT::scalar(32), commonAlignment(Align(64), StructOffset)); | ||||||
1808 | |||||||
1809 | Register LoadAddr; | ||||||
1810 | |||||||
1811 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | ||||||
1812 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||||
1813 | } | ||||||
1814 | |||||||
1815 | /// Return true if the value is a known valid address, such that a null check is | ||||||
1816 | /// not necessary. | ||||||
1817 | static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, | ||||||
1818 | const AMDGPUTargetMachine &TM, unsigned AddrSpace) { | ||||||
1819 | MachineInstr *Def = MRI.getVRegDef(Val); | ||||||
1820 | switch (Def->getOpcode()) { | ||||||
1821 | case AMDGPU::G_FRAME_INDEX: | ||||||
1822 | case AMDGPU::G_GLOBAL_VALUE: | ||||||
1823 | case AMDGPU::G_BLOCK_ADDR: | ||||||
1824 | return true; | ||||||
1825 | case AMDGPU::G_CONSTANT: { | ||||||
1826 | const ConstantInt *CI = Def->getOperand(1).getCImm(); | ||||||
1827 | return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); | ||||||
1828 | } | ||||||
1829 | default: | ||||||
1830 | return false; | ||||||
1831 | } | ||||||
1832 | |||||||
1833 | return false; | ||||||
1834 | } | ||||||
1835 | |||||||
1836 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||||
1837 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1838 | MachineIRBuilder &B) const { | ||||||
1839 | MachineFunction &MF = B.getMF(); | ||||||
1840 | |||||||
1841 | const LLT S32 = LLT::scalar(32); | ||||||
1842 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1843 | Register Src = MI.getOperand(1).getReg(); | ||||||
1844 | |||||||
1845 | LLT DstTy = MRI.getType(Dst); | ||||||
1846 | LLT SrcTy = MRI.getType(Src); | ||||||
1847 | unsigned DestAS = DstTy.getAddressSpace(); | ||||||
1848 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||||
1849 | |||||||
1850 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||||
1851 | // vector element. | ||||||
1852 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1852, __extension__ __PRETTY_FUNCTION__)); | ||||||
1853 | |||||||
1854 | const AMDGPUTargetMachine &TM | ||||||
1855 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||||
1856 | |||||||
1857 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||||
1858 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||||
1859 | return true; | ||||||
1860 | } | ||||||
1861 | |||||||
1862 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1863 | // Truncate. | ||||||
1864 | B.buildExtract(Dst, Src, 0); | ||||||
1865 | MI.eraseFromParent(); | ||||||
1866 | return true; | ||||||
1867 | } | ||||||
1868 | |||||||
1869 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1870 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1871 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||||
1872 | |||||||
1873 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | ||||||
1874 | // another. Merge operands are required to be the same type, but creating an | ||||||
1875 | // extra ptrtoint would be kind of pointless. | ||||||
1876 | auto HighAddr = B.buildConstant( | ||||||
1877 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | ||||||
1878 | B.buildMerge(Dst, {Src, HighAddr}); | ||||||
1879 | MI.eraseFromParent(); | ||||||
1880 | return true; | ||||||
1881 | } | ||||||
1882 | |||||||
1883 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | ||||||
1884 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1885, __extension__ __PRETTY_FUNCTION__)) | ||||||
1885 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1885, __extension__ __PRETTY_FUNCTION__)); | ||||||
1886 | |||||||
1887 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||||
1888 | // Extract low 32-bits of the pointer. | ||||||
1889 | B.buildExtract(Dst, Src, 0); | ||||||
1890 | MI.eraseFromParent(); | ||||||
1891 | return true; | ||||||
1892 | } | ||||||
1893 | |||||||
1894 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||||
1895 | |||||||
1896 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||||
1897 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||||
1898 | |||||||
1899 | // Extract low 32-bits of the pointer. | ||||||
1900 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||||
1901 | |||||||
1902 | auto CmpRes = | ||||||
1903 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||||
1904 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||||
1905 | |||||||
1906 | MI.eraseFromParent(); | ||||||
1907 | return true; | ||||||
1908 | } | ||||||
1909 | |||||||
1910 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | ||||||
1911 | return false; | ||||||
1912 | |||||||
1913 | if (!ST.hasFlatAddressSpace()) | ||||||
1914 | return false; | ||||||
1915 | |||||||
1916 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||||
1917 | if (!ApertureReg.isValid()) | ||||||
1918 | return false; | ||||||
1919 | |||||||
1920 | // Coerce the type of the low half of the result so we can use merge_values. | ||||||
1921 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||||
1922 | |||||||
1923 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||||
1924 | // avoid the ptrtoint? | ||||||
1925 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); | ||||||
1926 | |||||||
1927 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||||
1928 | B.buildCopy(Dst, BuildPtr); | ||||||
1929 | MI.eraseFromParent(); | ||||||
1930 | return true; | ||||||
1931 | } | ||||||
1932 | |||||||
1933 | auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||||
1934 | auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||||
1935 | |||||||
1936 | auto CmpRes = | ||||||
1937 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); | ||||||
1938 | |||||||
1939 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||||
1940 | |||||||
1941 | MI.eraseFromParent(); | ||||||
1942 | return true; | ||||||
1943 | } | ||||||
1944 | |||||||
1945 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||||
1946 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1947 | MachineIRBuilder &B) const { | ||||||
1948 | Register Src = MI.getOperand(1).getReg(); | ||||||
1949 | LLT Ty = MRI.getType(Src); | ||||||
1950 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1950, __extension__ __PRETTY_FUNCTION__)); | ||||||
1951 | |||||||
1952 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||||
1953 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||||
1954 | |||||||
1955 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||||
1956 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||||
1957 | |||||||
1958 | // TODO: Should this propagate fast-math-flags? | ||||||
1959 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||||
1960 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||||
1961 | |||||||
1962 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||||
1963 | auto Fabs = B.buildFAbs(Ty, Src); | ||||||
1964 | |||||||
1965 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||||
1966 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||||
1967 | MI.eraseFromParent(); | ||||||
1968 | return true; | ||||||
1969 | } | ||||||
1970 | |||||||
1971 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||||
1972 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1973 | MachineIRBuilder &B) const { | ||||||
1974 | |||||||
1975 | const LLT S1 = LLT::scalar(1); | ||||||
1976 | const LLT S64 = LLT::scalar(64); | ||||||
1977 | |||||||
1978 | Register Src = MI.getOperand(1).getReg(); | ||||||
1979 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1979, __extension__ __PRETTY_FUNCTION__)); | ||||||
1980 | |||||||
1981 | // result = trunc(src) | ||||||
1982 | // if (src > 0.0 && src != result) | ||||||
1983 | // result += 1.0 | ||||||
1984 | |||||||
1985 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||||
1986 | |||||||
1987 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||||
1988 | const auto One = B.buildFConstant(S64, 1.0); | ||||||
1989 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||||
1990 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||||
1991 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||||
1992 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||||
1993 | |||||||
1994 | // TODO: Should this propagate fast-math-flags? | ||||||
1995 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||||
1996 | return true; | ||||||
1997 | } | ||||||
1998 | |||||||
1999 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||||
2000 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2001 | MachineIRBuilder &B) const { | ||||||
2002 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2003 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||||
2004 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||||
2005 | auto Flags = MI.getFlags(); | ||||||
2006 | LLT Ty = MRI.getType(DstReg); | ||||||
2007 | |||||||
2008 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||||
2009 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||||
2010 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||||
2011 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||||
2012 | MI.eraseFromParent(); | ||||||
2013 | return true; | ||||||
2014 | } | ||||||
2015 | |||||||
2016 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||||
2017 | MachineIRBuilder &B) { | ||||||
2018 | const unsigned FractBits = 52; | ||||||
2019 | const unsigned ExpBits = 11; | ||||||
2020 | LLT S32 = LLT::scalar(32); | ||||||
2021 | |||||||
2022 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||||
2023 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||||
2024 | |||||||
2025 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||||
2026 | .addUse(Hi) | ||||||
2027 | .addUse(Const0.getReg(0)) | ||||||
2028 | .addUse(Const1.getReg(0)); | ||||||
2029 | |||||||
2030 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||||
2031 | } | ||||||
2032 | |||||||
2033 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||||
2034 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2035 | MachineIRBuilder &B) const { | ||||||
2036 | const LLT S1 = LLT::scalar(1); | ||||||
2037 | const LLT S32 = LLT::scalar(32); | ||||||
2038 | const LLT S64 = LLT::scalar(64); | ||||||
2039 | |||||||
2040 | Register Src = MI.getOperand(1).getReg(); | ||||||
2041 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2041, __extension__ __PRETTY_FUNCTION__)); | ||||||
2042 | |||||||
2043 | // TODO: Should this use extract since the low half is unused? | ||||||
2044 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
2045 | Register Hi = Unmerge.getReg(1); | ||||||
2046 | |||||||
2047 | // Extract the upper half, since this is where we will find the sign and | ||||||
2048 | // exponent. | ||||||
2049 | auto Exp = extractF64Exponent(Hi, B); | ||||||
2050 | |||||||
2051 | const unsigned FractBits = 52; | ||||||
2052 | |||||||
2053 | // Extract the sign bit. | ||||||
2054 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||||
2055 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||||
2056 | |||||||
2057 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||||
2058 | |||||||
2059 | const auto Zero32 = B.buildConstant(S32, 0); | ||||||
2060 | |||||||
2061 | // Extend back to 64-bits. | ||||||
2062 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); | ||||||
2063 | |||||||
2064 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||||
2065 | auto Not = B.buildNot(S64, Shr); | ||||||
2066 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||||
2067 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||||
2068 | |||||||
2069 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||||
2070 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||||
2071 | |||||||
2072 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||||
2073 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||||
2074 | MI.eraseFromParent(); | ||||||
2075 | return true; | ||||||
2076 | } | ||||||
2077 | |||||||
2078 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||||
2079 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2080 | MachineIRBuilder &B, bool Signed) const { | ||||||
2081 | |||||||
2082 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2083 | Register Src = MI.getOperand(1).getReg(); | ||||||
2084 | |||||||
2085 | const LLT S64 = LLT::scalar(64); | ||||||
2086 | const LLT S32 = LLT::scalar(32); | ||||||
2087 | |||||||
2088 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2088, __extension__ __PRETTY_FUNCTION__)); | ||||||
2089 | |||||||
2090 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
2091 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||||
2092 | |||||||
2093 | if (MRI.getType(Dst) == S64) { | ||||||
2094 | auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) | ||||||
2095 | : B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||||
2096 | |||||||
2097 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||||
2098 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||||
2099 | .addUse(CvtHi.getReg(0)) | ||||||
2100 | .addUse(ThirtyTwo.getReg(0)); | ||||||
2101 | |||||||
2102 | // TODO: Should this propagate fast-math-flags? | ||||||
2103 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||||
2104 | MI.eraseFromParent(); | ||||||
2105 | return true; | ||||||
2106 | } | ||||||
2107 | |||||||
2108 | assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0 ) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2108, __extension__ __PRETTY_FUNCTION__)); | ||||||
2109 | |||||||
2110 | auto One = B.buildConstant(S32, 1); | ||||||
2111 | |||||||
2112 | MachineInstrBuilder ShAmt; | ||||||
2113 | if (Signed) { | ||||||
2114 | auto ThirtyOne = B.buildConstant(S32, 31); | ||||||
2115 | auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); | ||||||
2116 | auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); | ||||||
2117 | auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); | ||||||
2118 | auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, | ||||||
2119 | /*HasSideEffects=*/false) | ||||||
2120 | .addUse(Unmerge.getReg(1)); | ||||||
2121 | auto LS2 = B.buildSub(S32, LS, One); | ||||||
2122 | ShAmt = B.buildUMin(S32, LS2, MaxShAmt); | ||||||
2123 | } else | ||||||
2124 | ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); | ||||||
2125 | auto Norm = B.buildShl(S64, Src, ShAmt); | ||||||
2126 | auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); | ||||||
2127 | auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); | ||||||
2128 | auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); | ||||||
2129 | auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); | ||||||
2130 | auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); | ||||||
2131 | B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, | ||||||
2132 | /*HasSideEffects=*/false) | ||||||
2133 | .addUse(FVal.getReg(0)) | ||||||
2134 | .addUse(Scale.getReg(0)); | ||||||
2135 | MI.eraseFromParent(); | ||||||
2136 | return true; | ||||||
2137 | } | ||||||
2138 | |||||||
2139 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||||
2140 | // actually works. | ||||||
2141 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, | ||||||
2142 | MachineRegisterInfo &MRI, | ||||||
2143 | MachineIRBuilder &B, | ||||||
2144 | bool Signed) const { | ||||||
2145 | |||||||
2146 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2147 | Register Src = MI.getOperand(1).getReg(); | ||||||
2148 | |||||||
2149 | const LLT S64 = LLT::scalar(64); | ||||||
2150 | const LLT S32 = LLT::scalar(32); | ||||||
2151 | |||||||
2152 | const LLT SrcLT = MRI.getType(Src); | ||||||
2153 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2153, __extension__ __PRETTY_FUNCTION__)); | ||||||
2154 | |||||||
2155 | unsigned Flags = MI.getFlags(); | ||||||
2156 | |||||||
2157 | // The basic idea of converting a floating point number into a pair of 32-bit | ||||||
2158 | // integers is illustrated as follows: | ||||||
2159 | // | ||||||
2160 | // tf := trunc(val); | ||||||
2161 | // hif := floor(tf * 2^-32); | ||||||
2162 | // lof := tf - hif * 2^32; // lof is always positive due to floor. | ||||||
2163 | // hi := fptoi(hif); | ||||||
2164 | // lo := fptoi(lof); | ||||||
2165 | // | ||||||
2166 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); | ||||||
2167 | MachineInstrBuilder Sign; | ||||||
2168 | if (Signed && SrcLT == S32) { | ||||||
2169 | // However, a 32-bit floating point number has only 23 bits mantissa and | ||||||
2170 | // it's not enough to hold all the significant bits of `lof` if val is | ||||||
2171 | // negative. To avoid the loss of precision, We need to take the absolute | ||||||
2172 | // value after truncating and flip the result back based on the original | ||||||
2173 | // signedness. | ||||||
2174 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); | ||||||
2175 | Trunc = B.buildFAbs(S32, Trunc, Flags); | ||||||
2176 | } | ||||||
2177 | MachineInstrBuilder K0, K1; | ||||||
2178 | if (SrcLT == S64) { | ||||||
2179 | K0 = B.buildFConstant(S64, | ||||||
2180 | BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL)); | ||||||
2181 | K1 = B.buildFConstant(S64, | ||||||
2182 | BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL)); | ||||||
2183 | } else { | ||||||
2184 | K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U)); | ||||||
2185 | K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U)); | ||||||
2186 | } | ||||||
2187 | |||||||
2188 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); | ||||||
2189 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); | ||||||
2190 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); | ||||||
2191 | |||||||
2192 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) | ||||||
2193 | : B.buildFPTOUI(S32, FloorMul); | ||||||
2194 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||||
2195 | |||||||
2196 | if (Signed && SrcLT == S32) { | ||||||
2197 | // Flip the result based on the signedness, which is either all 0s or 1s. | ||||||
2198 | Sign = B.buildMerge(S64, {Sign, Sign}); | ||||||
2199 | // r := xor({lo, hi}, sign) - sign; | ||||||
2200 | B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); | ||||||
2201 | } else | ||||||
2202 | B.buildMerge(Dst, {Lo, Hi}); | ||||||
2203 | MI.eraseFromParent(); | ||||||
2204 | |||||||
2205 | return true; | ||||||
2206 | } | ||||||
2207 | |||||||
2208 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||||
2209 | MachineInstr &MI) const { | ||||||
2210 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||||
2211 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2212 | |||||||
2213 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||||
2214 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||||
2215 | |||||||
2216 | // With ieee_mode disabled, the instructions have the correct behavior | ||||||
2217 | // already for G_FMINNUM/G_FMAXNUM | ||||||
2218 | if (!MFI->getMode().IEEE) | ||||||
2219 | return !IsIEEEOp; | ||||||
2220 | |||||||
2221 | if (IsIEEEOp) | ||||||
2222 | return true; | ||||||
2223 | |||||||
2224 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||||
2225 | } | ||||||
2226 | |||||||
2227 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||||
2228 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2229 | MachineIRBuilder &B) const { | ||||||
2230 | // TODO: Should move some of this into LegalizerHelper. | ||||||
2231 | |||||||
2232 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
2233 | |||||||
2234 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
2235 | // constant before this, so we shouldn't need | ||||||
2236 | // getIConstantVRegValWithLookThrough. | ||||||
2237 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||
2238 | getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||||
2239 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||
2240 | return true; | ||||||
2241 | const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||
2242 | |||||||
2243 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2244 | Register Vec = MI.getOperand(1).getReg(); | ||||||
2245 | |||||||
2246 | LLT VecTy = MRI.getType(Vec); | ||||||
2247 | LLT EltTy = VecTy.getElementType(); | ||||||
2248 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2248, __extension__ __PRETTY_FUNCTION__)); | ||||||
2249 | |||||||
2250 | if (IdxVal < VecTy.getNumElements()) { | ||||||
2251 | auto Unmerge = B.buildUnmerge(EltTy, Vec); | ||||||
2252 | B.buildCopy(Dst, Unmerge.getReg(IdxVal)); | ||||||
2253 | } else { | ||||||
2254 | B.buildUndef(Dst); | ||||||
2255 | } | ||||||
2256 | |||||||
2257 | MI.eraseFromParent(); | ||||||
2258 | return true; | ||||||
2259 | } | ||||||
2260 | |||||||
2261 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||||
2262 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2263 | MachineIRBuilder &B) const { | ||||||
2264 | // TODO: Should move some of this into LegalizerHelper. | ||||||
2265 | |||||||
2266 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
2267 | |||||||
2268 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
2269 | // constant before this, so we shouldn't need | ||||||
2270 | // getIConstantVRegValWithLookThrough. | ||||||
2271 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||
2272 | getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||||
2273 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||
2274 | return true; | ||||||
2275 | |||||||
2276 | int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||
2277 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2278 | Register Vec = MI.getOperand(1).getReg(); | ||||||
2279 | Register Ins = MI.getOperand(2).getReg(); | ||||||
2280 | |||||||
2281 | LLT VecTy = MRI.getType(Vec); | ||||||
2282 | LLT EltTy = VecTy.getElementType(); | ||||||
2283 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2283, __extension__ __PRETTY_FUNCTION__)); | ||||||
2284 | (void)Ins; | ||||||
2285 | |||||||
2286 | unsigned NumElts = VecTy.getNumElements(); | ||||||
2287 | if (IdxVal < NumElts) { | ||||||
2288 | SmallVector<Register, 8> SrcRegs; | ||||||
2289 | for (unsigned i = 0; i < NumElts; ++i) | ||||||
2290 | SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); | ||||||
2291 | B.buildUnmerge(SrcRegs, Vec); | ||||||
2292 | |||||||
2293 | SrcRegs[IdxVal] = MI.getOperand(2).getReg(); | ||||||
2294 | B.buildMerge(Dst, SrcRegs); | ||||||
2295 | } else { | ||||||
2296 | B.buildUndef(Dst); | ||||||
2297 | } | ||||||
2298 | |||||||
2299 | MI.eraseFromParent(); | ||||||
2300 | return true; | ||||||
2301 | } | ||||||
2302 | |||||||
2303 | bool AMDGPULegalizerInfo::legalizeShuffleVector( | ||||||
2304 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2305 | MachineIRBuilder &B) const { | ||||||
2306 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
2307 | |||||||
2308 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2309 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2310 | LLT DstTy = MRI.getType(Dst); | ||||||
2311 | LLT SrcTy = MRI.getType(Src0); | ||||||
2312 | |||||||
2313 | if (SrcTy == V2S16 && DstTy == V2S16 && | ||||||
2314 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) | ||||||
2315 | return true; | ||||||
2316 | |||||||
2317 | MachineIRBuilder HelperBuilder(MI); | ||||||
2318 | GISelObserverWrapper DummyObserver; | ||||||
2319 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); | ||||||
2320 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; | ||||||
2321 | } | ||||||
2322 | |||||||
2323 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||||
2324 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2325 | MachineIRBuilder &B) const { | ||||||
2326 | |||||||
2327 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2328 | Register SrcReg = MI.getOperand(1).getReg(); | ||||||
2329 | LLT Ty = MRI.getType(DstReg); | ||||||
2330 | unsigned Flags = MI.getFlags(); | ||||||
2331 | |||||||
2332 | Register TrigVal; | ||||||
2333 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||||
2334 | if (ST.hasTrigReducedRange()) { | ||||||
2335 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||||
2336 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||||
2337 | .addUse(MulVal.getReg(0)) | ||||||
2338 | .setMIFlags(Flags).getReg(0); | ||||||
2339 | } else | ||||||
2340 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||||
2341 | |||||||
2342 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||||
2343 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||||
2344 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | ||||||
2345 | .addUse(TrigVal) | ||||||
2346 | .setMIFlags(Flags); | ||||||
2347 | MI.eraseFromParent(); | ||||||
2348 | return true; | ||||||
2349 | } | ||||||
2350 | |||||||
2351 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||||
2352 | MachineIRBuilder &B, | ||||||
2353 | const GlobalValue *GV, | ||||||
2354 | int64_t Offset, | ||||||
2355 | unsigned GAFlags) const { | ||||||
2356 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2356, __extension__ __PRETTY_FUNCTION__)); | ||||||
2357 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||||
2358 | // to the following code sequence: | ||||||
2359 | // | ||||||
2360 | // For constant address space: | ||||||
2361 | // s_getpc_b64 s[0:1] | ||||||
2362 | // s_add_u32 s0, s0, $symbol | ||||||
2363 | // s_addc_u32 s1, s1, 0 | ||||||
2364 | // | ||||||
2365 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
2366 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||||
2367 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||||
2368 | // operand to the global variable. | ||||||
2369 | // | ||||||
2370 | // For global address space: | ||||||
2371 | // s_getpc_b64 s[0:1] | ||||||
2372 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||||
2373 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||||
2374 | // | ||||||
2375 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
2376 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||||
2377 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||||
2378 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||||
2379 | // operand to the global variable. | ||||||
2380 | // | ||||||
2381 | // What we want here is an offset from the value returned by s_getpc | ||||||
2382 | // (which is the address of the s_add_u32 instruction) to the global | ||||||
2383 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||||
2384 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||||
2385 | // small. This requires us to add 4 to the global variable offset in order to | ||||||
2386 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||||
2387 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||||
2388 | // instruction. | ||||||
2389 | |||||||
2390 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2391 | |||||||
2392 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||||
2393 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||||
2394 | |||||||
2395 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||||
2396 | .addDef(PCReg); | ||||||
2397 | |||||||
2398 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||||
2399 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||||
2400 | MIB.addImm(0); | ||||||
2401 | else | ||||||
2402 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||||
2403 | |||||||
2404 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||||
2405 | |||||||
2406 | if (PtrTy.getSizeInBits() == 32) | ||||||
2407 | B.buildExtract(DstReg, PCReg, 0); | ||||||
2408 | return true; | ||||||
2409 | } | ||||||
2410 | |||||||
2411 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||||
2412 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2413 | MachineIRBuilder &B) const { | ||||||
2414 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2415 | LLT Ty = MRI.getType(DstReg); | ||||||
2416 | unsigned AS = Ty.getAddressSpace(); | ||||||
2417 | |||||||
2418 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||||
2419 | MachineFunction &MF = B.getMF(); | ||||||
2420 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2421 | |||||||
2422 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||||
2423 | if (!MFI->isModuleEntryFunction() && | ||||||
2424 | !GV->getName().equals("llvm.amdgcn.module.lds")) { | ||||||
2425 | const Function &Fn = MF.getFunction(); | ||||||
2426 | DiagnosticInfoUnsupported BadLDSDecl( | ||||||
2427 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||||
2428 | DS_Warning); | ||||||
2429 | Fn.getContext().diagnose(BadLDSDecl); | ||||||
2430 | |||||||
2431 | // We currently don't have a way to correctly allocate LDS objects that | ||||||
2432 | // aren't directly associated with a kernel. We do force inlining of | ||||||
2433 | // functions that use local objects. However, if these dead functions are | ||||||
2434 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||||
2435 | // and a trap, since there should be no callable path here. | ||||||
2436 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||||
2437 | B.buildUndef(DstReg); | ||||||
2438 | MI.eraseFromParent(); | ||||||
2439 | return true; | ||||||
2440 | } | ||||||
2441 | |||||||
2442 | // TODO: We could emit code to handle the initialization somewhere. | ||||||
2443 | // We ignore the initializer for now and legalize it to allow selection. | ||||||
2444 | // The initializer will anyway get errored out during assembly emission. | ||||||
2445 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
2446 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||||
2447 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||||
2448 | return true; // Leave in place; | ||||||
2449 | } | ||||||
2450 | |||||||
2451 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||||
2452 | Type *Ty = GV->getValueType(); | ||||||
2453 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||||
2454 | // zero-sized type in other languages to declare the dynamic shared | ||||||
2455 | // memory which size is not known at the compile time. They will be | ||||||
2456 | // allocated by the runtime and placed directly after the static | ||||||
2457 | // allocated ones. They all share the same offset. | ||||||
2458 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||||
2459 | // Adjust alignment for that dynamic shared memory array. | ||||||
2460 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); | ||||||
2461 | LLT S32 = LLT::scalar(32); | ||||||
2462 | auto Sz = | ||||||
2463 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||||
2464 | B.buildIntToPtr(DstReg, Sz); | ||||||
2465 | MI.eraseFromParent(); | ||||||
2466 | return true; | ||||||
2467 | } | ||||||
2468 | } | ||||||
2469 | |||||||
2470 | B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), | ||||||
2471 | *cast<GlobalVariable>(GV))); | ||||||
2472 | MI.eraseFromParent(); | ||||||
2473 | return true; | ||||||
2474 | } | ||||||
2475 | |||||||
2476 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
2477 | |||||||
2478 | if (TLI->shouldEmitFixup(GV)) { | ||||||
2479 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||||
2480 | MI.eraseFromParent(); | ||||||
2481 | return true; | ||||||
2482 | } | ||||||
2483 | |||||||
2484 | if (TLI->shouldEmitPCReloc(GV)) { | ||||||
2485 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||||
2486 | MI.eraseFromParent(); | ||||||
2487 | return true; | ||||||
2488 | } | ||||||
2489 | |||||||
2490 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2491 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||||
2492 | |||||||
2493 | LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; | ||||||
2494 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||||
2495 | MachinePointerInfo::getGOT(MF), | ||||||
2496 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
2497 | MachineMemOperand::MOInvariant, | ||||||
2498 | LoadTy, Align(8)); | ||||||
2499 | |||||||
2500 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||||
2501 | |||||||
2502 | if (Ty.getSizeInBits() == 32) { | ||||||
2503 | // Truncate if this is a 32-bit constant address. | ||||||
2504 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||||
2505 | B.buildExtract(DstReg, Load, 0); | ||||||
2506 | } else | ||||||
2507 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||||
2508 | |||||||
2509 | MI.eraseFromParent(); | ||||||
2510 | return true; | ||||||
2511 | } | ||||||
2512 | |||||||
2513 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||||
2514 | if (Ty.isVector()) | ||||||
2515 | return Ty.changeElementCount( | ||||||
2516 | ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); | ||||||
2517 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||||
2518 | } | ||||||
2519 | |||||||
2520 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||||
2521 | MachineInstr &MI) const { | ||||||
2522 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
2523 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
2524 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
2525 | |||||||
2526 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
2527 | LLT PtrTy = MRI.getType(PtrReg); | ||||||
2528 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||||
2529 | |||||||
2530 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
2531 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2532 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||||
2533 | Observer.changingInstr(MI); | ||||||
2534 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||||
2535 | Observer.changedInstr(MI); | ||||||
2536 | return true; | ||||||
2537 | } | ||||||
2538 | |||||||
2539 | if (MI.getOpcode() != AMDGPU::G_LOAD) | ||||||
2540 | return false; | ||||||
2541 | |||||||
2542 | Register ValReg = MI.getOperand(0).getReg(); | ||||||
2543 | LLT ValTy = MRI.getType(ValReg); | ||||||
2544 | |||||||
2545 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
2546 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||||
2547 | const LLT MemTy = MMO->getMemoryType(); | ||||||
2548 | const Align MemAlign = MMO->getAlign(); | ||||||
2549 | const unsigned MemSize = MemTy.getSizeInBits(); | ||||||
2550 | const unsigned AlignInBits = 8 * MemAlign.value(); | ||||||
2551 | |||||||
2552 | // Widen non-power-of-2 loads to the alignment if needed | ||||||
2553 | if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||||
2554 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||||
2555 | |||||||
2556 | // This was already the correct extending load result type, so just adjust | ||||||
2557 | // the memory type. | ||||||
2558 | if (WideMemSize == ValSize) { | ||||||
2559 | MachineFunction &MF = B.getMF(); | ||||||
2560 | |||||||
2561 | MachineMemOperand *WideMMO = | ||||||
2562 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||||
2563 | Observer.changingInstr(MI); | ||||||
2564 | MI.setMemRefs(MF, {WideMMO}); | ||||||
2565 | Observer.changedInstr(MI); | ||||||
2566 | return true; | ||||||
2567 | } | ||||||
2568 | |||||||
2569 | // Don't bother handling edge case that should probably never be produced. | ||||||
2570 | if (ValSize > WideMemSize) | ||||||
2571 | return false; | ||||||
2572 | |||||||
2573 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||||
2574 | |||||||
2575 | Register WideLoad; | ||||||
2576 | if (!WideTy.isVector()) { | ||||||
2577 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||
2578 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||||
2579 | } else { | ||||||
2580 | // Extract the subvector. | ||||||
2581 | |||||||
2582 | if (isRegisterType(ValTy)) { | ||||||
2583 | // If this a case where G_EXTRACT is legal, use it. | ||||||
2584 | // (e.g. <3 x s32> -> <4 x s32>) | ||||||
2585 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||
2586 | B.buildExtract(ValReg, WideLoad, 0); | ||||||
2587 | } else { | ||||||
2588 | // For cases where the widened type isn't a nice register value, unmerge | ||||||
2589 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||||
2590 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||
2591 | B.buildDeleteTrailingVectorElements(ValReg, WideLoad); | ||||||
2592 | } | ||||||
2593 | } | ||||||
2594 | |||||||
2595 | MI.eraseFromParent(); | ||||||
2596 | return true; | ||||||
2597 | } | ||||||
2598 | |||||||
2599 | return false; | ||||||
2600 | } | ||||||
2601 | |||||||
2602 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||||
2603 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2604 | MachineIRBuilder &B) const { | ||||||
2605 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
2606 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2606, __extension__ __PRETTY_FUNCTION__)); | ||||||
2607 | |||||||
2608 | MachineFunction &MF = B.getMF(); | ||||||
2609 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2610 | |||||||
2611 | // TODO: Always legal with future ftz flag. | ||||||
2612 | // FIXME: Do we need just output? | ||||||
2613 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||||
2614 | return true; | ||||||
2615 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||||
2616 | return true; | ||||||
2617 | |||||||
2618 | MachineIRBuilder HelperBuilder(MI); | ||||||
2619 | GISelObserverWrapper DummyObserver; | ||||||
2620 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||
2621 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||||
2622 | } | ||||||
2623 | |||||||
2624 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||||
2625 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2626 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2627 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
2628 | Register CmpVal = MI.getOperand(2).getReg(); | ||||||
2629 | Register NewVal = MI.getOperand(3).getReg(); | ||||||
2630 | |||||||
2631 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2632, __extension__ __PRETTY_FUNCTION__)) | ||||||
2632 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2632, __extension__ __PRETTY_FUNCTION__)); | ||||||
2633 | |||||||
2634 | LLT ValTy = MRI.getType(CmpVal); | ||||||
2635 | LLT VecTy = LLT::fixed_vector(2, ValTy); | ||||||
2636 | |||||||
2637 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||||
2638 | |||||||
2639 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||||
2640 | .addDef(DstReg) | ||||||
2641 | .addUse(PtrReg) | ||||||
2642 | .addUse(PackedVal) | ||||||
2643 | .setMemRefs(MI.memoperands()); | ||||||
2644 | |||||||
2645 | MI.eraseFromParent(); | ||||||
2646 | return true; | ||||||
2647 | } | ||||||
2648 | |||||||
2649 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||||
2650 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||||
2651 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2652 | Register Src = MI.getOperand(1).getReg(); | ||||||
2653 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2654 | unsigned Flags = MI.getFlags(); | ||||||
2655 | |||||||
2656 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||||
2657 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||||
2658 | |||||||
2659 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||||
2660 | MI.eraseFromParent(); | ||||||
2661 | return true; | ||||||
2662 | } | ||||||
2663 | |||||||
2664 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||||
2665 | MachineIRBuilder &B) const { | ||||||
2666 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2667 | Register Src = MI.getOperand(1).getReg(); | ||||||
2668 | unsigned Flags = MI.getFlags(); | ||||||
2669 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2670 | |||||||
2671 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||||
2672 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||||
2673 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2674 | MI.eraseFromParent(); | ||||||
2675 | return true; | ||||||
2676 | } | ||||||
2677 | |||||||
2678 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||||
2679 | MachineIRBuilder &B) const { | ||||||
2680 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2681 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2682 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2683 | unsigned Flags = MI.getFlags(); | ||||||
2684 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2685 | const LLT S16 = LLT::scalar(16); | ||||||
2686 | const LLT S32 = LLT::scalar(32); | ||||||
2687 | |||||||
2688 | if (Ty == S32) { | ||||||
2689 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||||
2690 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2691 | .addUse(Log.getReg(0)) | ||||||
2692 | .addUse(Src1) | ||||||
2693 | .setMIFlags(Flags); | ||||||
2694 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2695 | } else if (Ty == S16) { | ||||||
2696 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||||
2697 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||||
2698 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||||
2699 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||||
2700 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2701 | .addUse(Ext0.getReg(0)) | ||||||
2702 | .addUse(Ext1.getReg(0)) | ||||||
2703 | .setMIFlags(Flags); | ||||||
2704 | |||||||
2705 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||||
2706 | } else | ||||||
2707 | return false; | ||||||
2708 | |||||||
2709 | MI.eraseFromParent(); | ||||||
2710 | return true; | ||||||
2711 | } | ||||||
2712 | |||||||
2713 | // Find a source register, ignoring any possible source modifiers. | ||||||
2714 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||||
2715 | Register ModSrc = OrigSrc; | ||||||
2716 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||||
2717 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||||
2718 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2719 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2720 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2721 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2722 | return ModSrc; | ||||||
2723 | } | ||||||
2724 | |||||||
2725 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||||
2726 | MachineRegisterInfo &MRI, | ||||||
2727 | MachineIRBuilder &B) const { | ||||||
2728 | |||||||
2729 | const LLT S1 = LLT::scalar(1); | ||||||
2730 | const LLT S64 = LLT::scalar(64); | ||||||
2731 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2732 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||||
2733 | unsigned Flags = MI.getFlags(); | ||||||
2734 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2735, __extension__ __PRETTY_FUNCTION__)) | ||||||
2735 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2735, __extension__ __PRETTY_FUNCTION__)); | ||||||
2736 | |||||||
2737 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||||
2738 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||||
2739 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||||
2740 | // V_FRACT bug is: | ||||||
2741 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||||
2742 | // | ||||||
2743 | // Convert floor(x) to (x - fract(x)) | ||||||
2744 | |||||||
2745 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||||
2746 | .addUse(OrigSrc) | ||||||
2747 | .setMIFlags(Flags); | ||||||
2748 | |||||||
2749 | // Give source modifier matching some assistance before obscuring a foldable | ||||||
2750 | // pattern. | ||||||
2751 | |||||||
2752 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||||
2753 | // shouldn't matter? | ||||||
2754 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||||
2755 | |||||||
2756 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); | ||||||
2757 | |||||||
2758 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||||
2759 | |||||||
2760 | // We don't need to concern ourselves with the snan handling difference, so | ||||||
2761 | // use the one which will directly select. | ||||||
2762 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2763 | if (MFI->getMode().IEEE) | ||||||
2764 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||||
2765 | else | ||||||
2766 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||||
2767 | |||||||
2768 | Register CorrectedFract = Min; | ||||||
2769 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||||
2770 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||||
2771 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||||
2772 | } | ||||||
2773 | |||||||
2774 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||||
2775 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||||
2776 | |||||||
2777 | MI.eraseFromParent(); | ||||||
2778 | return true; | ||||||
2779 | } | ||||||
2780 | |||||||
2781 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||||
2782 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||||
2783 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||||
2784 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2785 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2786 | const LLT S32 = LLT::scalar(32); | ||||||
2787 | assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector (2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2787, __extension__ __PRETTY_FUNCTION__)); | ||||||
2788 | |||||||
2789 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2790 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2791 | assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16 )) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2791, __extension__ __PRETTY_FUNCTION__)); | ||||||
2792 | |||||||
2793 | auto Merge = B.buildMerge(S32, {Src0, Src1}); | ||||||
2794 | B.buildBitcast(Dst, Merge); | ||||||
2795 | |||||||
2796 | MI.eraseFromParent(); | ||||||
2797 | return true; | ||||||
2798 | } | ||||||
2799 | |||||||
2800 | // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to | ||||||
2801 | // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input | ||||||
2802 | // case with a single min instruction instead of a compare+select. | ||||||
2803 | bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, | ||||||
2804 | MachineRegisterInfo &MRI, | ||||||
2805 | MachineIRBuilder &B) const { | ||||||
2806 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2807 | Register Src = MI.getOperand(1).getReg(); | ||||||
2808 | LLT DstTy = MRI.getType(Dst); | ||||||
2809 | LLT SrcTy = MRI.getType(Src); | ||||||
2810 | |||||||
2811 | unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ | ||||||
2812 | ? AMDGPU::G_AMDGPU_FFBH_U32 | ||||||
2813 | : AMDGPU::G_AMDGPU_FFBL_B32; | ||||||
2814 | auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); | ||||||
2815 | B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); | ||||||
2816 | |||||||
2817 | MI.eraseFromParent(); | ||||||
2818 | return true; | ||||||
2819 | } | ||||||
2820 | |||||||
2821 | // Check that this is a G_XOR x, -1 | ||||||
2822 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||||
2823 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||||
2824 | return false; | ||||||
2825 | auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||||
2826 | return ConstVal && *ConstVal == -1; | ||||||
2827 | } | ||||||
2828 | |||||||
2829 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||||
2830 | static MachineInstr * | ||||||
2831 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||||
2832 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||||
2833 | Register CondDef = MI.getOperand(0).getReg(); | ||||||
2834 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||||
2835 | return nullptr; | ||||||
2836 | |||||||
2837 | MachineBasicBlock *Parent = MI.getParent(); | ||||||
2838 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||||
2839 | |||||||
2840 | if (isNot(MRI, *UseMI)) { | ||||||
2841 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||||
2842 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||||
2843 | return nullptr; | ||||||
2844 | |||||||
2845 | // We're deleting the def of this value, so we need to remove it. | ||||||
2846 | eraseInstr(*UseMI, MRI); | ||||||
2847 | |||||||
2848 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||||
2849 | Negated = true; | ||||||
2850 | } | ||||||
2851 | |||||||
2852 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||||
2853 | return nullptr; | ||||||
2854 | |||||||
2855 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||||
2856 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||||
2857 | if (Next == Parent->end()) { | ||||||
2858 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||||
2859 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||||
2860 | return nullptr; | ||||||
2861 | UncondBrTarget = &*NextMBB; | ||||||
2862 | } else { | ||||||
2863 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||||
2864 | return nullptr; | ||||||
2865 | Br = &*Next; | ||||||
2866 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||||
2867 | } | ||||||
2868 | |||||||
2869 | return UseMI; | ||||||
2870 | } | ||||||
2871 | |||||||
2872 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||||
2873 | const ArgDescriptor *Arg, | ||||||
2874 | const TargetRegisterClass *ArgRC, | ||||||
2875 | LLT ArgTy) const { | ||||||
2876 | MCRegister SrcReg = Arg->getRegister(); | ||||||
2877 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2877, __extension__ __PRETTY_FUNCTION__)); | ||||||
2878 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2878, __extension__ __PRETTY_FUNCTION__)); | ||||||
2879 | |||||||
2880 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, | ||||||
2881 | *ArgRC, B.getDebugLoc(), ArgTy); | ||||||
2882 | if (Arg->isMasked()) { | ||||||
2883 | // TODO: Should we try to emit this once in the entry block? | ||||||
2884 | const LLT S32 = LLT::scalar(32); | ||||||
2885 | const unsigned Mask = Arg->getMask(); | ||||||
2886 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | ||||||
2887 | |||||||
2888 | Register AndMaskSrc = LiveIn; | ||||||
2889 | |||||||
2890 | if (Shift
| ||||||
2891 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||||
2892 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||||
2893 | } | ||||||
2894 | |||||||
2895 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||||
| |||||||
2896 | } else { | ||||||
2897 | B.buildCopy(DstReg, LiveIn); | ||||||
2898 | } | ||||||
2899 | |||||||
2900 | return true; | ||||||
2901 | } | ||||||
2902 | |||||||
2903 | bool AMDGPULegalizerInfo::loadInputValue( | ||||||
2904 | Register DstReg, MachineIRBuilder &B, | ||||||
2905 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
2906 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2907 | const ArgDescriptor *Arg; | ||||||
2908 | const TargetRegisterClass *ArgRC; | ||||||
2909 | LLT ArgTy; | ||||||
2910 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||||
2911 | |||||||
2912 | if (!Arg) { | ||||||
2913 | if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { | ||||||
2914 | // The intrinsic may appear when we have a 0 sized kernarg segment, in which | ||||||
2915 | // case the pointer argument may be missing and we use null. | ||||||
2916 | B.buildConstant(DstReg, 0); | ||||||
2917 | return true; | ||||||
2918 | } | ||||||
2919 | |||||||
2920 | // It's undefined behavior if a function marked with the amdgpu-no-* | ||||||
2921 | // attributes uses the corresponding intrinsic. | ||||||
2922 | B.buildUndef(DstReg); | ||||||
2923 | return true; | ||||||
2924 | } | ||||||
2925 | |||||||
2926 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||||
2927 | return false; // TODO: Handle these | ||||||
2928 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||||
2929 | } | ||||||
2930 | |||||||
2931 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||||
2932 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||||
2933 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
2934 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||||
2935 | return false; | ||||||
2936 | |||||||
2937 | MI.eraseFromParent(); | ||||||
2938 | return true; | ||||||
2939 | } | ||||||
2940 | |||||||
2941 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||||
2942 | MachineRegisterInfo &MRI, | ||||||
2943 | MachineIRBuilder &B) const { | ||||||
2944 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2945 | LLT DstTy = MRI.getType(Dst); | ||||||
2946 | LLT S16 = LLT::scalar(16); | ||||||
2947 | LLT S32 = LLT::scalar(32); | ||||||
2948 | LLT S64 = LLT::scalar(64); | ||||||
2949 | |||||||
2950 | if (DstTy == S16) | ||||||
2951 | return legalizeFDIV16(MI, MRI, B); | ||||||
2952 | if (DstTy == S32) | ||||||
2953 | return legalizeFDIV32(MI, MRI, B); | ||||||
2954 | if (DstTy == S64) | ||||||
2955 | return legalizeFDIV64(MI, MRI, B); | ||||||
2956 | |||||||
2957 | return false; | ||||||
2958 | } | ||||||
2959 | |||||||
2960 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, | ||||||
2961 | Register DstDivReg, | ||||||
2962 | Register DstRemReg, | ||||||
2963 | Register X, | ||||||
2964 | Register Y) const { | ||||||
2965 | const LLT S1 = LLT::scalar(1); | ||||||
2966 | const LLT S32 = LLT::scalar(32); | ||||||
2967 | |||||||
2968 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||||
2969 | // algorithm used here. | ||||||
2970 | |||||||
2971 | // Initial estimate of inv(y). | ||||||
2972 | auto FloatY = B.buildUITOFP(S32, Y); | ||||||
2973 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||||
2974 | auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); | ||||||
2975 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||||
2976 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||||
2977 | |||||||
2978 | // One round of UNR. | ||||||
2979 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||||
2980 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||||
2981 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||||
2982 | |||||||
2983 | // Quotient/remainder estimate. | ||||||
2984 | auto Q = B.buildUMulH(S32, X, Z); | ||||||
2985 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||||
2986 | |||||||
2987 | // First quotient/remainder refinement. | ||||||
2988 | auto One = B.buildConstant(S32, 1); | ||||||
2989 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||
2990 | if (DstDivReg) | ||||||
2991 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||||
2992 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||||
2993 | |||||||
2994 | // Second quotient/remainder refinement. | ||||||
2995 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||
2996 | if (DstDivReg) | ||||||
2997 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||||
2998 | |||||||
2999 | if (DstRemReg) | ||||||
3000 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); | ||||||
3001 | } | ||||||
3002 | |||||||
3003 | // Build integer reciprocal sequence around V_RCP_IFLAG_F32 | ||||||
3004 | // | ||||||
3005 | // Return lo, hi of result | ||||||
3006 | // | ||||||
3007 | // %cvt.lo = G_UITOFP Val.lo | ||||||
3008 | // %cvt.hi = G_UITOFP Val.hi | ||||||
3009 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||||
3010 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||||
3011 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||||
3012 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||||
3013 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||||
3014 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||||
3015 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||||
3016 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||||
3017 | Register Val) { | ||||||
3018 | const LLT S32 = LLT::scalar(32); | ||||||
3019 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||||
3020 | |||||||
3021 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||||
3022 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||||
3023 | |||||||
3024 | auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 | ||||||
3025 | B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); | ||||||
3026 | |||||||
3027 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||||
3028 | auto Mul1 = | ||||||
3029 | B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); | ||||||
3030 | |||||||
3031 | // 2**(-32) | ||||||
3032 | auto Mul2 = | ||||||
3033 | B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); | ||||||
3034 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||||
3035 | |||||||
3036 | // -(2**32) | ||||||
3037 | auto Mad2 = B.buildFMAD(S32, Trunc, | ||||||
3038 | B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); | ||||||
3039 | |||||||
3040 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||||
3041 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||||
3042 | |||||||
3043 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||||
3044 | } | ||||||
3045 | |||||||
3046 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, | ||||||
3047 | Register DstDivReg, | ||||||
3048 | Register DstRemReg, | ||||||
3049 | Register Numer, | ||||||
3050 | Register Denom) const { | ||||||
3051 | const LLT S32 = LLT::scalar(32); | ||||||
3052 | const LLT S64 = LLT::scalar(64); | ||||||
3053 | const LLT S1 = LLT::scalar(1); | ||||||
3054 | Register RcpLo, RcpHi; | ||||||
3055 | |||||||
3056 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||||
3057 | |||||||
3058 | auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); | ||||||
3059 | |||||||
3060 | auto Zero64 = B.buildConstant(S64, 0); | ||||||
3061 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||||
3062 | |||||||
3063 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||||
3064 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||||
3065 | |||||||
3066 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||||
3067 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||||
3068 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||||
3069 | |||||||
3070 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||||
3071 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||||
3072 | auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); | ||||||
3073 | |||||||
3074 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||||
3075 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||||
3076 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||||
3077 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||||
3078 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||||
3079 | |||||||
3080 | auto Zero32 = B.buildConstant(S32, 0); | ||||||
3081 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||||
3082 | auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); | ||||||
3083 | auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); | ||||||
3084 | |||||||
3085 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||||
3086 | Register NumerLo = UnmergeNumer.getReg(0); | ||||||
3087 | Register NumerHi = UnmergeNumer.getReg(1); | ||||||
3088 | |||||||
3089 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||||
3090 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||||
3091 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||||
3092 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||||
3093 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||||
3094 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||||
3095 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||||
3096 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||||
3097 | auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); | ||||||
3098 | |||||||
3099 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||||
3100 | Register DenomLo = UnmergeDenom.getReg(0); | ||||||
3101 | Register DenomHi = UnmergeDenom.getReg(1); | ||||||
3102 | |||||||
3103 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||||
3104 | auto C1 = B.buildSExt(S32, CmpHi); | ||||||
3105 | |||||||
3106 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||||
3107 | auto C2 = B.buildSExt(S32, CmpLo); | ||||||
3108 | |||||||
3109 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||||
3110 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||||
3111 | |||||||
3112 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||||
3113 | // Currently control flow is unconditional and we have 4 selects after | ||||||
3114 | // potential endif to substitute PHIs. | ||||||
3115 | |||||||
3116 | // if C3 != 0 ... | ||||||
3117 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||||
3118 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||||
3119 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||||
3120 | auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); | ||||||
3121 | |||||||
3122 | auto One64 = B.buildConstant(S64, 1); | ||||||
3123 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||||
3124 | |||||||
3125 | auto C4 = | ||||||
3126 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||||
3127 | auto C5 = | ||||||
3128 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||||
3129 | auto C6 = B.buildSelect( | ||||||
3130 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||||
3131 | |||||||
3132 | // if (C6 != 0) | ||||||
3133 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||||
3134 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||||
3135 | |||||||
3136 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||||
3137 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||||
3138 | auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); | ||||||
3139 | |||||||
3140 | // endif C6 | ||||||
3141 | // endif C3 | ||||||
3142 | |||||||
3143 | if (DstDivReg) { | ||||||
3144 | auto Sel1 = B.buildSelect( | ||||||
3145 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||||
3146 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||||
3147 | Sel1, MulHi3); | ||||||
3148 | } | ||||||
3149 | |||||||
3150 | if (DstRemReg) { | ||||||
3151 | auto Sel2 = B.buildSelect( | ||||||
3152 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||||
3153 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||||
3154 | Sel2, Sub1); | ||||||
3155 | } | ||||||
3156 | } | ||||||
3157 | |||||||
3158 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, | ||||||
3159 | MachineRegisterInfo &MRI, | ||||||
3160 | MachineIRBuilder &B) const { | ||||||
3161 | Register DstDivReg, DstRemReg; | ||||||
3162 | switch (MI.getOpcode()) { | ||||||
3163 | default: | ||||||
3164 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3164); | ||||||
3165 | case AMDGPU::G_UDIV: { | ||||||
3166 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3167 | break; | ||||||
3168 | } | ||||||
3169 | case AMDGPU::G_UREM: { | ||||||
3170 | DstRemReg = MI.getOperand(0).getReg(); | ||||||
3171 | break; | ||||||
3172 | } | ||||||
3173 | case AMDGPU::G_UDIVREM: { | ||||||
3174 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3175 | DstRemReg = MI.getOperand(1).getReg(); | ||||||
3176 | break; | ||||||
3177 | } | ||||||
3178 | } | ||||||
3179 | |||||||
3180 | const LLT S64 = LLT::scalar(64); | ||||||
3181 | const LLT S32 = LLT::scalar(32); | ||||||
3182 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||||
3183 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||||
3184 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||||
3185 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
3186 | |||||||
3187 | if (Ty == S32) | ||||||
3188 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||||
3189 | else if (Ty == S64) | ||||||
3190 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||||
3191 | else | ||||||
3192 | return false; | ||||||
3193 | |||||||
3194 | MI.eraseFromParent(); | ||||||
3195 | return true; | ||||||
3196 | } | ||||||
3197 | |||||||
3198 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, | ||||||
3199 | MachineRegisterInfo &MRI, | ||||||
3200 | MachineIRBuilder &B) const { | ||||||
3201 | const LLT S64 = LLT::scalar(64); | ||||||
3202 | const LLT S32 = LLT::scalar(32); | ||||||
3203 | |||||||
3204 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
3205 | if (Ty != S32 && Ty != S64) | ||||||
3206 | return false; | ||||||
3207 | |||||||
3208 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||||
3209 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||||
3210 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||||
3211 | |||||||
3212 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||||
3213 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||||
3214 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||||
3215 | |||||||
3216 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||||
3217 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||||
3218 | |||||||
3219 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||||
3220 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||||
3221 | |||||||
3222 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; | ||||||
3223 | switch (MI.getOpcode()) { | ||||||
3224 | default: | ||||||
3225 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3225); | ||||||
3226 | case AMDGPU::G_SDIV: { | ||||||
3227 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3228 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3229 | break; | ||||||
3230 | } | ||||||
3231 | case AMDGPU::G_SREM: { | ||||||
3232 | DstRemReg = MI.getOperand(0).getReg(); | ||||||
3233 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3234 | break; | ||||||
3235 | } | ||||||
3236 | case AMDGPU::G_SDIVREM: { | ||||||
3237 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3238 | DstRemReg = MI.getOperand(1).getReg(); | ||||||
3239 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3240 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3241 | break; | ||||||
3242 | } | ||||||
3243 | } | ||||||
3244 | |||||||
3245 | if (Ty == S32) | ||||||
3246 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||||
3247 | else | ||||||
3248 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||||
3249 | |||||||
3250 | if (DstDivReg) { | ||||||
3251 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||||
3252 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); | ||||||
3253 | B.buildSub(DstDivReg, SignXor, Sign); | ||||||
3254 | } | ||||||
3255 | |||||||
3256 | if (DstRemReg) { | ||||||
3257 | auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||||
3258 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); | ||||||
3259 | B.buildSub(DstRemReg, SignXor, Sign); | ||||||
3260 | } | ||||||
3261 | |||||||
3262 | MI.eraseFromParent(); | ||||||
3263 | return true; | ||||||
3264 | } | ||||||
3265 | |||||||
3266 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||||
3267 | MachineRegisterInfo &MRI, | ||||||
3268 | MachineIRBuilder &B) const { | ||||||
3269 | Register Res = MI.getOperand(0).getReg(); | ||||||
3270 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3271 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3272 | uint16_t Flags = MI.getFlags(); | ||||||
3273 | LLT ResTy = MRI.getType(Res); | ||||||
3274 | |||||||
3275 | const MachineFunction &MF = B.getMF(); | ||||||
3276 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||
3277 | MI.getFlag(MachineInstr::FmAfn); | ||||||
3278 | |||||||
3279 | if (!AllowInaccurateRcp) | ||||||
3280 | return false; | ||||||
3281 | |||||||
3282 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||||
3283 | // 1 / x -> RCP(x) | ||||||
3284 | if (CLHS->isExactlyValue(1.0)) { | ||||||
3285 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
3286 | .addUse(RHS) | ||||||
3287 | .setMIFlags(Flags); | ||||||
3288 | |||||||
3289 | MI.eraseFromParent(); | ||||||
3290 | return true; | ||||||
3291 | } | ||||||
3292 | |||||||
3293 | // -1 / x -> RCP( FNEG(x) ) | ||||||
3294 | if (CLHS->isExactlyValue(-1.0)) { | ||||||
3295 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||||
3296 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
3297 | .addUse(FNeg.getReg(0)) | ||||||
3298 | .setMIFlags(Flags); | ||||||
3299 | |||||||
3300 | MI.eraseFromParent(); | ||||||
3301 | return true; | ||||||
3302 | } | ||||||
3303 | } | ||||||
3304 | |||||||
3305 | // x / y -> x * (1.0 / y) | ||||||
3306 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
3307 | .addUse(RHS) | ||||||
3308 | .setMIFlags(Flags); | ||||||
3309 | B.buildFMul(Res, LHS, RCP, Flags); | ||||||
3310 | |||||||
3311 | MI.eraseFromParent(); | ||||||
3312 | return true; | ||||||
3313 | } | ||||||
3314 | |||||||
3315 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||||
3316 | MachineRegisterInfo &MRI, | ||||||
3317 | MachineIRBuilder &B) const { | ||||||
3318 | Register Res = MI.getOperand(0).getReg(); | ||||||
3319 | Register X = MI.getOperand(1).getReg(); | ||||||
3320 | Register Y = MI.getOperand(2).getReg(); | ||||||
3321 | uint16_t Flags = MI.getFlags(); | ||||||
3322 | LLT ResTy = MRI.getType(Res); | ||||||
3323 | |||||||
3324 | const MachineFunction &MF = B.getMF(); | ||||||
3325 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||
3326 | MI.getFlag(MachineInstr::FmAfn); | ||||||
3327 | |||||||
3328 | if (!AllowInaccurateRcp) | ||||||
3329 | return false; | ||||||
3330 | |||||||
3331 | auto NegY = B.buildFNeg(ResTy, Y); | ||||||
3332 | auto One = B.buildFConstant(ResTy, 1.0); | ||||||
3333 | |||||||
3334 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
3335 | .addUse(Y) | ||||||
3336 | .setMIFlags(Flags); | ||||||
3337 | |||||||
3338 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||||
3339 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||||
3340 | |||||||
3341 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||||
3342 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||||
3343 | |||||||
3344 | auto Ret = B.buildFMul(ResTy, X, R); | ||||||
3345 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||||
3346 | |||||||
3347 | B.buildFMA(Res, Tmp2, R, Ret); | ||||||
3348 | MI.eraseFromParent(); | ||||||
3349 | return true; | ||||||
3350 | } | ||||||
3351 | |||||||
3352 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||||
3353 | MachineRegisterInfo &MRI, | ||||||
3354 | MachineIRBuilder &B) const { | ||||||
3355 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
3356 | return true; | ||||||
3357 | |||||||
3358 | Register Res = MI.getOperand(0).getReg(); | ||||||
3359 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3360 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3361 | |||||||
3362 | uint16_t Flags = MI.getFlags(); | ||||||
3363 | |||||||
3364 | LLT S16 = LLT::scalar(16); | ||||||
3365 | LLT S32 = LLT::scalar(32); | ||||||
3366 | |||||||
3367 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||||
3368 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||||
3369 | |||||||
3370 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3371 | .addUse(RHSExt.getReg(0)) | ||||||
3372 | .setMIFlags(Flags); | ||||||
3373 | |||||||
3374 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||||
3375 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||||
3376 | |||||||
3377 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
3378 | .addUse(RDst.getReg(0)) | ||||||
3379 | .addUse(RHS) | ||||||
3380 | .addUse(LHS) | ||||||
3381 | .setMIFlags(Flags); | ||||||
3382 | |||||||
3383 | MI.eraseFromParent(); | ||||||
3384 | return true; | ||||||
3385 | } | ||||||
3386 | |||||||
3387 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||||
3388 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||||
3389 | static void toggleSPDenormMode(bool Enable, | ||||||
3390 | MachineIRBuilder &B, | ||||||
3391 | const GCNSubtarget &ST, | ||||||
3392 | AMDGPU::SIModeRegisterDefaults Mode) { | ||||||
3393 | // Set SP denorm mode to this value. | ||||||
3394 | unsigned SPDenormMode = | ||||||
3395 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||||
3396 | |||||||
3397 | if (ST.hasDenormModeInst()) { | ||||||
3398 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||||
3399 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||||
3400 | |||||||
3401 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||||
3402 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||||
3403 | .addImm(NewDenormModeValue); | ||||||
3404 | |||||||
3405 | } else { | ||||||
3406 | // Select FP32 bit field in mode register. | ||||||
3407 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||||
3408 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||||
3409 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||||
3410 | |||||||
3411 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||||
3412 | .addImm(SPDenormMode) | ||||||
3413 | .addImm(SPDenormModeBitField); | ||||||
3414 | } | ||||||
3415 | } | ||||||
3416 | |||||||
3417 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||||
3418 | MachineRegisterInfo &MRI, | ||||||
3419 | MachineIRBuilder &B) const { | ||||||
3420 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
3421 | return true; | ||||||
3422 | |||||||
3423 | Register Res = MI.getOperand(0).getReg(); | ||||||
3424 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3425 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3426 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3427 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); | ||||||
3428 | |||||||
3429 | uint16_t Flags = MI.getFlags(); | ||||||
3430 | |||||||
3431 | LLT S32 = LLT::scalar(32); | ||||||
3432 | LLT S1 = LLT::scalar(1); | ||||||
3433 | |||||||
3434 | auto One = B.buildFConstant(S32, 1.0f); | ||||||
3435 | |||||||
3436 | auto DenominatorScaled = | ||||||
3437 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
3438 | .addUse(LHS) | ||||||
3439 | .addUse(RHS) | ||||||
3440 | .addImm(0) | ||||||
3441 | .setMIFlags(Flags); | ||||||
3442 | auto NumeratorScaled = | ||||||
3443 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
3444 | .addUse(LHS) | ||||||
3445 | .addUse(RHS) | ||||||
3446 | .addImm(1) | ||||||
3447 | .setMIFlags(Flags); | ||||||
3448 | |||||||
3449 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3450 | .addUse(DenominatorScaled.getReg(0)) | ||||||
3451 | .setMIFlags(Flags); | ||||||
3452 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||||
3453 | |||||||
3454 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||||
3455 | // aren't modeled as reading it. | ||||||
3456 | if (!Mode.allFP32Denormals()) | ||||||
3457 | toggleSPDenormMode(true, B, ST, Mode); | ||||||
3458 | |||||||
3459 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||||
3460 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||||
3461 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||||
3462 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||||
3463 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||||
3464 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||||
3465 | |||||||
3466 | if (!Mode.allFP32Denormals()) | ||||||
3467 | toggleSPDenormMode(false, B, ST, Mode); | ||||||
3468 | |||||||
3469 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||||
3470 | .addUse(Fma4.getReg(0)) | ||||||
3471 | .addUse(Fma1.getReg(0)) | ||||||
3472 | .addUse(Fma3.getReg(0)) | ||||||
3473 | .addUse(NumeratorScaled.getReg(1)) | ||||||
3474 | .setMIFlags(Flags); | ||||||
3475 | |||||||
3476 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
3477 | .addUse(Fmas.getReg(0)) | ||||||
3478 | .addUse(RHS) | ||||||
3479 | .addUse(LHS) | ||||||
3480 | .setMIFlags(Flags); | ||||||
3481 | |||||||
3482 | MI.eraseFromParent(); | ||||||
3483 | return true; | ||||||
3484 | } | ||||||
3485 | |||||||
3486 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||||
3487 | MachineRegisterInfo &MRI, | ||||||
3488 | MachineIRBuilder &B) const { | ||||||
3489 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||||
3490 | return true; | ||||||
3491 | |||||||
3492 | Register Res = MI.getOperand(0).getReg(); | ||||||
3493 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3494 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3495 | |||||||
3496 | uint16_t Flags = MI.getFlags(); | ||||||
3497 | |||||||
3498 | LLT S64 = LLT::scalar(64); | ||||||
3499 | LLT S1 = LLT::scalar(1); | ||||||
3500 | |||||||
3501 | auto One = B.buildFConstant(S64, 1.0); | ||||||
3502 | |||||||
3503 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
3504 | .addUse(LHS) | ||||||
3505 | .addUse(RHS) | ||||||
3506 | .addImm(0) | ||||||
3507 | .setMIFlags(Flags); | ||||||
3508 | |||||||
3509 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||||
3510 | |||||||
3511 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||||
3512 | .addUse(DivScale0.getReg(0)) | ||||||
3513 | .setMIFlags(Flags); | ||||||
3514 | |||||||
3515 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||||
3516 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||||
3517 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||||
3518 | |||||||
3519 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
3520 | .addUse(LHS) | ||||||
3521 | .addUse(RHS) | ||||||
3522 | .addImm(1) | ||||||
3523 | .setMIFlags(Flags); | ||||||
3524 | |||||||
3525 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||||
3526 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||||
3527 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||||
3528 | |||||||
3529 | Register Scale; | ||||||
3530 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||||
3531 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||||
3532 | // is not usable. | ||||||
3533 | |||||||
3534 | LLT S32 = LLT::scalar(32); | ||||||
3535 | |||||||
3536 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||||
3537 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||||
3538 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||||
3539 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||||
3540 | |||||||
3541 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||||
3542 | Scale1Unmerge.getReg(1)); | ||||||
3543 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||||
3544 | Scale0Unmerge.getReg(1)); | ||||||
3545 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||||
3546 | } else { | ||||||
3547 | Scale = DivScale1.getReg(1); | ||||||
3548 | } | ||||||
3549 | |||||||
3550 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||||
3551 | .addUse(Fma4.getReg(0)) | ||||||
3552 | .addUse(Fma3.getReg(0)) | ||||||
3553 | .addUse(Mul.getReg(0)) | ||||||
3554 | .addUse(Scale) | ||||||
3555 | .setMIFlags(Flags); | ||||||
3556 | |||||||
3557 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) | ||||||
3558 | .addUse(Fmas.getReg(0)) | ||||||
3559 | .addUse(RHS) | ||||||
3560 | .addUse(LHS) | ||||||
3561 | .setMIFlags(Flags); | ||||||
3562 | |||||||
3563 | MI.eraseFromParent(); | ||||||
3564 | return true; | ||||||
3565 | } | ||||||
3566 | |||||||
3567 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||||
3568 | MachineRegisterInfo &MRI, | ||||||
3569 | MachineIRBuilder &B) const { | ||||||
3570 | Register Res = MI.getOperand(0).getReg(); | ||||||
3571 | Register LHS = MI.getOperand(2).getReg(); | ||||||
3572 | Register RHS = MI.getOperand(3).getReg(); | ||||||
3573 | uint16_t Flags = MI.getFlags(); | ||||||
3574 | |||||||
3575 | LLT S32 = LLT::scalar(32); | ||||||
3576 | LLT S1 = LLT::scalar(1); | ||||||
3577 | |||||||
3578 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||||
3579 | const APFloat C0Val(1.0f); | ||||||
3580 | |||||||
3581 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||||
3582 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||||
3583 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | ||||||
3584 | |||||||
3585 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||||
3586 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||||
3587 | |||||||
3588 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||||
3589 | |||||||
3590 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3591 | .addUse(Mul0.getReg(0)) | ||||||
3592 | .setMIFlags(Flags); | ||||||
3593 | |||||||
3594 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||||
3595 | |||||||
3596 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||||
3597 | |||||||
3598 | MI.eraseFromParent(); | ||||||
3599 | return true; | ||||||
3600 | } | ||||||
3601 | |||||||
3602 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||||
3603 | // FIXME: Why do we handle this one but not other removed instructions? | ||||||
3604 | // | ||||||
3605 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||||
3606 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||||
3607 | // +-max_float. | ||||||
3608 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||||
3609 | MachineRegisterInfo &MRI, | ||||||
3610 | MachineIRBuilder &B) const { | ||||||
3611 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||||
3612 | return true; | ||||||
3613 | |||||||
3614 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3615 | Register Src = MI.getOperand(2).getReg(); | ||||||
3616 | auto Flags = MI.getFlags(); | ||||||
3617 | |||||||
3618 | LLT Ty = MRI.getType(Dst); | ||||||
3619 | |||||||
3620 | const fltSemantics *FltSemantics; | ||||||
3621 | if (Ty == LLT::scalar(32)) | ||||||
3622 | FltSemantics = &APFloat::IEEEsingle(); | ||||||
3623 | else if (Ty == LLT::scalar(64)) | ||||||
3624 | FltSemantics = &APFloat::IEEEdouble(); | ||||||
3625 | else | ||||||
3626 | return false; | ||||||
3627 | |||||||
3628 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||||
3629 | .addUse(Src) | ||||||
3630 | .setMIFlags(Flags); | ||||||
3631 | |||||||
3632 | // We don't need to concern ourselves with the snan handling difference, since | ||||||
3633 | // the rsq quieted (or not) so use the one which will directly select. | ||||||
3634 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3635 | const bool UseIEEE = MFI->getMode().IEEE; | ||||||
3636 | |||||||
3637 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||||
3638 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||||
3639 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||||
3640 | |||||||
3641 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||||
3642 | |||||||
3643 | if (UseIEEE) | ||||||
3644 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||||
3645 | else | ||||||
3646 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||||
3647 | MI.eraseFromParent(); | ||||||
3648 | return true; | ||||||
3649 | } | ||||||
3650 | |||||||
3651 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||||
3652 | switch (IID) { | ||||||
3653 | case Intrinsic::amdgcn_ds_fadd: | ||||||
3654 | return AMDGPU::G_ATOMICRMW_FADD; | ||||||
3655 | case Intrinsic::amdgcn_ds_fmin: | ||||||
3656 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||||
3657 | case Intrinsic::amdgcn_ds_fmax: | ||||||
3658 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||||
3659 | default: | ||||||
3660 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3660); | ||||||
3661 | } | ||||||
3662 | } | ||||||
3663 | |||||||
3664 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||||
3665 | MachineInstr &MI, | ||||||
3666 | Intrinsic::ID IID) const { | ||||||
3667 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
3668 | Observer.changingInstr(MI); | ||||||
3669 | |||||||
3670 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||||
3671 | |||||||
3672 | // The remaining operands were used to set fields in the MemOperand on | ||||||
3673 | // construction. | ||||||
3674 | for (int I = 6; I > 3; --I) | ||||||
3675 | MI.RemoveOperand(I); | ||||||
3676 | |||||||
3677 | MI.RemoveOperand(1); // Remove the intrinsic ID. | ||||||
3678 | Observer.changedInstr(MI); | ||||||
3679 | return true; | ||||||
3680 | } | ||||||
3681 | |||||||
3682 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||||
3683 | MachineRegisterInfo &MRI, | ||||||
3684 | MachineIRBuilder &B) const { | ||||||
3685 | uint64_t Offset = | ||||||
3686 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||||
3687 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||||
3688 | LLT DstTy = MRI.getType(DstReg); | ||||||
3689 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||||
3690 | |||||||
3691 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||||
3692 | if (!loadInputValue(KernargPtrReg, B, | ||||||
3693 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||||
3694 | return false; | ||||||
3695 | |||||||
3696 | // FIXME: This should be nuw | ||||||
3697 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||||
3698 | return true; | ||||||
3699 | } | ||||||
3700 | |||||||
3701 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||||
3702 | MachineRegisterInfo &MRI, | ||||||
3703 | MachineIRBuilder &B) const { | ||||||
3704 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3705 | if (!MFI->isEntryFunction()) { | ||||||
3706 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3707 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||||
3708 | } | ||||||
3709 | |||||||
3710 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
3711 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||||
3712 | return false; | ||||||
3713 | |||||||
3714 | MI.eraseFromParent(); | ||||||
3715 | return true; | ||||||
3716 | } | ||||||
3717 | |||||||
3718 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||||
3719 | MachineRegisterInfo &MRI, | ||||||
3720 | MachineIRBuilder &B, | ||||||
3721 | unsigned AddrSpace) const { | ||||||
3722 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||||
3723 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||||
3724 | Register Hi32 = Unmerge.getReg(1); | ||||||
3725 | |||||||
3726 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||||
3727 | MI.eraseFromParent(); | ||||||
3728 | return true; | ||||||
3729 | } | ||||||
3730 | |||||||
3731 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||||
3732 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||||
3733 | // split between the instruction's voffset and immoffset fields) and soffset | ||||||
3734 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||||
3735 | // the instruction's soffset field). This function takes the first kind of | ||||||
3736 | // offset and figures out how to split it between voffset and immoffset. | ||||||
3737 | std::pair<Register, unsigned> | ||||||
3738 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||||
3739 | Register OrigOffset) const { | ||||||
3740 | const unsigned MaxImm = 4095; | ||||||
3741 | Register BaseReg; | ||||||
3742 | unsigned ImmOffset; | ||||||
3743 | const LLT S32 = LLT::scalar(32); | ||||||
3744 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
3745 | |||||||
3746 | std::tie(BaseReg, ImmOffset) = | ||||||
3747 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | ||||||
3748 | |||||||
3749 | // If BaseReg is a pointer, convert it to int. | ||||||
3750 | if (MRI.getType(BaseReg).isPointer()) | ||||||
3751 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | ||||||
3752 | |||||||
3753 | // If the immediate value is too big for the immoffset field, put the value | ||||||
3754 | // and -4096 into the immoffset field so that the value that is copied/added | ||||||
3755 | // for the voffset field is a multiple of 4096, and it stands more chance | ||||||
3756 | // of being CSEd with the copy/add for another similar load/store. | ||||||
3757 | // However, do not do that rounding down to a multiple of 4096 if that is a | ||||||
3758 | // negative number, as it appears to be illegal to have a negative offset | ||||||
3759 | // in the vgpr, even if adding the immediate offset makes it positive. | ||||||
3760 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||||
3761 | ImmOffset -= Overflow; | ||||||
3762 | if ((int32_t)Overflow < 0) { | ||||||
3763 | Overflow += ImmOffset; | ||||||
3764 | ImmOffset = 0; | ||||||
3765 | } | ||||||
3766 | |||||||
3767 | if (Overflow != 0) { | ||||||
3768 | if (!BaseReg) { | ||||||
3769 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||||
3770 | } else { | ||||||
3771 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||||
3772 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||||
3773 | } | ||||||
3774 | } | ||||||
3775 | |||||||
3776 | if (!BaseReg) | ||||||
3777 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||||
3778 | |||||||
3779 | return std::make_pair(BaseReg, ImmOffset); | ||||||
3780 | } | ||||||
3781 | |||||||
3782 | /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. | ||||||
3783 | void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, | ||||||
3784 | Register VOffset, Register SOffset, | ||||||
3785 | unsigned ImmOffset, Register VIndex, | ||||||
3786 | MachineRegisterInfo &MRI) const { | ||||||
3787 | Optional<ValueAndVReg> MaybeVOffsetVal = | ||||||
3788 | getIConstantVRegValWithLookThrough(VOffset, MRI); | ||||||
3789 | Optional<ValueAndVReg> MaybeSOffsetVal = | ||||||
3790 | getIConstantVRegValWithLookThrough(SOffset, MRI); | ||||||
3791 | Optional<ValueAndVReg> MaybeVIndexVal = | ||||||
3792 | getIConstantVRegValWithLookThrough(VIndex, MRI); | ||||||
3793 | // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, | ||||||
3794 | // update the MMO with that offset. The stride is unknown so we can only do | ||||||
3795 | // this if VIndex is constant 0. | ||||||
3796 | if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && | ||||||
3797 | MaybeVIndexVal->Value == 0) { | ||||||
3798 | uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + | ||||||
3799 | MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; | ||||||
3800 | MMO->setOffset(TotalOffset); | ||||||
3801 | } else { | ||||||
3802 | // We don't have a constant combined offset to use in the MMO. Give up. | ||||||
3803 | MMO->setValue((Value *)nullptr); | ||||||
3804 | } | ||||||
3805 | } | ||||||
3806 | |||||||
3807 | /// Handle register layout difference for f16 images for some subtargets. | ||||||
3808 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||||
3809 | MachineRegisterInfo &MRI, | ||||||
3810 | Register Reg, | ||||||
3811 | bool ImageStore) const { | ||||||
3812 | const LLT S16 = LLT::scalar(16); | ||||||
3813 | const LLT S32 = LLT::scalar(32); | ||||||
3814 | LLT StoreVT = MRI.getType(Reg); | ||||||
3815 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3815, __extension__ __PRETTY_FUNCTION__)); | ||||||
3816 | |||||||
3817 | if (ST.hasUnpackedD16VMem()) { | ||||||
3818 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
3819 | |||||||
3820 | SmallVector<Register, 4> WideRegs; | ||||||
3821 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3822 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||||
3823 | |||||||
3824 | int NumElts = StoreVT.getNumElements(); | ||||||
3825 | |||||||
3826 | return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) | ||||||
3827 | .getReg(0); | ||||||
3828 | } | ||||||
3829 | |||||||
3830 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||||
3831 | if (StoreVT.getNumElements() == 2) { | ||||||
3832 | SmallVector<Register, 4> PackedRegs; | ||||||
3833 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||||
3834 | PackedRegs.push_back(Reg); | ||||||
3835 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||||
3836 | return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) | ||||||
3837 | .getReg(0); | ||||||
3838 | } | ||||||
3839 | |||||||
3840 | if (StoreVT.getNumElements() == 3) { | ||||||
3841 | SmallVector<Register, 4> PackedRegs; | ||||||
3842 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
3843 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3844 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||
3845 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||||
3846 | Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); | ||||||
3847 | return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); | ||||||
3848 | } | ||||||
3849 | |||||||
3850 | if (StoreVT.getNumElements() == 4) { | ||||||
3851 | SmallVector<Register, 4> PackedRegs; | ||||||
3852 | Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); | ||||||
3853 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||||
3854 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3855 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||
3856 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||||
3857 | return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) | ||||||
3858 | .getReg(0); | ||||||
3859 | } | ||||||
3860 | |||||||
3861 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3861); | ||||||
3862 | } | ||||||
3863 | |||||||
3864 | if (StoreVT == LLT::fixed_vector(3, S16)) { | ||||||
3865 | Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) | ||||||
3866 | .getReg(0); | ||||||
3867 | } | ||||||
3868 | return Reg; | ||||||
3869 | } | ||||||
3870 | |||||||
3871 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||||
3872 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||||
3873 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
3874 | LLT Ty = MRI->getType(VData); | ||||||
3875 | |||||||
3876 | const LLT S16 = LLT::scalar(16); | ||||||
3877 | |||||||
3878 | // Fixup illegal register types for i8 stores. | ||||||
3879 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||||
3880 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||||
3881 | return AnyExt; | ||||||
3882 | } | ||||||
3883 | |||||||
3884 | if (Ty.isVector()) { | ||||||
3885 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||||
3886 | if (IsFormat) | ||||||
3887 | return handleD16VData(B, *MRI, VData); | ||||||
3888 | } | ||||||
3889 | } | ||||||
3890 | |||||||
3891 | return VData; | ||||||
3892 | } | ||||||
3893 | |||||||
3894 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||||
3895 | MachineRegisterInfo &MRI, | ||||||
3896 | MachineIRBuilder &B, | ||||||
3897 | bool IsTyped, | ||||||
3898 | bool IsFormat) const { | ||||||
3899 | Register VData = MI.getOperand(1).getReg(); | ||||||
3900 | LLT Ty = MRI.getType(VData); | ||||||
3901 | LLT EltTy = Ty.getScalarType(); | ||||||
3902 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
3903 | const LLT S32 = LLT::scalar(32); | ||||||
3904 | |||||||
3905 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||||
3906 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
3907 | |||||||
3908 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3909 | const int MemSize = MMO->getSize(); | ||||||
3910 | |||||||
3911 | unsigned ImmOffset; | ||||||
3912 | |||||||
3913 | // The typed intrinsics add an immediate after the registers. | ||||||
3914 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3915 | |||||||
3916 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3917 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3918 | Register VIndex; | ||||||
3919 | int OpOffset = 0; | ||||||
3920 | if (HasVIndex) { | ||||||
3921 | VIndex = MI.getOperand(3).getReg(); | ||||||
3922 | OpOffset = 1; | ||||||
3923 | } else { | ||||||
3924 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
3925 | } | ||||||
3926 | |||||||
3927 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3928 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3929 | |||||||
3930 | unsigned Format = 0; | ||||||
3931 | if (IsTyped) { | ||||||
3932 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3933 | ++OpOffset; | ||||||
3934 | } | ||||||
3935 | |||||||
3936 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3937 | |||||||
3938 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||||
3939 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||||
3940 | |||||||
3941 | unsigned Opc; | ||||||
3942 | if (IsTyped) { | ||||||
3943 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||||
3944 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||||
3945 | } else if (IsFormat) { | ||||||
3946 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||||
3947 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||||
3948 | } else { | ||||||
3949 | switch (MemSize) { | ||||||
3950 | case 1: | ||||||
3951 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||||
3952 | break; | ||||||
3953 | case 2: | ||||||
3954 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||||
3955 | break; | ||||||
3956 | default: | ||||||
3957 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||||
3958 | break; | ||||||
3959 | } | ||||||
3960 | } | ||||||
3961 | |||||||
3962 | auto MIB = B.buildInstr(Opc) | ||||||
3963 | .addUse(VData) // vdata | ||||||
3964 | .addUse(RSrc) // rsrc | ||||||
3965 | .addUse(VIndex) // vindex | ||||||
3966 | .addUse(VOffset) // voffset | ||||||
3967 | .addUse(SOffset) // soffset | ||||||
3968 | .addImm(ImmOffset); // offset(imm) | ||||||
3969 | |||||||
3970 | if (IsTyped) | ||||||
3971 | MIB.addImm(Format); | ||||||
3972 | |||||||
3973 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3974 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3975 | .addMemOperand(MMO); | ||||||
3976 | |||||||
3977 | MI.eraseFromParent(); | ||||||
3978 | return true; | ||||||
3979 | } | ||||||
3980 | |||||||
3981 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||||
3982 | MachineRegisterInfo &MRI, | ||||||
3983 | MachineIRBuilder &B, | ||||||
3984 | bool IsFormat, | ||||||
3985 | bool IsTyped) const { | ||||||
3986 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||||
3987 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3988 | const LLT MemTy = MMO->getMemoryType(); | ||||||
3989 | const LLT S32 = LLT::scalar(32); | ||||||
3990 | |||||||
3991 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3992 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
3993 | |||||||
3994 | // The typed intrinsics add an immediate after the registers. | ||||||
3995 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3996 | |||||||
3997 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3998 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3999 | Register VIndex; | ||||||
4000 | int OpOffset = 0; | ||||||
4001 | if (HasVIndex) { | ||||||
4002 | VIndex = MI.getOperand(3).getReg(); | ||||||
4003 | OpOffset = 1; | ||||||
4004 | } else { | ||||||
4005 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
4006 | } | ||||||
4007 | |||||||
4008 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
4009 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
4010 | |||||||
4011 | unsigned Format = 0; | ||||||
4012 | if (IsTyped) { | ||||||
4013 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
4014 | ++OpOffset; | ||||||
4015 | } | ||||||
4016 | |||||||
4017 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
4018 | unsigned ImmOffset; | ||||||
4019 | |||||||
4020 | LLT Ty = MRI.getType(Dst); | ||||||
4021 | LLT EltTy = Ty.getScalarType(); | ||||||
4022 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
4023 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||||
4024 | |||||||
4025 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||||
4026 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||||
4027 | |||||||
4028 | unsigned Opc; | ||||||
4029 | |||||||
4030 | if (IsTyped) { | ||||||
4031 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||||
4032 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||||
4033 | } else if (IsFormat) { | ||||||
4034 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : | ||||||
4035 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||||
4036 | } else { | ||||||
4037 | switch (MemTy.getSizeInBits()) { | ||||||
4038 | case 8: | ||||||
4039 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||||
4040 | break; | ||||||
4041 | case 16: | ||||||
4042 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||||
4043 | break; | ||||||
4044 | default: | ||||||
4045 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||||
4046 | break; | ||||||
4047 | } | ||||||
4048 | } | ||||||
4049 | |||||||
4050 | Register LoadDstReg; | ||||||
4051 | |||||||
4052 | bool IsExtLoad = | ||||||
4053 | (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); | ||||||
4054 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||||
4055 | |||||||
4056 | if (IsExtLoad) | ||||||
4057 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||||
4058 | else if (Unpacked && IsD16 && Ty.isVector()) | ||||||
4059 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||||
4060 | else | ||||||
4061 | LoadDstReg = Dst; | ||||||
4062 | |||||||
4063 | auto MIB = B.buildInstr(Opc) | ||||||
4064 | .addDef(LoadDstReg) // vdata | ||||||
4065 | .addUse(RSrc) // rsrc | ||||||
4066 | .addUse(VIndex) // vindex | ||||||
4067 | .addUse(VOffset) // voffset | ||||||
4068 | .addUse(SOffset) // soffset | ||||||
4069 | .addImm(ImmOffset); // offset(imm) | ||||||
4070 | |||||||
4071 | if (IsTyped) | ||||||
4072 | MIB.addImm(Format); | ||||||
4073 | |||||||
4074 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
4075 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
4076 | .addMemOperand(MMO); | ||||||
4077 | |||||||
4078 | if (LoadDstReg != Dst) { | ||||||
4079 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||
4080 | |||||||
4081 | // Widen result for extending loads was widened. | ||||||
4082 | if (IsExtLoad) | ||||||
4083 | B.buildTrunc(Dst, LoadDstReg); | ||||||
4084 | else { | ||||||
4085 | // Repack to original 16-bit vector result | ||||||
4086 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||||
4087 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||||
4088 | SmallVector<Register, 4> Repack; | ||||||
4089 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||||
4090 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||||
4091 | B.buildMerge(Dst, Repack); | ||||||
4092 | } | ||||||
4093 | } | ||||||
4094 | |||||||
4095 | MI.eraseFromParent(); | ||||||
4096 | return true; | ||||||
4097 | } | ||||||
4098 | |||||||
4099 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||||
4100 | MachineIRBuilder &B, | ||||||
4101 | bool IsInc) const { | ||||||
4102 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : | ||||||
4103 | AMDGPU::G_AMDGPU_ATOMIC_DEC; | ||||||
4104 | B.buildInstr(Opc) | ||||||
4105 | .addDef(MI.getOperand(0).getReg()) | ||||||
4106 | .addUse(MI.getOperand(2).getReg()) | ||||||
4107 | .addUse(MI.getOperand(3).getReg()) | ||||||
4108 | .cloneMemRefs(MI); | ||||||
4109 | MI.eraseFromParent(); | ||||||
4110 | return true; | ||||||
4111 | } | ||||||
4112 | |||||||
4113 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||||
4114 | switch (IntrID) { | ||||||
4115 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
4116 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
4117 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||||
4118 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
4119 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
4120 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||||
4121 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
4122 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
4123 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||||
4124 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
4125 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
4126 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||||
4127 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
4128 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
4129 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||||
4130 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
4131 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
4132 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||||
4133 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
4134 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
4135 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||||
4136 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
4137 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
4138 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||||
4139 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
4140 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
4141 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||||
4142 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
4143 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
4144 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||||
4145 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
4146 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
4147 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||||
4148 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
4149 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
4150 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||||
4151 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
4152 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
4153 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||||
4154 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||
4155 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||
4156 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||
4157 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||||
4158 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||
4159 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||
4160 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | ||||||
4161 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||
4162 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||
4163 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | ||||||
4164 | default: | ||||||
4165 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4165); | ||||||
4166 | } | ||||||
4167 | } | ||||||
4168 | |||||||
4169 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||||
4170 | MachineIRBuilder &B, | ||||||
4171 | Intrinsic::ID IID) const { | ||||||
4172 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||||
4173 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||||
4174 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||||
4175 | |||||||
4176 | Register Dst; | ||||||
4177 | |||||||
4178 | int OpOffset = 0; | ||||||
4179 | if (HasReturn) { | ||||||
4180 | // A few FP atomics do not support return values. | ||||||
4181 | Dst = MI.getOperand(0).getReg(); | ||||||
4182 | } else { | ||||||
4183 | OpOffset = -1; | ||||||
4184 | } | ||||||
4185 | |||||||
4186 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||||
4187 | Register CmpVal; | ||||||
4188 | |||||||
4189 | if (IsCmpSwap) { | ||||||
4190 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||||
4191 | ++OpOffset; | ||||||
4192 | } | ||||||
4193 | |||||||
4194 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||||
4195 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||||
4196 | |||||||
4197 | // The struct intrinsic variants add one additional operand over raw. | ||||||
4198 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
4199 | Register VIndex; | ||||||
4200 | if (HasVIndex) { | ||||||
4201 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||||
4202 | ++OpOffset; | ||||||
4203 | } else { | ||||||
4204 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||||
4205 | } | ||||||
4206 | |||||||
4207 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
4208 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||||
4209 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||||
4210 | |||||||
4211 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
4212 | |||||||
4213 | unsigned ImmOffset; | ||||||
4214 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||||
4215 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); | ||||||
4216 | |||||||
4217 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||||
4218 | |||||||
4219 | if (HasReturn) | ||||||
4220 | MIB.addDef(Dst); | ||||||
4221 | |||||||
4222 | MIB.addUse(VData); // vdata | ||||||
4223 | |||||||
4224 | if (IsCmpSwap) | ||||||
4225 | MIB.addReg(CmpVal); | ||||||
4226 | |||||||
4227 | MIB.addUse(RSrc) // rsrc | ||||||
4228 | .addUse(VIndex) // vindex | ||||||
4229 | .addUse(VOffset) // voffset | ||||||
4230 | .addUse(SOffset) // soffset | ||||||
4231 | .addImm(ImmOffset) // offset(imm) | ||||||
4232 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
4233 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
4234 | .addMemOperand(MMO); | ||||||
4235 | |||||||
4236 | MI.eraseFromParent(); | ||||||
4237 | return true; | ||||||
4238 | } | ||||||
4239 | |||||||
4240 | /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized | ||||||
4241 | /// vector with s16 typed elements. | ||||||
4242 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, | ||||||
4243 | SmallVectorImpl<Register> &PackedAddrs, | ||||||
4244 | unsigned ArgOffset, | ||||||
4245 | const AMDGPU::ImageDimIntrinsicInfo *Intr, | ||||||
4246 | bool IsA16, bool IsG16) { | ||||||
4247 | const LLT S16 = LLT::scalar(16); | ||||||
4248 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
4249 | auto EndIdx = Intr->VAddrEnd; | ||||||
4250 | |||||||
4251 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||||
4252 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||
4253 | if (!SrcOp.isReg()) | ||||||
4254 | continue; // _L to _LZ may have eliminated this. | ||||||
4255 | |||||||
4256 | Register AddrReg = SrcOp.getReg(); | ||||||
4257 | |||||||
4258 | if ((I < Intr->GradientStart) || | ||||||
4259 | (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || | ||||||
4260 | (I >= Intr->CoordStart && !IsA16)) { | ||||||
4261 | // Handle any gradient or coordinate operands that should not be packed | ||||||
4262 | if ((I < Intr->GradientStart) && IsA16 && | ||||||
4263 | (B.getMRI()->getType(AddrReg) == S16)) { | ||||||
4264 | // Special handling of bias when A16 is on. Bias is of type half but | ||||||
4265 | // occupies full 32-bit. | ||||||
4266 | PackedAddrs.push_back( | ||||||
4267 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||||
4268 | .getReg(0)); | ||||||
4269 | } else { | ||||||
4270 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||||
4271 | PackedAddrs.push_back(AddrReg); | ||||||
4272 | } | ||||||
4273 | } else { | ||||||
4274 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||||
4275 | // derivatives dx/dh and dx/dv are packed with undef. | ||||||
4276 | if (((I + 1) >= EndIdx) || | ||||||
4277 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||||
4278 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||||
4279 | (Intr->NumGradients / 2) - 1) || | ||||||
4280 | I == static_cast<unsigned>(Intr->GradientStart + | ||||||
4281 | Intr->NumGradients - 1))) || | ||||||
4282 | // Check for _L to _LZ optimization | ||||||
4283 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||||
4284 | PackedAddrs.push_back( | ||||||
4285 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||||
4286 | .getReg(0)); | ||||||
4287 | } else { | ||||||
4288 | PackedAddrs.push_back( | ||||||
4289 | B.buildBuildVector( | ||||||
4290 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||||
4291 | .getReg(0)); | ||||||
4292 | ++I; | ||||||
4293 | } | ||||||
4294 | } | ||||||
4295 | } | ||||||
4296 | } | ||||||
4297 | |||||||
4298 | /// Convert from separate vaddr components to a single vector address register, | ||||||
4299 | /// and replace the remaining operands with $noreg. | ||||||
4300 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||||
4301 | int DimIdx, int NumVAddrs) { | ||||||
4302 | const LLT S32 = LLT::scalar(32); | ||||||
4303 | |||||||
4304 | SmallVector<Register, 8> AddrRegs; | ||||||
4305 | for (int I = 0; I != NumVAddrs; ++I) { | ||||||
4306 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||
4307 | if (SrcOp.isReg()) { | ||||||
4308 | AddrRegs.push_back(SrcOp.getReg()); | ||||||
4309 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4309, __extension__ __PRETTY_FUNCTION__)); | ||||||
4310 | } | ||||||
4311 | } | ||||||
4312 | |||||||
4313 | int NumAddrRegs = AddrRegs.size(); | ||||||
4314 | if (NumAddrRegs != 1) { | ||||||
4315 | // Above 8 elements round up to next power of 2 (i.e. 16). | ||||||
4316 | if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { | ||||||
4317 | const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); | ||||||
4318 | auto Undef = B.buildUndef(S32); | ||||||
4319 | AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); | ||||||
4320 | NumAddrRegs = RoundedNumRegs; | ||||||
4321 | } | ||||||
4322 | |||||||
4323 | auto VAddr = | ||||||
4324 | B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); | ||||||
4325 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||||
4326 | } | ||||||
4327 | |||||||
4328 | for (int I = 1; I != NumVAddrs; ++I) { | ||||||
4329 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||
4330 | if (SrcOp.isReg()) | ||||||
4331 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||||
4332 | } | ||||||
4333 | } | ||||||
4334 | |||||||
4335 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||||
4336 | /// | ||||||
4337 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||||
4338 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||||
4339 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||||
4340 | /// registers. | ||||||
4341 | /// | ||||||
4342 | /// We don't want to directly select image instructions just yet, but also want | ||||||
4343 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||||
4344 | /// want a selected instrution entering RegBankSelect. In order to avoid | ||||||
4345 | /// defining a multitude of intermediate image instructions, directly hack on | ||||||
4346 | /// the intrinsic's arguments. In cases like a16 addresses, this requires | ||||||
4347 | /// padding now unnecessary arguments with $noreg. | ||||||
4348 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||||
4349 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||||
4350 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||||
4351 | |||||||
4352 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||||
4353 | const unsigned ArgOffset = NumDefs + 1; | ||||||
4354 | bool IsTFE = NumDefs == 2; | ||||||
4355 | // We are only processing the operands of d16 image operations on subtargets | ||||||
4356 | // that use the unpacked register layout, or need to repack the TFE result. | ||||||
4357 | |||||||
4358 | // TODO: Do we need to guard against already legalized intrinsics? | ||||||
4359 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||||
4360 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||||
4361 | |||||||
4362 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
4363 | const LLT S32 = LLT::scalar(32); | ||||||
4364 | const LLT S16 = LLT::scalar(16); | ||||||
4365 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
4366 | |||||||
4367 | unsigned DMask = 0; | ||||||
4368 | Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); | ||||||
4369 | LLT Ty = MRI->getType(VData); | ||||||
4370 | |||||||
4371 | // Check for 16 bit addresses and pack if true. | ||||||
4372 | LLT GradTy = | ||||||
4373 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||||
4374 | LLT AddrTy = | ||||||
4375 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||||
4376 | const bool IsG16 = GradTy == S16; | ||||||
4377 | const bool IsA16 = AddrTy == S16; | ||||||
4378 | const bool IsD16 = Ty.getScalarType() == S16; | ||||||
4379 | |||||||
4380 | int DMaskLanes = 0; | ||||||
4381 | if (!BaseOpcode->Atomic) { | ||||||
4382 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||||
4383 | if (BaseOpcode->Gather4) { | ||||||
4384 | DMaskLanes = 4; | ||||||
4385 | } else if (DMask != 0) { | ||||||
4386 | DMaskLanes = countPopulation(DMask); | ||||||
4387 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||||
4388 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||||
4389 | B.buildUndef(MI.getOperand(0)); | ||||||
4390 | MI.eraseFromParent(); | ||||||
4391 | return true; | ||||||
4392 | } | ||||||
4393 | } | ||||||
4394 | |||||||
4395 | Observer.changingInstr(MI); | ||||||
4396 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||||
4397 | |||||||
4398 | const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 | ||||||
4399 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; | ||||||
4400 | const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 | ||||||
4401 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||||
4402 | unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; | ||||||
4403 | |||||||
4404 | // Track that we legalized this | ||||||
4405 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||||
4406 | |||||||
4407 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||||
4408 | // dmask to be at least 1 otherwise the instruction will fail | ||||||
4409 | if (IsTFE && DMask == 0) { | ||||||
4410 | DMask = 0x1; | ||||||
4411 | DMaskLanes = 1; | ||||||
4412 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||||
4413 | } | ||||||
4414 | |||||||
4415 | if (BaseOpcode->Atomic) { | ||||||
4416 | Register VData0 = MI.getOperand(2).getReg(); | ||||||
4417 | LLT Ty = MRI->getType(VData0); | ||||||
4418 | |||||||
4419 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||||
4420 | if (Ty.isVector()) | ||||||
4421 | return false; | ||||||
4422 | |||||||
4423 | if (BaseOpcode->AtomicX2) { | ||||||
4424 | Register VData1 = MI.getOperand(3).getReg(); | ||||||
4425 | // The two values are packed in one register. | ||||||
4426 | LLT PackedTy = LLT::fixed_vector(2, Ty); | ||||||
4427 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||||
4428 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||||
4429 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||||
4430 | } | ||||||
4431 | } | ||||||
4432 | |||||||
4433 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||||
4434 | |||||||
4435 | // Optimize _L to _LZ when _L is zero | ||||||
4436 | if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = | ||||||
4437 | AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { | ||||||
4438 | const ConstantFP *ConstantLod; | ||||||
4439 | |||||||
4440 | if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, | ||||||
4441 | m_GFCst(ConstantLod))) { | ||||||
4442 | if (ConstantLod->isZero() || ConstantLod->isNegative()) { | ||||||
4443 | // Set new opcode to _lz variant of _l, and change the intrinsic ID. | ||||||
4444 | const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = | ||||||
4445 | AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, | ||||||
4446 | Intr->Dim); | ||||||
4447 | |||||||
4448 | // The starting indexes should remain in the same place. | ||||||
4449 | --CorrectedNumVAddrs; | ||||||
4450 | |||||||
4451 | MI.getOperand(NumDefs).setIntrinsicID( | ||||||
4452 | static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); | ||||||
4453 | MI.RemoveOperand(ArgOffset + Intr->LodIndex); | ||||||
4454 | Intr = NewImageDimIntr; | ||||||
4455 | } | ||||||
4456 | } | ||||||
4457 | } | ||||||
4458 | |||||||
4459 | // Optimize _mip away, when 'lod' is zero | ||||||
4460 | if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { | ||||||
4461 | int64_t ConstantLod; | ||||||
4462 | if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, | ||||||
4463 | m_ICst(ConstantLod))) { | ||||||
4464 | if (ConstantLod == 0) { | ||||||
4465 | // TODO: Change intrinsic opcode and remove operand instead or replacing | ||||||
4466 | // it with 0, as the _L to _LZ handling is done above. | ||||||
4467 | MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); | ||||||
4468 | --CorrectedNumVAddrs; | ||||||
4469 | } | ||||||
4470 | } | ||||||
4471 | } | ||||||
4472 | |||||||
4473 | // Rewrite the addressing register layout before doing anything else. | ||||||
4474 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { | ||||||
4475 | // 16 bit gradients are supported, but are tied to the A16 control | ||||||
4476 | // so both gradients and addresses must be 16 bit | ||||||
4477 | return false; | ||||||
4478 | } | ||||||
4479 | |||||||
4480 | if (IsA16 && !ST.hasA16()) { | ||||||
4481 | // A16 not supported | ||||||
4482 | return false; | ||||||
4483 | } | ||||||
4484 | |||||||
4485 | if (IsA16 || IsG16) { | ||||||
4486 | if (Intr->NumVAddrs > 1) { | ||||||
4487 | SmallVector<Register, 4> PackedRegs; | ||||||
4488 | |||||||
4489 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, | ||||||
4490 | IsG16); | ||||||
4491 | |||||||
4492 | // See also below in the non-a16 branch | ||||||
4493 | const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && | ||||||
4494 | PackedRegs.size() <= ST.getNSAMaxSize(); | ||||||
4495 | |||||||
4496 | if (!UseNSA && PackedRegs.size() > 1) { | ||||||
4497 | LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); | ||||||
4498 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||||
4499 | PackedRegs[0] = Concat.getReg(0); | ||||||
4500 | PackedRegs.resize(1); | ||||||
4501 | } | ||||||
4502 | |||||||
4503 | const unsigned NumPacked = PackedRegs.size(); | ||||||
4504 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||||
4505 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||
4506 | if (!SrcOp.isReg()) { | ||||||
4507 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4507, __extension__ __PRETTY_FUNCTION__)); | ||||||
4508 | continue; | ||||||
4509 | } | ||||||
4510 | |||||||
4511 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4511, __extension__ __PRETTY_FUNCTION__)); | ||||||
4512 | |||||||
4513 | if (I - Intr->VAddrStart < NumPacked) | ||||||
4514 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||||
4515 | else | ||||||
4516 | SrcOp.setReg(AMDGPU::NoRegister); | ||||||
4517 | } | ||||||
4518 | } | ||||||
4519 | } else { | ||||||
4520 | // If the register allocator cannot place the address registers contiguously | ||||||
4521 | // without introducing moves, then using the non-sequential address encoding | ||||||
4522 | // is always preferable, since it saves VALU instructions and is usually a | ||||||
4523 | // wash in terms of code size or even better. | ||||||
4524 | // | ||||||
4525 | // However, we currently have no way of hinting to the register allocator | ||||||
4526 | // that MIMG addresses should be placed contiguously when it is possible to | ||||||
4527 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||||
4528 | // | ||||||
4529 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||||
4530 | // allocation when possible. | ||||||
4531 | const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && | ||||||
4532 | CorrectedNumVAddrs <= ST.getNSAMaxSize(); | ||||||
4533 | |||||||
4534 | if (!UseNSA && Intr->NumVAddrs > 1) | ||||||
4535 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||||
4536 | Intr->NumVAddrs); | ||||||
4537 | } | ||||||
4538 | |||||||
4539 | int Flags = 0; | ||||||
4540 | if (IsA16) | ||||||
4541 | Flags |= 1; | ||||||
4542 | if (IsG16) | ||||||
4543 | Flags |= 2; | ||||||
4544 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||||
4545 | |||||||
4546 | if (BaseOpcode->Store) { // No TFE for stores? | ||||||
4547 | // TODO: Handle dmask trim | ||||||
4548 | if (!Ty.isVector() || !IsD16) | ||||||
4549 | return true; | ||||||
4550 | |||||||
4551 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||||
4552 | if (RepackedReg != VData) { | ||||||
4553 | MI.getOperand(1).setReg(RepackedReg); | ||||||
4554 | } | ||||||
4555 | |||||||
4556 | return true; | ||||||
4557 | } | ||||||
4558 | |||||||
4559 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
4560 | const LLT EltTy = Ty.getScalarType(); | ||||||
4561 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||||
4562 | |||||||
4563 | // Confirm that the return type is large enough for the dmask specified | ||||||
4564 | if (NumElts < DMaskLanes) | ||||||
4565 | return false; | ||||||
4566 | |||||||
4567 | if (NumElts > 4 || DMaskLanes > 4) | ||||||
4568 | return false; | ||||||
4569 | |||||||
4570 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; | ||||||
4571 | const LLT AdjustedTy = | ||||||
4572 | Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); | ||||||
4573 | |||||||
4574 | // The raw dword aligned data component of the load. The only legal cases | ||||||
4575 | // where this matters should be when using the packed D16 format, for | ||||||
4576 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||||
4577 | LLT RoundedTy; | ||||||
4578 | |||||||
4579 | // S32 vector to to cover all data, plus TFE result element. | ||||||
4580 | LLT TFETy; | ||||||
4581 | |||||||
4582 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||||
4583 | LLT RegTy; | ||||||
4584 | |||||||
4585 | if (IsD16 && ST.hasUnpackedD16VMem()) { | ||||||
4586 | RoundedTy = | ||||||
4587 | LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); | ||||||
4588 | TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); | ||||||
4589 | RegTy = S32; | ||||||
4590 | } else { | ||||||
4591 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
4592 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||||
4593 | unsigned RoundedSize = 32 * RoundedElts; | ||||||
4594 | RoundedTy = LLT::scalarOrVector( | ||||||
4595 | ElementCount::getFixed(RoundedSize / EltSize), EltSize); | ||||||
4596 | TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); | ||||||
4597 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||||
4598 | } | ||||||
4599 | |||||||
4600 | // The return type does not need adjustment. | ||||||
4601 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||||
4602 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||||
4603 | return true; | ||||||
4604 | |||||||
4605 | Register Dst1Reg; | ||||||
4606 | |||||||
4607 | // Insert after the instruction. | ||||||
4608 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||||
4609 | |||||||
4610 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||||
4611 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||||
4612 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||||
4613 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||||
4614 | |||||||
4615 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||||
4616 | |||||||
4617 | MI.getOperand(0).setReg(NewResultReg); | ||||||
4618 | |||||||
4619 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||||
4620 | // type. The instruction really returns these two values in one contiguous | ||||||
4621 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||||
4622 | // return type to use a single register result. | ||||||
4623 | |||||||
4624 | if (IsTFE) { | ||||||
4625 | Dst1Reg = MI.getOperand(1).getReg(); | ||||||
4626 | if (MRI->getType(Dst1Reg) != S32) | ||||||
4627 | return false; | ||||||
4628 | |||||||
4629 | // TODO: Make sure the TFE operand bit is set. | ||||||
4630 | MI.RemoveOperand(1); | ||||||
4631 | |||||||
4632 | // Handle the easy case that requires no repack instructions. | ||||||
4633 | if (Ty == S32) { | ||||||
4634 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||||
4635 | return true; | ||||||
4636 | } | ||||||
4637 | } | ||||||
4638 | |||||||
4639 | // Now figure out how to copy the new result register back into the old | ||||||
4640 | // result. | ||||||
4641 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||||
4642 | |||||||
4643 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||||
4644 | |||||||
4645 | if (ResultNumRegs == 1) { | ||||||
4646 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4646, __extension__ __PRETTY_FUNCTION__)); | ||||||
4647 | ResultRegs[0] = NewResultReg; | ||||||
4648 | } else { | ||||||
4649 | // We have to repack into a new vector of some kind. | ||||||
4650 | for (int I = 0; I != NumDataRegs; ++I) | ||||||
4651 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||||
4652 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||||
4653 | |||||||
4654 | // Drop the final TFE element to get the data part. The TFE result is | ||||||
4655 | // directly written to the right place already. | ||||||
4656 | if (IsTFE) | ||||||
4657 | ResultRegs.resize(NumDataRegs); | ||||||
4658 | } | ||||||
4659 | |||||||
4660 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||||
4661 | // of packed vs. unpacked. | ||||||
4662 | if (IsD16 && !Ty.isVector()) { | ||||||
4663 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||||
4664 | return true; | ||||||
4665 | } | ||||||
4666 | |||||||
4667 | // Avoid a build/concat_vector of 1 entry. | ||||||
4668 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||||
4669 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||||
4670 | return true; | ||||||
4671 | } | ||||||
4672 | |||||||
4673 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4673, __extension__ __PRETTY_FUNCTION__)); | ||||||
4674 | |||||||
4675 | if (IsD16) { | ||||||
4676 | // For packed D16 results with TFE enabled, all the data components are | ||||||
4677 | // S32. Cast back to the expected type. | ||||||
4678 | // | ||||||
4679 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||||
4680 | // cast for the TFE result if a multiple of v2s16 was used. | ||||||
4681 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||||
4682 | for (Register &Reg : ResultRegs) | ||||||
4683 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||||
4684 | } else if (ST.hasUnpackedD16VMem()) { | ||||||
4685 | for (Register &Reg : ResultRegs) | ||||||
4686 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||||
4687 | } | ||||||
4688 | } | ||||||
4689 | |||||||
4690 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||||
4691 | if (NumElts == 0) | ||||||
4692 | return; | ||||||
4693 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||||
4694 | for (int I = 0; I != NumElts; ++I) | ||||||
4695 | ResultRegs.push_back(Undef); | ||||||
4696 | }; | ||||||
4697 | |||||||
4698 | // Pad out any elements eliminated due to the dmask. | ||||||
4699 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||||
4700 | if (!ResTy.isVector()) { | ||||||
4701 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||||
4702 | B.buildBuildVector(DstReg, ResultRegs); | ||||||
4703 | return true; | ||||||
4704 | } | ||||||
4705 | |||||||
4706 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4706, __extension__ __PRETTY_FUNCTION__)); | ||||||
4707 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||||
4708 | |||||||
4709 | // Deal with the one annoying legal case. | ||||||
4710 | const LLT V3S16 = LLT::fixed_vector(3, 16); | ||||||
4711 | if (Ty == V3S16) { | ||||||
4712 | if (IsTFE) { | ||||||
4713 | if (ResultRegs.size() == 1) { | ||||||
4714 | NewResultReg = ResultRegs[0]; | ||||||
4715 | } else if (ResultRegs.size() == 2) { | ||||||
4716 | LLT V4S16 = LLT::fixed_vector(4, 16); | ||||||
4717 | NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); | ||||||
4718 | } else { | ||||||
4719 | return false; | ||||||
4720 | } | ||||||
4721 | } | ||||||
4722 | |||||||
4723 | if (MRI->getType(DstReg).getNumElements() < | ||||||
4724 | MRI->getType(NewResultReg).getNumElements()) { | ||||||
4725 | B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); | ||||||
4726 | } else { | ||||||
4727 | B.buildPadVectorWithUndefElements(DstReg, NewResultReg); | ||||||
4728 | } | ||||||
4729 | return true; | ||||||
4730 | } | ||||||
4731 | |||||||
4732 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||||
4733 | B.buildConcatVectors(DstReg, ResultRegs); | ||||||
4734 | return true; | ||||||
4735 | } | ||||||
4736 | |||||||
4737 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||||
4738 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||||
4739 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
4740 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
4741 | |||||||
4742 | Register Dst = MI.getOperand(0).getReg(); | ||||||
4743 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
4744 | unsigned Size = Ty.getSizeInBits(); | ||||||
4745 | MachineFunction &MF = B.getMF(); | ||||||
4746 | |||||||
4747 | Observer.changingInstr(MI); | ||||||
4748 | |||||||
4749 | if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { | ||||||
4750 | Ty = getBitcastRegisterType(Ty); | ||||||
4751 | Helper.bitcastDst(MI, Ty, 0); | ||||||
4752 | Dst = MI.getOperand(0).getReg(); | ||||||
4753 | B.setInsertPt(B.getMBB(), MI); | ||||||
4754 | } | ||||||
4755 | |||||||
4756 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||||
4757 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||||
4758 | // allowed to add one. | ||||||
4759 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||||
4760 | MI.RemoveOperand(1); // Remove intrinsic ID | ||||||
4761 | |||||||
4762 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||||
4763 | // TODO: Should this use datalayout alignment? | ||||||
4764 | const unsigned MemSize = (Size + 7) / 8; | ||||||
4765 | const Align MemAlign(4); | ||||||
4766 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
4767 | MachinePointerInfo(), | ||||||
4768 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
4769 | MachineMemOperand::MOInvariant, | ||||||
4770 | MemSize, MemAlign); | ||||||
4771 | MI.addMemOperand(MF, MMO); | ||||||
4772 | |||||||
4773 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||||
4774 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||||
4775 | // out this needs to be converted to a vector load during RegBankSelect. | ||||||
4776 | if (!isPowerOf2_32(Size)) { | ||||||
4777 | if (Ty.isVector()) | ||||||
4778 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||||
4779 | else | ||||||
4780 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||||
4781 | } | ||||||
4782 | |||||||
4783 | Observer.changedInstr(MI); | ||||||
4784 | return true; | ||||||
4785 | } | ||||||
4786 | |||||||
4787 | // TODO: Move to selection | ||||||
4788 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||||
4789 | MachineRegisterInfo &MRI, | ||||||
4790 | MachineIRBuilder &B) const { | ||||||
4791 | if (!ST.isTrapHandlerEnabled() || | ||||||
4792 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | ||||||
4793 | return legalizeTrapEndpgm(MI, MRI, B); | ||||||
4794 | |||||||
4795 | if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { | ||||||
4796 | switch (*HsaAbiVer) { | ||||||
4797 | case ELF::ELFABIVERSION_AMDGPU_HSA_V2: | ||||||
4798 | case ELF::ELFABIVERSION_AMDGPU_HSA_V3: | ||||||
4799 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||
4800 | case ELF::ELFABIVERSION_AMDGPU_HSA_V4: | ||||||
4801 | return ST.supportsGetDoorbellID() ? | ||||||
4802 | legalizeTrapHsa(MI, MRI, B) : | ||||||
4803 | legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||
4804 | } | ||||||
4805 | } | ||||||
4806 | |||||||
4807 | llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4807); | ||||||
4808 | } | ||||||
4809 | |||||||
4810 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | ||||||
4811 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4812 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||||
4813 | MI.eraseFromParent(); | ||||||
4814 | return true; | ||||||
4815 | } | ||||||
4816 | |||||||
4817 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | ||||||
4818 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4819 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||||
4820 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||||
4821 | Register LiveIn = | ||||||
4822 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
4823 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||
4824 | return false; | ||||||
4825 | |||||||
4826 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||||
4827 | B.buildCopy(SGPR01, LiveIn); | ||||||
4828 | B.buildInstr(AMDGPU::S_TRAP) | ||||||
4829 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||||
4830 | .addReg(SGPR01, RegState::Implicit); | ||||||
4831 | |||||||
4832 | MI.eraseFromParent(); | ||||||
4833 | return true; | ||||||
4834 | } | ||||||
4835 | |||||||
4836 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | ||||||
4837 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4838 | B.buildInstr(AMDGPU::S_TRAP) | ||||||
4839 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | ||||||
4840 | MI.eraseFromParent(); | ||||||
4841 | return true; | ||||||
4842 | } | ||||||
4843 | |||||||
4844 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||||
4845 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4846 | // Is non-HSA path or trap-handler disabled? Then, report a warning | ||||||
4847 | // accordingly | ||||||
4848 | if (!ST.isTrapHandlerEnabled() || | ||||||
4849 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | ||||||
4850 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||||
4851 | "debugtrap handler not supported", | ||||||
4852 | MI.getDebugLoc(), DS_Warning); | ||||||
4853 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||||
4854 | Ctx.diagnose(NoTrap); | ||||||
4855 | } else { | ||||||
4856 | // Insert debug-trap instruction | ||||||
4857 | B.buildInstr(AMDGPU::S_TRAP) | ||||||
4858 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | ||||||
4859 | } | ||||||
4860 | |||||||
4861 | MI.eraseFromParent(); | ||||||
4862 | return true; | ||||||
4863 | } | ||||||
4864 | |||||||
4865 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||||
4866 | MachineIRBuilder &B) const { | ||||||
4867 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
4868 | const LLT S16 = LLT::scalar(16); | ||||||
4869 | const LLT S32 = LLT::scalar(32); | ||||||
4870 | |||||||
4871 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
4872 | Register NodePtr = MI.getOperand(2).getReg(); | ||||||
4873 | Register RayExtent = MI.getOperand(3).getReg(); | ||||||
4874 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||||
4875 | Register RayDir = MI.getOperand(5).getReg(); | ||||||
4876 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||||
4877 | Register TDescr = MI.getOperand(7).getReg(); | ||||||
4878 | |||||||
4879 | if (!ST.hasGFX10_AEncoding()) { | ||||||
4880 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), | ||||||
4881 | "intrinsic not supported on subtarget", | ||||||
4882 | MI.getDebugLoc()); | ||||||
4883 | B.getMF().getFunction().getContext().diagnose(BadIntrin); | ||||||
4884 | return false; | ||||||
4885 | } | ||||||
4886 | |||||||
4887 | const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||||
4888 | const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||||
4889 | const unsigned NumVDataDwords = 4; | ||||||
4890 | const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); | ||||||
4891 | const bool UseNSA = | ||||||
4892 | ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); | ||||||
4893 | const unsigned BaseOpcodes[2][2] = { | ||||||
4894 | {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, | ||||||
4895 | {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, | ||||||
4896 | AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; | ||||||
4897 | int Opcode; | ||||||
4898 | if (UseNSA) { | ||||||
4899 | Opcode = | ||||||
4900 | AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, | ||||||
4901 | NumVDataDwords, NumVAddrDwords); | ||||||
4902 | } else { | ||||||
4903 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], | ||||||
4904 | AMDGPU::MIMGEncGfx10Default, NumVDataDwords, | ||||||
4905 | PowerOf2Ceil(NumVAddrDwords)); | ||||||
4906 | } | ||||||
4907 | assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail ("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4907, __extension__ __PRETTY_FUNCTION__)); | ||||||
4908 | |||||||
4909 | SmallVector<Register, 12> Ops; | ||||||
4910 | if (Is64) { | ||||||
4911 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||||
4912 | Ops.push_back(Unmerge.getReg(0)); | ||||||
4913 | Ops.push_back(Unmerge.getReg(1)); | ||||||
4914 | } else { | ||||||
4915 | Ops.push_back(NodePtr); | ||||||
4916 | } | ||||||
4917 | Ops.push_back(RayExtent); | ||||||
4918 | |||||||
4919 | auto packLanes = [&Ops, &S32, &B](Register Src) { | ||||||
4920 | auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); | ||||||
4921 | Ops.push_back(Unmerge.getReg(0)); | ||||||
4922 | Ops.push_back(Unmerge.getReg(1)); | ||||||
4923 | Ops.push_back(Unmerge.getReg(2)); | ||||||
4924 | }; | ||||||
4925 | |||||||
4926 | packLanes(RayOrigin); | ||||||
4927 | if (IsA16) { | ||||||
4928 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); | ||||||
4929 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); | ||||||
4930 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||||
4931 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||||
4932 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||||
4933 | B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||||
4934 | B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||||
4935 | B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||||
4936 | Ops.push_back(R1); | ||||||
4937 | Ops.push_back(R2); | ||||||
4938 | Ops.push_back(R3); | ||||||
4939 | } else { | ||||||
4940 | packLanes(RayDir); | ||||||
4941 | packLanes(RayInvDir); | ||||||
4942 | } | ||||||
4943 | |||||||
4944 | if (!UseNSA) { | ||||||
4945 | // Build a single vector containing all the operands so far prepared. | ||||||
4946 | LLT OpTy = LLT::fixed_vector(Ops.size(), 32); | ||||||
4947 | Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); | ||||||
4948 | Ops.clear(); | ||||||
4949 | Ops.push_back(MergedOps); | ||||||
4950 | } | ||||||
4951 | |||||||
4952 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||||
4953 | .addDef(DstReg) | ||||||
4954 | .addImm(Opcode); | ||||||
4955 | |||||||
4956 | for (Register R : Ops) { | ||||||
4957 | MIB.addUse(R); | ||||||
4958 | } | ||||||
4959 | |||||||
4960 | MIB.addUse(TDescr) | ||||||
4961 | .addImm(IsA16 ? 1 : 0) | ||||||
4962 | .cloneMemRefs(MI); | ||||||
4963 | |||||||
4964 | MI.eraseFromParent(); | ||||||
4965 | return true; | ||||||
4966 | } | ||||||
4967 | |||||||
4968 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||||
4969 | MachineInstr &MI) const { | ||||||
4970 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
4971 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
4972 | |||||||
4973 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||||
4974 | auto IntrID = MI.getIntrinsicID(); | ||||||
4975 | switch (IntrID) { | ||||||
| |||||||
4976 | case Intrinsic::amdgcn_if: | ||||||
4977 | case Intrinsic::amdgcn_else: { | ||||||
4978 | MachineInstr *Br = nullptr; | ||||||
4979 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||
4980 | bool Negated = false; | ||||||
4981 | if (MachineInstr *BrCond = | ||||||
4982 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||
4983 | const SIRegisterInfo *TRI | ||||||
4984 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
4985 | |||||||
4986 | Register Def = MI.getOperand(1).getReg(); | ||||||
4987 | Register Use = MI.getOperand(3).getReg(); | ||||||
4988 | |||||||
4989 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||
4990 | |||||||
4991 | if (Negated) | ||||||
4992 | std::swap(CondBrTarget, UncondBrTarget); | ||||||
4993 | |||||||
4994 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||
4995 | if (IntrID == Intrinsic::amdgcn_if) { | ||||||
4996 | B.buildInstr(AMDGPU::SI_IF) | ||||||
4997 | .addDef(Def) | ||||||
4998 | .addUse(Use) | ||||||
4999 | .addMBB(UncondBrTarget); | ||||||
5000 | } else { | ||||||
5001 | B.buildInstr(AMDGPU::SI_ELSE) | ||||||
5002 | .addDef(Def) | ||||||
5003 | .addUse(Use) | ||||||
5004 | .addMBB(UncondBrTarget); | ||||||
5005 | } | ||||||
5006 | |||||||
5007 | if (Br) { | ||||||
5008 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||
5009 | } else { | ||||||
5010 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||||
5011 | // since we're swapping branch targets it needs to be reinserted. | ||||||
5012 | // FIXME: IRTranslator should probably not do this | ||||||
5013 | B.buildBr(*CondBrTarget); | ||||||
5014 | } | ||||||
5015 | |||||||
5016 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||||
5017 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||||
5018 | MI.eraseFromParent(); | ||||||
5019 | BrCond->eraseFromParent(); | ||||||
5020 | return true; | ||||||
5021 | } | ||||||
5022 | |||||||
5023 | return false; | ||||||
5024 | } | ||||||
5025 | case Intrinsic::amdgcn_loop: { | ||||||
5026 | MachineInstr *Br = nullptr; | ||||||
5027 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||
5028 | bool Negated = false; | ||||||
5029 | if (MachineInstr *BrCond = | ||||||
5030 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||
5031 | const SIRegisterInfo *TRI | ||||||
5032 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
5033 | |||||||
5034 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||
5035 | Register Reg = MI.getOperand(2).getReg(); | ||||||
5036 | |||||||
5037 | if (Negated) | ||||||
5038 | std::swap(CondBrTarget, UncondBrTarget); | ||||||
5039 | |||||||
5040 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||
5041 | B.buildInstr(AMDGPU::SI_LOOP) | ||||||
5042 | .addUse(Reg) | ||||||
5043 | .addMBB(UncondBrTarget); | ||||||
5044 | |||||||
5045 | if (Br) | ||||||
5046 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||
5047 | else | ||||||
5048 | B.buildBr(*CondBrTarget); | ||||||
5049 | |||||||
5050 | MI.eraseFromParent(); | ||||||
5051 | BrCond->eraseFromParent(); | ||||||
5052 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||||
5053 | return true; | ||||||
5054 | } | ||||||
5055 | |||||||
5056 | return false; | ||||||
5057 | } | ||||||
5058 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||||
5059 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||||
5060 | // This only makes sense to call in a kernel, so just lower to null. | ||||||
5061 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||||
5062 | MI.eraseFromParent(); | ||||||
5063 | return true; | ||||||
5064 | } | ||||||
5065 | |||||||
5066 | return legalizePreloadedArgIntrin( | ||||||
5067 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||
5068 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||||
5069 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||||
5070 | case Intrinsic::amdgcn_workitem_id_x: | ||||||
5071 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5072 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||||
5073 | case Intrinsic::amdgcn_workitem_id_y: | ||||||
5074 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5075 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||||
5076 | case Intrinsic::amdgcn_workitem_id_z: | ||||||
5077 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5078 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||||
5079 | case Intrinsic::amdgcn_workgroup_id_x: | ||||||
5080 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5081 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||||
5082 | case Intrinsic::amdgcn_workgroup_id_y: | ||||||
5083 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5084 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||||
5085 | case Intrinsic::amdgcn_workgroup_id_z: | ||||||
5086 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5087 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||||
5088 | case Intrinsic::amdgcn_dispatch_ptr: | ||||||
5089 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5090 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||||
5091 | case Intrinsic::amdgcn_queue_ptr: | ||||||
5092 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5093 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||||
5094 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||||
5095 | return legalizePreloadedArgIntrin( | ||||||
5096 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||||
5097 | case Intrinsic::amdgcn_dispatch_id: | ||||||
5098 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5099 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||||
5100 | case Intrinsic::amdgcn_fdiv_fast: | ||||||
5101 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||||
5102 | case Intrinsic::amdgcn_is_shared: | ||||||
5103 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||||
5104 | case Intrinsic::amdgcn_is_private: | ||||||
5105 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||||
5106 | case Intrinsic::amdgcn_wavefrontsize: { | ||||||
5107 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||||
5108 | MI.eraseFromParent(); | ||||||
5109 | return true; | ||||||
5110 | } | ||||||
5111 | case Intrinsic::amdgcn_s_buffer_load: | ||||||
5112 | return legalizeSBufferLoad(Helper, MI); | ||||||
5113 | case Intrinsic::amdgcn_raw_buffer_store: | ||||||
5114 | case Intrinsic::amdgcn_struct_buffer_store: | ||||||
5115 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||||
5116 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||||
5117 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||||
5118 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||||
5119 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||||
5120 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||||
5121 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||||
5122 | case Intrinsic::amdgcn_raw_buffer_load: | ||||||
5123 | case Intrinsic::amdgcn_struct_buffer_load: | ||||||
5124 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||||
5125 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||||
5126 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||||
5127 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||||
5128 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||||
5129 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||||
5130 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||||
5131 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
5132 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
5133 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
5134 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
5135 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
5136 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
5137 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
5138 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
5139 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
5140 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
5141 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
5142 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
5143 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
5144 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
5145 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
5146 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
5147 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
5148 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
5149 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
5150 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
5151 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
5152 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
5153 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
5154 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
5155 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||
5156 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||
5157 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
5158 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
5159 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||
5160 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||
5161 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||
5162 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||
5163 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||
5164 | return legalizeBufferAtomic(MI, B, IntrID); | ||||||
5165 | case Intrinsic::amdgcn_atomic_inc: | ||||||
5166 | return legalizeAtomicIncDec(MI, B, true); | ||||||
5167 | case Intrinsic::amdgcn_atomic_dec: | ||||||
5168 | return legalizeAtomicIncDec(MI, B, false); | ||||||
5169 | case Intrinsic::trap: | ||||||
5170 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||||
5171 | case Intrinsic::debugtrap: | ||||||
5172 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||||
5173 | case Intrinsic::amdgcn_rsq_clamp: | ||||||
5174 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||||
5175 | case Intrinsic::amdgcn_ds_fadd: | ||||||
5176 | case Intrinsic::amdgcn_ds_fmin: | ||||||
5177 | case Intrinsic::amdgcn_ds_fmax: | ||||||
5178 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||||
5179 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||||
5180 | return legalizeBVHIntrinsic(MI, B); | ||||||
5181 | default: { | ||||||
5182 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||||
5183 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||||
5184 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||||
5185 | return true; | ||||||
5186 | } | ||||||
5187 | } | ||||||
5188 | |||||||
5189 | return true; | ||||||
5190 | } |
1 | //==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
10 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
11 | |
12 | #include "llvm/CodeGen/Register.h" |
13 | #include "llvm/Pass.h" |
14 | |
15 | namespace llvm { |
16 | |
17 | class Function; |
18 | class LLT; |
19 | class raw_ostream; |
20 | class TargetRegisterClass; |
21 | class TargetRegisterInfo; |
22 | |
23 | struct ArgDescriptor { |
24 | private: |
25 | friend struct AMDGPUFunctionArgInfo; |
26 | friend class AMDGPUArgumentUsageInfo; |
27 | |
28 | union { |
29 | MCRegister Reg; |
30 | unsigned StackOffset; |
31 | }; |
32 | |
33 | // Bitmask to locate argument within the register. |
34 | unsigned Mask; |
35 | |
36 | bool IsStack : 1; |
37 | bool IsSet : 1; |
38 | |
39 | public: |
40 | constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, |
41 | bool IsStack = false, bool IsSet = false) |
42 | : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} |
43 | |
44 | static constexpr ArgDescriptor createRegister(Register Reg, |
45 | unsigned Mask = ~0u) { |
46 | return ArgDescriptor(Reg, Mask, false, true); |
47 | } |
48 | |
49 | static constexpr ArgDescriptor createStack(unsigned Offset, |
50 | unsigned Mask = ~0u) { |
51 | return ArgDescriptor(Offset, Mask, true, true); |
52 | } |
53 | |
54 | static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, |
55 | unsigned Mask) { |
56 | return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); |
57 | } |
58 | |
59 | bool isSet() const { |
60 | return IsSet; |
61 | } |
62 | |
63 | explicit operator bool() const { |
64 | return isSet(); |
65 | } |
66 | |
67 | bool isRegister() const { |
68 | return !IsStack; |
69 | } |
70 | |
71 | MCRegister getRegister() const { |
72 | assert(!IsStack)(static_cast <bool> (!IsStack) ? void (0) : __assert_fail ("!IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 72, __extension__ __PRETTY_FUNCTION__)); |
73 | return Reg; |
74 | } |
75 | |
76 | unsigned getStackOffset() const { |
77 | assert(IsStack)(static_cast <bool> (IsStack) ? void (0) : __assert_fail ("IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 77, __extension__ __PRETTY_FUNCTION__)); |
78 | return StackOffset; |
79 | } |
80 | |
81 | unsigned getMask() const { |
82 | return Mask; |
83 | } |
84 | |
85 | bool isMasked() const { |
86 | return Mask != ~0u; |
87 | } |
88 | |
89 | void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; |
90 | }; |
91 | |
92 | inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { |
93 | Arg.print(OS); |
94 | return OS; |
95 | } |
96 | |
97 | struct AMDGPUFunctionArgInfo { |
98 | enum PreloadedValue { |
99 | // SGPRS: |
100 | PRIVATE_SEGMENT_BUFFER = 0, |
101 | DISPATCH_PTR = 1, |
102 | QUEUE_PTR = 2, |
103 | KERNARG_SEGMENT_PTR = 3, |
104 | DISPATCH_ID = 4, |
105 | FLAT_SCRATCH_INIT = 5, |
106 | WORKGROUP_ID_X = 10, |
107 | WORKGROUP_ID_Y = 11, |
108 | WORKGROUP_ID_Z = 12, |
109 | PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, |
110 | IMPLICIT_BUFFER_PTR = 15, |
111 | IMPLICIT_ARG_PTR = 16, |
112 | |
113 | // VGPRS: |
114 | WORKITEM_ID_X = 17, |
115 | WORKITEM_ID_Y = 18, |
116 | WORKITEM_ID_Z = 19, |
117 | FIRST_VGPR_VALUE = WORKITEM_ID_X |
118 | }; |
119 | |
120 | // Kernel input registers setup for the HSA ABI in allocation order. |
121 | |
122 | // User SGPRs in kernels |
123 | // XXX - Can these require argument spills? |
124 | ArgDescriptor PrivateSegmentBuffer; |
125 | ArgDescriptor DispatchPtr; |
126 | ArgDescriptor QueuePtr; |
127 | ArgDescriptor KernargSegmentPtr; |
128 | ArgDescriptor DispatchID; |
129 | ArgDescriptor FlatScratchInit; |
130 | ArgDescriptor PrivateSegmentSize; |
131 | |
132 | // System SGPRs in kernels. |
133 | ArgDescriptor WorkGroupIDX; |
134 | ArgDescriptor WorkGroupIDY; |
135 | ArgDescriptor WorkGroupIDZ; |
136 | ArgDescriptor WorkGroupInfo; |
137 | ArgDescriptor PrivateSegmentWaveByteOffset; |
138 | |
139 | // Pointer with offset from kernargsegmentptr to where special ABI arguments |
140 | // are passed to callable functions. |
141 | ArgDescriptor ImplicitArgPtr; |
142 | |
143 | // Input registers for non-HSA ABI |
144 | ArgDescriptor ImplicitBufferPtr; |
145 | |
146 | // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed |
147 | // into v0, 10 bits per dimension if packed-tid is set. |
148 | ArgDescriptor WorkItemIDX; |
149 | ArgDescriptor WorkItemIDY; |
150 | ArgDescriptor WorkItemIDZ; |
151 | |
152 | std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> |
153 | getPreloadedValue(PreloadedValue Value) const; |
154 | |
155 | static constexpr AMDGPUFunctionArgInfo fixedABILayout(); |
156 | }; |
157 | |
158 | class AMDGPUArgumentUsageInfo : public ImmutablePass { |
159 | private: |
160 | DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; |
161 | |
162 | public: |
163 | static char ID; |
164 | |
165 | static const AMDGPUFunctionArgInfo ExternFunctionInfo; |
166 | static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; |
167 | |
168 | AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } |
169 | |
170 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
171 | AU.setPreservesAll(); |
172 | } |
173 | |
174 | bool doInitialization(Module &M) override; |
175 | bool doFinalization(Module &M) override; |
176 | |
177 | void print(raw_ostream &OS, const Module *M = nullptr) const override; |
178 | |
179 | void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { |
180 | ArgInfoMap[&F] = ArgInfo; |
181 | } |
182 | |
183 | const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; |
184 | }; |
185 | |
186 | } // end namespace llvm |
187 | |
188 | #endif |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file contains some functions that are useful for math stuff. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||||
14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||||
15 | |||||||
16 | #include "llvm/Support/Compiler.h" | ||||||
17 | #include <cassert> | ||||||
18 | #include <climits> | ||||||
19 | #include <cmath> | ||||||
20 | #include <cstdint> | ||||||
21 | #include <cstring> | ||||||
22 | #include <limits> | ||||||
23 | #include <type_traits> | ||||||
24 | |||||||
25 | #ifdef __ANDROID_NDK__ | ||||||
26 | #include <android/api-level.h> | ||||||
27 | #endif | ||||||
28 | |||||||
29 | #ifdef _MSC_VER | ||||||
30 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||||
31 | // expensive, and MathExtras.h is popular. | ||||||
32 | // #include <intrin.h> | ||||||
33 | extern "C" { | ||||||
34 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||||
35 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
36 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||||
37 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
38 | } | ||||||
39 | #endif | ||||||
40 | |||||||
41 | namespace llvm { | ||||||
42 | |||||||
43 | /// The behavior an operation has on an input of 0. | ||||||
44 | enum ZeroBehavior { | ||||||
45 | /// The returned value is undefined. | ||||||
46 | ZB_Undefined, | ||||||
47 | /// The returned value is numeric_limits<T>::max() | ||||||
48 | ZB_Max, | ||||||
49 | /// The returned value is numeric_limits<T>::digits | ||||||
50 | ZB_Width | ||||||
51 | }; | ||||||
52 | |||||||
53 | /// Mathematical constants. | ||||||
54 | namespace numbers { | ||||||
55 | // TODO: Track C++20 std::numbers. | ||||||
56 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||||
57 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||||
58 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||||
59 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||||
60 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||||
61 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||||
62 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||||
63 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||||
64 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||||
65 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||||
66 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||||
67 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||||
68 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||||
69 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||||
70 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||||
71 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||||
72 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||||
73 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||||
74 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||||
75 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||||
76 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||||
77 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||||
78 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||||
79 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||||
80 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||||
81 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||||
82 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||||
83 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||||
84 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||||
85 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||||
86 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||||
87 | } // namespace numbers | ||||||
88 | |||||||
89 | namespace detail { | ||||||
90 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||||
91 | static unsigned count(T Val, ZeroBehavior) { | ||||||
92 | if (!Val) | ||||||
93 | return std::numeric_limits<T>::digits; | ||||||
94 | if (Val & 0x1) | ||||||
95 | return 0; | ||||||
96 | |||||||
97 | // Bisection method. | ||||||
98 | unsigned ZeroBits = 0; | ||||||
99 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||||
100 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||||
101 | while (Shift) { | ||||||
102 | if ((Val & Mask) == 0) { | ||||||
103 | Val >>= Shift; | ||||||
104 | ZeroBits |= Shift; | ||||||
105 | } | ||||||
106 | Shift >>= 1; | ||||||
107 | Mask >>= Shift; | ||||||
108 | } | ||||||
109 | return ZeroBits; | ||||||
110 | } | ||||||
111 | }; | ||||||
112 | |||||||
113 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
114 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||||
115 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
116 | if (ZB
| ||||||
117 | return 32; | ||||||
118 | |||||||
119 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||||
120 | return __builtin_ctz(Val); | ||||||
121 | #elif defined(_MSC_VER) | ||||||
122 | unsigned long Index; | ||||||
123 | _BitScanForward(&Index, Val); | ||||||
124 | return Index; | ||||||
125 | #endif | ||||||
126 | } | ||||||
127 | }; | ||||||
128 | |||||||
129 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
130 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||||
131 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
132 | if (ZB != ZB_Undefined && Val == 0) | ||||||
133 | return 64; | ||||||
134 | |||||||
135 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||||
136 | return __builtin_ctzll(Val); | ||||||
137 | #elif defined(_MSC_VER) | ||||||
138 | unsigned long Index; | ||||||
139 | _BitScanForward64(&Index, Val); | ||||||
140 | return Index; | ||||||
141 | #endif | ||||||
142 | } | ||||||
143 | }; | ||||||
144 | #endif | ||||||
145 | #endif | ||||||
146 | } // namespace detail | ||||||
147 | |||||||
148 | /// Count number of 0's from the least significant bit to the most | ||||||
149 | /// stopping at the first 1. | ||||||
150 | /// | ||||||
151 | /// Only unsigned integral types are allowed. | ||||||
152 | /// | ||||||
153 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
154 | /// valid arguments. | ||||||
155 | template <typename T> | ||||||
156 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
157 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
158 | !std::numeric_limits<T>::is_signed, | ||||||
159 | "Only unsigned integral types are allowed."); | ||||||
160 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
161 | } | ||||||
162 | |||||||
163 | namespace detail { | ||||||
164 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||||
165 | static unsigned count(T Val, ZeroBehavior) { | ||||||
166 | if (!Val) | ||||||
167 | return std::numeric_limits<T>::digits; | ||||||
168 | |||||||
169 | // Bisection method. | ||||||
170 | unsigned ZeroBits = 0; | ||||||
171 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||||
172 | T Tmp = Val >> Shift; | ||||||
173 | if (Tmp) | ||||||
174 | Val = Tmp; | ||||||
175 | else | ||||||
176 | ZeroBits |= Shift; | ||||||
177 | } | ||||||
178 | return ZeroBits; | ||||||
179 | } | ||||||
180 | }; | ||||||
181 | |||||||
182 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
183 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||||
184 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
185 | if (ZB != ZB_Undefined && Val == 0) | ||||||
186 | return 32; | ||||||
187 | |||||||
188 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||||
189 | return __builtin_clz(Val); | ||||||
190 | #elif defined(_MSC_VER) | ||||||
191 | unsigned long Index; | ||||||
192 | _BitScanReverse(&Index, Val); | ||||||
193 | return Index ^ 31; | ||||||
194 | #endif | ||||||
195 | } | ||||||
196 | }; | ||||||
197 | |||||||
198 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
199 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||||
200 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
201 | if (ZB != ZB_Undefined && Val == 0) | ||||||
202 | return 64; | ||||||
203 | |||||||
204 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||||
205 | return __builtin_clzll(Val); | ||||||
206 | #elif defined(_MSC_VER) | ||||||
207 | unsigned long Index; | ||||||
208 | _BitScanReverse64(&Index, Val); | ||||||
209 | return Index ^ 63; | ||||||
210 | #endif | ||||||
211 | } | ||||||
212 | }; | ||||||
213 | #endif | ||||||
214 | #endif | ||||||
215 | } // namespace detail | ||||||
216 | |||||||
217 | /// Count number of 0's from the most significant bit to the least | ||||||
218 | /// stopping at the first 1. | ||||||
219 | /// | ||||||
220 | /// Only unsigned integral types are allowed. | ||||||
221 | /// | ||||||
222 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
223 | /// valid arguments. | ||||||
224 | template <typename T> | ||||||
225 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
226 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
227 | !std::numeric_limits<T>::is_signed, | ||||||
228 | "Only unsigned integral types are allowed."); | ||||||
229 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
230 | } | ||||||
231 | |||||||
232 | /// Get the index of the first set bit starting from the least | ||||||
233 | /// significant bit. | ||||||
234 | /// | ||||||
235 | /// Only unsigned integral types are allowed. | ||||||
236 | /// | ||||||
237 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
238 | /// valid arguments. | ||||||
239 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
240 | if (ZB == ZB_Max && Val == 0) | ||||||
241 | return std::numeric_limits<T>::max(); | ||||||
242 | |||||||
243 | return countTrailingZeros(Val, ZB_Undefined); | ||||||
244 | } | ||||||
245 | |||||||
246 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||||
247 | /// bits set to 0. Only unsigned types are allowed. | ||||||
248 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||||
249 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||||
250 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||||
251 | assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index" ) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "llvm/include/llvm/Support/MathExtras.h", 251, __extension__ __PRETTY_FUNCTION__)); | ||||||
252 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||||
253 | } | ||||||
254 | |||||||
255 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||||
256 | /// bits set to 0. Only unsigned types are allowed. | ||||||
257 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||||
258 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
259 | } | ||||||
260 | |||||||
261 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||||
262 | /// bits set to 1. Only unsigned types are allowed. | ||||||
263 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||||
264 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
265 | } | ||||||
266 | |||||||
267 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||||
268 | /// bits set to 1. Only unsigned types are allowed. | ||||||
269 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||||
270 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
271 | } | ||||||
272 | |||||||
273 | /// Get the index of the last set bit starting from the least | ||||||
274 | /// significant bit. | ||||||
275 | /// | ||||||
276 | /// Only unsigned integral types are allowed. | ||||||
277 | /// | ||||||
278 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
279 | /// valid arguments. | ||||||
280 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
281 | if (ZB == ZB_Max && Val == 0) | ||||||
282 | return std::numeric_limits<T>::max(); | ||||||
283 | |||||||
284 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||||
285 | // in the __builtin_clz intrinsic on x86. | ||||||
286 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||||
287 | (std::numeric_limits<T>::digits - 1); | ||||||
288 | } | ||||||
289 | |||||||
290 | /// Macro compressed bit reversal table for 256 bits. | ||||||
291 | /// | ||||||
292 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||||
293 | static const unsigned char BitReverseTable256[256] = { | ||||||
294 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||||
295 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||||
296 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||||
297 | R6(0), R6(2), R6(1), R6(3) | ||||||
298 | #undef R2 | ||||||
299 | #undef R4 | ||||||
300 | #undef R6 | ||||||
301 | }; | ||||||
302 | |||||||
303 | /// Reverse the bits in \p Val. | ||||||
304 | template <typename T> | ||||||
305 | T reverseBits(T Val) { | ||||||
306 | unsigned char in[sizeof(Val)]; | ||||||
307 | unsigned char out[sizeof(Val)]; | ||||||
308 | std::memcpy(in, &Val, sizeof(Val)); | ||||||
309 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||||
310 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||||
311 | std::memcpy(&Val, out, sizeof(Val)); | ||||||
312 | return Val; | ||||||
313 | } | ||||||
314 | |||||||
315 | #if __has_builtin(__builtin_bitreverse8)1 | ||||||
316 | template<> | ||||||
317 | inline uint8_t reverseBits<uint8_t>(uint8_t Val) { | ||||||
318 | return __builtin_bitreverse8(Val); | ||||||
319 | } | ||||||
320 | #endif | ||||||
321 | |||||||
322 | #if __has_builtin(__builtin_bitreverse16)1 | ||||||
323 | template<> | ||||||
324 | inline uint16_t reverseBits<uint16_t>(uint16_t Val) { | ||||||
325 | return __builtin_bitreverse16(Val); | ||||||
326 | } | ||||||
327 | #endif | ||||||
328 | |||||||
329 | #if __has_builtin(__builtin_bitreverse32)1 | ||||||
330 | template<> | ||||||
331 | inline uint32_t reverseBits<uint32_t>(uint32_t Val) { | ||||||
332 | return __builtin_bitreverse32(Val); | ||||||
333 | } | ||||||
334 | #endif | ||||||
335 | |||||||
336 | #if __has_builtin(__builtin_bitreverse64)1 | ||||||
337 | template<> | ||||||
338 | inline uint64_t reverseBits<uint64_t>(uint64_t Val) { | ||||||
339 | return __builtin_bitreverse64(Val); | ||||||
340 | } | ||||||
341 | #endif | ||||||
342 | |||||||
343 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||||
344 | // type overloading so that signed and unsigned integers can be used without | ||||||
345 | // ambiguity. | ||||||
346 | |||||||
347 | /// Return the high 32 bits of a 64 bit value. | ||||||
348 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||||
349 | return static_cast<uint32_t>(Value >> 32); | ||||||
350 | } | ||||||
351 | |||||||
352 | /// Return the low 32 bits of a 64 bit value. | ||||||
353 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||||
354 | return static_cast<uint32_t>(Value); | ||||||
355 | } | ||||||
356 | |||||||
357 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||||
358 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||||
359 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||||
360 | } | ||||||
361 | |||||||
362 | /// Checks if an integer fits into the given bit width. | ||||||
363 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||||
364 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); | ||||||
365 | } | ||||||
366 | // Template specializations to get better code for common cases. | ||||||
367 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||||
368 | return static_cast<int8_t>(x) == x; | ||||||
369 | } | ||||||
370 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||||
371 | return static_cast<int16_t>(x) == x; | ||||||
372 | } | ||||||
373 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||||
374 | return static_cast<int32_t>(x) == x; | ||||||
375 | } | ||||||
376 | |||||||
377 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||||
378 | template <unsigned N, unsigned S> | ||||||
379 | constexpr inline bool isShiftedInt(int64_t x) { | ||||||
380 | static_assert( | ||||||
381 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||||
382 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||||
383 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
384 | } | ||||||
385 | |||||||
386 | /// Checks if an unsigned integer fits into the given bit width. | ||||||
387 | /// | ||||||
388 | /// This is written as two functions rather than as simply | ||||||
389 | /// | ||||||
390 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||||
391 | /// | ||||||
392 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||||
393 | /// left too many places. | ||||||
394 | template <unsigned N> | ||||||
395 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { | ||||||
396 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||||
397 | return X < (UINT64_C(1)1UL << (N)); | ||||||
398 | } | ||||||
399 | template <unsigned N> | ||||||
400 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) { | ||||||
401 | return true; | ||||||
402 | } | ||||||
403 | |||||||
404 | // Template specializations to get better code for common cases. | ||||||
405 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||||
406 | return static_cast<uint8_t>(x) == x; | ||||||
407 | } | ||||||
408 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||||
409 | return static_cast<uint16_t>(x) == x; | ||||||
410 | } | ||||||
411 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||||
412 | return static_cast<uint32_t>(x) == x; | ||||||
413 | } | ||||||
414 | |||||||
415 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||||
416 | template <unsigned N, unsigned S> | ||||||
417 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||||
418 | static_assert( | ||||||
419 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||||
420 | static_assert(N + S <= 64, | ||||||
421 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||||
422 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||||
423 | // 1 << S is not undefined behavior. | ||||||
424 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
425 | } | ||||||
426 | |||||||
427 | /// Gets the maximum value for a N-bit unsigned integer. | ||||||
428 | inline uint64_t maxUIntN(uint64_t N) { | ||||||
429 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 429, __extension__ __PRETTY_FUNCTION__)); | ||||||
430 | |||||||
431 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||||
432 | // (uint64_t(1) << N) - 1 | ||||||
433 | // without checking first that N != 64. But this works and doesn't have a | ||||||
434 | // branch. | ||||||
435 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); | ||||||
436 | } | ||||||
437 | |||||||
438 | /// Gets the minimum value for a N-bit signed integer. | ||||||
439 | inline int64_t minIntN(int64_t N) { | ||||||
440 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 440, __extension__ __PRETTY_FUNCTION__)); | ||||||
441 | |||||||
442 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); | ||||||
443 | } | ||||||
444 | |||||||
445 | /// Gets the maximum value for a N-bit signed integer. | ||||||
446 | inline int64_t maxIntN(int64_t N) { | ||||||
447 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 447, __extension__ __PRETTY_FUNCTION__)); | ||||||
448 | |||||||
449 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||||
450 | // int64_t only at the very end to avoid UB. | ||||||
451 | return (UINT64_C(1)1UL << (N - 1)) - 1; | ||||||
452 | } | ||||||
453 | |||||||
454 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||||
455 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||||
456 | return N >= 64 || x <= maxUIntN(N); | ||||||
457 | } | ||||||
458 | |||||||
459 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||||
460 | inline bool isIntN(unsigned N, int64_t x) { | ||||||
461 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||||
462 | } | ||||||
463 | |||||||
464 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
465 | /// least significant bit with the remainder zero (32 bit version). | ||||||
466 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||||
467 | constexpr inline bool isMask_32(uint32_t Value) { | ||||||
468 | return Value && ((Value + 1) & Value) == 0; | ||||||
469 | } | ||||||
470 | |||||||
471 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
472 | /// least significant bit with the remainder zero (64 bit version). | ||||||
473 | constexpr inline bool isMask_64(uint64_t Value) { | ||||||
474 | return Value && ((Value + 1) & Value) == 0; | ||||||
475 | } | ||||||
476 | |||||||
477 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
478 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||||
479 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||||
480 | return Value && isMask_32((Value - 1) | Value); | ||||||
481 | } | ||||||
482 | |||||||
483 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
484 | /// remainder zero (64 bit version.) | ||||||
485 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||||
486 | return Value && isMask_64((Value - 1) | Value); | ||||||
487 | } | ||||||
488 | |||||||
489 | /// Return true if the argument is a power of two > 0. | ||||||
490 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||||
491 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||||
492 | return Value && !(Value & (Value - 1)); | ||||||
493 | } | ||||||
494 | |||||||
495 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||||
496 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||||
497 | return Value && !(Value & (Value - 1)); | ||||||
498 | } | ||||||
499 | |||||||
500 | /// Count the number of ones from the most significant bit to the first | ||||||
501 | /// zero bit. | ||||||
502 | /// | ||||||
503 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||||
504 | /// Only unsigned integral types are allowed. | ||||||
505 | /// | ||||||
506 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
507 | /// ZB_Undefined are valid arguments. | ||||||
508 | template <typename T> | ||||||
509 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
510 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
511 | !std::numeric_limits<T>::is_signed, | ||||||
512 | "Only unsigned integral types are allowed."); | ||||||
513 | return countLeadingZeros<T>(~Value, ZB); | ||||||
514 | } | ||||||
515 | |||||||
516 | /// Count the number of ones from the least significant bit to the first | ||||||
517 | /// zero bit. | ||||||
518 | /// | ||||||
519 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||||
520 | /// Only unsigned integral types are allowed. | ||||||
521 | /// | ||||||
522 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
523 | /// ZB_Undefined are valid arguments. | ||||||
524 | template <typename T> | ||||||
525 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
526 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
527 | !std::numeric_limits<T>::is_signed, | ||||||
528 | "Only unsigned integral types are allowed."); | ||||||
529 | return countTrailingZeros<T>(~Value, ZB); | ||||||
530 | } | ||||||
531 | |||||||
532 | namespace detail { | ||||||
533 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||||
534 | static unsigned count(T Value) { | ||||||
535 | // Generic version, forward to 32 bits. | ||||||
536 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||||
537 | #if defined(__GNUC__4) | ||||||
538 | return __builtin_popcount(Value); | ||||||
539 | #else | ||||||
540 | uint32_t v = Value; | ||||||
541 | v = v - ((v >> 1) & 0x55555555); | ||||||
542 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||||
543 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||||
544 | #endif | ||||||
545 | } | ||||||
546 | }; | ||||||
547 | |||||||
548 | template <typename T> struct PopulationCounter<T, 8> { | ||||||
549 | static unsigned count(T Value) { | ||||||
550 | #if defined(__GNUC__4) | ||||||
551 | return __builtin_popcountll(Value); | ||||||
552 | #else | ||||||
553 | uint64_t v = Value; | ||||||
554 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||||
555 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||||
556 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||||
557 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||||
558 | #endif | ||||||
559 | } | ||||||
560 | }; | ||||||
561 | } // namespace detail | ||||||
562 | |||||||
563 | /// Count the number of set bits in a value. | ||||||
564 | /// Ex. countPopulation(0xF000F000) = 8 | ||||||
565 | /// Returns 0 if the word is zero. | ||||||
566 | template <typename T> | ||||||
567 | inline unsigned countPopulation(T Value) { | ||||||
568 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
569 | !std::numeric_limits<T>::is_signed, | ||||||
570 | "Only unsigned integral types are allowed."); | ||||||
571 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||||
572 | } | ||||||
573 | |||||||
574 | /// Compile time Log2. | ||||||
575 | /// Valid only for positive powers of two. | ||||||
576 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||||
577 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||||
578 | "Value is not a valid power of 2"); | ||||||
579 | return 1 + CTLog2<kValue / 2>(); | ||||||
580 | } | ||||||
581 | |||||||
582 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||||
583 | |||||||
584 | /// Return the log base 2 of the specified value. | ||||||
585 | inline double Log2(double Value) { | ||||||
586 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||||
587 | return __builtin_log(Value) / __builtin_log(2.0); | ||||||
588 | #else | ||||||
589 | return log2(Value); | ||||||
590 | #endif | ||||||
591 | } | ||||||
592 | |||||||
593 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
594 | /// (32 bit edition.) | ||||||
595 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||||
596 | inline unsigned Log2_32(uint32_t Value) { | ||||||
597 | return 31 - countLeadingZeros(Value); | ||||||
598 | } | ||||||
599 | |||||||
600 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
601 | /// (64 bit edition.) | ||||||
602 | inline unsigned Log2_64(uint64_t Value) { | ||||||
603 | return 63 - countLeadingZeros(Value); | ||||||
604 | } | ||||||
605 | |||||||
606 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||||
607 | /// (32 bit edition). | ||||||
608 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||||
609 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||||
610 | return 32 - countLeadingZeros(Value - 1); | ||||||
611 | } | ||||||
612 | |||||||
613 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||||
614 | /// (64 bit edition.) | ||||||
615 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||||
616 | return 64 - countLeadingZeros(Value - 1); | ||||||
617 | } | ||||||
618 | |||||||
619 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||||
620 | template <typename T> | ||||||
621 | inline T greatestCommonDivisor(T A, T B) { | ||||||
622 | while (B) { | ||||||
623 | T Tmp = B; | ||||||
624 | B = A % B; | ||||||
625 | A = Tmp; | ||||||
626 | } | ||||||
627 | return A; | ||||||
628 | } | ||||||
629 | |||||||
630 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||||
631 | return greatestCommonDivisor<uint64_t>(A, B); | ||||||
632 | } | ||||||
633 | |||||||
634 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||||
635 | inline double BitsToDouble(uint64_t Bits) { | ||||||
636 | double D; | ||||||
637 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
638 | memcpy(&D, &Bits, sizeof(Bits)); | ||||||
639 | return D; | ||||||
640 | } | ||||||
641 | |||||||
642 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||||
643 | inline float BitsToFloat(uint32_t Bits) { | ||||||
644 | float F; | ||||||
645 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
646 | memcpy(&F, &Bits, sizeof(Bits)); | ||||||
647 | return F; | ||||||
648 | } | ||||||
649 | |||||||
650 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||||
651 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||||
652 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
653 | inline uint64_t DoubleToBits(double Double) { | ||||||
654 | uint64_t Bits; | ||||||
655 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
656 | memcpy(&Bits, &Double, sizeof(Double)); | ||||||
657 | return Bits; | ||||||
658 | } | ||||||
659 | |||||||
660 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||||
661 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||||
662 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
663 | inline uint32_t FloatToBits(float Float) { | ||||||
664 | uint32_t Bits; | ||||||
665 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
666 | memcpy(&Bits, &Float, sizeof(Float)); | ||||||
667 | return Bits; | ||||||
668 | } | ||||||
669 | |||||||
670 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||||
671 | /// may be assumed after adding the two together. | ||||||
672 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||||
673 | // The largest power of 2 that divides both A and B. | ||||||
674 | // | ||||||
675 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||||
676 | // MSVC warning C4146 | ||||||
677 | // return (A | B) & -(A | B); | ||||||
678 | return (A | B) & (1 + ~(A | B)); | ||||||
679 | } | ||||||
680 | |||||||
681 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||||
682 | /// Returns zero on overflow. | ||||||
683 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||||
684 | A |= (A >> 1); | ||||||
685 | A |= (A >> 2); | ||||||
686 | A |= (A >> 4); | ||||||
687 | A |= (A >> 8); | ||||||
688 | A |= (A >> 16); | ||||||
689 | A |= (A >> 32); | ||||||
690 | return A + 1; | ||||||
691 | } | ||||||
692 | |||||||
693 | /// Returns the power of two which is less than or equal to the given value. | ||||||
694 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||||
695 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||||
696 | if (!A) return 0; | ||||||
697 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||||
698 | } | ||||||
699 | |||||||
700 | /// Returns the power of two which is greater than or equal to the given value. | ||||||
701 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||||
702 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||||
703 | if (!A) | ||||||
704 | return 0; | ||||||
705 | return NextPowerOf2(A - 1); | ||||||
706 | } | ||||||
707 | |||||||
708 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
709 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||||
710 | /// | ||||||
711 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||||
712 | /// integer that is greater than or equal to \p Value and equal to | ||||||
713 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||||
714 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||||
715 | /// | ||||||
716 | /// Examples: | ||||||
717 | /// \code | ||||||
718 | /// alignTo(5, 8) = 8 | ||||||
719 | /// alignTo(17, 8) = 24 | ||||||
720 | /// alignTo(~0LL, 8) = 0 | ||||||
721 | /// alignTo(321, 255) = 510 | ||||||
722 | /// | ||||||
723 | /// alignTo(5, 8, 7) = 7 | ||||||
724 | /// alignTo(17, 8, 1) = 17 | ||||||
725 | /// alignTo(~0LL, 8, 3) = 3 | ||||||
726 | /// alignTo(321, 255, 42) = 552 | ||||||
727 | /// \endcode | ||||||
728 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
729 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 729, __extension__ __PRETTY_FUNCTION__)); | ||||||
730 | Skew %= Align; | ||||||
731 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||||
732 | } | ||||||
733 | |||||||
734 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
735 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||||
736 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||||
737 | static_assert(Align != 0u, "Align must be non-zero"); | ||||||
738 | return (Value + Align - 1) / Align * Align; | ||||||
739 | } | ||||||
740 | |||||||
741 | /// Returns the integer ceil(Numerator / Denominator). | ||||||
742 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||||
743 | return alignTo(Numerator, Denominator) / Denominator; | ||||||
744 | } | ||||||
745 | |||||||
746 | /// Returns the integer nearest(Numerator / Denominator). | ||||||
747 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { | ||||||
748 | return (Numerator + (Denominator / 2)) / Denominator; | ||||||
749 | } | ||||||
750 | |||||||
751 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||||
752 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||||
753 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
754 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 754, __extension__ __PRETTY_FUNCTION__)); | ||||||
755 | Skew %= Align; | ||||||
756 | return (Value - Skew) / Align * Align + Skew; | ||||||
757 | } | ||||||
758 | |||||||
759 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
760 | /// Requires 0 < B <= 32. | ||||||
761 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||||
762 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
763 | static_assert(B <= 32, "Bit width out of range."); | ||||||
764 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
765 | } | ||||||
766 | |||||||
767 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
768 | /// Requires 0 < B <= 32. | ||||||
769 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||||
770 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 770, __extension__ __PRETTY_FUNCTION__)); | ||||||
771 | assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "llvm/include/llvm/Support/MathExtras.h", 771, __extension__ __PRETTY_FUNCTION__)); | ||||||
772 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
773 | } | ||||||
774 | |||||||
775 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
776 | /// Requires 0 < B <= 64. | ||||||
777 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||||
778 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
779 | static_assert(B <= 64, "Bit width out of range."); | ||||||
780 | return int64_t(x << (64 - B)) >> (64 - B); | ||||||
781 | } | ||||||
782 | |||||||
783 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
784 | /// Requires 0 < B <= 64. | ||||||
785 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||||
786 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 786, __extension__ __PRETTY_FUNCTION__)); | ||||||
787 | assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "llvm/include/llvm/Support/MathExtras.h", 787, __extension__ __PRETTY_FUNCTION__)); | ||||||
788 | return int64_t(X << (64 - B)) >> (64 - B); | ||||||
789 | } | ||||||
790 | |||||||
791 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||||
792 | /// value of the result. | ||||||
793 | template <typename T> | ||||||
794 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { | ||||||
795 | return X > Y ? (X - Y) : (Y - X); | ||||||
796 | } | ||||||
797 | |||||||
798 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
799 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
800 | /// the result is larger than the maximum representable value of type T. | ||||||
801 | template <typename T> | ||||||
802 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
803 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
804 | bool Dummy; | ||||||
805 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
806 | // Hacker's Delight, p. 29 | ||||||
807 | T Z = X + Y; | ||||||
808 | Overflowed = (Z < X || Z < Y); | ||||||
809 | if (Overflowed) | ||||||
810 | return std::numeric_limits<T>::max(); | ||||||
811 | else | ||||||
812 | return Z; | ||||||
813 | } | ||||||
814 | |||||||
815 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
816 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
817 | /// the result is larger than the maximum representable value of type T. | ||||||
818 | template <typename T> | ||||||
819 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
820 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
821 | bool Dummy; | ||||||
822 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
823 | |||||||
824 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||||
825 | // because it fails for uint16_t (where multiplication can have undefined | ||||||
826 | // behavior due to promotion to int), and requires a division in addition | ||||||
827 | // to the multiplication. | ||||||
828 | |||||||
829 | Overflowed = false; | ||||||
830 | |||||||
831 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||||
832 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||||
833 | // will necessarily be less than Log2Max as desired. | ||||||
834 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||||
835 | const T Max = std::numeric_limits<T>::max(); | ||||||
836 | int Log2Max = Log2_64(Max); | ||||||
837 | if (Log2Z < Log2Max) { | ||||||
838 | return X * Y; | ||||||
839 | } | ||||||
840 | if (Log2Z > Log2Max) { | ||||||
841 | Overflowed = true; | ||||||
842 | return Max; | ||||||
843 | } | ||||||
844 | |||||||
845 | // We're going to use the top bit, and maybe overflow one | ||||||
846 | // bit past it. Multiply all but the bottom bit then add | ||||||
847 | // that on at the end. | ||||||
848 | T Z = (X >> 1) * Y; | ||||||
849 | if (Z & ~(Max >> 1)) { | ||||||
850 | Overflowed = true; | ||||||
851 | return Max; | ||||||
852 | } | ||||||
853 | Z <<= 1; | ||||||
854 | if (X & 1) | ||||||
855 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||||
856 | |||||||
857 | return Z; | ||||||
858 | } | ||||||
859 | |||||||
860 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||||
861 | /// the product. Clamp the result to the maximum representable value of T on | ||||||
862 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||||
863 | /// maximum representable value of type T. | ||||||
864 | template <typename T> | ||||||
865 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
866 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||||
867 | bool Dummy; | ||||||
868 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
869 | |||||||
870 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||||
871 | if (Overflowed) | ||||||
872 | return Product; | ||||||
873 | |||||||
874 | return SaturatingAdd(A, Product, &Overflowed); | ||||||
875 | } | ||||||
876 | |||||||
877 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||||
878 | extern const float huge_valf; | ||||||
879 | |||||||
880 | |||||||
881 | /// Add two signed integers, computing the two's complement truncated result, | ||||||
882 | /// returning true if overflow occured. | ||||||
883 | template <typename T> | ||||||
884 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { | ||||||
885 | #if __has_builtin(__builtin_add_overflow)1 | ||||||
886 | return __builtin_add_overflow(X, Y, &Result); | ||||||
887 | #else | ||||||
888 | // Perform the unsigned addition. | ||||||
889 | using U = std::make_unsigned_t<T>; | ||||||
890 | const U UX = static_cast<U>(X); | ||||||
891 | const U UY = static_cast<U>(Y); | ||||||
892 | const U UResult = UX + UY; | ||||||
893 | |||||||
894 | // Convert to signed. | ||||||
895 | Result = static_cast<T>(UResult); | ||||||
896 | |||||||
897 | // Adding two positive numbers should result in a positive number. | ||||||
898 | if (X > 0 && Y > 0) | ||||||
899 | return Result <= 0; | ||||||
900 | // Adding two negatives should result in a negative number. | ||||||
901 | if (X < 0 && Y < 0) | ||||||
902 | return Result >= 0; | ||||||
903 | return false; | ||||||
904 | #endif | ||||||
905 | } | ||||||
906 | |||||||
907 | /// Subtract two signed integers, computing the two's complement truncated | ||||||
908 | /// result, returning true if an overflow ocurred. | ||||||
909 | template <typename T> | ||||||
910 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { | ||||||
911 | #if __has_builtin(__builtin_sub_overflow)1 | ||||||
912 | return __builtin_sub_overflow(X, Y, &Result); | ||||||
913 | #else | ||||||
914 | // Perform the unsigned addition. | ||||||
915 | using U = std::make_unsigned_t<T>; | ||||||
916 | const U UX = static_cast<U>(X); | ||||||
917 | const U UY = static_cast<U>(Y); | ||||||
918 | const U UResult = UX - UY; | ||||||
919 | |||||||
920 | // Convert to signed. | ||||||
921 | Result = static_cast<T>(UResult); | ||||||
922 | |||||||
923 | // Subtracting a positive number from a negative results in a negative number. | ||||||
924 | if (X <= 0 && Y > 0) | ||||||
925 | return Result >= 0; | ||||||
926 | // Subtracting a negative number from a positive results in a positive number. | ||||||
927 | if (X >= 0 && Y < 0) | ||||||
928 | return Result <= 0; | ||||||
929 | return false; | ||||||
930 | #endif | ||||||
931 | } | ||||||
932 | |||||||
933 | /// Multiply two signed integers, computing the two's complement truncated | ||||||
934 | /// result, returning true if an overflow ocurred. | ||||||
935 | template <typename T> | ||||||
936 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { | ||||||
937 | // Perform the unsigned multiplication on absolute values. | ||||||
938 | using U = std::make_unsigned_t<T>; | ||||||
939 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||||
940 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||||
941 | const U UResult = UX * UY; | ||||||
942 | |||||||
943 | // Convert to signed. | ||||||
944 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||||
945 | Result = IsNegative ? (0 - UResult) : UResult; | ||||||
946 | |||||||
947 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||||
948 | if (UX == 0 || UY == 0) | ||||||
949 | return false; | ||||||
950 | |||||||
951 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||||
952 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||||
953 | // positive) divided by an argument compares to the other. | ||||||
954 | if (IsNegative) | ||||||
955 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||||
956 | else | ||||||
957 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||||
958 | } | ||||||
959 | |||||||
960 | } // End llvm namespace | ||||||
961 | |||||||
962 | #endif |