File: | build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 3340, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||
2 | // | ||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
6 | // | ||||
7 | //===----------------------------------------------------------------------===// | ||||
8 | /// \file | ||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||
10 | /// AMDGPU. | ||||
11 | /// \todo This should be generated by TableGen. | ||||
12 | //===----------------------------------------------------------------------===// | ||||
13 | |||||
14 | #include "AMDGPULegalizerInfo.h" | ||||
15 | |||||
16 | #include "AMDGPU.h" | ||||
17 | #include "AMDGPUGlobalISelUtils.h" | ||||
18 | #include "AMDGPUInstrInfo.h" | ||||
19 | #include "AMDGPUTargetMachine.h" | ||||
20 | #include "SIMachineFunctionInfo.h" | ||||
21 | #include "Utils/AMDGPUBaseInfo.h" | ||||
22 | #include "llvm/ADT/ScopeExit.h" | ||||
23 | #include "llvm/BinaryFormat/ELF.h" | ||||
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||
27 | #include "llvm/IR/DiagnosticInfo.h" | ||||
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||
29 | #include "llvm/IR/IntrinsicsR600.h" | ||||
30 | |||||
31 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||
32 | |||||
33 | using namespace llvm; | ||||
34 | using namespace LegalizeActions; | ||||
35 | using namespace LegalizeMutations; | ||||
36 | using namespace LegalityPredicates; | ||||
37 | using namespace MIPatternMatch; | ||||
38 | |||||
39 | // Hack until load/store selection patterns support any tuple of legal types. | ||||
40 | static cl::opt<bool> EnableNewLegality( | ||||
41 | "amdgpu-global-isel-new-legality", | ||||
42 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||
43 | "rules compatible with selection patterns"), | ||||
44 | cl::init(false), | ||||
45 | cl::ReallyHidden); | ||||
46 | |||||
47 | static constexpr unsigned MaxRegisterSize = 1024; | ||||
48 | |||||
49 | // Round the number of elements to the next power of two elements | ||||
50 | static LLT getPow2VectorType(LLT Ty) { | ||||
51 | unsigned NElts = Ty.getNumElements(); | ||||
52 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||
53 | return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); | ||||
54 | } | ||||
55 | |||||
56 | // Round the number of bits to the next power of two bits | ||||
57 | static LLT getPow2ScalarType(LLT Ty) { | ||||
58 | unsigned Bits = Ty.getSizeInBits(); | ||||
59 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||
60 | return LLT::scalar(Pow2Bits); | ||||
61 | } | ||||
62 | |||||
63 | /// \returns true if this is an odd sized vector which should widen by adding an | ||||
64 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||
65 | /// excludes s1 vectors, which should always be scalarized. | ||||
66 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||
67 | return [=](const LegalityQuery &Query) { | ||||
68 | const LLT Ty = Query.Types[TypeIdx]; | ||||
69 | if (!Ty.isVector()) | ||||
70 | return false; | ||||
71 | |||||
72 | const LLT EltTy = Ty.getElementType(); | ||||
73 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||
74 | return Ty.getNumElements() % 2 != 0 && | ||||
75 | EltSize > 1 && EltSize < 32 && | ||||
76 | Ty.getSizeInBits() % 32 != 0; | ||||
77 | }; | ||||
78 | } | ||||
79 | |||||
80 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||
81 | return [=](const LegalityQuery &Query) { | ||||
82 | const LLT Ty = Query.Types[TypeIdx]; | ||||
83 | return Ty.getSizeInBits() % 32 == 0; | ||||
84 | }; | ||||
85 | } | ||||
86 | |||||
87 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||
88 | return [=](const LegalityQuery &Query) { | ||||
89 | const LLT Ty = Query.Types[TypeIdx]; | ||||
90 | const LLT EltTy = Ty.getScalarType(); | ||||
91 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||
92 | }; | ||||
93 | } | ||||
94 | |||||
95 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||
96 | return [=](const LegalityQuery &Query) { | ||||
97 | const LLT Ty = Query.Types[TypeIdx]; | ||||
98 | const LLT EltTy = Ty.getElementType(); | ||||
99 | return std::pair(TypeIdx, | ||||
100 | LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); | ||||
101 | }; | ||||
102 | } | ||||
103 | |||||
104 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||
105 | return [=](const LegalityQuery &Query) { | ||||
106 | const LLT Ty = Query.Types[TypeIdx]; | ||||
107 | const LLT EltTy = Ty.getElementType(); | ||||
108 | unsigned Size = Ty.getSizeInBits(); | ||||
109 | unsigned Pieces = (Size + 63) / 64; | ||||
110 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||
111 | return std::pair(TypeIdx, LLT::scalarOrVector( | ||||
112 | ElementCount::getFixed(NewNumElts), EltTy)); | ||||
113 | }; | ||||
114 | } | ||||
115 | |||||
116 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||
117 | // type. | ||||
118 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||
119 | return [=](const LegalityQuery &Query) { | ||||
120 | const LLT Ty = Query.Types[TypeIdx]; | ||||
121 | |||||
122 | const LLT EltTy = Ty.getElementType(); | ||||
123 | const int Size = Ty.getSizeInBits(); | ||||
124 | const int EltSize = EltTy.getSizeInBits(); | ||||
125 | const int NextMul32 = (Size + 31) / 32; | ||||
126 | |||||
127 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 127, __extension__ __PRETTY_FUNCTION__)); | ||||
128 | |||||
129 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||
130 | return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); | ||||
131 | }; | ||||
132 | } | ||||
133 | |||||
134 | // Increase the number of vector elements to reach the next legal RegClass. | ||||
135 | static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { | ||||
136 | return [=](const LegalityQuery &Query) { | ||||
137 | const LLT Ty = Query.Types[TypeIdx]; | ||||
138 | const unsigned NumElts = Ty.getNumElements(); | ||||
139 | const unsigned EltSize = Ty.getElementType().getSizeInBits(); | ||||
140 | const unsigned MaxNumElts = MaxRegisterSize / EltSize; | ||||
141 | |||||
142 | assert(EltSize == 32 || EltSize == 64)(static_cast <bool> (EltSize == 32 || EltSize == 64) ? void (0) : __assert_fail ("EltSize == 32 || EltSize == 64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 142, __extension__ __PRETTY_FUNCTION__)); | ||||
143 | assert(Ty.getSizeInBits() < MaxRegisterSize)(static_cast <bool> (Ty.getSizeInBits() < MaxRegisterSize ) ? void (0) : __assert_fail ("Ty.getSizeInBits() < MaxRegisterSize" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 143, __extension__ __PRETTY_FUNCTION__)); | ||||
144 | |||||
145 | unsigned NewNumElts; | ||||
146 | // Find the nearest legal RegClass that is larger than the current type. | ||||
147 | for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { | ||||
148 | if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) | ||||
149 | break; | ||||
150 | } | ||||
151 | |||||
152 | return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); | ||||
153 | }; | ||||
154 | } | ||||
155 | |||||
156 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||
157 | const unsigned Size = Ty.getSizeInBits(); | ||||
158 | |||||
159 | if (Size <= 32) { | ||||
160 | // <2 x s8> -> s16 | ||||
161 | // <4 x s8> -> s32 | ||||
162 | return LLT::scalar(Size); | ||||
163 | } | ||||
164 | |||||
165 | return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); | ||||
166 | } | ||||
167 | |||||
168 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||
169 | return [=](const LegalityQuery &Query) { | ||||
170 | const LLT Ty = Query.Types[TypeIdx]; | ||||
171 | return std::pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||
172 | }; | ||||
173 | } | ||||
174 | |||||
175 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||
176 | return [=](const LegalityQuery &Query) { | ||||
177 | const LLT Ty = Query.Types[TypeIdx]; | ||||
178 | unsigned Size = Ty.getSizeInBits(); | ||||
179 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 179, __extension__ __PRETTY_FUNCTION__)); | ||||
180 | return std::pair( | ||||
181 | TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); | ||||
182 | }; | ||||
183 | } | ||||
184 | |||||
185 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||
186 | return [=](const LegalityQuery &Query) { | ||||
187 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
188 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||
189 | }; | ||||
190 | } | ||||
191 | |||||
192 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||
193 | return [=](const LegalityQuery &Query) { | ||||
194 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
195 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||
196 | }; | ||||
197 | } | ||||
198 | |||||
199 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||
200 | return [=](const LegalityQuery &Query) { | ||||
201 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
202 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||
203 | }; | ||||
204 | } | ||||
205 | |||||
206 | static bool isRegisterSize(unsigned Size) { | ||||
207 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||
208 | } | ||||
209 | |||||
210 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||
211 | const int EltSize = EltTy.getSizeInBits(); | ||||
212 | return EltSize == 16 || EltSize % 32 == 0; | ||||
213 | } | ||||
214 | |||||
215 | static bool isRegisterVectorType(LLT Ty) { | ||||
216 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||
217 | return EltSize == 32 || EltSize == 64 || | ||||
218 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||
219 | EltSize == 128 || EltSize == 256; | ||||
220 | } | ||||
221 | |||||
222 | static bool isRegisterType(LLT Ty) { | ||||
223 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||
224 | return false; | ||||
225 | |||||
226 | if (Ty.isVector()) | ||||
227 | return isRegisterVectorType(Ty); | ||||
228 | |||||
229 | return true; | ||||
230 | } | ||||
231 | |||||
232 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||
233 | // multiples of v2s16. | ||||
234 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||
235 | return [=](const LegalityQuery &Query) { | ||||
236 | return isRegisterType(Query.Types[TypeIdx]); | ||||
237 | }; | ||||
238 | } | ||||
239 | |||||
240 | // RegisterType that doesn't have a corresponding RegClass. | ||||
241 | static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { | ||||
242 | return [=](const LegalityQuery &Query) { | ||||
243 | LLT Ty = Query.Types[TypeIdx]; | ||||
244 | return isRegisterType(Ty) && | ||||
245 | !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); | ||||
246 | }; | ||||
247 | } | ||||
248 | |||||
249 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||
250 | return [=](const LegalityQuery &Query) { | ||||
251 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
252 | if (!QueryTy.isVector()) | ||||
253 | return false; | ||||
254 | const LLT EltTy = QueryTy.getElementType(); | ||||
255 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||
256 | }; | ||||
257 | } | ||||
258 | |||||
259 | // If we have a truncating store or an extending load with a data size larger | ||||
260 | // than 32-bits, we need to reduce to a 32-bit type. | ||||
261 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { | ||||
262 | return [=](const LegalityQuery &Query) { | ||||
263 | const LLT Ty = Query.Types[TypeIdx]; | ||||
264 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||
265 | Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); | ||||
266 | }; | ||||
267 | } | ||||
268 | |||||
269 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||
270 | // handle some operations by just promoting the register during | ||||
271 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||
272 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||
273 | bool IsLoad, bool IsAtomic) { | ||||
274 | switch (AS) { | ||||
275 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||
276 | // FIXME: Private element size. | ||||
277 | return ST.enableFlatScratch() ? 128 : 32; | ||||
278 | case AMDGPUAS::LOCAL_ADDRESS: | ||||
279 | return ST.useDS128() ? 128 : 64; | ||||
280 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||
281 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||
282 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||
283 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||
284 | // global loads (ideally constant address space should be eliminated) | ||||
285 | // depending on the context. Legality cannot be context dependent, but | ||||
286 | // RegBankSelect can split the load as necessary depending on the pointer | ||||
287 | // register bank/uniformity and if the memory is invariant or not written in a | ||||
288 | // kernel. | ||||
289 | return IsLoad ? 512 : 128; | ||||
290 | default: | ||||
291 | // FIXME: Flat addresses may contextually need to be split to 32-bit parts | ||||
292 | // if they may alias scratch depending on the subtarget. This needs to be | ||||
293 | // moved to custom handling to use addressMayBeAccessedAsPrivate | ||||
294 | return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; | ||||
295 | } | ||||
296 | } | ||||
297 | |||||
298 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||
299 | const LegalityQuery &Query) { | ||||
300 | const LLT Ty = Query.Types[0]; | ||||
301 | |||||
302 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||
303 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; | ||||
304 | |||||
305 | unsigned RegSize = Ty.getSizeInBits(); | ||||
306 | uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
307 | uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; | ||||
308 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||
309 | |||||
310 | // All of these need to be custom lowered to cast the pointer operand. | ||||
311 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||
312 | return false; | ||||
313 | |||||
314 | // Do not handle extending vector loads. | ||||
315 | if (Ty.isVector() && MemSize != RegSize) | ||||
316 | return false; | ||||
317 | |||||
318 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||
319 | // we also need to modify the memory access size. | ||||
320 | #if 0 | ||||
321 | // Accept widening loads based on alignment. | ||||
322 | if (IsLoad && MemSize < Size) | ||||
323 | MemSize = std::max(MemSize, Align); | ||||
324 | #endif | ||||
325 | |||||
326 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||
327 | if (MemSize != RegSize && RegSize != 32) | ||||
328 | return false; | ||||
329 | |||||
330 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, | ||||
331 | Query.MMODescrs[0].Ordering != | ||||
332 | AtomicOrdering::NotAtomic)) | ||||
333 | return false; | ||||
334 | |||||
335 | switch (MemSize) { | ||||
336 | case 8: | ||||
337 | case 16: | ||||
338 | case 32: | ||||
339 | case 64: | ||||
340 | case 128: | ||||
341 | break; | ||||
342 | case 96: | ||||
343 | if (!ST.hasDwordx3LoadStores()) | ||||
344 | return false; | ||||
345 | break; | ||||
346 | case 256: | ||||
347 | case 512: | ||||
348 | // These may contextually need to be broken down. | ||||
349 | break; | ||||
350 | default: | ||||
351 | return false; | ||||
352 | } | ||||
353 | |||||
354 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 354, __extension__ __PRETTY_FUNCTION__)); | ||||
355 | |||||
356 | if (AlignBits < MemSize) { | ||||
357 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
358 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||
359 | Align(AlignBits / 8))) | ||||
360 | return false; | ||||
361 | } | ||||
362 | |||||
363 | return true; | ||||
364 | } | ||||
365 | |||||
366 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||
367 | // workaround this. Eventually it should ignore the type for loads and only care | ||||
368 | // about the size. Return true in cases where we will workaround this for now by | ||||
369 | // bitcasting. | ||||
370 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||
371 | if (EnableNewLegality) | ||||
372 | return false; | ||||
373 | |||||
374 | const unsigned Size = Ty.getSizeInBits(); | ||||
375 | if (Size <= 64) | ||||
376 | return false; | ||||
377 | if (!Ty.isVector()) | ||||
378 | return true; | ||||
379 | |||||
380 | LLT EltTy = Ty.getElementType(); | ||||
381 | if (EltTy.isPointer()) | ||||
382 | return true; | ||||
383 | |||||
384 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
385 | return EltSize != 32 && EltSize != 64; | ||||
386 | } | ||||
387 | |||||
388 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { | ||||
389 | const LLT Ty = Query.Types[0]; | ||||
390 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && | ||||
391 | !loadStoreBitcastWorkaround(Ty); | ||||
392 | } | ||||
393 | |||||
394 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||
395 | /// to a different type. | ||||
396 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||
397 | const LLT MemTy) { | ||||
398 | const unsigned MemSizeInBits = MemTy.getSizeInBits(); | ||||
399 | const unsigned Size = Ty.getSizeInBits(); | ||||
400 | if (Size != MemSizeInBits) | ||||
401 | return Size <= 32 && Ty.isVector(); | ||||
402 | |||||
403 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||
404 | return true; | ||||
405 | |||||
406 | // Don't try to handle bitcasting vector ext loads for now. | ||||
407 | return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && | ||||
408 | (Size <= 32 || isRegisterSize(Size)) && | ||||
409 | !isRegisterVectorElementType(Ty.getElementType()); | ||||
410 | } | ||||
411 | |||||
412 | /// Return true if we should legalize a load by widening an odd sized memory | ||||
413 | /// access up to the alignment. Note this case when the memory access itself | ||||
414 | /// changes, not the size of the result register. | ||||
415 | static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, | ||||
416 | uint64_t AlignInBits, unsigned AddrSpace, | ||||
417 | unsigned Opcode) { | ||||
418 | unsigned SizeInBits = MemoryTy.getSizeInBits(); | ||||
419 | // We don't want to widen cases that are naturally legal. | ||||
420 | if (isPowerOf2_32(SizeInBits)) | ||||
421 | return false; | ||||
422 | |||||
423 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||
424 | // end up widening these for a scalar load during RegBankSelect, since there | ||||
425 | // aren't 96-bit scalar loads. | ||||
426 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||
427 | return false; | ||||
428 | |||||
429 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) | ||||
430 | return false; | ||||
431 | |||||
432 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||
433 | // to it. | ||||
434 | // | ||||
435 | // TODO: Could check dereferenceable for less aligned cases. | ||||
436 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||
437 | if (AlignInBits < RoundedSize) | ||||
438 | return false; | ||||
439 | |||||
440 | // Do not widen if it would introduce a slow unaligned load. | ||||
441 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
442 | unsigned Fast = 0; | ||||
443 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||
444 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||
445 | MachineMemOperand::MOLoad, &Fast) && | ||||
446 | Fast; | ||||
447 | } | ||||
448 | |||||
449 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||
450 | unsigned Opcode) { | ||||
451 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||
452 | return false; | ||||
453 | |||||
454 | return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, | ||||
455 | Query.MMODescrs[0].AlignInBits, | ||||
456 | Query.Types[1].getAddressSpace(), Opcode); | ||||
457 | } | ||||
458 | |||||
459 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||
460 | const GCNTargetMachine &TM) | ||||
461 | : ST(ST_) { | ||||
462 | using namespace TargetOpcode; | ||||
463 | |||||
464 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||
465 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||
466 | }; | ||||
467 | |||||
468 | const LLT S1 = LLT::scalar(1); | ||||
469 | const LLT S8 = LLT::scalar(8); | ||||
470 | const LLT S16 = LLT::scalar(16); | ||||
471 | const LLT S32 = LLT::scalar(32); | ||||
472 | const LLT S64 = LLT::scalar(64); | ||||
473 | const LLT S128 = LLT::scalar(128); | ||||
474 | const LLT S256 = LLT::scalar(256); | ||||
475 | const LLT S512 = LLT::scalar(512); | ||||
476 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||
477 | |||||
478 | const LLT V2S8 = LLT::fixed_vector(2, 8); | ||||
479 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
480 | const LLT V4S16 = LLT::fixed_vector(4, 16); | ||||
481 | |||||
482 | const LLT V2S32 = LLT::fixed_vector(2, 32); | ||||
483 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||
484 | const LLT V4S32 = LLT::fixed_vector(4, 32); | ||||
485 | const LLT V5S32 = LLT::fixed_vector(5, 32); | ||||
486 | const LLT V6S32 = LLT::fixed_vector(6, 32); | ||||
487 | const LLT V7S32 = LLT::fixed_vector(7, 32); | ||||
488 | const LLT V8S32 = LLT::fixed_vector(8, 32); | ||||
489 | const LLT V9S32 = LLT::fixed_vector(9, 32); | ||||
490 | const LLT V10S32 = LLT::fixed_vector(10, 32); | ||||
491 | const LLT V11S32 = LLT::fixed_vector(11, 32); | ||||
492 | const LLT V12S32 = LLT::fixed_vector(12, 32); | ||||
493 | const LLT V13S32 = LLT::fixed_vector(13, 32); | ||||
494 | const LLT V14S32 = LLT::fixed_vector(14, 32); | ||||
495 | const LLT V15S32 = LLT::fixed_vector(15, 32); | ||||
496 | const LLT V16S32 = LLT::fixed_vector(16, 32); | ||||
497 | const LLT V32S32 = LLT::fixed_vector(32, 32); | ||||
498 | |||||
499 | const LLT V2S64 = LLT::fixed_vector(2, 64); | ||||
500 | const LLT V3S64 = LLT::fixed_vector(3, 64); | ||||
501 | const LLT V4S64 = LLT::fixed_vector(4, 64); | ||||
502 | const LLT V5S64 = LLT::fixed_vector(5, 64); | ||||
503 | const LLT V6S64 = LLT::fixed_vector(6, 64); | ||||
504 | const LLT V7S64 = LLT::fixed_vector(7, 64); | ||||
505 | const LLT V8S64 = LLT::fixed_vector(8, 64); | ||||
506 | const LLT V16S64 = LLT::fixed_vector(16, 64); | ||||
507 | |||||
508 | std::initializer_list<LLT> AllS32Vectors = | ||||
509 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||
510 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||
511 | std::initializer_list<LLT> AllS64Vectors = | ||||
512 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||
513 | |||||
514 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||
515 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||
516 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||
517 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||
518 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||
519 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||
520 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||
521 | |||||
522 | const LLT CodePtr = FlatPtr; | ||||
523 | |||||
524 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||
525 | GlobalPtr, ConstantPtr, FlatPtr | ||||
526 | }; | ||||
527 | |||||
528 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||
529 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||
530 | }; | ||||
531 | |||||
532 | const std::initializer_list<LLT> FPTypesBase = { | ||||
533 | S32, S64 | ||||
534 | }; | ||||
535 | |||||
536 | const std::initializer_list<LLT> FPTypes16 = { | ||||
537 | S32, S64, S16 | ||||
538 | }; | ||||
539 | |||||
540 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||
541 | S32, S64, S16, V2S16 | ||||
542 | }; | ||||
543 | |||||
544 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||
545 | |||||
546 | // s1 for VCC branches, s32 for SCC branches. | ||||
547 | getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); | ||||
548 | |||||
549 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||
550 | // elements for v3s16 | ||||
551 | getActionDefinitionsBuilder(G_PHI) | ||||
552 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||
553 | .legalFor(AllS32Vectors) | ||||
554 | .legalFor(AllS64Vectors) | ||||
555 | .legalFor(AddrSpaces64) | ||||
556 | .legalFor(AddrSpaces32) | ||||
557 | .legalIf(isPointer(0)) | ||||
558 | .clampScalar(0, S16, S256) | ||||
559 | .widenScalarToNextPow2(0, 32) | ||||
560 | .clampMaxNumElements(0, S32, 16) | ||||
561 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
562 | .scalarize(0); | ||||
563 | |||||
564 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||
565 | // Full set of gfx9 features. | ||||
566 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
567 | .legalFor({S32, S16, V2S16}) | ||||
568 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
569 | .scalarize(0) | ||||
570 | .minScalar(0, S16) | ||||
571 | .widenScalarToNextMultipleOf(0, 32) | ||||
572 | .maxScalar(0, S32); | ||||
573 | |||||
574 | getActionDefinitionsBuilder(G_MUL) | ||||
575 | .legalFor({S32, S16, V2S16}) | ||||
576 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
577 | .scalarize(0) | ||||
578 | .minScalar(0, S16) | ||||
579 | .widenScalarToNextMultipleOf(0, 32) | ||||
580 | .custom(); | ||||
581 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 581, __extension__ __PRETTY_FUNCTION__)); | ||||
582 | |||||
583 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||
584 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||
585 | .minScalarOrElt(0, S16) | ||||
586 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
587 | .scalarize(0) | ||||
588 | .widenScalarToNextPow2(0, 32) | ||||
589 | .lower(); | ||||
590 | } else if (ST.has16BitInsts()) { | ||||
591 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
592 | .legalFor({S32, S16}) | ||||
593 | .minScalar(0, S16) | ||||
594 | .widenScalarToNextMultipleOf(0, 32) | ||||
595 | .maxScalar(0, S32) | ||||
596 | .scalarize(0); | ||||
597 | |||||
598 | getActionDefinitionsBuilder(G_MUL) | ||||
599 | .legalFor({S32, S16}) | ||||
600 | .scalarize(0) | ||||
601 | .minScalar(0, S16) | ||||
602 | .widenScalarToNextMultipleOf(0, 32) | ||||
603 | .custom(); | ||||
604 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 604, __extension__ __PRETTY_FUNCTION__)); | ||||
605 | |||||
606 | // Technically the saturating operations require clamp bit support, but this | ||||
607 | // was introduced at the same time as 16-bit operations. | ||||
608 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
609 | .legalFor({S32, S16}) // Clamp modifier | ||||
610 | .minScalar(0, S16) | ||||
611 | .scalarize(0) | ||||
612 | .widenScalarToNextPow2(0, 16) | ||||
613 | .lower(); | ||||
614 | |||||
615 | // We're just lowering this, but it helps get a better result to try to | ||||
616 | // coerce to the desired type first. | ||||
617 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||
618 | .minScalar(0, S16) | ||||
619 | .scalarize(0) | ||||
620 | .lower(); | ||||
621 | } else { | ||||
622 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
623 | .legalFor({S32}) | ||||
624 | .widenScalarToNextMultipleOf(0, 32) | ||||
625 | .clampScalar(0, S32, S32) | ||||
626 | .scalarize(0); | ||||
627 | |||||
628 | auto &Mul = getActionDefinitionsBuilder(G_MUL) | ||||
629 | .legalFor({S32}) | ||||
630 | .scalarize(0) | ||||
631 | .minScalar(0, S32) | ||||
632 | .widenScalarToNextMultipleOf(0, 32); | ||||
633 | |||||
634 | if (ST.hasMad64_32()) | ||||
635 | Mul.custom(); | ||||
636 | else | ||||
637 | Mul.maxScalar(0, S32); | ||||
638 | |||||
639 | if (ST.hasIntClamp()) { | ||||
640 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
641 | .legalFor({S32}) // Clamp modifier. | ||||
642 | .scalarize(0) | ||||
643 | .minScalarOrElt(0, S32) | ||||
644 | .lower(); | ||||
645 | } else { | ||||
646 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||
647 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
648 | .minScalar(0, S32) | ||||
649 | .scalarize(0) | ||||
650 | .lower(); | ||||
651 | } | ||||
652 | |||||
653 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||
654 | // range values and goes for the min/max lowering directly. | ||||
655 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||
656 | .minScalar(0, S32) | ||||
657 | .scalarize(0) | ||||
658 | .lower(); | ||||
659 | } | ||||
660 | |||||
661 | getActionDefinitionsBuilder( | ||||
662 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) | ||||
663 | .customFor({S32, S64}) | ||||
664 | .clampScalar(0, S32, S64) | ||||
665 | .widenScalarToNextPow2(0, 32) | ||||
666 | .scalarize(0); | ||||
667 | |||||
668 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||
669 | .legalFor({S32}) | ||||
670 | .maxScalar(0, S32); | ||||
671 | |||||
672 | if (ST.hasVOP3PInsts()) { | ||||
673 | Mulh | ||||
674 | .clampMaxNumElements(0, S8, 2) | ||||
675 | .lowerFor({V2S8}); | ||||
676 | } | ||||
677 | |||||
678 | Mulh | ||||
679 | .scalarize(0) | ||||
680 | .lower(); | ||||
681 | |||||
682 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||
683 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||
684 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||
685 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||
686 | .clampScalar(0, S32, S64) | ||||
687 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
688 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||
689 | .widenScalarToNextPow2(0) | ||||
690 | .scalarize(0); | ||||
691 | |||||
692 | getActionDefinitionsBuilder( | ||||
693 | {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||
694 | .legalFor({{S32, S1}, {S32, S32}}) | ||||
695 | .clampScalar(0, S32, S32) | ||||
696 | .scalarize(0); | ||||
697 | |||||
698 | getActionDefinitionsBuilder(G_BITCAST) | ||||
699 | // Don't worry about the size constraint. | ||||
700 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
701 | .lower(); | ||||
702 | |||||
703 | |||||
704 | getActionDefinitionsBuilder(G_CONSTANT) | ||||
705 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||
706 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||
707 | .legalIf(isPointer(0)) | ||||
708 | .clampScalar(0, S32, S64) | ||||
709 | .widenScalarToNextPow2(0); | ||||
710 | |||||
711 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||
712 | .legalFor({S32, S64, S16}) | ||||
713 | .clampScalar(0, S16, S64); | ||||
714 | |||||
715 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||
716 | .legalIf(isRegisterType(0)) | ||||
717 | // s1 and s16 are special cases because they have legal operations on | ||||
718 | // them, but don't really occupy registers in the normal way. | ||||
719 | .legalFor({S1, S16}) | ||||
720 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
721 | .clampScalarOrElt(0, S32, MaxScalar) | ||||
722 | .widenScalarToNextPow2(0, 32) | ||||
723 | .clampMaxNumElements(0, S32, 16); | ||||
724 | |||||
725 | getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); | ||||
726 | |||||
727 | // If the amount is divergent, we have to do a wave reduction to get the | ||||
728 | // maximum value, so this is expanded during RegBankSelect. | ||||
729 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||
730 | .legalFor({{PrivatePtr, S32}}); | ||||
731 | |||||
732 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||
733 | .customIf(typeIsNot(0, PrivatePtr)); | ||||
734 | |||||
735 | getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); | ||||
736 | |||||
737 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||
738 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, | ||||
739 | G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) | ||||
740 | .legalFor({S32, S64}); | ||||
741 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||
742 | .customFor({S32, S64}); | ||||
743 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||
744 | .customFor({S32, S64}); | ||||
745 | |||||
746 | if (ST.has16BitInsts()) { | ||||
747 | if (ST.hasVOP3PInsts()) | ||||
748 | FPOpActions.legalFor({S16, V2S16}); | ||||
749 | else | ||||
750 | FPOpActions.legalFor({S16}); | ||||
751 | |||||
752 | TrigActions.customFor({S16}); | ||||
753 | FDIVActions.customFor({S16}); | ||||
754 | } | ||||
755 | |||||
756 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||
757 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||
758 | |||||
759 | if (ST.hasVOP3PInsts()) { | ||||
760 | MinNumMaxNum.customFor(FPTypesPK16) | ||||
761 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
762 | .clampMaxNumElements(0, S16, 2) | ||||
763 | .clampScalar(0, S16, S64) | ||||
764 | .scalarize(0); | ||||
765 | } else if (ST.has16BitInsts()) { | ||||
766 | MinNumMaxNum.customFor(FPTypes16) | ||||
767 | .clampScalar(0, S16, S64) | ||||
768 | .scalarize(0); | ||||
769 | } else { | ||||
770 | MinNumMaxNum.customFor(FPTypesBase) | ||||
771 | .clampScalar(0, S32, S64) | ||||
772 | .scalarize(0); | ||||
773 | } | ||||
774 | |||||
775 | if (ST.hasVOP3PInsts()) | ||||
776 | FPOpActions.clampMaxNumElementsStrict(0, S16, 2); | ||||
777 | |||||
778 | FPOpActions | ||||
779 | .scalarize(0) | ||||
780 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
781 | |||||
782 | TrigActions | ||||
783 | .scalarize(0) | ||||
784 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
785 | |||||
786 | FDIVActions | ||||
787 | .scalarize(0) | ||||
788 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
789 | |||||
790 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||
791 | .legalFor(FPTypesPK16) | ||||
792 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
793 | .scalarize(0) | ||||
794 | .clampScalar(0, S16, S64); | ||||
795 | |||||
796 | if (ST.has16BitInsts()) { | ||||
797 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||
798 | .legalFor({S32, S64, S16}) | ||||
799 | .scalarize(0) | ||||
800 | .clampScalar(0, S16, S64); | ||||
801 | } else { | ||||
802 | getActionDefinitionsBuilder(G_FSQRT) | ||||
803 | .legalFor({S32, S64}) | ||||
804 | .scalarize(0) | ||||
805 | .clampScalar(0, S32, S64); | ||||
806 | |||||
807 | if (ST.hasFractBug()) { | ||||
808 | getActionDefinitionsBuilder(G_FFLOOR) | ||||
809 | .customFor({S64}) | ||||
810 | .legalFor({S32, S64}) | ||||
811 | .scalarize(0) | ||||
812 | .clampScalar(0, S32, S64); | ||||
813 | } else { | ||||
814 | getActionDefinitionsBuilder(G_FFLOOR) | ||||
815 | .legalFor({S32, S64}) | ||||
816 | .scalarize(0) | ||||
817 | .clampScalar(0, S32, S64); | ||||
818 | } | ||||
819 | } | ||||
820 | |||||
821 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||
822 | .legalFor({{S32, S64}, {S16, S32}}) | ||||
823 | .scalarize(0) | ||||
824 | .lower(); | ||||
825 | |||||
826 | getActionDefinitionsBuilder(G_FPEXT) | ||||
827 | .legalFor({{S64, S32}, {S32, S16}}) | ||||
828 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||
829 | .scalarize(0); | ||||
830 | |||||
831 | auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); | ||||
832 | if (ST.has16BitInsts()) { | ||||
833 | FSubActions | ||||
834 | // Use actual fsub instruction | ||||
835 | .legalFor({S32, S16}) | ||||
836 | // Must use fadd + fneg | ||||
837 | .lowerFor({S64, V2S16}); | ||||
838 | } else { | ||||
839 | FSubActions | ||||
840 | // Use actual fsub instruction | ||||
841 | .legalFor({S32}) | ||||
842 | // Must use fadd + fneg | ||||
843 | .lowerFor({S64, S16, V2S16}); | ||||
844 | } | ||||
845 | |||||
846 | FSubActions | ||||
847 | .scalarize(0) | ||||
848 | .clampScalar(0, S32, S64); | ||||
849 | |||||
850 | // Whether this is legal depends on the floating point mode for the function. | ||||
851 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||
852 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||
853 | FMad.customFor({S32, S16}); | ||||
854 | else if (ST.hasMadMacF32Insts()) | ||||
855 | FMad.customFor({S32}); | ||||
856 | else if (ST.hasMadF16()) | ||||
857 | FMad.customFor({S16}); | ||||
858 | FMad.scalarize(0) | ||||
859 | .lower(); | ||||
860 | |||||
861 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||
862 | if (ST.has16BitInsts()) { | ||||
863 | FRem.customFor({S16, S32, S64}); | ||||
864 | } else { | ||||
865 | FRem.minScalar(0, S32) | ||||
866 | .customFor({S32, S64}); | ||||
867 | } | ||||
868 | FRem.scalarize(0); | ||||
869 | |||||
870 | // TODO: Do we need to clamp maximum bitwidth? | ||||
871 | getActionDefinitionsBuilder(G_TRUNC) | ||||
872 | .legalIf(isScalar(0)) | ||||
873 | .legalFor({{V2S16, V2S32}}) | ||||
874 | .clampMaxNumElements(0, S16, 2) | ||||
875 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||
876 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||
877 | // in the legalizer. | ||||
878 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||
879 | .alwaysLegal(); | ||||
880 | |||||
881 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||
882 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||
883 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||
884 | .scalarize(0) | ||||
885 | .clampScalar(0, S32, S64) | ||||
886 | .widenScalarToNextPow2(1, 32); | ||||
887 | |||||
888 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||
889 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||
890 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||
891 | .lowerIf(typeIs(1, S1)) | ||||
892 | .customFor({{S32, S64}, {S64, S64}}); | ||||
893 | if (ST.has16BitInsts()) | ||||
894 | IToFP.legalFor({{S16, S16}}); | ||||
895 | IToFP.clampScalar(1, S32, S64) | ||||
896 | .minScalar(0, S32) | ||||
897 | .scalarize(0) | ||||
898 | .widenScalarToNextPow2(1); | ||||
899 | |||||
900 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||
901 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||
902 | .customFor({{S64, S32}, {S64, S64}}) | ||||
903 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||
904 | if (ST.has16BitInsts()) | ||||
905 | FPToI.legalFor({{S16, S16}}); | ||||
906 | else | ||||
907 | FPToI.minScalar(1, S32); | ||||
908 | |||||
909 | FPToI.minScalar(0, S32) | ||||
910 | .widenScalarToNextPow2(0, 32) | ||||
911 | .scalarize(0) | ||||
912 | .lower(); | ||||
913 | |||||
914 | getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) | ||||
915 | .customFor({S16, S32}) | ||||
916 | .scalarize(0) | ||||
917 | .lower(); | ||||
918 | |||||
919 | // Lower roundeven into G_FRINT | ||||
920 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||
921 | .scalarize(0) | ||||
922 | .lower(); | ||||
923 | |||||
924 | if (ST.has16BitInsts()) { | ||||
925 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
926 | .legalFor({S16, S32, S64}) | ||||
927 | .clampScalar(0, S16, S64) | ||||
928 | .scalarize(0); | ||||
929 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||
930 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
931 | .legalFor({S32, S64}) | ||||
932 | .clampScalar(0, S32, S64) | ||||
933 | .scalarize(0); | ||||
934 | } else { | ||||
935 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
936 | .legalFor({S32}) | ||||
937 | .customFor({S64}) | ||||
938 | .clampScalar(0, S32, S64) | ||||
939 | .scalarize(0); | ||||
940 | } | ||||
941 | |||||
942 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||
943 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||
944 | .scalarize(0) | ||||
945 | .scalarSameSizeAs(1, 0); | ||||
946 | |||||
947 | getActionDefinitionsBuilder(G_PTRMASK) | ||||
948 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||
949 | .scalarSameSizeAs(1, 0) | ||||
950 | .scalarize(0); | ||||
951 | |||||
952 | auto &CmpBuilder = | ||||
953 | getActionDefinitionsBuilder(G_ICMP) | ||||
954 | // The compare output type differs based on the register bank of the output, | ||||
955 | // so make both s1 and s32 legal. | ||||
956 | // | ||||
957 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||
958 | // is the allocatable register type that will be needed for the copy from | ||||
959 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||
960 | // before that won't try to use s32 result types. | ||||
961 | // | ||||
962 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||
963 | // bank. | ||||
964 | .legalForCartesianProduct( | ||||
965 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||
966 | .legalForCartesianProduct( | ||||
967 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||
968 | if (ST.has16BitInsts()) { | ||||
969 | CmpBuilder.legalFor({{S1, S16}}); | ||||
970 | } | ||||
971 | |||||
972 | CmpBuilder | ||||
973 | .widenScalarToNextPow2(1) | ||||
974 | .clampScalar(1, S32, S64) | ||||
975 | .scalarize(0) | ||||
976 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||
977 | |||||
978 | getActionDefinitionsBuilder(G_FCMP) | ||||
979 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||
980 | .widenScalarToNextPow2(1) | ||||
981 | .clampScalar(1, S32, S64) | ||||
982 | .scalarize(0); | ||||
983 | |||||
984 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||
985 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||
986 | if (ST.has16BitInsts()) | ||||
987 | Exp2Ops.legalFor({S32, S16}); | ||||
988 | else | ||||
989 | Exp2Ops.legalFor({S32}); | ||||
990 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||
991 | Exp2Ops.scalarize(0); | ||||
992 | |||||
993 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||
994 | if (ST.has16BitInsts()) | ||||
995 | ExpOps.customFor({{S32}, {S16}}); | ||||
996 | else | ||||
997 | ExpOps.customFor({S32}); | ||||
998 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||
999 | .scalarize(0); | ||||
1000 | |||||
1001 | getActionDefinitionsBuilder(G_FPOWI) | ||||
1002 | .clampScalar(0, MinScalarFPTy, S32) | ||||
1003 | .lower(); | ||||
1004 | |||||
1005 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||
1006 | getActionDefinitionsBuilder(G_CTPOP) | ||||
1007 | .legalFor({{S32, S32}, {S32, S64}}) | ||||
1008 | .clampScalar(0, S32, S32) | ||||
1009 | .widenScalarToNextPow2(1, 32) | ||||
1010 | .clampScalar(1, S32, S64) | ||||
1011 | .scalarize(0) | ||||
1012 | .widenScalarToNextPow2(0, 32); | ||||
1013 | |||||
1014 | // If no 16 bit instr is available, lower into different instructions. | ||||
1015 | if (ST.has16BitInsts()) | ||||
1016 | getActionDefinitionsBuilder(G_IS_FPCLASS) | ||||
1017 | .legalForCartesianProduct({S1}, FPTypes16) | ||||
1018 | .widenScalarToNextPow2(1) | ||||
1019 | .scalarize(0) | ||||
1020 | .lower(); | ||||
1021 | else | ||||
1022 | getActionDefinitionsBuilder(G_IS_FPCLASS) | ||||
1023 | .legalForCartesianProduct({S1}, FPTypesBase) | ||||
1024 | .lowerFor({S1, S16}) | ||||
1025 | .widenScalarToNextPow2(1) | ||||
1026 | .scalarize(0) | ||||
1027 | .lower(); | ||||
1028 | |||||
1029 | // The hardware instructions return a different result on 0 than the generic | ||||
1030 | // instructions expect. The hardware produces -1, but these produce the | ||||
1031 | // bitwidth. | ||||
1032 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||
1033 | .scalarize(0) | ||||
1034 | .clampScalar(0, S32, S32) | ||||
1035 | .clampScalar(1, S32, S64) | ||||
1036 | .widenScalarToNextPow2(0, 32) | ||||
1037 | .widenScalarToNextPow2(1, 32) | ||||
1038 | .custom(); | ||||
1039 | |||||
1040 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||
1041 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||
1042 | .legalFor({{S32, S32}, {S32, S64}}) | ||||
1043 | .clampScalar(0, S32, S32) | ||||
1044 | .clampScalar(1, S32, S64) | ||||
1045 | .scalarize(0) | ||||
1046 | .widenScalarToNextPow2(0, 32) | ||||
1047 | .widenScalarToNextPow2(1, 32); | ||||
1048 | |||||
1049 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||
1050 | // RegBankSelect. | ||||
1051 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||
1052 | .legalFor({S32, S64}) | ||||
1053 | .clampScalar(0, S32, S64) | ||||
1054 | .scalarize(0) | ||||
1055 | .widenScalarToNextPow2(0); | ||||
1056 | |||||
1057 | if (ST.has16BitInsts()) { | ||||
1058 | getActionDefinitionsBuilder(G_BSWAP) | ||||
1059 | .legalFor({S16, S32, V2S16}) | ||||
1060 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
1061 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||
1062 | // narrowScalar limitation. | ||||
1063 | .widenScalarToNextPow2(0) | ||||
1064 | .clampScalar(0, S16, S32) | ||||
1065 | .scalarize(0); | ||||
1066 | |||||
1067 | if (ST.hasVOP3PInsts()) { | ||||
1068 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
1069 | .legalFor({S32, S16, V2S16}) | ||||
1070 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
1071 | .clampMaxNumElements(0, S16, 2) | ||||
1072 | .minScalar(0, S16) | ||||
1073 | .widenScalarToNextPow2(0) | ||||
1074 | .scalarize(0) | ||||
1075 | .lower(); | ||||
1076 | } else { | ||||
1077 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
1078 | .legalFor({S32, S16}) | ||||
1079 | .widenScalarToNextPow2(0) | ||||
1080 | .minScalar(0, S16) | ||||
1081 | .scalarize(0) | ||||
1082 | .lower(); | ||||
1083 | } | ||||
1084 | } else { | ||||
1085 | // TODO: Should have same legality without v_perm_b32 | ||||
1086 | getActionDefinitionsBuilder(G_BSWAP) | ||||
1087 | .legalFor({S32}) | ||||
1088 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||
1089 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||
1090 | // narrowScalar limitation. | ||||
1091 | .widenScalarToNextPow2(0) | ||||
1092 | .maxScalar(0, S32) | ||||
1093 | .scalarize(0) | ||||
1094 | .lower(); | ||||
1095 | |||||
1096 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
1097 | .legalFor({S32}) | ||||
1098 | .minScalar(0, S32) | ||||
1099 | .widenScalarToNextPow2(0) | ||||
1100 | .scalarize(0) | ||||
1101 | .lower(); | ||||
1102 | } | ||||
1103 | |||||
1104 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||
1105 | // List the common cases | ||||
1106 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||
1107 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||
1108 | .scalarize(0) | ||||
1109 | // Accept any address space as long as the size matches | ||||
1110 | .legalIf(sameSize(0, 1)) | ||||
1111 | .widenScalarIf(smallerThan(1, 0), | ||||
1112 | [](const LegalityQuery &Query) { | ||||
1113 | return std::pair( | ||||
1114 | 1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||
1115 | }) | ||||
1116 | .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { | ||||
1117 | return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||
1118 | }); | ||||
1119 | |||||
1120 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||
1121 | // List the common cases | ||||
1122 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||
1123 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||
1124 | .scalarize(0) | ||||
1125 | // Accept any address space as long as the size matches | ||||
1126 | .legalIf(sameSize(0, 1)) | ||||
1127 | .widenScalarIf(smallerThan(0, 1), | ||||
1128 | [](const LegalityQuery &Query) { | ||||
1129 | return std::pair( | ||||
1130 | 0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||
1131 | }) | ||||
1132 | .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { | ||||
1133 | return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||
1134 | }); | ||||
1135 | |||||
1136 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||
1137 | .scalarize(0) | ||||
1138 | .custom(); | ||||
1139 | |||||
1140 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||
1141 | bool IsLoad) -> bool { | ||||
1142 | const LLT DstTy = Query.Types[0]; | ||||
1143 | |||||
1144 | // Split vector extloads. | ||||
1145 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
1146 | |||||
1147 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||
1148 | return true; | ||||
1149 | |||||
1150 | const LLT PtrTy = Query.Types[1]; | ||||
1151 | unsigned AS = PtrTy.getAddressSpace(); | ||||
1152 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, | ||||
1153 | Query.MMODescrs[0].Ordering != | ||||
1154 | AtomicOrdering::NotAtomic)) | ||||
1155 | return true; | ||||
1156 | |||||
1157 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||
1158 | // TODO: May be able to widen depending on alignment etc. | ||||
1159 | unsigned NumRegs = (MemSize + 31) / 32; | ||||
1160 | if (NumRegs == 3) { | ||||
1161 | if (!ST.hasDwordx3LoadStores()) | ||||
1162 | return true; | ||||
1163 | } else { | ||||
1164 | // If the alignment allows, these should have been widened. | ||||
1165 | if (!isPowerOf2_32(NumRegs)) | ||||
1166 | return true; | ||||
1167 | } | ||||
1168 | |||||
1169 | return false; | ||||
1170 | }; | ||||
1171 | |||||
1172 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||
1173 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||
1174 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||
1175 | |||||
1176 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||
1177 | // LDS | ||||
1178 | // TODO: Unsupported flat for SI. | ||||
1179 | |||||
1180 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||
1181 | const bool IsStore = Op == G_STORE; | ||||
1182 | |||||
1183 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||
1184 | // Explicitly list some common cases. | ||||
1185 | // TODO: Does this help compile time at all? | ||||
1186 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, | ||||
1187 | {V2S32, GlobalPtr, V2S32, GlobalAlign32}, | ||||
1188 | {V4S32, GlobalPtr, V4S32, GlobalAlign32}, | ||||
1189 | {S64, GlobalPtr, S64, GlobalAlign32}, | ||||
1190 | {V2S64, GlobalPtr, V2S64, GlobalAlign32}, | ||||
1191 | {V2S16, GlobalPtr, V2S16, GlobalAlign32}, | ||||
1192 | {S32, GlobalPtr, S8, GlobalAlign8}, | ||||
1193 | {S32, GlobalPtr, S16, GlobalAlign16}, | ||||
1194 | |||||
1195 | {S32, LocalPtr, S32, 32}, | ||||
1196 | {S64, LocalPtr, S64, 32}, | ||||
1197 | {V2S32, LocalPtr, V2S32, 32}, | ||||
1198 | {S32, LocalPtr, S8, 8}, | ||||
1199 | {S32, LocalPtr, S16, 16}, | ||||
1200 | {V2S16, LocalPtr, S32, 32}, | ||||
1201 | |||||
1202 | {S32, PrivatePtr, S32, 32}, | ||||
1203 | {S32, PrivatePtr, S8, 8}, | ||||
1204 | {S32, PrivatePtr, S16, 16}, | ||||
1205 | {V2S16, PrivatePtr, S32, 32}, | ||||
1206 | |||||
1207 | {S32, ConstantPtr, S32, GlobalAlign32}, | ||||
1208 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}, | ||||
1209 | {V4S32, ConstantPtr, V4S32, GlobalAlign32}, | ||||
1210 | {S64, ConstantPtr, S64, GlobalAlign32}, | ||||
1211 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); | ||||
1212 | Actions.legalIf( | ||||
1213 | [=](const LegalityQuery &Query) -> bool { | ||||
1214 | return isLoadStoreLegal(ST, Query); | ||||
1215 | }); | ||||
1216 | |||||
1217 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||
1218 | // 64-bits. | ||||
1219 | // | ||||
1220 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||
1221 | // inserting addrspacecasts. | ||||
1222 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||
1223 | |||||
1224 | // Turn any illegal element vectors into something easier to deal | ||||
1225 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||
1226 | // parts anyway. | ||||
1227 | // | ||||
1228 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||
1229 | // 16-bit vector parts. | ||||
1230 | Actions.bitcastIf( | ||||
1231 | [=](const LegalityQuery &Query) -> bool { | ||||
1232 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||
1233 | Query.MMODescrs[0].MemoryTy); | ||||
1234 | }, bitcastToRegisterType(0)); | ||||
1235 | |||||
1236 | if (!IsStore) { | ||||
1237 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||
1238 | // legalization actions can't properly express widening memory operands. | ||||
1239 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||
1240 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||
1241 | }); | ||||
1242 | } | ||||
1243 | |||||
1244 | // FIXME: load/store narrowing should be moved to lower action | ||||
1245 | Actions | ||||
1246 | .narrowScalarIf( | ||||
1247 | [=](const LegalityQuery &Query) -> bool { | ||||
1248 | return !Query.Types[0].isVector() && | ||||
1249 | needToSplitMemOp(Query, Op == G_LOAD); | ||||
1250 | }, | ||||
1251 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||
1252 | const LLT DstTy = Query.Types[0]; | ||||
1253 | const LLT PtrTy = Query.Types[1]; | ||||
1254 | |||||
1255 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||
1256 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
1257 | |||||
1258 | // Split extloads. | ||||
1259 | if (DstSize > MemSize) | ||||
1260 | return std::pair(0, LLT::scalar(MemSize)); | ||||
1261 | |||||
1262 | unsigned MaxSize = maxSizeForAddrSpace( | ||||
1263 | ST, PtrTy.getAddressSpace(), Op == G_LOAD, | ||||
1264 | Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); | ||||
1265 | if (MemSize > MaxSize) | ||||
1266 | return std::pair(0, LLT::scalar(MaxSize)); | ||||
1267 | |||||
1268 | uint64_t Align = Query.MMODescrs[0].AlignInBits; | ||||
1269 | return std::pair(0, LLT::scalar(Align)); | ||||
1270 | }) | ||||
1271 | .fewerElementsIf( | ||||
1272 | [=](const LegalityQuery &Query) -> bool { | ||||
1273 | return Query.Types[0].isVector() && | ||||
1274 | needToSplitMemOp(Query, Op == G_LOAD); | ||||
1275 | }, | ||||
1276 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||
1277 | const LLT DstTy = Query.Types[0]; | ||||
1278 | const LLT PtrTy = Query.Types[1]; | ||||
1279 | |||||
1280 | LLT EltTy = DstTy.getElementType(); | ||||
1281 | unsigned MaxSize = maxSizeForAddrSpace( | ||||
1282 | ST, PtrTy.getAddressSpace(), Op == G_LOAD, | ||||
1283 | Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); | ||||
1284 | |||||
1285 | // FIXME: Handle widened to power of 2 results better. This ends | ||||
1286 | // up scalarizing. | ||||
1287 | // FIXME: 3 element stores scalarized on SI | ||||
1288 | |||||
1289 | // Split if it's too large for the address space. | ||||
1290 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
1291 | if (MemSize > MaxSize) { | ||||
1292 | unsigned NumElts = DstTy.getNumElements(); | ||||
1293 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
1294 | |||||
1295 | if (MaxSize % EltSize == 0) { | ||||
1296 | return std::pair( | ||||
1297 | 0, LLT::scalarOrVector( | ||||
1298 | ElementCount::getFixed(MaxSize / EltSize), EltTy)); | ||||
1299 | } | ||||
1300 | |||||
1301 | unsigned NumPieces = MemSize / MaxSize; | ||||
1302 | |||||
1303 | // FIXME: Refine when odd breakdowns handled | ||||
1304 | // The scalars will need to be re-legalized. | ||||
1305 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||
1306 | NumElts % NumPieces != 0) | ||||
1307 | return std::pair(0, EltTy); | ||||
1308 | |||||
1309 | return std::pair(0, | ||||
1310 | LLT::fixed_vector(NumElts / NumPieces, EltTy)); | ||||
1311 | } | ||||
1312 | |||||
1313 | // FIXME: We could probably handle weird extending loads better. | ||||
1314 | if (DstTy.getSizeInBits() > MemSize) | ||||
1315 | return std::pair(0, EltTy); | ||||
1316 | |||||
1317 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
1318 | unsigned DstSize = DstTy.getSizeInBits(); | ||||
1319 | if (!isPowerOf2_32(DstSize)) { | ||||
1320 | // We're probably decomposing an odd sized store. Try to split | ||||
1321 | // to the widest type. TODO: Account for alignment. As-is it | ||||
1322 | // should be OK, since the new parts will be further legalized. | ||||
1323 | unsigned FloorSize = llvm::bit_floor(DstSize); | ||||
1324 | return std::pair( | ||||
1325 | 0, LLT::scalarOrVector( | ||||
1326 | ElementCount::getFixed(FloorSize / EltSize), EltTy)); | ||||
1327 | } | ||||
1328 | |||||
1329 | // May need relegalization for the scalars. | ||||
1330 | return std::pair(0, EltTy); | ||||
1331 | }) | ||||
1332 | .minScalar(0, S32) | ||||
1333 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) | ||||
1334 | .widenScalarToNextPow2(0) | ||||
1335 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||
1336 | .lower(); | ||||
1337 | } | ||||
1338 | |||||
1339 | // FIXME: Unaligned accesses not lowered. | ||||
1340 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||
1341 | .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, | ||||
1342 | {S32, GlobalPtr, S16, 2 * 8}, | ||||
1343 | {S32, LocalPtr, S8, 8}, | ||||
1344 | {S32, LocalPtr, S16, 16}, | ||||
1345 | {S32, PrivatePtr, S8, 8}, | ||||
1346 | {S32, PrivatePtr, S16, 16}, | ||||
1347 | {S32, ConstantPtr, S8, 8}, | ||||
1348 | {S32, ConstantPtr, S16, 2 * 8}}) | ||||
1349 | .legalIf( | ||||
1350 | [=](const LegalityQuery &Query) -> bool { | ||||
1351 | return isLoadStoreLegal(ST, Query); | ||||
1352 | }); | ||||
1353 | |||||
1354 | if (ST.hasFlatAddressSpace()) { | ||||
1355 | ExtLoads.legalForTypesWithMemDesc( | ||||
1356 | {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); | ||||
1357 | } | ||||
1358 | |||||
1359 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||
1360 | // 64-bits. | ||||
1361 | // | ||||
1362 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||
1363 | // inserting addrspacecasts. | ||||
1364 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); | ||||
1365 | |||||
1366 | ExtLoads.clampScalar(0, S32, S32) | ||||
1367 | .widenScalarToNextPow2(0) | ||||
1368 | .lower(); | ||||
1369 | |||||
1370 | auto &Atomics = getActionDefinitionsBuilder( | ||||
1371 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||
1372 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||
1373 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||
1374 | G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) | ||||
1375 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||
1376 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||
1377 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||
1378 | if (ST.hasFlatAddressSpace()) { | ||||
1379 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||
1380 | } | ||||
1381 | |||||
1382 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | ||||
1383 | if (ST.hasLDSFPAtomicAdd()) { | ||||
1384 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||
1385 | if (ST.hasGFX90AInsts()) | ||||
1386 | Atomic.legalFor({{S64, LocalPtr}}); | ||||
1387 | if (ST.hasAtomicDsPkAdd16Insts()) | ||||
1388 | Atomic.legalFor({{V2S16, LocalPtr}}); | ||||
1389 | } | ||||
1390 | if (ST.hasAtomicFaddInsts()) | ||||
1391 | Atomic.legalFor({{S32, GlobalPtr}}); | ||||
1392 | if (ST.hasFlatAtomicFaddF32Inst()) | ||||
1393 | Atomic.legalFor({{S32, FlatPtr}}); | ||||
1394 | |||||
1395 | if (ST.hasGFX90AInsts()) { | ||||
1396 | // These are legal with some caveats, and should have undergone expansion in | ||||
1397 | // the IR in most situations | ||||
1398 | // TODO: Move atomic expansion into legalizer | ||||
1399 | Atomic.legalFor({ | ||||
1400 | {S32, GlobalPtr}, | ||||
1401 | {S64, GlobalPtr}, | ||||
1402 | {S64, FlatPtr} | ||||
1403 | }); | ||||
1404 | } | ||||
1405 | |||||
1406 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||
1407 | // demarshalling | ||||
1408 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||
1409 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||
1410 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||
1411 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||
1412 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||
1413 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||
1414 | |||||
1415 | // Condition should be s32 for scalar, s1 for vector. | ||||
1416 | getActionDefinitionsBuilder(G_SELECT) | ||||
1417 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, | ||||
1418 | LocalPtr, FlatPtr, PrivatePtr, | ||||
1419 | LLT::fixed_vector(2, LocalPtr), | ||||
1420 | LLT::fixed_vector(2, PrivatePtr)}, | ||||
1421 | {S1, S32}) | ||||
1422 | .clampScalar(0, S16, S64) | ||||
1423 | .scalarize(1) | ||||
1424 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
1425 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||
1426 | .clampMaxNumElements(0, S32, 2) | ||||
1427 | .clampMaxNumElements(0, LocalPtr, 2) | ||||
1428 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||
1429 | .scalarize(0) | ||||
1430 | .widenScalarToNextPow2(0) | ||||
1431 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||
1432 | |||||
1433 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||
1434 | // be more flexible with the shift amount type. | ||||
1435 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||
1436 | .legalFor({{S32, S32}, {S64, S32}}); | ||||
1437 | if (ST.has16BitInsts()) { | ||||
1438 | if (ST.hasVOP3PInsts()) { | ||||
1439 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||
1440 | .clampMaxNumElements(0, S16, 2); | ||||
1441 | } else | ||||
1442 | Shifts.legalFor({{S16, S16}}); | ||||
1443 | |||||
1444 | // TODO: Support 16-bit shift amounts for all types | ||||
1445 | Shifts.widenScalarIf( | ||||
1446 | [=](const LegalityQuery &Query) { | ||||
1447 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||
1448 | // 32-bit amount. | ||||
1449 | const LLT ValTy = Query.Types[0]; | ||||
1450 | const LLT AmountTy = Query.Types[1]; | ||||
1451 | return ValTy.getSizeInBits() <= 16 && | ||||
1452 | AmountTy.getSizeInBits() < 16; | ||||
1453 | }, changeTo(1, S16)); | ||||
1454 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||
1455 | Shifts.clampScalar(1, S32, S32); | ||||
1456 | Shifts.widenScalarToNextPow2(0, 16); | ||||
1457 | Shifts.clampScalar(0, S16, S64); | ||||
1458 | |||||
1459 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||
1460 | .minScalar(0, S16) | ||||
1461 | .scalarize(0) | ||||
1462 | .lower(); | ||||
1463 | } else { | ||||
1464 | // Make sure we legalize the shift amount type first, as the general | ||||
1465 | // expansion for the shifted type will produce much worse code if it hasn't | ||||
1466 | // been truncated already. | ||||
1467 | Shifts.clampScalar(1, S32, S32); | ||||
1468 | Shifts.widenScalarToNextPow2(0, 32); | ||||
1469 | Shifts.clampScalar(0, S32, S64); | ||||
1470 | |||||
1471 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||
1472 | .minScalar(0, S32) | ||||
1473 | .scalarize(0) | ||||
1474 | .lower(); | ||||
1475 | } | ||||
1476 | Shifts.scalarize(0); | ||||
1477 | |||||
1478 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||
1479 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||
1480 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||
1481 | unsigned IdxTypeIdx = 2; | ||||
1482 | |||||
1483 | getActionDefinitionsBuilder(Op) | ||||
1484 | .customIf([=](const LegalityQuery &Query) { | ||||
1485 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||
1486 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||
1487 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||
1488 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||
1489 | const bool isLegalVecType = | ||||
1490 | !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); | ||||
1491 | return (EltSize == 32 || EltSize == 64) && | ||||
1492 | VecTy.getSizeInBits() % 32 == 0 && | ||||
1493 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||
1494 | IdxTy.getSizeInBits() == 32 && | ||||
1495 | isLegalVecType; | ||||
1496 | }) | ||||
1497 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||
1498 | bitcastToVectorElement32(VecTypeIdx)) | ||||
1499 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||
1500 | .bitcastIf( | ||||
1501 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||
1502 | [=](const LegalityQuery &Query) { | ||||
1503 | // For > 64-bit element types, try to turn this into a 64-bit | ||||
1504 | // element vector since we may be able to do better indexing | ||||
1505 | // if this is scalar. If not, fall back to 32. | ||||
1506 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||
1507 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||
1508 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||
1509 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||
1510 | |||||
1511 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||
1512 | return std::pair( | ||||
1513 | VecTypeIdx, | ||||
1514 | LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); | ||||
1515 | }) | ||||
1516 | .clampScalar(EltTypeIdx, S32, S64) | ||||
1517 | .clampScalar(VecTypeIdx, S32, S64) | ||||
1518 | .clampScalar(IdxTypeIdx, S32, S32) | ||||
1519 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||
1520 | // TODO: Clamp elements for 64-bit vectors? | ||||
1521 | .moreElementsIf( | ||||
1522 | isIllegalRegisterType(VecTypeIdx), | ||||
1523 | moreElementsToNextExistingRegClass(VecTypeIdx)) | ||||
1524 | // It should only be necessary with variable indexes. | ||||
1525 | // As a last resort, lower to the stack | ||||
1526 | .lower(); | ||||
1527 | } | ||||
1528 | |||||
1529 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||
1530 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||
1531 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||
1532 | return Query.Types[0] != EltTy; | ||||
1533 | }); | ||||
1534 | |||||
1535 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||
1536 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||
1537 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||
1538 | |||||
1539 | // FIXME: Doesn't handle extract of illegal sizes. | ||||
1540 | getActionDefinitionsBuilder(Op) | ||||
1541 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||
1542 | .lowerIf([=](const LegalityQuery &Query) { | ||||
1543 | // Sub-vector(or single element) insert and extract. | ||||
1544 | // TODO: verify immediate offset here since lower only works with | ||||
1545 | // whole elements. | ||||
1546 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1547 | return BigTy.isVector(); | ||||
1548 | }) | ||||
1549 | // FIXME: Multiples of 16 should not be legal. | ||||
1550 | .legalIf([=](const LegalityQuery &Query) { | ||||
1551 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1552 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||
1553 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||
1554 | (LitTy.getSizeInBits() % 16 == 0); | ||||
1555 | }) | ||||
1556 | .widenScalarIf( | ||||
1557 | [=](const LegalityQuery &Query) { | ||||
1558 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1559 | return (BigTy.getScalarSizeInBits() < 16); | ||||
1560 | }, | ||||
1561 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||
1562 | .widenScalarIf( | ||||
1563 | [=](const LegalityQuery &Query) { | ||||
1564 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||
1565 | return (LitTy.getScalarSizeInBits() < 16); | ||||
1566 | }, | ||||
1567 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||
1568 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||
1569 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||
1570 | |||||
1571 | } | ||||
1572 | |||||
1573 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||
1574 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||
1575 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||
1576 | .clampNumElements(0, V16S32, V32S32) | ||||
1577 | .clampNumElements(0, V2S64, V16S64) | ||||
1578 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) | ||||
1579 | .moreElementsIf( | ||||
1580 | isIllegalRegisterType(0), | ||||
1581 | moreElementsToNextExistingRegClass(0)); | ||||
1582 | |||||
1583 | if (ST.hasScalarPackInsts()) { | ||||
1584 | BuildVector | ||||
1585 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||
1586 | .minScalarOrElt(0, S16) | ||||
1587 | .minScalar(1, S16); | ||||
1588 | |||||
1589 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||
1590 | .legalFor({V2S16, S32}) | ||||
1591 | .lower(); | ||||
1592 | } else { | ||||
1593 | BuildVector.customFor({V2S16, S16}); | ||||
1594 | BuildVector.minScalarOrElt(0, S32); | ||||
1595 | |||||
1596 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||
1597 | .customFor({V2S16, S32}) | ||||
1598 | .lower(); | ||||
1599 | } | ||||
1600 | |||||
1601 | BuildVector.legalIf(isRegisterType(0)); | ||||
1602 | |||||
1603 | // FIXME: Clamp maximum size | ||||
1604 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||
1605 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
1606 | .clampMaxNumElements(0, S32, 32) | ||||
1607 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||
1608 | .clampMaxNumElements(0, S16, 64); | ||||
1609 | |||||
1610 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||
1611 | |||||
1612 | // Merge/Unmerge | ||||
1613 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||
1614 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||
1615 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||
1616 | |||||
1617 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||
1618 | const LLT Ty = Query.Types[TypeIdx]; | ||||
1619 | if (Ty.isVector()) { | ||||
1620 | const LLT &EltTy = Ty.getElementType(); | ||||
1621 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||
1622 | return true; | ||||
1623 | if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) | ||||
1624 | return true; | ||||
1625 | } | ||||
1626 | return false; | ||||
1627 | }; | ||||
1628 | |||||
1629 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||
1630 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
1631 | .lowerFor({{S16, V2S16}}) | ||||
1632 | .lowerIf([=](const LegalityQuery &Query) { | ||||
1633 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1634 | return BigTy.getSizeInBits() == 32; | ||||
1635 | }) | ||||
1636 | // Try to widen to s16 first for small types. | ||||
1637 | // TODO: Only do this on targets with legal s16 shifts | ||||
1638 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||
1639 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||
1640 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||
1641 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||
1642 | elementTypeIs(1, S16)), | ||||
1643 | changeTo(1, V2S16)) | ||||
1644 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||
1645 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||
1646 | // valid. | ||||
1647 | .clampScalar(LitTyIdx, S32, S512) | ||||
1648 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||
1649 | // Break up vectors with weird elements into scalars | ||||
1650 | .fewerElementsIf( | ||||
1651 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||
1652 | scalarize(0)) | ||||
1653 | .fewerElementsIf( | ||||
1654 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||
1655 | scalarize(1)) | ||||
1656 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||
1657 | |||||
1658 | if (Op == G_MERGE_VALUES) { | ||||
1659 | Builder.widenScalarIf( | ||||
1660 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||
1661 | [=](const LegalityQuery &Query) { | ||||
1662 | const LLT Ty = Query.Types[LitTyIdx]; | ||||
1663 | return Ty.getSizeInBits() < 32; | ||||
1664 | }, | ||||
1665 | changeTo(LitTyIdx, S32)); | ||||
1666 | } | ||||
1667 | |||||
1668 | Builder.widenScalarIf( | ||||
1669 | [=](const LegalityQuery &Query) { | ||||
1670 | const LLT Ty = Query.Types[BigTyIdx]; | ||||
1671 | return Ty.getSizeInBits() % 16 != 0; | ||||
1672 | }, | ||||
1673 | [=](const LegalityQuery &Query) { | ||||
1674 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||
1675 | // Whichever is smaller. | ||||
1676 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||
1677 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||
1678 | if (NewSizeInBits >= 256) { | ||||
1679 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||
1680 | if (RoundedTo < NewSizeInBits) | ||||
1681 | NewSizeInBits = RoundedTo; | ||||
1682 | } | ||||
1683 | return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||
1684 | }) | ||||
1685 | // Any vectors left are the wrong size. Scalarize them. | ||||
1686 | .scalarize(0) | ||||
1687 | .scalarize(1); | ||||
1688 | } | ||||
1689 | |||||
1690 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||
1691 | // RegBankSelect. | ||||
1692 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||
1693 | .legalFor({{S32}, {S64}}); | ||||
1694 | |||||
1695 | if (ST.hasVOP3PInsts()) { | ||||
1696 | SextInReg.lowerFor({{V2S16}}) | ||||
1697 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||
1698 | // get more vector shift opportunities, since we'll get those when | ||||
1699 | // expanded. | ||||
1700 | .clampMaxNumElementsStrict(0, S16, 2); | ||||
1701 | } else if (ST.has16BitInsts()) { | ||||
1702 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||
1703 | } else { | ||||
1704 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||
1705 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||
1706 | SextInReg.lowerFor({{S32}, {S64}}); | ||||
1707 | } | ||||
1708 | |||||
1709 | SextInReg | ||||
1710 | .scalarize(0) | ||||
1711 | .clampScalar(0, S32, S64) | ||||
1712 | .lower(); | ||||
1713 | |||||
1714 | getActionDefinitionsBuilder({G_ROTR, G_ROTL}) | ||||
1715 | .scalarize(0) | ||||
1716 | .lower(); | ||||
1717 | |||||
1718 | // TODO: Only Try to form v2s16 with legal packed instructions. | ||||
1719 | getActionDefinitionsBuilder(G_FSHR) | ||||
1720 | .legalFor({{S32, S32}}) | ||||
1721 | .lowerFor({{V2S16, V2S16}}) | ||||
1722 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
1723 | .scalarize(0) | ||||
1724 | .lower(); | ||||
1725 | |||||
1726 | if (ST.hasVOP3PInsts()) { | ||||
1727 | getActionDefinitionsBuilder(G_FSHL) | ||||
1728 | .lowerFor({{V2S16, V2S16}}) | ||||
1729 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
1730 | .scalarize(0) | ||||
1731 | .lower(); | ||||
1732 | } else { | ||||
1733 | getActionDefinitionsBuilder(G_FSHL) | ||||
1734 | .scalarize(0) | ||||
1735 | .lower(); | ||||
1736 | } | ||||
1737 | |||||
1738 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||
1739 | .legalFor({S64}); | ||||
1740 | |||||
1741 | getActionDefinitionsBuilder(G_FENCE) | ||||
1742 | .alwaysLegal(); | ||||
1743 | |||||
1744 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | ||||
1745 | .scalarize(0) | ||||
1746 | .minScalar(0, S32) | ||||
1747 | .lower(); | ||||
1748 | |||||
1749 | getActionDefinitionsBuilder({G_SBFX, G_UBFX}) | ||||
1750 | .legalFor({{S32, S32}, {S64, S32}}) | ||||
1751 | .clampScalar(1, S32, S32) | ||||
1752 | .clampScalar(0, S32, S64) | ||||
1753 | .widenScalarToNextPow2(0) | ||||
1754 | .scalarize(0); | ||||
1755 | |||||
1756 | getActionDefinitionsBuilder({ | ||||
1757 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||
1758 | G_FCOPYSIGN, | ||||
1759 | |||||
1760 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||
1761 | G_ATOMICRMW_NAND, | ||||
1762 | G_ATOMICRMW_FSUB, | ||||
1763 | G_READ_REGISTER, | ||||
1764 | G_WRITE_REGISTER, | ||||
1765 | |||||
1766 | G_SADDO, G_SSUBO, | ||||
1767 | |||||
1768 | // TODO: Implement | ||||
1769 | G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||
1770 | |||||
1771 | getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) | ||||
1772 | .lower(); | ||||
1773 | |||||
1774 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||
1775 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||
1776 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||
1777 | .unsupported(); | ||||
1778 | |||||
1779 | getLegacyLegalizerInfo().computeTables(); | ||||
1780 | verify(*ST.getInstrInfo()); | ||||
1781 | } | ||||
1782 | |||||
1783 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||
1784 | MachineInstr &MI) const { | ||||
1785 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
1786 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
1787 | |||||
1788 | switch (MI.getOpcode()) { | ||||
1789 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||
1790 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||
1791 | case TargetOpcode::G_FRINT: | ||||
1792 | return legalizeFrint(MI, MRI, B); | ||||
1793 | case TargetOpcode::G_FCEIL: | ||||
1794 | return legalizeFceil(MI, MRI, B); | ||||
1795 | case TargetOpcode::G_FREM: | ||||
1796 | return legalizeFrem(MI, MRI, B); | ||||
1797 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||
1798 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||
1799 | case TargetOpcode::G_SITOFP: | ||||
1800 | return legalizeITOFP(MI, MRI, B, true); | ||||
1801 | case TargetOpcode::G_UITOFP: | ||||
1802 | return legalizeITOFP(MI, MRI, B, false); | ||||
1803 | case TargetOpcode::G_FPTOSI: | ||||
1804 | return legalizeFPTOI(MI, MRI, B, true); | ||||
1805 | case TargetOpcode::G_FPTOUI: | ||||
1806 | return legalizeFPTOI(MI, MRI, B, false); | ||||
1807 | case TargetOpcode::G_FMINNUM: | ||||
1808 | case TargetOpcode::G_FMAXNUM: | ||||
1809 | case TargetOpcode::G_FMINNUM_IEEE: | ||||
1810 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||
1811 | return legalizeMinNumMaxNum(Helper, MI); | ||||
1812 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||
1813 | return legalizeExtractVectorElt(MI, MRI, B); | ||||
1814 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||
1815 | return legalizeInsertVectorElt(MI, MRI, B); | ||||
1816 | case TargetOpcode::G_FSIN: | ||||
1817 | case TargetOpcode::G_FCOS: | ||||
1818 | return legalizeSinCos(MI, MRI, B); | ||||
1819 | case TargetOpcode::G_GLOBAL_VALUE: | ||||
1820 | return legalizeGlobalValue(MI, MRI, B); | ||||
1821 | case TargetOpcode::G_LOAD: | ||||
1822 | case TargetOpcode::G_SEXTLOAD: | ||||
1823 | case TargetOpcode::G_ZEXTLOAD: | ||||
1824 | return legalizeLoad(Helper, MI); | ||||
1825 | case TargetOpcode::G_FMAD: | ||||
1826 | return legalizeFMad(MI, MRI, B); | ||||
1827 | case TargetOpcode::G_FDIV: | ||||
1828 | return legalizeFDIV(MI, MRI, B); | ||||
1829 | case TargetOpcode::G_UDIV: | ||||
1830 | case TargetOpcode::G_UREM: | ||||
1831 | case TargetOpcode::G_UDIVREM: | ||||
1832 | return legalizeUnsignedDIV_REM(MI, MRI, B); | ||||
1833 | case TargetOpcode::G_SDIV: | ||||
1834 | case TargetOpcode::G_SREM: | ||||
1835 | case TargetOpcode::G_SDIVREM: | ||||
1836 | return legalizeSignedDIV_REM(MI, MRI, B); | ||||
1837 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||
1838 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||
1839 | case TargetOpcode::G_FLOG: | ||||
1840 | return legalizeFlog(MI, B, numbers::ln2f); | ||||
1841 | case TargetOpcode::G_FLOG10: | ||||
1842 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||
1843 | case TargetOpcode::G_FEXP: | ||||
1844 | return legalizeFExp(MI, B); | ||||
1845 | case TargetOpcode::G_FPOW: | ||||
1846 | return legalizeFPow(MI, B); | ||||
1847 | case TargetOpcode::G_FFLOOR: | ||||
1848 | return legalizeFFloor(MI, MRI, B); | ||||
1849 | case TargetOpcode::G_BUILD_VECTOR: | ||||
1850 | case TargetOpcode::G_BUILD_VECTOR_TRUNC: | ||||
1851 | return legalizeBuildVector(MI, MRI, B); | ||||
1852 | case TargetOpcode::G_MUL: | ||||
1853 | return legalizeMul(Helper, MI); | ||||
1854 | case TargetOpcode::G_CTLZ: | ||||
1855 | case TargetOpcode::G_CTTZ: | ||||
1856 | return legalizeCTLZ_CTTZ(MI, MRI, B); | ||||
1857 | case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: | ||||
1858 | return legalizeFPTruncRound(MI, B); | ||||
1859 | default: | ||||
1860 | return false; | ||||
1861 | } | ||||
1862 | |||||
1863 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1863); | ||||
1864 | } | ||||
1865 | |||||
1866 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||
1867 | unsigned AS, | ||||
1868 | MachineRegisterInfo &MRI, | ||||
1869 | MachineIRBuilder &B) const { | ||||
1870 | MachineFunction &MF = B.getMF(); | ||||
1871 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||
1872 | const LLT S32 = LLT::scalar(32); | ||||
1873 | const LLT S64 = LLT::scalar(64); | ||||
1874 | |||||
1875 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1875, __extension__ __PRETTY_FUNCTION__)); | ||||
1876 | |||||
1877 | if (ST.hasApertureRegs()) { | ||||
1878 | // Note: this register is somewhat broken. When used as a 32-bit operand, | ||||
1879 | // it only returns zeroes. The real value is in the upper 32 bits. | ||||
1880 | // Thus, we must emit extract the high 32 bits. | ||||
1881 | const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) | ||||
1882 | ? AMDGPU::SRC_SHARED_BASE | ||||
1883 | : AMDGPU::SRC_PRIVATE_BASE; | ||||
1884 | // FIXME: It would be more natural to emit a COPY here, but then copy | ||||
1885 | // coalescing would kick in and it would think it's okay to use the "HI" | ||||
1886 | // subregister (instead of extracting the HI 32 bits) which is an artificial | ||||
1887 | // (unusable) register. | ||||
1888 | // Register TableGen definitions would need an overhaul to get rid of the | ||||
1889 | // artificial "HI" aperture registers and prevent this kind of issue from | ||||
1890 | // happening. | ||||
1891 | Register Dst = MRI.createGenericVirtualRegister(S64); | ||||
1892 | MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); | ||||
1893 | B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); | ||||
1894 | return B.buildUnmerge(S32, Dst).getReg(1); | ||||
1895 | } | ||||
1896 | |||||
1897 | // TODO: can we be smarter about machine pointer info? | ||||
1898 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
1899 | Register LoadAddr = MRI.createGenericVirtualRegister( | ||||
1900 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
1901 | // For code object version 5, private_base and shared_base are passed through | ||||
1902 | // implicit kernargs. | ||||
1903 | if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= | ||||
1904 | AMDGPU::AMDHSA_COV5) { | ||||
1905 | AMDGPUTargetLowering::ImplicitParameter Param = | ||||
1906 | AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE | ||||
1907 | : AMDGPUTargetLowering::PRIVATE_BASE; | ||||
1908 | uint64_t Offset = | ||||
1909 | ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); | ||||
1910 | |||||
1911 | Register KernargPtrReg = MRI.createGenericVirtualRegister( | ||||
1912 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
1913 | |||||
1914 | if (!loadInputValue(KernargPtrReg, B, | ||||
1915 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
1916 | return Register(); | ||||
1917 | |||||
1918 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
1919 | PtrInfo, | ||||
1920 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
1921 | MachineMemOperand::MOInvariant, | ||||
1922 | LLT::scalar(32), commonAlignment(Align(64), Offset)); | ||||
1923 | |||||
1924 | // Pointer address | ||||
1925 | B.buildPtrAdd(LoadAddr, KernargPtrReg, | ||||
1926 | B.buildConstant(LLT::scalar(64), Offset).getReg(0)); | ||||
1927 | // Load address | ||||
1928 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||
1929 | } | ||||
1930 | |||||
1931 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||
1932 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
1933 | |||||
1934 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||
1935 | return Register(); | ||||
1936 | |||||
1937 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||
1938 | // private_segment_aperture_base_hi. | ||||
1939 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||
1940 | |||||
1941 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
1942 | PtrInfo, | ||||
1943 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
1944 | MachineMemOperand::MOInvariant, | ||||
1945 | LLT::scalar(32), commonAlignment(Align(64), StructOffset)); | ||||
1946 | |||||
1947 | B.buildPtrAdd(LoadAddr, QueuePtr, | ||||
1948 | B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); | ||||
1949 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||
1950 | } | ||||
1951 | |||||
1952 | /// Return true if the value is a known valid address, such that a null check is | ||||
1953 | /// not necessary. | ||||
1954 | static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, | ||||
1955 | const AMDGPUTargetMachine &TM, unsigned AddrSpace) { | ||||
1956 | MachineInstr *Def = MRI.getVRegDef(Val); | ||||
1957 | switch (Def->getOpcode()) { | ||||
1958 | case AMDGPU::G_FRAME_INDEX: | ||||
1959 | case AMDGPU::G_GLOBAL_VALUE: | ||||
1960 | case AMDGPU::G_BLOCK_ADDR: | ||||
1961 | return true; | ||||
1962 | case AMDGPU::G_CONSTANT: { | ||||
1963 | const ConstantInt *CI = Def->getOperand(1).getCImm(); | ||||
1964 | return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); | ||||
1965 | } | ||||
1966 | default: | ||||
1967 | return false; | ||||
1968 | } | ||||
1969 | |||||
1970 | return false; | ||||
1971 | } | ||||
1972 | |||||
1973 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||
1974 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
1975 | MachineIRBuilder &B) const { | ||||
1976 | MachineFunction &MF = B.getMF(); | ||||
1977 | |||||
1978 | const LLT S32 = LLT::scalar(32); | ||||
1979 | Register Dst = MI.getOperand(0).getReg(); | ||||
1980 | Register Src = MI.getOperand(1).getReg(); | ||||
1981 | |||||
1982 | LLT DstTy = MRI.getType(Dst); | ||||
1983 | LLT SrcTy = MRI.getType(Src); | ||||
1984 | unsigned DestAS = DstTy.getAddressSpace(); | ||||
1985 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||
1986 | |||||
1987 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||
1988 | // vector element. | ||||
1989 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1989, __extension__ __PRETTY_FUNCTION__)); | ||||
1990 | |||||
1991 | const AMDGPUTargetMachine &TM | ||||
1992 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||
1993 | |||||
1994 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||
1995 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||
1996 | return true; | ||||
1997 | } | ||||
1998 | |||||
1999 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS && | ||||
2000 | (DestAS == AMDGPUAS::LOCAL_ADDRESS || | ||||
2001 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { | ||||
2002 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||
2003 | // Extract low 32-bits of the pointer. | ||||
2004 | B.buildExtract(Dst, Src, 0); | ||||
2005 | MI.eraseFromParent(); | ||||
2006 | return true; | ||||
2007 | } | ||||
2008 | |||||
2009 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||
2010 | |||||
2011 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||
2012 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||
2013 | |||||
2014 | // Extract low 32-bits of the pointer. | ||||
2015 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||
2016 | |||||
2017 | auto CmpRes = | ||||
2018 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||
2019 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||
2020 | |||||
2021 | MI.eraseFromParent(); | ||||
2022 | return true; | ||||
2023 | } | ||||
2024 | |||||
2025 | if (DestAS == AMDGPUAS::FLAT_ADDRESS && | ||||
2026 | (SrcAS == AMDGPUAS::LOCAL_ADDRESS || | ||||
2027 | SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { | ||||
2028 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||
2029 | if (!ApertureReg.isValid()) | ||||
2030 | return false; | ||||
2031 | |||||
2032 | // Coerce the type of the low half of the result so we can use merge_values. | ||||
2033 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||
2034 | |||||
2035 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||
2036 | // avoid the ptrtoint? | ||||
2037 | auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); | ||||
2038 | |||||
2039 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||
2040 | B.buildCopy(Dst, BuildPtr); | ||||
2041 | MI.eraseFromParent(); | ||||
2042 | return true; | ||||
2043 | } | ||||
2044 | |||||
2045 | auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||
2046 | auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||
2047 | |||||
2048 | auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, | ||||
2049 | SegmentNull.getReg(0)); | ||||
2050 | |||||
2051 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||
2052 | |||||
2053 | MI.eraseFromParent(); | ||||
2054 | return true; | ||||
2055 | } | ||||
2056 | |||||
2057 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && | ||||
2058 | SrcTy.getSizeInBits() == 64) { | ||||
2059 | // Truncate. | ||||
2060 | B.buildExtract(Dst, Src, 0); | ||||
2061 | MI.eraseFromParent(); | ||||
2062 | return true; | ||||
2063 | } | ||||
2064 | |||||
2065 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && | ||||
2066 | DstTy.getSizeInBits() == 64) { | ||||
2067 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2068 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||
2069 | auto PtrLo = B.buildPtrToInt(S32, Src); | ||||
2070 | auto HighAddr = B.buildConstant(S32, AddrHiVal); | ||||
2071 | B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); | ||||
2072 | MI.eraseFromParent(); | ||||
2073 | return true; | ||||
2074 | } | ||||
2075 | |||||
2076 | DiagnosticInfoUnsupported InvalidAddrSpaceCast( | ||||
2077 | MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); | ||||
2078 | |||||
2079 | LLVMContext &Ctx = MF.getFunction().getContext(); | ||||
2080 | Ctx.diagnose(InvalidAddrSpaceCast); | ||||
2081 | B.buildUndef(Dst); | ||||
2082 | MI.eraseFromParent(); | ||||
2083 | return true; | ||||
2084 | } | ||||
2085 | |||||
2086 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||
2087 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2088 | MachineIRBuilder &B) const { | ||||
2089 | Register Src = MI.getOperand(1).getReg(); | ||||
2090 | LLT Ty = MRI.getType(Src); | ||||
2091 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2091, __extension__ __PRETTY_FUNCTION__)); | ||||
2092 | |||||
2093 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||
2094 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||
2095 | |||||
2096 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||
2097 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||
2098 | |||||
2099 | // TODO: Should this propagate fast-math-flags? | ||||
2100 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||
2101 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||
2102 | |||||
2103 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||
2104 | auto Fabs = B.buildFAbs(Ty, Src); | ||||
2105 | |||||
2106 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||
2107 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||
2108 | MI.eraseFromParent(); | ||||
2109 | return true; | ||||
2110 | } | ||||
2111 | |||||
2112 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||
2113 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2114 | MachineIRBuilder &B) const { | ||||
2115 | |||||
2116 | const LLT S1 = LLT::scalar(1); | ||||
2117 | const LLT S64 = LLT::scalar(64); | ||||
2118 | |||||
2119 | Register Src = MI.getOperand(1).getReg(); | ||||
2120 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2120, __extension__ __PRETTY_FUNCTION__)); | ||||
2121 | |||||
2122 | // result = trunc(src) | ||||
2123 | // if (src > 0.0 && src != result) | ||||
2124 | // result += 1.0 | ||||
2125 | |||||
2126 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||
2127 | |||||
2128 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||
2129 | const auto One = B.buildFConstant(S64, 1.0); | ||||
2130 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||
2131 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||
2132 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||
2133 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||
2134 | |||||
2135 | // TODO: Should this propagate fast-math-flags? | ||||
2136 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||
2137 | MI.eraseFromParent(); | ||||
2138 | return true; | ||||
2139 | } | ||||
2140 | |||||
2141 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||
2142 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2143 | MachineIRBuilder &B) const { | ||||
2144 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2145 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||
2146 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||
2147 | auto Flags = MI.getFlags(); | ||||
2148 | LLT Ty = MRI.getType(DstReg); | ||||
2149 | |||||
2150 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||
2151 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||
2152 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||
2153 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||
2154 | MI.eraseFromParent(); | ||||
2155 | return true; | ||||
2156 | } | ||||
2157 | |||||
2158 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||
2159 | MachineIRBuilder &B) { | ||||
2160 | const unsigned FractBits = 52; | ||||
2161 | const unsigned ExpBits = 11; | ||||
2162 | LLT S32 = LLT::scalar(32); | ||||
2163 | |||||
2164 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||
2165 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||
2166 | |||||
2167 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||
2168 | .addUse(Hi) | ||||
2169 | .addUse(Const0.getReg(0)) | ||||
2170 | .addUse(Const1.getReg(0)); | ||||
2171 | |||||
2172 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||
2173 | } | ||||
2174 | |||||
2175 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||
2176 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2177 | MachineIRBuilder &B) const { | ||||
2178 | const LLT S1 = LLT::scalar(1); | ||||
2179 | const LLT S32 = LLT::scalar(32); | ||||
2180 | const LLT S64 = LLT::scalar(64); | ||||
2181 | |||||
2182 | Register Src = MI.getOperand(1).getReg(); | ||||
2183 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2183, __extension__ __PRETTY_FUNCTION__)); | ||||
2184 | |||||
2185 | // TODO: Should this use extract since the low half is unused? | ||||
2186 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||
2187 | Register Hi = Unmerge.getReg(1); | ||||
2188 | |||||
2189 | // Extract the upper half, since this is where we will find the sign and | ||||
2190 | // exponent. | ||||
2191 | auto Exp = extractF64Exponent(Hi, B); | ||||
2192 | |||||
2193 | const unsigned FractBits = 52; | ||||
2194 | |||||
2195 | // Extract the sign bit. | ||||
2196 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||
2197 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||
2198 | |||||
2199 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||
2200 | |||||
2201 | const auto Zero32 = B.buildConstant(S32, 0); | ||||
2202 | |||||
2203 | // Extend back to 64-bits. | ||||
2204 | auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); | ||||
2205 | |||||
2206 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||
2207 | auto Not = B.buildNot(S64, Shr); | ||||
2208 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||
2209 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||
2210 | |||||
2211 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||
2212 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||
2213 | |||||
2214 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||
2215 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||
2216 | MI.eraseFromParent(); | ||||
2217 | return true; | ||||
2218 | } | ||||
2219 | |||||
2220 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||
2221 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2222 | MachineIRBuilder &B, bool Signed) const { | ||||
2223 | |||||
2224 | Register Dst = MI.getOperand(0).getReg(); | ||||
2225 | Register Src = MI.getOperand(1).getReg(); | ||||
2226 | |||||
2227 | const LLT S64 = LLT::scalar(64); | ||||
2228 | const LLT S32 = LLT::scalar(32); | ||||
2229 | |||||
2230 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2230, __extension__ __PRETTY_FUNCTION__)); | ||||
2231 | |||||
2232 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||
2233 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||
2234 | |||||
2235 | if (MRI.getType(Dst) == S64) { | ||||
2236 | auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) | ||||
2237 | : B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||
2238 | |||||
2239 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||
2240 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||
2241 | .addUse(CvtHi.getReg(0)) | ||||
2242 | .addUse(ThirtyTwo.getReg(0)); | ||||
2243 | |||||
2244 | // TODO: Should this propagate fast-math-flags? | ||||
2245 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||
2246 | MI.eraseFromParent(); | ||||
2247 | return true; | ||||
2248 | } | ||||
2249 | |||||
2250 | assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0 ) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2250, __extension__ __PRETTY_FUNCTION__)); | ||||
2251 | |||||
2252 | auto One = B.buildConstant(S32, 1); | ||||
2253 | |||||
2254 | MachineInstrBuilder ShAmt; | ||||
2255 | if (Signed) { | ||||
2256 | auto ThirtyOne = B.buildConstant(S32, 31); | ||||
2257 | auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); | ||||
2258 | auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); | ||||
2259 | auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); | ||||
2260 | auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, | ||||
2261 | /*HasSideEffects=*/false) | ||||
2262 | .addUse(Unmerge.getReg(1)); | ||||
2263 | auto LS2 = B.buildSub(S32, LS, One); | ||||
2264 | ShAmt = B.buildUMin(S32, LS2, MaxShAmt); | ||||
2265 | } else | ||||
2266 | ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); | ||||
2267 | auto Norm = B.buildShl(S64, Src, ShAmt); | ||||
2268 | auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); | ||||
2269 | auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); | ||||
2270 | auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); | ||||
2271 | auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); | ||||
2272 | auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); | ||||
2273 | B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, | ||||
2274 | /*HasSideEffects=*/false) | ||||
2275 | .addUse(FVal.getReg(0)) | ||||
2276 | .addUse(Scale.getReg(0)); | ||||
2277 | MI.eraseFromParent(); | ||||
2278 | return true; | ||||
2279 | } | ||||
2280 | |||||
2281 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||
2282 | // actually works. | ||||
2283 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, | ||||
2284 | MachineRegisterInfo &MRI, | ||||
2285 | MachineIRBuilder &B, | ||||
2286 | bool Signed) const { | ||||
2287 | |||||
2288 | Register Dst = MI.getOperand(0).getReg(); | ||||
2289 | Register Src = MI.getOperand(1).getReg(); | ||||
2290 | |||||
2291 | const LLT S64 = LLT::scalar(64); | ||||
2292 | const LLT S32 = LLT::scalar(32); | ||||
2293 | |||||
2294 | const LLT SrcLT = MRI.getType(Src); | ||||
2295 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2295, __extension__ __PRETTY_FUNCTION__)); | ||||
2296 | |||||
2297 | unsigned Flags = MI.getFlags(); | ||||
2298 | |||||
2299 | // The basic idea of converting a floating point number into a pair of 32-bit | ||||
2300 | // integers is illustrated as follows: | ||||
2301 | // | ||||
2302 | // tf := trunc(val); | ||||
2303 | // hif := floor(tf * 2^-32); | ||||
2304 | // lof := tf - hif * 2^32; // lof is always positive due to floor. | ||||
2305 | // hi := fptoi(hif); | ||||
2306 | // lo := fptoi(lof); | ||||
2307 | // | ||||
2308 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); | ||||
2309 | MachineInstrBuilder Sign; | ||||
2310 | if (Signed && SrcLT == S32) { | ||||
2311 | // However, a 32-bit floating point number has only 23 bits mantissa and | ||||
2312 | // it's not enough to hold all the significant bits of `lof` if val is | ||||
2313 | // negative. To avoid the loss of precision, We need to take the absolute | ||||
2314 | // value after truncating and flip the result back based on the original | ||||
2315 | // signedness. | ||||
2316 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); | ||||
2317 | Trunc = B.buildFAbs(S32, Trunc, Flags); | ||||
2318 | } | ||||
2319 | MachineInstrBuilder K0, K1; | ||||
2320 | if (SrcLT == S64) { | ||||
2321 | K0 = B.buildFConstant( | ||||
2322 | S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL)); | ||||
2323 | K1 = B.buildFConstant( | ||||
2324 | S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL)); | ||||
2325 | } else { | ||||
2326 | K0 = B.buildFConstant( | ||||
2327 | S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U)); | ||||
2328 | K1 = B.buildFConstant( | ||||
2329 | S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U)); | ||||
2330 | } | ||||
2331 | |||||
2332 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); | ||||
2333 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); | ||||
2334 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); | ||||
2335 | |||||
2336 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) | ||||
2337 | : B.buildFPTOUI(S32, FloorMul); | ||||
2338 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||
2339 | |||||
2340 | if (Signed && SrcLT == S32) { | ||||
2341 | // Flip the result based on the signedness, which is either all 0s or 1s. | ||||
2342 | Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); | ||||
2343 | // r := xor({lo, hi}, sign) - sign; | ||||
2344 | B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), | ||||
2345 | Sign); | ||||
2346 | } else | ||||
2347 | B.buildMergeLikeInstr(Dst, {Lo, Hi}); | ||||
2348 | MI.eraseFromParent(); | ||||
2349 | |||||
2350 | return true; | ||||
2351 | } | ||||
2352 | |||||
2353 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||
2354 | MachineInstr &MI) const { | ||||
2355 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||
2356 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2357 | |||||
2358 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||
2359 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||
2360 | |||||
2361 | // With ieee_mode disabled, the instructions have the correct behavior | ||||
2362 | // already for G_FMINNUM/G_FMAXNUM | ||||
2363 | if (!MFI->getMode().IEEE) | ||||
2364 | return !IsIEEEOp; | ||||
2365 | |||||
2366 | if (IsIEEEOp) | ||||
2367 | return true; | ||||
2368 | |||||
2369 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||
2370 | } | ||||
2371 | |||||
2372 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||
2373 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2374 | MachineIRBuilder &B) const { | ||||
2375 | // TODO: Should move some of this into LegalizerHelper. | ||||
2376 | |||||
2377 | // TODO: Promote dynamic indexing of s16 to s32 | ||||
2378 | |||||
2379 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||
2380 | // constant before this, so we shouldn't need | ||||
2381 | // getIConstantVRegValWithLookThrough. | ||||
2382 | std::optional<ValueAndVReg> MaybeIdxVal = | ||||
2383 | getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||
2384 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||
2385 | return true; | ||||
2386 | const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); | ||||
2387 | |||||
2388 | Register Dst = MI.getOperand(0).getReg(); | ||||
2389 | Register Vec = MI.getOperand(1).getReg(); | ||||
2390 | |||||
2391 | LLT VecTy = MRI.getType(Vec); | ||||
2392 | LLT EltTy = VecTy.getElementType(); | ||||
2393 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2393, __extension__ __PRETTY_FUNCTION__)); | ||||
2394 | |||||
2395 | if (IdxVal < VecTy.getNumElements()) { | ||||
2396 | auto Unmerge = B.buildUnmerge(EltTy, Vec); | ||||
2397 | B.buildCopy(Dst, Unmerge.getReg(IdxVal)); | ||||
2398 | } else { | ||||
2399 | B.buildUndef(Dst); | ||||
2400 | } | ||||
2401 | |||||
2402 | MI.eraseFromParent(); | ||||
2403 | return true; | ||||
2404 | } | ||||
2405 | |||||
2406 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||
2407 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2408 | MachineIRBuilder &B) const { | ||||
2409 | // TODO: Should move some of this into LegalizerHelper. | ||||
2410 | |||||
2411 | // TODO: Promote dynamic indexing of s16 to s32 | ||||
2412 | |||||
2413 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||
2414 | // constant before this, so we shouldn't need | ||||
2415 | // getIConstantVRegValWithLookThrough. | ||||
2416 | std::optional<ValueAndVReg> MaybeIdxVal = | ||||
2417 | getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||
2418 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||
2419 | return true; | ||||
2420 | |||||
2421 | const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); | ||||
2422 | Register Dst = MI.getOperand(0).getReg(); | ||||
2423 | Register Vec = MI.getOperand(1).getReg(); | ||||
2424 | Register Ins = MI.getOperand(2).getReg(); | ||||
2425 | |||||
2426 | LLT VecTy = MRI.getType(Vec); | ||||
2427 | LLT EltTy = VecTy.getElementType(); | ||||
2428 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2428, __extension__ __PRETTY_FUNCTION__)); | ||||
2429 | (void)Ins; | ||||
2430 | |||||
2431 | unsigned NumElts = VecTy.getNumElements(); | ||||
2432 | if (IdxVal < NumElts) { | ||||
2433 | SmallVector<Register, 8> SrcRegs; | ||||
2434 | for (unsigned i = 0; i < NumElts; ++i) | ||||
2435 | SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); | ||||
2436 | B.buildUnmerge(SrcRegs, Vec); | ||||
2437 | |||||
2438 | SrcRegs[IdxVal] = MI.getOperand(2).getReg(); | ||||
2439 | B.buildMergeLikeInstr(Dst, SrcRegs); | ||||
2440 | } else { | ||||
2441 | B.buildUndef(Dst); | ||||
2442 | } | ||||
2443 | |||||
2444 | MI.eraseFromParent(); | ||||
2445 | return true; | ||||
2446 | } | ||||
2447 | |||||
2448 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||
2449 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2450 | MachineIRBuilder &B) const { | ||||
2451 | |||||
2452 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2453 | Register SrcReg = MI.getOperand(1).getReg(); | ||||
2454 | LLT Ty = MRI.getType(DstReg); | ||||
2455 | unsigned Flags = MI.getFlags(); | ||||
2456 | |||||
2457 | Register TrigVal; | ||||
2458 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||
2459 | if (ST.hasTrigReducedRange()) { | ||||
2460 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||
2461 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||
2462 | .addUse(MulVal.getReg(0)) | ||||
2463 | .setMIFlags(Flags).getReg(0); | ||||
2464 | } else | ||||
2465 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||
2466 | |||||
2467 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||
2468 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||
2469 | B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) | ||||
2470 | .addUse(TrigVal) | ||||
2471 | .setMIFlags(Flags); | ||||
2472 | MI.eraseFromParent(); | ||||
2473 | return true; | ||||
2474 | } | ||||
2475 | |||||
2476 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||
2477 | MachineIRBuilder &B, | ||||
2478 | const GlobalValue *GV, | ||||
2479 | int64_t Offset, | ||||
2480 | unsigned GAFlags) const { | ||||
2481 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2481, __extension__ __PRETTY_FUNCTION__)); | ||||
2482 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||
2483 | // to the following code sequence: | ||||
2484 | // | ||||
2485 | // For constant address space: | ||||
2486 | // s_getpc_b64 s[0:1] | ||||
2487 | // s_add_u32 s0, s0, $symbol | ||||
2488 | // s_addc_u32 s1, s1, 0 | ||||
2489 | // | ||||
2490 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||
2491 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||
2492 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||
2493 | // operand to the global variable. | ||||
2494 | // | ||||
2495 | // For global address space: | ||||
2496 | // s_getpc_b64 s[0:1] | ||||
2497 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||
2498 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||
2499 | // | ||||
2500 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||
2501 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||
2502 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||
2503 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||
2504 | // operand to the global variable. | ||||
2505 | // | ||||
2506 | // What we want here is an offset from the value returned by s_getpc | ||||
2507 | // (which is the address of the s_add_u32 instruction) to the global | ||||
2508 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||
2509 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||
2510 | // small. This requires us to add 4 to the global variable offset in order to | ||||
2511 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||
2512 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||
2513 | // instruction. | ||||
2514 | |||||
2515 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
2516 | |||||
2517 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||
2518 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||
2519 | |||||
2520 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||
2521 | .addDef(PCReg); | ||||
2522 | |||||
2523 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||
2524 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||
2525 | MIB.addImm(0); | ||||
2526 | else | ||||
2527 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||
2528 | |||||
2529 | if (!B.getMRI()->getRegClassOrNull(PCReg)) | ||||
2530 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||
2531 | |||||
2532 | if (PtrTy.getSizeInBits() == 32) | ||||
2533 | B.buildExtract(DstReg, PCReg, 0); | ||||
2534 | return true; | ||||
2535 | } | ||||
2536 | |||||
2537 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||
2538 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2539 | MachineIRBuilder &B) const { | ||||
2540 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2541 | LLT Ty = MRI.getType(DstReg); | ||||
2542 | unsigned AS = Ty.getAddressSpace(); | ||||
2543 | |||||
2544 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||
2545 | MachineFunction &MF = B.getMF(); | ||||
2546 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2547 | |||||
2548 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||
2549 | if (!MFI->isModuleEntryFunction() && | ||||
2550 | !GV->getName().equals("llvm.amdgcn.module.lds")) { | ||||
2551 | const Function &Fn = MF.getFunction(); | ||||
2552 | DiagnosticInfoUnsupported BadLDSDecl( | ||||
2553 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||
2554 | DS_Warning); | ||||
2555 | Fn.getContext().diagnose(BadLDSDecl); | ||||
2556 | |||||
2557 | // We currently don't have a way to correctly allocate LDS objects that | ||||
2558 | // aren't directly associated with a kernel. We do force inlining of | ||||
2559 | // functions that use local objects. However, if these dead functions are | ||||
2560 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||
2561 | // and a trap, since there should be no callable path here. | ||||
2562 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||
2563 | B.buildUndef(DstReg); | ||||
2564 | MI.eraseFromParent(); | ||||
2565 | return true; | ||||
2566 | } | ||||
2567 | |||||
2568 | // TODO: We could emit code to handle the initialization somewhere. | ||||
2569 | // We ignore the initializer for now and legalize it to allow selection. | ||||
2570 | // The initializer will anyway get errored out during assembly emission. | ||||
2571 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
2572 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||
2573 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||
2574 | return true; // Leave in place; | ||||
2575 | } | ||||
2576 | |||||
2577 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||
2578 | Type *Ty = GV->getValueType(); | ||||
2579 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||
2580 | // zero-sized type in other languages to declare the dynamic shared | ||||
2581 | // memory which size is not known at the compile time. They will be | ||||
2582 | // allocated by the runtime and placed directly after the static | ||||
2583 | // allocated ones. They all share the same offset. | ||||
2584 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||
2585 | // Adjust alignment for that dynamic shared memory array. | ||||
2586 | MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); | ||||
2587 | LLT S32 = LLT::scalar(32); | ||||
2588 | auto Sz = | ||||
2589 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||
2590 | B.buildIntToPtr(DstReg, Sz); | ||||
2591 | MI.eraseFromParent(); | ||||
2592 | return true; | ||||
2593 | } | ||||
2594 | } | ||||
2595 | |||||
2596 | B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), | ||||
2597 | *cast<GlobalVariable>(GV))); | ||||
2598 | MI.eraseFromParent(); | ||||
2599 | return true; | ||||
2600 | } | ||||
2601 | |||||
2602 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
2603 | |||||
2604 | if (TLI->shouldEmitFixup(GV)) { | ||||
2605 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||
2606 | MI.eraseFromParent(); | ||||
2607 | return true; | ||||
2608 | } | ||||
2609 | |||||
2610 | if (TLI->shouldEmitPCReloc(GV)) { | ||||
2611 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||
2612 | MI.eraseFromParent(); | ||||
2613 | return true; | ||||
2614 | } | ||||
2615 | |||||
2616 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
2617 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||
2618 | |||||
2619 | LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; | ||||
2620 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||
2621 | MachinePointerInfo::getGOT(MF), | ||||
2622 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
2623 | MachineMemOperand::MOInvariant, | ||||
2624 | LoadTy, Align(8)); | ||||
2625 | |||||
2626 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||
2627 | |||||
2628 | if (Ty.getSizeInBits() == 32) { | ||||
2629 | // Truncate if this is a 32-bit constant address. | ||||
2630 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||
2631 | B.buildExtract(DstReg, Load, 0); | ||||
2632 | } else | ||||
2633 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||
2634 | |||||
2635 | MI.eraseFromParent(); | ||||
2636 | return true; | ||||
2637 | } | ||||
2638 | |||||
2639 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||
2640 | if (Ty.isVector()) | ||||
2641 | return Ty.changeElementCount( | ||||
2642 | ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); | ||||
2643 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||
2644 | } | ||||
2645 | |||||
2646 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||
2647 | MachineInstr &MI) const { | ||||
2648 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
2649 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
2650 | GISelChangeObserver &Observer = Helper.Observer; | ||||
2651 | |||||
2652 | Register PtrReg = MI.getOperand(1).getReg(); | ||||
2653 | LLT PtrTy = MRI.getType(PtrReg); | ||||
2654 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||
2655 | |||||
2656 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||
2657 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
2658 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||
2659 | Observer.changingInstr(MI); | ||||
2660 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||
2661 | Observer.changedInstr(MI); | ||||
2662 | return true; | ||||
2663 | } | ||||
2664 | |||||
2665 | if (MI.getOpcode() != AMDGPU::G_LOAD) | ||||
2666 | return false; | ||||
2667 | |||||
2668 | Register ValReg = MI.getOperand(0).getReg(); | ||||
2669 | LLT ValTy = MRI.getType(ValReg); | ||||
2670 | |||||
2671 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
2672 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||
2673 | const LLT MemTy = MMO->getMemoryType(); | ||||
2674 | const Align MemAlign = MMO->getAlign(); | ||||
2675 | const unsigned MemSize = MemTy.getSizeInBits(); | ||||
2676 | const uint64_t AlignInBits = 8 * MemAlign.value(); | ||||
2677 | |||||
2678 | // Widen non-power-of-2 loads to the alignment if needed | ||||
2679 | if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||
2680 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||
2681 | |||||
2682 | // This was already the correct extending load result type, so just adjust | ||||
2683 | // the memory type. | ||||
2684 | if (WideMemSize == ValSize) { | ||||
2685 | MachineFunction &MF = B.getMF(); | ||||
2686 | |||||
2687 | MachineMemOperand *WideMMO = | ||||
2688 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||
2689 | Observer.changingInstr(MI); | ||||
2690 | MI.setMemRefs(MF, {WideMMO}); | ||||
2691 | Observer.changedInstr(MI); | ||||
2692 | return true; | ||||
2693 | } | ||||
2694 | |||||
2695 | // Don't bother handling edge case that should probably never be produced. | ||||
2696 | if (ValSize > WideMemSize) | ||||
2697 | return false; | ||||
2698 | |||||
2699 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||
2700 | |||||
2701 | Register WideLoad; | ||||
2702 | if (!WideTy.isVector()) { | ||||
2703 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
2704 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||
2705 | } else { | ||||
2706 | // Extract the subvector. | ||||
2707 | |||||
2708 | if (isRegisterType(ValTy)) { | ||||
2709 | // If this a case where G_EXTRACT is legal, use it. | ||||
2710 | // (e.g. <3 x s32> -> <4 x s32>) | ||||
2711 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
2712 | B.buildExtract(ValReg, WideLoad, 0); | ||||
2713 | } else { | ||||
2714 | // For cases where the widened type isn't a nice register value, unmerge | ||||
2715 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||
2716 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
2717 | B.buildDeleteTrailingVectorElements(ValReg, WideLoad); | ||||
2718 | } | ||||
2719 | } | ||||
2720 | |||||
2721 | MI.eraseFromParent(); | ||||
2722 | return true; | ||||
2723 | } | ||||
2724 | |||||
2725 | return false; | ||||
2726 | } | ||||
2727 | |||||
2728 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||
2729 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2730 | MachineIRBuilder &B) const { | ||||
2731 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
2732 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2732, __extension__ __PRETTY_FUNCTION__)); | ||||
2733 | |||||
2734 | MachineFunction &MF = B.getMF(); | ||||
2735 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2736 | |||||
2737 | // TODO: Always legal with future ftz flag. | ||||
2738 | // FIXME: Do we need just output? | ||||
2739 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||
2740 | return true; | ||||
2741 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||
2742 | return true; | ||||
2743 | |||||
2744 | MachineIRBuilder HelperBuilder(MI); | ||||
2745 | GISelObserverWrapper DummyObserver; | ||||
2746 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||
2747 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||
2748 | } | ||||
2749 | |||||
2750 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||
2751 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
2752 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2753 | Register PtrReg = MI.getOperand(1).getReg(); | ||||
2754 | Register CmpVal = MI.getOperand(2).getReg(); | ||||
2755 | Register NewVal = MI.getOperand(3).getReg(); | ||||
2756 | |||||
2757 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2758, __extension__ __PRETTY_FUNCTION__)) | ||||
2758 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2758, __extension__ __PRETTY_FUNCTION__)); | ||||
2759 | |||||
2760 | LLT ValTy = MRI.getType(CmpVal); | ||||
2761 | LLT VecTy = LLT::fixed_vector(2, ValTy); | ||||
2762 | |||||
2763 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||
2764 | |||||
2765 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||
2766 | .addDef(DstReg) | ||||
2767 | .addUse(PtrReg) | ||||
2768 | .addUse(PackedVal) | ||||
2769 | .setMemRefs(MI.memoperands()); | ||||
2770 | |||||
2771 | MI.eraseFromParent(); | ||||
2772 | return true; | ||||
2773 | } | ||||
2774 | |||||
2775 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||
2776 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||
2777 | Register Dst = MI.getOperand(0).getReg(); | ||||
2778 | Register Src = MI.getOperand(1).getReg(); | ||||
2779 | LLT Ty = B.getMRI()->getType(Dst); | ||||
2780 | unsigned Flags = MI.getFlags(); | ||||
2781 | |||||
2782 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||
2783 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||
2784 | |||||
2785 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||
2786 | MI.eraseFromParent(); | ||||
2787 | return true; | ||||
2788 | } | ||||
2789 | |||||
2790 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||
2791 | MachineIRBuilder &B) const { | ||||
2792 | Register Dst = MI.getOperand(0).getReg(); | ||||
2793 | Register Src = MI.getOperand(1).getReg(); | ||||
2794 | unsigned Flags = MI.getFlags(); | ||||
2795 | LLT Ty = B.getMRI()->getType(Dst); | ||||
2796 | |||||
2797 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||
2798 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||
2799 | B.buildFExp2(Dst, Mul, Flags); | ||||
2800 | MI.eraseFromParent(); | ||||
2801 | return true; | ||||
2802 | } | ||||
2803 | |||||
2804 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||
2805 | MachineIRBuilder &B) const { | ||||
2806 | Register Dst = MI.getOperand(0).getReg(); | ||||
2807 | Register Src0 = MI.getOperand(1).getReg(); | ||||
2808 | Register Src1 = MI.getOperand(2).getReg(); | ||||
2809 | unsigned Flags = MI.getFlags(); | ||||
2810 | LLT Ty = B.getMRI()->getType(Dst); | ||||
2811 | const LLT S16 = LLT::scalar(16); | ||||
2812 | const LLT S32 = LLT::scalar(32); | ||||
2813 | |||||
2814 | if (Ty == S32) { | ||||
2815 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||
2816 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||
2817 | .addUse(Log.getReg(0)) | ||||
2818 | .addUse(Src1) | ||||
2819 | .setMIFlags(Flags); | ||||
2820 | B.buildFExp2(Dst, Mul, Flags); | ||||
2821 | } else if (Ty == S16) { | ||||
2822 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||
2823 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||
2824 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||
2825 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||
2826 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||
2827 | .addUse(Ext0.getReg(0)) | ||||
2828 | .addUse(Ext1.getReg(0)) | ||||
2829 | .setMIFlags(Flags); | ||||
2830 | |||||
2831 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||
2832 | } else | ||||
2833 | return false; | ||||
2834 | |||||
2835 | MI.eraseFromParent(); | ||||
2836 | return true; | ||||
2837 | } | ||||
2838 | |||||
2839 | // Find a source register, ignoring any possible source modifiers. | ||||
2840 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||
2841 | Register ModSrc = OrigSrc; | ||||
2842 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||
2843 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||
2844 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||
2845 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||
2846 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||
2847 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||
2848 | return ModSrc; | ||||
2849 | } | ||||
2850 | |||||
2851 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||
2852 | MachineRegisterInfo &MRI, | ||||
2853 | MachineIRBuilder &B) const { | ||||
2854 | |||||
2855 | const LLT S1 = LLT::scalar(1); | ||||
2856 | const LLT S64 = LLT::scalar(64); | ||||
2857 | Register Dst = MI.getOperand(0).getReg(); | ||||
2858 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||
2859 | unsigned Flags = MI.getFlags(); | ||||
2860 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2861, __extension__ __PRETTY_FUNCTION__)) | ||||
2861 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2861, __extension__ __PRETTY_FUNCTION__)); | ||||
2862 | |||||
2863 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||
2864 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||
2865 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||
2866 | // V_FRACT bug is: | ||||
2867 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||
2868 | // | ||||
2869 | // Convert floor(x) to (x - fract(x)) | ||||
2870 | |||||
2871 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||
2872 | .addUse(OrigSrc) | ||||
2873 | .setMIFlags(Flags); | ||||
2874 | |||||
2875 | // Give source modifier matching some assistance before obscuring a foldable | ||||
2876 | // pattern. | ||||
2877 | |||||
2878 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||
2879 | // shouldn't matter? | ||||
2880 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||
2881 | |||||
2882 | auto Const = | ||||
2883 | B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); | ||||
2884 | |||||
2885 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||
2886 | |||||
2887 | // We don't need to concern ourselves with the snan handling difference, so | ||||
2888 | // use the one which will directly select. | ||||
2889 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
2890 | if (MFI->getMode().IEEE) | ||||
2891 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||
2892 | else | ||||
2893 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||
2894 | |||||
2895 | Register CorrectedFract = Min; | ||||
2896 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||
2897 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||
2898 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||
2899 | } | ||||
2900 | |||||
2901 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||
2902 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||
2903 | |||||
2904 | MI.eraseFromParent(); | ||||
2905 | return true; | ||||
2906 | } | ||||
2907 | |||||
2908 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||
2909 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||
2910 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||
2911 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
2912 | Register Dst = MI.getOperand(0).getReg(); | ||||
2913 | const LLT S32 = LLT::scalar(32); | ||||
2914 | const LLT S16 = LLT::scalar(16); | ||||
2915 | assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector (2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2915, __extension__ __PRETTY_FUNCTION__)); | ||||
2916 | |||||
2917 | Register Src0 = MI.getOperand(1).getReg(); | ||||
2918 | Register Src1 = MI.getOperand(2).getReg(); | ||||
2919 | |||||
2920 | if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { | ||||
2921 | assert(MRI.getType(Src0) == S32)(static_cast <bool> (MRI.getType(Src0) == S32) ? void ( 0) : __assert_fail ("MRI.getType(Src0) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2921, __extension__ __PRETTY_FUNCTION__)); | ||||
2922 | Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); | ||||
2923 | Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); | ||||
2924 | } | ||||
2925 | |||||
2926 | auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); | ||||
2927 | B.buildBitcast(Dst, Merge); | ||||
2928 | |||||
2929 | MI.eraseFromParent(); | ||||
2930 | return true; | ||||
2931 | } | ||||
2932 | |||||
2933 | // Build a big integer multiply or multiply-add using MAD_64_32 instructions. | ||||
2934 | // | ||||
2935 | // Source and accumulation registers must all be 32-bits. | ||||
2936 | // | ||||
2937 | // TODO: When the multiply is uniform, we should produce a code sequence | ||||
2938 | // that is better suited to instruction selection on the SALU. Instead of | ||||
2939 | // the outer loop going over parts of the result, the outer loop should go | ||||
2940 | // over parts of one of the factors. This should result in instruction | ||||
2941 | // selection that makes full use of S_ADDC_U32 instructions. | ||||
2942 | void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, | ||||
2943 | MutableArrayRef<Register> Accum, | ||||
2944 | ArrayRef<Register> Src0, | ||||
2945 | ArrayRef<Register> Src1, | ||||
2946 | bool UsePartialMad64_32, | ||||
2947 | bool SeparateOddAlignedProducts) const { | ||||
2948 | // Use (possibly empty) vectors of S1 registers to represent the set of | ||||
2949 | // carries from one pair of positions to the next. | ||||
2950 | using Carry = SmallVector<Register, 2>; | ||||
2951 | |||||
2952 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
2953 | GISelKnownBits &KB = *Helper.getKnownBits(); | ||||
2954 | |||||
2955 | const LLT S1 = LLT::scalar(1); | ||||
2956 | const LLT S32 = LLT::scalar(32); | ||||
2957 | const LLT S64 = LLT::scalar(64); | ||||
2958 | |||||
2959 | Register Zero32; | ||||
2960 | Register Zero64; | ||||
2961 | |||||
2962 | auto getZero32 = [&]() -> Register { | ||||
2963 | if (!Zero32) | ||||
2964 | Zero32 = B.buildConstant(S32, 0).getReg(0); | ||||
2965 | return Zero32; | ||||
2966 | }; | ||||
2967 | auto getZero64 = [&]() -> Register { | ||||
2968 | if (!Zero64) | ||||
2969 | Zero64 = B.buildConstant(S64, 0).getReg(0); | ||||
2970 | return Zero64; | ||||
2971 | }; | ||||
2972 | |||||
2973 | SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; | ||||
2974 | for (unsigned i = 0; i < Src0.size(); ++i) { | ||||
2975 | Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); | ||||
2976 | Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); | ||||
2977 | } | ||||
2978 | |||||
2979 | // Merge the given carries into the 32-bit LocalAccum, which is modified | ||||
2980 | // in-place. | ||||
2981 | // | ||||
2982 | // Returns the carry-out, which is a single S1 register or null. | ||||
2983 | auto mergeCarry = | ||||
2984 | [&](Register &LocalAccum, const Carry &CarryIn) -> Register { | ||||
2985 | if (CarryIn.empty()) | ||||
2986 | return Register(); | ||||
2987 | |||||
2988 | bool HaveCarryOut = true; | ||||
2989 | Register CarryAccum; | ||||
2990 | if (CarryIn.size() == 1) { | ||||
2991 | if (!LocalAccum) { | ||||
2992 | LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); | ||||
2993 | return Register(); | ||||
2994 | } | ||||
2995 | |||||
2996 | CarryAccum = getZero32(); | ||||
2997 | } else { | ||||
2998 | CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); | ||||
2999 | for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { | ||||
3000 | CarryAccum = | ||||
3001 | B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) | ||||
3002 | .getReg(0); | ||||
3003 | } | ||||
3004 | |||||
3005 | if (!LocalAccum) { | ||||
3006 | LocalAccum = getZero32(); | ||||
3007 | HaveCarryOut = false; | ||||
3008 | } | ||||
3009 | } | ||||
3010 | |||||
3011 | auto Add = | ||||
3012 | B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); | ||||
3013 | LocalAccum = Add.getReg(0); | ||||
3014 | return HaveCarryOut ? Add.getReg(1) : Register(); | ||||
3015 | }; | ||||
3016 | |||||
3017 | // Build a multiply-add chain to compute | ||||
3018 | // | ||||
3019 | // LocalAccum + (partial products at DstIndex) | ||||
3020 | // + (opportunistic subset of CarryIn) | ||||
3021 | // | ||||
3022 | // LocalAccum is an array of one or two 32-bit registers that are updated | ||||
3023 | // in-place. The incoming registers may be null. | ||||
3024 | // | ||||
3025 | // In some edge cases, carry-ins can be consumed "for free". In that case, | ||||
3026 | // the consumed carry bits are removed from CarryIn in-place. | ||||
3027 | auto buildMadChain = | ||||
3028 | [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) | ||||
3029 | -> Carry { | ||||
3030 | assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||(static_cast <bool> ((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3031, __extension__ __PRETTY_FUNCTION__)) | ||||
3031 | (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1))(static_cast <bool> ((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3031, __extension__ __PRETTY_FUNCTION__)); | ||||
3032 | |||||
3033 | Carry CarryOut; | ||||
3034 | unsigned j0 = 0; | ||||
3035 | |||||
3036 | // Use plain 32-bit multiplication for the most significant part of the | ||||
3037 | // result by default. | ||||
3038 | if (LocalAccum.size() == 1 && | ||||
3039 | (!UsePartialMad64_32 || !CarryIn.empty())) { | ||||
3040 | do { | ||||
3041 | // Skip multiplication if one of the operands is 0 | ||||
3042 | unsigned j1 = DstIndex - j0; | ||||
3043 | if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { | ||||
3044 | ++j0; | ||||
3045 | continue; | ||||
3046 | } | ||||
3047 | auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); | ||||
3048 | if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { | ||||
3049 | LocalAccum[0] = Mul.getReg(0); | ||||
3050 | } else { | ||||
3051 | if (CarryIn.empty()) { | ||||
3052 | LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); | ||||
3053 | } else { | ||||
3054 | LocalAccum[0] = | ||||
3055 | B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) | ||||
3056 | .getReg(0); | ||||
3057 | CarryIn.pop_back(); | ||||
3058 | } | ||||
3059 | } | ||||
3060 | ++j0; | ||||
3061 | } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); | ||||
3062 | } | ||||
3063 | |||||
3064 | // Build full 64-bit multiplies. | ||||
3065 | if (j0 <= DstIndex) { | ||||
3066 | bool HaveSmallAccum = false; | ||||
3067 | Register Tmp; | ||||
3068 | |||||
3069 | if (LocalAccum[0]) { | ||||
3070 | if (LocalAccum.size() == 1) { | ||||
3071 | Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); | ||||
3072 | HaveSmallAccum = true; | ||||
3073 | } else if (LocalAccum[1]) { | ||||
3074 | Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); | ||||
3075 | HaveSmallAccum = false; | ||||
3076 | } else { | ||||
3077 | Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); | ||||
3078 | HaveSmallAccum = true; | ||||
3079 | } | ||||
3080 | } else { | ||||
3081 | assert(LocalAccum.size() == 1 || !LocalAccum[1])(static_cast <bool> (LocalAccum.size() == 1 || !LocalAccum [1]) ? void (0) : __assert_fail ("LocalAccum.size() == 1 || !LocalAccum[1]" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3081, __extension__ __PRETTY_FUNCTION__)); | ||||
3082 | Tmp = getZero64(); | ||||
3083 | HaveSmallAccum = true; | ||||
3084 | } | ||||
3085 | |||||
3086 | do { | ||||
3087 | unsigned j1 = DstIndex - j0; | ||||
3088 | if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { | ||||
3089 | ++j0; | ||||
3090 | continue; | ||||
3091 | } | ||||
3092 | auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, | ||||
3093 | {Src0[j0], Src1[j1], Tmp}); | ||||
3094 | Tmp = Mad.getReg(0); | ||||
3095 | if (!HaveSmallAccum) | ||||
3096 | CarryOut.push_back(Mad.getReg(1)); | ||||
3097 | HaveSmallAccum = false; | ||||
3098 | |||||
3099 | ++j0; | ||||
3100 | } while (j0 <= DstIndex); | ||||
3101 | |||||
3102 | auto Unmerge = B.buildUnmerge(S32, Tmp); | ||||
3103 | LocalAccum[0] = Unmerge.getReg(0); | ||||
3104 | if (LocalAccum.size() > 1) | ||||
3105 | LocalAccum[1] = Unmerge.getReg(1); | ||||
3106 | } | ||||
3107 | |||||
3108 | return CarryOut; | ||||
3109 | }; | ||||
3110 | |||||
3111 | // Outer multiply loop, iterating over destination parts from least | ||||
3112 | // significant to most significant parts. | ||||
3113 | // | ||||
3114 | // The columns of the following diagram correspond to the destination parts | ||||
3115 | // affected by one iteration of the outer loop (ignoring boundary | ||||
3116 | // conditions). | ||||
3117 | // | ||||
3118 | // Dest index relative to 2 * i: 1 0 -1 | ||||
3119 | // ------ | ||||
3120 | // Carries from previous iteration: e o | ||||
3121 | // Even-aligned partial product sum: E E . | ||||
3122 | // Odd-aligned partial product sum: O O | ||||
3123 | // | ||||
3124 | // 'o' is OddCarry, 'e' is EvenCarry. | ||||
3125 | // EE and OO are computed from partial products via buildMadChain and use | ||||
3126 | // accumulation where possible and appropriate. | ||||
3127 | // | ||||
3128 | Register SeparateOddCarry; | ||||
3129 | Carry EvenCarry; | ||||
3130 | Carry OddCarry; | ||||
3131 | |||||
3132 | for (unsigned i = 0; i <= Accum.size() / 2; ++i) { | ||||
3133 | Carry OddCarryIn = std::move(OddCarry); | ||||
3134 | Carry EvenCarryIn = std::move(EvenCarry); | ||||
3135 | OddCarry.clear(); | ||||
3136 | EvenCarry.clear(); | ||||
3137 | |||||
3138 | // Partial products at offset 2 * i. | ||||
3139 | if (2 * i < Accum.size()) { | ||||
3140 | auto LocalAccum = Accum.drop_front(2 * i).take_front(2); | ||||
3141 | EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); | ||||
3142 | } | ||||
3143 | |||||
3144 | // Partial products at offset 2 * i - 1. | ||||
3145 | if (i > 0) { | ||||
3146 | if (!SeparateOddAlignedProducts) { | ||||
3147 | auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); | ||||
3148 | OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); | ||||
3149 | } else { | ||||
3150 | bool IsHighest = 2 * i >= Accum.size(); | ||||
3151 | Register SeparateOddOut[2]; | ||||
3152 | auto LocalAccum = MutableArrayRef(SeparateOddOut) | ||||
3153 | .take_front(IsHighest ? 1 : 2); | ||||
3154 | OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); | ||||
3155 | |||||
3156 | MachineInstr *Lo; | ||||
3157 | |||||
3158 | if (i == 1) { | ||||
3159 | if (!IsHighest) | ||||
3160 | Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); | ||||
3161 | else | ||||
3162 | Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); | ||||
3163 | } else { | ||||
3164 | Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], | ||||
3165 | SeparateOddCarry); | ||||
3166 | } | ||||
3167 | Accum[2 * i - 1] = Lo->getOperand(0).getReg(); | ||||
3168 | |||||
3169 | if (!IsHighest) { | ||||
3170 | auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], | ||||
3171 | Lo->getOperand(1).getReg()); | ||||
3172 | Accum[2 * i] = Hi.getReg(0); | ||||
3173 | SeparateOddCarry = Hi.getReg(1); | ||||
3174 | } | ||||
3175 | } | ||||
3176 | } | ||||
3177 | |||||
3178 | // Add in the carries from the previous iteration | ||||
3179 | if (i > 0) { | ||||
3180 | if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) | ||||
3181 | EvenCarryIn.push_back(CarryOut); | ||||
3182 | |||||
3183 | if (2 * i < Accum.size()) { | ||||
3184 | if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) | ||||
3185 | OddCarry.push_back(CarryOut); | ||||
3186 | } | ||||
3187 | } | ||||
3188 | } | ||||
3189 | } | ||||
3190 | |||||
3191 | // Custom narrowing of wide multiplies using wide multiply-add instructions. | ||||
3192 | // | ||||
3193 | // TODO: If the multiply is followed by an addition, we should attempt to | ||||
3194 | // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. | ||||
3195 | bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, | ||||
3196 | MachineInstr &MI) const { | ||||
3197 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3197, __extension__ __PRETTY_FUNCTION__)); | ||||
3198 | assert(MI.getOpcode() == TargetOpcode::G_MUL)(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_MUL ) ? void (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_MUL" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3198, __extension__ __PRETTY_FUNCTION__)); | ||||
3199 | |||||
3200 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
3201 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
3202 | |||||
3203 | Register DstReg = MI.getOperand(0).getReg(); | ||||
3204 | Register Src0 = MI.getOperand(1).getReg(); | ||||
3205 | Register Src1 = MI.getOperand(2).getReg(); | ||||
3206 | |||||
3207 | LLT Ty = MRI.getType(DstReg); | ||||
3208 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3208, __extension__ __PRETTY_FUNCTION__)); | ||||
3209 | |||||
3210 | unsigned Size = Ty.getSizeInBits(); | ||||
3211 | unsigned NumParts = Size / 32; | ||||
3212 | assert((Size % 32) == 0)(static_cast <bool> ((Size % 32) == 0) ? void (0) : __assert_fail ("(Size % 32) == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3212, __extension__ __PRETTY_FUNCTION__)); | ||||
3213 | assert(NumParts >= 2)(static_cast <bool> (NumParts >= 2) ? void (0) : __assert_fail ("NumParts >= 2", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3213, __extension__ __PRETTY_FUNCTION__)); | ||||
3214 | |||||
3215 | // Whether to use MAD_64_32 for partial products whose high half is | ||||
3216 | // discarded. This avoids some ADD instructions but risks false dependency | ||||
3217 | // stalls on some subtargets in some cases. | ||||
3218 | const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; | ||||
3219 | |||||
3220 | // Whether to compute odd-aligned partial products separately. This is | ||||
3221 | // advisable on subtargets where the accumulator of MAD_64_32 must be placed | ||||
3222 | // in an even-aligned VGPR. | ||||
3223 | const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); | ||||
3224 | |||||
3225 | LLT S32 = LLT::scalar(32); | ||||
3226 | SmallVector<Register, 2> Src0Parts, Src1Parts; | ||||
3227 | for (unsigned i = 0; i < NumParts; ++i) { | ||||
3228 | Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); | ||||
3229 | Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); | ||||
3230 | } | ||||
3231 | B.buildUnmerge(Src0Parts, Src0); | ||||
3232 | B.buildUnmerge(Src1Parts, Src1); | ||||
3233 | |||||
3234 | SmallVector<Register, 2> AccumRegs(NumParts); | ||||
3235 | buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, | ||||
3236 | SeparateOddAlignedProducts); | ||||
3237 | |||||
3238 | B.buildMergeLikeInstr(DstReg, AccumRegs); | ||||
3239 | MI.eraseFromParent(); | ||||
3240 | return true; | ||||
3241 | } | ||||
3242 | |||||
3243 | // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to | ||||
3244 | // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input | ||||
3245 | // case with a single min instruction instead of a compare+select. | ||||
3246 | bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, | ||||
3247 | MachineRegisterInfo &MRI, | ||||
3248 | MachineIRBuilder &B) const { | ||||
3249 | Register Dst = MI.getOperand(0).getReg(); | ||||
3250 | Register Src = MI.getOperand(1).getReg(); | ||||
3251 | LLT DstTy = MRI.getType(Dst); | ||||
3252 | LLT SrcTy = MRI.getType(Src); | ||||
3253 | |||||
3254 | unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ | ||||
3255 | ? AMDGPU::G_AMDGPU_FFBH_U32 | ||||
3256 | : AMDGPU::G_AMDGPU_FFBL_B32; | ||||
3257 | auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); | ||||
3258 | B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); | ||||
3259 | |||||
3260 | MI.eraseFromParent(); | ||||
3261 | return true; | ||||
3262 | } | ||||
3263 | |||||
3264 | // Check that this is a G_XOR x, -1 | ||||
3265 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||
3266 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||
3267 | return false; | ||||
3268 | auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||
3269 | return ConstVal && *ConstVal == -1; | ||||
3270 | } | ||||
3271 | |||||
3272 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||
3273 | static MachineInstr * | ||||
3274 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||
3275 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||
3276 | Register CondDef = MI.getOperand(0).getReg(); | ||||
3277 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||
3278 | return nullptr; | ||||
3279 | |||||
3280 | MachineBasicBlock *Parent = MI.getParent(); | ||||
3281 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||
3282 | |||||
3283 | if (isNot(MRI, *UseMI)) { | ||||
3284 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||
3285 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||
3286 | return nullptr; | ||||
3287 | |||||
3288 | // We're deleting the def of this value, so we need to remove it. | ||||
3289 | eraseInstr(*UseMI, MRI); | ||||
3290 | |||||
3291 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||
3292 | Negated = true; | ||||
3293 | } | ||||
3294 | |||||
3295 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||
3296 | return nullptr; | ||||
3297 | |||||
3298 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||
3299 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||
3300 | if (Next == Parent->end()) { | ||||
3301 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||
3302 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||
3303 | return nullptr; | ||||
3304 | UncondBrTarget = &*NextMBB; | ||||
3305 | } else { | ||||
3306 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||
3307 | return nullptr; | ||||
3308 | Br = &*Next; | ||||
3309 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||
3310 | } | ||||
3311 | |||||
3312 | return UseMI; | ||||
3313 | } | ||||
3314 | |||||
3315 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||
3316 | const ArgDescriptor *Arg, | ||||
3317 | const TargetRegisterClass *ArgRC, | ||||
3318 | LLT ArgTy) const { | ||||
3319 | MCRegister SrcReg = Arg->getRegister(); | ||||
3320 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3320, __extension__ __PRETTY_FUNCTION__)); | ||||
3321 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3321, __extension__ __PRETTY_FUNCTION__)); | ||||
3322 | |||||
3323 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, | ||||
3324 | *ArgRC, B.getDebugLoc(), ArgTy); | ||||
3325 | if (Arg->isMasked()) { | ||||
3326 | // TODO: Should we try to emit this once in the entry block? | ||||
3327 | const LLT S32 = LLT::scalar(32); | ||||
3328 | const unsigned Mask = Arg->getMask(); | ||||
3329 | const unsigned Shift = llvm::countr_zero<unsigned>(Mask); | ||||
3330 | |||||
3331 | Register AndMaskSrc = LiveIn; | ||||
3332 | |||||
3333 | // TODO: Avoid clearing the high bits if we know workitem id y/z are always | ||||
3334 | // 0. | ||||
3335 | if (Shift
| ||||
3336 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||
3337 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||
3338 | } | ||||
3339 | |||||
3340 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||
| |||||
3341 | } else { | ||||
3342 | B.buildCopy(DstReg, LiveIn); | ||||
3343 | } | ||||
3344 | |||||
3345 | return true; | ||||
3346 | } | ||||
3347 | |||||
3348 | bool AMDGPULegalizerInfo::loadInputValue( | ||||
3349 | Register DstReg, MachineIRBuilder &B, | ||||
3350 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
3351 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
3352 | const ArgDescriptor *Arg; | ||||
3353 | const TargetRegisterClass *ArgRC; | ||||
3354 | LLT ArgTy; | ||||
3355 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||
3356 | |||||
3357 | if (!Arg) { | ||||
3358 | if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { | ||||
3359 | // The intrinsic may appear when we have a 0 sized kernarg segment, in which | ||||
3360 | // case the pointer argument may be missing and we use null. | ||||
3361 | B.buildConstant(DstReg, 0); | ||||
3362 | return true; | ||||
3363 | } | ||||
3364 | |||||
3365 | // It's undefined behavior if a function marked with the amdgpu-no-* | ||||
3366 | // attributes uses the corresponding intrinsic. | ||||
3367 | B.buildUndef(DstReg); | ||||
3368 | return true; | ||||
3369 | } | ||||
3370 | |||||
3371 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||
3372 | return false; // TODO: Handle these | ||||
3373 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||
3374 | } | ||||
3375 | |||||
3376 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||
3377 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||
3378 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
3379 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||
3380 | return false; | ||||
3381 | |||||
3382 | MI.eraseFromParent(); | ||||
3383 | return true; | ||||
3384 | } | ||||
3385 | |||||
3386 | static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, | ||||
3387 | int64_t C) { | ||||
3388 | B.buildConstant(MI.getOperand(0).getReg(), C); | ||||
3389 | MI.eraseFromParent(); | ||||
3390 | return true; | ||||
3391 | } | ||||
3392 | |||||
3393 | bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( | ||||
3394 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||
3395 | unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
3396 | unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); | ||||
3397 | if (MaxID == 0) | ||||
3398 | return replaceWithConstant(B, MI, 0); | ||||
3399 | |||||
3400 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
3401 | const ArgDescriptor *Arg; | ||||
3402 | const TargetRegisterClass *ArgRC; | ||||
3403 | LLT ArgTy; | ||||
3404 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||
3405 | |||||
3406 | Register DstReg = MI.getOperand(0).getReg(); | ||||
3407 | if (!Arg) { | ||||
3408 | // It's undefined behavior if a function marked with the amdgpu-no-* | ||||
3409 | // attributes uses the corresponding intrinsic. | ||||
3410 | B.buildUndef(DstReg); | ||||
3411 | MI.eraseFromParent(); | ||||
3412 | return true; | ||||
3413 | } | ||||
3414 | |||||
3415 | if (Arg->isMasked()) { | ||||
3416 | // Don't bother inserting AssertZext for packed IDs since we're emitting the | ||||
3417 | // masking operations anyway. | ||||
3418 | // | ||||
3419 | // TODO: We could assert the top bit is 0 for the source copy. | ||||
3420 | if (!loadInputValue(DstReg, B, ArgType)) | ||||
3421 | return false; | ||||
3422 | } else { | ||||
3423 | Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); | ||||
3424 | if (!loadInputValue(TmpReg, B, ArgType)) | ||||
3425 | return false; | ||||
3426 | B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); | ||||
3427 | } | ||||
3428 | |||||
3429 | MI.eraseFromParent(); | ||||
3430 | return true; | ||||
3431 | } | ||||
3432 | |||||
3433 | Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, | ||||
3434 | int64_t Offset) const { | ||||
3435 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
3436 | Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); | ||||
3437 | |||||
3438 | // TODO: If we passed in the base kernel offset we could have a better | ||||
3439 | // alignment than 4, but we don't really need it. | ||||
3440 | if (!loadInputValue(KernArgReg, B, | ||||
| |||||
3441 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
3442 | llvm_unreachable("failed to find kernarg segment ptr")::llvm::llvm_unreachable_internal("failed to find kernarg segment ptr" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3442); | ||||
3443 | |||||
3444 | auto COffset = B.buildConstant(LLT::scalar(64), Offset); | ||||
3445 | // TODO: Should get nuw | ||||
3446 | return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); | ||||
3447 | } | ||||
3448 | |||||
3449 | /// Legalize a value that's loaded from kernel arguments. This is only used by | ||||
3450 | /// legacy intrinsics. | ||||
3451 | bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, | ||||
3452 | MachineIRBuilder &B, | ||||
3453 | uint64_t Offset, | ||||
3454 | Align Alignment) const { | ||||
3455 | Register DstReg = MI.getOperand(0).getReg(); | ||||
3456 | |||||
3457 | assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT ::scalar(32) && "unexpected kernarg parameter type") ? void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3458, __extension__ __PRETTY_FUNCTION__)) | ||||
3458 | "unexpected kernarg parameter type")(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT ::scalar(32) && "unexpected kernarg parameter type") ? void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3458, __extension__ __PRETTY_FUNCTION__)); | ||||
3459 | |||||
3460 | Register Ptr = getKernargParameterPtr(B, Offset); | ||||
3461 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
3462 | B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), | ||||
3463 | MachineMemOperand::MODereferenceable | | ||||
3464 | MachineMemOperand::MOInvariant); | ||||
3465 | MI.eraseFromParent(); | ||||
3466 | return true; | ||||
3467 | } | ||||
3468 | |||||
3469 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||
3470 | MachineRegisterInfo &MRI, | ||||
3471 | MachineIRBuilder &B) const { | ||||
3472 | Register Dst = MI.getOperand(0).getReg(); | ||||
3473 | LLT DstTy = MRI.getType(Dst); | ||||
3474 | LLT S16 = LLT::scalar(16); | ||||
3475 | LLT S32 = LLT::scalar(32); | ||||
3476 | LLT S64 = LLT::scalar(64); | ||||
3477 | |||||
3478 | if (DstTy == S16) | ||||
3479 | return legalizeFDIV16(MI, MRI, B); | ||||
3480 | if (DstTy == S32) | ||||
3481 | return legalizeFDIV32(MI, MRI, B); | ||||
3482 | if (DstTy == S64) | ||||
3483 | return legalizeFDIV64(MI, MRI, B); | ||||
3484 | |||||
3485 | return false; | ||||
3486 | } | ||||
3487 | |||||
3488 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, | ||||
3489 | Register DstDivReg, | ||||
3490 | Register DstRemReg, | ||||
3491 | Register X, | ||||
3492 | Register Y) const { | ||||
3493 | const LLT S1 = LLT::scalar(1); | ||||
3494 | const LLT S32 = LLT::scalar(32); | ||||
3495 | |||||
3496 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||
3497 | // algorithm used here. | ||||
3498 | |||||
3499 | // Initial estimate of inv(y). | ||||
3500 | auto FloatY = B.buildUITOFP(S32, Y); | ||||
3501 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||
3502 | auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); | ||||
3503 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||
3504 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||
3505 | |||||
3506 | // One round of UNR. | ||||
3507 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||
3508 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||
3509 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||
3510 | |||||
3511 | // Quotient/remainder estimate. | ||||
3512 | auto Q = B.buildUMulH(S32, X, Z); | ||||
3513 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||
3514 | |||||
3515 | // First quotient/remainder refinement. | ||||
3516 | auto One = B.buildConstant(S32, 1); | ||||
3517 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||
3518 | if (DstDivReg) | ||||
3519 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||
3520 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||
3521 | |||||
3522 | // Second quotient/remainder refinement. | ||||
3523 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||
3524 | if (DstDivReg) | ||||
3525 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||
3526 | |||||
3527 | if (DstRemReg) | ||||
3528 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); | ||||
3529 | } | ||||
3530 | |||||
3531 | // Build integer reciprocal sequence around V_RCP_IFLAG_F32 | ||||
3532 | // | ||||
3533 | // Return lo, hi of result | ||||
3534 | // | ||||
3535 | // %cvt.lo = G_UITOFP Val.lo | ||||
3536 | // %cvt.hi = G_UITOFP Val.hi | ||||
3537 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||
3538 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||
3539 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||
3540 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||
3541 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||
3542 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||
3543 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||
3544 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||
3545 | Register Val) { | ||||
3546 | const LLT S32 = LLT::scalar(32); | ||||
3547 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||
3548 | |||||
3549 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||
3550 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||
3551 | |||||
3552 | auto Mad = B.buildFMAD( | ||||
3553 | S32, CvtHi, // 2**32 | ||||
3554 | B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); | ||||
3555 | |||||
3556 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||
3557 | auto Mul1 = B.buildFMul( | ||||
3558 | S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); | ||||
3559 | |||||
3560 | // 2**(-32) | ||||
3561 | auto Mul2 = B.buildFMul( | ||||
3562 | S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); | ||||
3563 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||
3564 | |||||
3565 | // -(2**32) | ||||
3566 | auto Mad2 = B.buildFMAD( | ||||
3567 | S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), | ||||
3568 | Mul1); | ||||
3569 | |||||
3570 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||
3571 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||
3572 | |||||
3573 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||
3574 | } | ||||
3575 | |||||
3576 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, | ||||
3577 | Register DstDivReg, | ||||
3578 | Register DstRemReg, | ||||
3579 | Register Numer, | ||||
3580 | Register Denom) const { | ||||
3581 | const LLT S32 = LLT::scalar(32); | ||||
3582 | const LLT S64 = LLT::scalar(64); | ||||
3583 | const LLT S1 = LLT::scalar(1); | ||||
3584 | Register RcpLo, RcpHi; | ||||
3585 | |||||
3586 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||
3587 | |||||
3588 | auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); | ||||
3589 | |||||
3590 | auto Zero64 = B.buildConstant(S64, 0); | ||||
3591 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||
3592 | |||||
3593 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||
3594 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||
3595 | |||||
3596 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||
3597 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||
3598 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||
3599 | |||||
3600 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||
3601 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||
3602 | auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); | ||||
3603 | |||||
3604 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||
3605 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||
3606 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||
3607 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||
3608 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||
3609 | |||||
3610 | auto Zero32 = B.buildConstant(S32, 0); | ||||
3611 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||
3612 | auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); | ||||
3613 | auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); | ||||
3614 | |||||
3615 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||
3616 | Register NumerLo = UnmergeNumer.getReg(0); | ||||
3617 | Register NumerHi = UnmergeNumer.getReg(1); | ||||
3618 | |||||
3619 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||
3620 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||
3621 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||
3622 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||
3623 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||
3624 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||
3625 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||
3626 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||
3627 | auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); | ||||
3628 | |||||
3629 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||
3630 | Register DenomLo = UnmergeDenom.getReg(0); | ||||
3631 | Register DenomHi = UnmergeDenom.getReg(1); | ||||
3632 | |||||
3633 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||
3634 | auto C1 = B.buildSExt(S32, CmpHi); | ||||
3635 | |||||
3636 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||
3637 | auto C2 = B.buildSExt(S32, CmpLo); | ||||
3638 | |||||
3639 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||
3640 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||
3641 | |||||
3642 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||
3643 | // Currently control flow is unconditional and we have 4 selects after | ||||
3644 | // potential endif to substitute PHIs. | ||||
3645 | |||||
3646 | // if C3 != 0 ... | ||||
3647 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||
3648 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||
3649 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||
3650 | auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); | ||||
3651 | |||||
3652 | auto One64 = B.buildConstant(S64, 1); | ||||
3653 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||
3654 | |||||
3655 | auto C4 = | ||||
3656 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||
3657 | auto C5 = | ||||
3658 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||
3659 | auto C6 = B.buildSelect( | ||||
3660 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||
3661 | |||||
3662 | // if (C6 != 0) | ||||
3663 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||
3664 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||
3665 | |||||
3666 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||
3667 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||
3668 | auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); | ||||
3669 | |||||
3670 | // endif C6 | ||||
3671 | // endif C3 | ||||
3672 | |||||
3673 | if (DstDivReg) { | ||||
3674 | auto Sel1 = B.buildSelect( | ||||
3675 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||
3676 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||
3677 | Sel1, MulHi3); | ||||
3678 | } | ||||
3679 | |||||
3680 | if (DstRemReg) { | ||||
3681 | auto Sel2 = B.buildSelect( | ||||
3682 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||
3683 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||
3684 | Sel2, Sub1); | ||||
3685 | } | ||||
3686 | } | ||||
3687 | |||||
3688 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, | ||||
3689 | MachineRegisterInfo &MRI, | ||||
3690 | MachineIRBuilder &B) const { | ||||
3691 | Register DstDivReg, DstRemReg; | ||||
3692 | switch (MI.getOpcode()) { | ||||
3693 | default: | ||||
3694 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3694); | ||||
3695 | case AMDGPU::G_UDIV: { | ||||
3696 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3697 | break; | ||||
3698 | } | ||||
3699 | case AMDGPU::G_UREM: { | ||||
3700 | DstRemReg = MI.getOperand(0).getReg(); | ||||
3701 | break; | ||||
3702 | } | ||||
3703 | case AMDGPU::G_UDIVREM: { | ||||
3704 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3705 | DstRemReg = MI.getOperand(1).getReg(); | ||||
3706 | break; | ||||
3707 | } | ||||
3708 | } | ||||
3709 | |||||
3710 | const LLT S64 = LLT::scalar(64); | ||||
3711 | const LLT S32 = LLT::scalar(32); | ||||
3712 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||
3713 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||
3714 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||
3715 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
3716 | |||||
3717 | if (Ty == S32) | ||||
3718 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||
3719 | else if (Ty == S64) | ||||
3720 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||
3721 | else | ||||
3722 | return false; | ||||
3723 | |||||
3724 | MI.eraseFromParent(); | ||||
3725 | return true; | ||||
3726 | } | ||||
3727 | |||||
3728 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, | ||||
3729 | MachineRegisterInfo &MRI, | ||||
3730 | MachineIRBuilder &B) const { | ||||
3731 | const LLT S64 = LLT::scalar(64); | ||||
3732 | const LLT S32 = LLT::scalar(32); | ||||
3733 | |||||
3734 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
3735 | if (Ty != S32 && Ty != S64) | ||||
3736 | return false; | ||||
3737 | |||||
3738 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||
3739 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||
3740 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||
3741 | |||||
3742 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||
3743 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||
3744 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||
3745 | |||||
3746 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||
3747 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||
3748 | |||||
3749 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||
3750 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||
3751 | |||||
3752 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; | ||||
3753 | switch (MI.getOpcode()) { | ||||
3754 | default: | ||||
3755 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3755); | ||||
3756 | case AMDGPU::G_SDIV: { | ||||
3757 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3758 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||
3759 | break; | ||||
3760 | } | ||||
3761 | case AMDGPU::G_SREM: { | ||||
3762 | DstRemReg = MI.getOperand(0).getReg(); | ||||
3763 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||
3764 | break; | ||||
3765 | } | ||||
3766 | case AMDGPU::G_SDIVREM: { | ||||
3767 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3768 | DstRemReg = MI.getOperand(1).getReg(); | ||||
3769 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||
3770 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||
3771 | break; | ||||
3772 | } | ||||
3773 | } | ||||
3774 | |||||
3775 | if (Ty == S32) | ||||
3776 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||
3777 | else | ||||
3778 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||
3779 | |||||
3780 | if (DstDivReg) { | ||||
3781 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||
3782 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); | ||||
3783 | B.buildSub(DstDivReg, SignXor, Sign); | ||||
3784 | } | ||||
3785 | |||||
3786 | if (DstRemReg) { | ||||
3787 | auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||
3788 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); | ||||
3789 | B.buildSub(DstRemReg, SignXor, Sign); | ||||
3790 | } | ||||
3791 | |||||
3792 | MI.eraseFromParent(); | ||||
3793 | return true; | ||||
3794 | } | ||||
3795 | |||||
3796 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||
3797 | MachineRegisterInfo &MRI, | ||||
3798 | MachineIRBuilder &B) const { | ||||
3799 | Register Res = MI.getOperand(0).getReg(); | ||||
3800 | Register LHS = MI.getOperand(1).getReg(); | ||||
3801 | Register RHS = MI.getOperand(2).getReg(); | ||||
3802 | uint16_t Flags = MI.getFlags(); | ||||
3803 | LLT ResTy = MRI.getType(Res); | ||||
3804 | |||||
3805 | const MachineFunction &MF = B.getMF(); | ||||
3806 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||
3807 | MI.getFlag(MachineInstr::FmAfn); | ||||
3808 | |||||
3809 | if (!AllowInaccurateRcp) | ||||
3810 | return false; | ||||
3811 | |||||
3812 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||
3813 | // 1 / x -> RCP(x) | ||||
3814 | if (CLHS->isExactlyValue(1.0)) { | ||||
3815 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||
3816 | .addUse(RHS) | ||||
3817 | .setMIFlags(Flags); | ||||
3818 | |||||
3819 | MI.eraseFromParent(); | ||||
3820 | return true; | ||||
3821 | } | ||||
3822 | |||||
3823 | // -1 / x -> RCP( FNEG(x) ) | ||||
3824 | if (CLHS->isExactlyValue(-1.0)) { | ||||
3825 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||
3826 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||
3827 | .addUse(FNeg.getReg(0)) | ||||
3828 | .setMIFlags(Flags); | ||||
3829 | |||||
3830 | MI.eraseFromParent(); | ||||
3831 | return true; | ||||
3832 | } | ||||
3833 | } | ||||
3834 | |||||
3835 | // x / y -> x * (1.0 / y) | ||||
3836 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||
3837 | .addUse(RHS) | ||||
3838 | .setMIFlags(Flags); | ||||
3839 | B.buildFMul(Res, LHS, RCP, Flags); | ||||
3840 | |||||
3841 | MI.eraseFromParent(); | ||||
3842 | return true; | ||||
3843 | } | ||||
3844 | |||||
3845 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||
3846 | MachineRegisterInfo &MRI, | ||||
3847 | MachineIRBuilder &B) const { | ||||
3848 | Register Res = MI.getOperand(0).getReg(); | ||||
3849 | Register X = MI.getOperand(1).getReg(); | ||||
3850 | Register Y = MI.getOperand(2).getReg(); | ||||
3851 | uint16_t Flags = MI.getFlags(); | ||||
3852 | LLT ResTy = MRI.getType(Res); | ||||
3853 | |||||
3854 | const MachineFunction &MF = B.getMF(); | ||||
3855 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||
3856 | MI.getFlag(MachineInstr::FmAfn); | ||||
3857 | |||||
3858 | if (!AllowInaccurateRcp) | ||||
3859 | return false; | ||||
3860 | |||||
3861 | auto NegY = B.buildFNeg(ResTy, Y); | ||||
3862 | auto One = B.buildFConstant(ResTy, 1.0); | ||||
3863 | |||||
3864 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||
3865 | .addUse(Y) | ||||
3866 | .setMIFlags(Flags); | ||||
3867 | |||||
3868 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||
3869 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||
3870 | |||||
3871 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||
3872 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||
3873 | |||||
3874 | auto Ret = B.buildFMul(ResTy, X, R); | ||||
3875 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||
3876 | |||||
3877 | B.buildFMA(Res, Tmp2, R, Ret); | ||||
3878 | MI.eraseFromParent(); | ||||
3879 | return true; | ||||
3880 | } | ||||
3881 | |||||
3882 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||
3883 | MachineRegisterInfo &MRI, | ||||
3884 | MachineIRBuilder &B) const { | ||||
3885 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||
3886 | return true; | ||||
3887 | |||||
3888 | Register Res = MI.getOperand(0).getReg(); | ||||
3889 | Register LHS = MI.getOperand(1).getReg(); | ||||
3890 | Register RHS = MI.getOperand(2).getReg(); | ||||
3891 | |||||
3892 | uint16_t Flags = MI.getFlags(); | ||||
3893 | |||||
3894 | LLT S16 = LLT::scalar(16); | ||||
3895 | LLT S32 = LLT::scalar(32); | ||||
3896 | |||||
3897 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||
3898 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||
3899 | |||||
3900 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
3901 | .addUse(RHSExt.getReg(0)) | ||||
3902 | .setMIFlags(Flags); | ||||
3903 | |||||
3904 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||
3905 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||
3906 | |||||
3907 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||
3908 | .addUse(RDst.getReg(0)) | ||||
3909 | .addUse(RHS) | ||||
3910 | .addUse(LHS) | ||||
3911 | .setMIFlags(Flags); | ||||
3912 | |||||
3913 | MI.eraseFromParent(); | ||||
3914 | return true; | ||||
3915 | } | ||||
3916 | |||||
3917 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||
3918 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||
3919 | static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, | ||||
3920 | const GCNSubtarget &ST, | ||||
3921 | SIModeRegisterDefaults Mode) { | ||||
3922 | // Set SP denorm mode to this value. | ||||
3923 | unsigned SPDenormMode = | ||||
3924 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||
3925 | |||||
3926 | if (ST.hasDenormModeInst()) { | ||||
3927 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||
3928 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||
3929 | |||||
3930 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||
3931 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||
3932 | .addImm(NewDenormModeValue); | ||||
3933 | |||||
3934 | } else { | ||||
3935 | // Select FP32 bit field in mode register. | ||||
3936 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||
3937 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||
3938 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||
3939 | |||||
3940 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||
3941 | .addImm(SPDenormMode) | ||||
3942 | .addImm(SPDenormModeBitField); | ||||
3943 | } | ||||
3944 | } | ||||
3945 | |||||
3946 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||
3947 | MachineRegisterInfo &MRI, | ||||
3948 | MachineIRBuilder &B) const { | ||||
3949 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||
3950 | return true; | ||||
3951 | |||||
3952 | Register Res = MI.getOperand(0).getReg(); | ||||
3953 | Register LHS = MI.getOperand(1).getReg(); | ||||
3954 | Register RHS = MI.getOperand(2).getReg(); | ||||
3955 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
3956 | SIModeRegisterDefaults Mode = MFI->getMode(); | ||||
3957 | |||||
3958 | uint16_t Flags = MI.getFlags(); | ||||
3959 | |||||
3960 | LLT S32 = LLT::scalar(32); | ||||
3961 | LLT S1 = LLT::scalar(1); | ||||
3962 | |||||
3963 | auto One = B.buildFConstant(S32, 1.0f); | ||||
3964 | |||||
3965 | auto DenominatorScaled = | ||||
3966 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||
3967 | .addUse(LHS) | ||||
3968 | .addUse(RHS) | ||||
3969 | .addImm(0) | ||||
3970 | .setMIFlags(Flags); | ||||
3971 | auto NumeratorScaled = | ||||
3972 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||
3973 | .addUse(LHS) | ||||
3974 | .addUse(RHS) | ||||
3975 | .addImm(1) | ||||
3976 | .setMIFlags(Flags); | ||||
3977 | |||||
3978 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
3979 | .addUse(DenominatorScaled.getReg(0)) | ||||
3980 | .setMIFlags(Flags); | ||||
3981 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||
3982 | |||||
3983 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||
3984 | // aren't modeled as reading it. | ||||
3985 | if (!Mode.allFP32Denormals()) | ||||
3986 | toggleSPDenormMode(true, B, ST, Mode); | ||||
3987 | |||||
3988 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||
3989 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||
3990 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||
3991 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||
3992 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||
3993 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||
3994 | |||||
3995 | if (!Mode.allFP32Denormals()) | ||||
3996 | toggleSPDenormMode(false, B, ST, Mode); | ||||
3997 | |||||
3998 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||
3999 | .addUse(Fma4.getReg(0)) | ||||
4000 | .addUse(Fma1.getReg(0)) | ||||
4001 | .addUse(Fma3.getReg(0)) | ||||
4002 | .addUse(NumeratorScaled.getReg(1)) | ||||
4003 | .setMIFlags(Flags); | ||||
4004 | |||||
4005 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||
4006 | .addUse(Fmas.getReg(0)) | ||||
4007 | .addUse(RHS) | ||||
4008 | .addUse(LHS) | ||||
4009 | .setMIFlags(Flags); | ||||
4010 | |||||
4011 | MI.eraseFromParent(); | ||||
4012 | return true; | ||||
4013 | } | ||||
4014 | |||||
4015 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||
4016 | MachineRegisterInfo &MRI, | ||||
4017 | MachineIRBuilder &B) const { | ||||
4018 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||
4019 | return true; | ||||
4020 | |||||
4021 | Register Res = MI.getOperand(0).getReg(); | ||||
4022 | Register LHS = MI.getOperand(1).getReg(); | ||||
4023 | Register RHS = MI.getOperand(2).getReg(); | ||||
4024 | |||||
4025 | uint16_t Flags = MI.getFlags(); | ||||
4026 | |||||
4027 | LLT S64 = LLT::scalar(64); | ||||
4028 | LLT S1 = LLT::scalar(1); | ||||
4029 | |||||
4030 | auto One = B.buildFConstant(S64, 1.0); | ||||
4031 | |||||
4032 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||
4033 | .addUse(LHS) | ||||
4034 | .addUse(RHS) | ||||
4035 | .addImm(0) | ||||
4036 | .setMIFlags(Flags); | ||||
4037 | |||||
4038 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||
4039 | |||||
4040 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||
4041 | .addUse(DivScale0.getReg(0)) | ||||
4042 | .setMIFlags(Flags); | ||||
4043 | |||||
4044 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||
4045 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||
4046 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||
4047 | |||||
4048 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||
4049 | .addUse(LHS) | ||||
4050 | .addUse(RHS) | ||||
4051 | .addImm(1) | ||||
4052 | .setMIFlags(Flags); | ||||
4053 | |||||
4054 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||
4055 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||
4056 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||
4057 | |||||
4058 | Register Scale; | ||||
4059 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||
4060 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||
4061 | // is not usable. | ||||
4062 | |||||
4063 | LLT S32 = LLT::scalar(32); | ||||
4064 | |||||
4065 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||
4066 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||
4067 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||
4068 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||
4069 | |||||
4070 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||
4071 | Scale1Unmerge.getReg(1)); | ||||
4072 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||
4073 | Scale0Unmerge.getReg(1)); | ||||
4074 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||
4075 | } else { | ||||
4076 | Scale = DivScale1.getReg(1); | ||||
4077 | } | ||||
4078 | |||||
4079 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||
4080 | .addUse(Fma4.getReg(0)) | ||||
4081 | .addUse(Fma3.getReg(0)) | ||||
4082 | .addUse(Mul.getReg(0)) | ||||
4083 | .addUse(Scale) | ||||
4084 | .setMIFlags(Flags); | ||||
4085 | |||||
4086 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) | ||||
4087 | .addUse(Fmas.getReg(0)) | ||||
4088 | .addUse(RHS) | ||||
4089 | .addUse(LHS) | ||||
4090 | .setMIFlags(Flags); | ||||
4091 | |||||
4092 | MI.eraseFromParent(); | ||||
4093 | return true; | ||||
4094 | } | ||||
4095 | |||||
4096 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||
4097 | MachineRegisterInfo &MRI, | ||||
4098 | MachineIRBuilder &B) const { | ||||
4099 | Register Res = MI.getOperand(0).getReg(); | ||||
4100 | Register LHS = MI.getOperand(2).getReg(); | ||||
4101 | Register RHS = MI.getOperand(3).getReg(); | ||||
4102 | uint16_t Flags = MI.getFlags(); | ||||
4103 | |||||
4104 | LLT S32 = LLT::scalar(32); | ||||
4105 | LLT S1 = LLT::scalar(1); | ||||
4106 | |||||
4107 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||
4108 | const APFloat C0Val(1.0f); | ||||
4109 | |||||
4110 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||
4111 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||
4112 | auto C2 = B.buildConstant(S32, llvm::bit_cast<uint32_t>(1.0f)); | ||||
4113 | |||||
4114 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||
4115 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||
4116 | |||||
4117 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||
4118 | |||||
4119 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
4120 | .addUse(Mul0.getReg(0)) | ||||
4121 | .setMIFlags(Flags); | ||||
4122 | |||||
4123 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||
4124 | |||||
4125 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||
4126 | |||||
4127 | MI.eraseFromParent(); | ||||
4128 | return true; | ||||
4129 | } | ||||
4130 | |||||
4131 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||
4132 | // FIXME: Why do we handle this one but not other removed instructions? | ||||
4133 | // | ||||
4134 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||
4135 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||
4136 | // +-max_float. | ||||
4137 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||
4138 | MachineRegisterInfo &MRI, | ||||
4139 | MachineIRBuilder &B) const { | ||||
4140 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||
4141 | return true; | ||||
4142 | |||||
4143 | Register Dst = MI.getOperand(0).getReg(); | ||||
4144 | Register Src = MI.getOperand(2).getReg(); | ||||
4145 | auto Flags = MI.getFlags(); | ||||
4146 | |||||
4147 | LLT Ty = MRI.getType(Dst); | ||||
4148 | |||||
4149 | const fltSemantics *FltSemantics; | ||||
4150 | if (Ty == LLT::scalar(32)) | ||||
4151 | FltSemantics = &APFloat::IEEEsingle(); | ||||
4152 | else if (Ty == LLT::scalar(64)) | ||||
4153 | FltSemantics = &APFloat::IEEEdouble(); | ||||
4154 | else | ||||
4155 | return false; | ||||
4156 | |||||
4157 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||
4158 | .addUse(Src) | ||||
4159 | .setMIFlags(Flags); | ||||
4160 | |||||
4161 | // We don't need to concern ourselves with the snan handling difference, since | ||||
4162 | // the rsq quieted (or not) so use the one which will directly select. | ||||
4163 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
4164 | const bool UseIEEE = MFI->getMode().IEEE; | ||||
4165 | |||||
4166 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||
4167 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||
4168 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||
4169 | |||||
4170 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||
4171 | |||||
4172 | if (UseIEEE) | ||||
4173 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||
4174 | else | ||||
4175 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||
4176 | MI.eraseFromParent(); | ||||
4177 | return true; | ||||
4178 | } | ||||
4179 | |||||
4180 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||
4181 | switch (IID) { | ||||
4182 | case Intrinsic::amdgcn_ds_fadd: | ||||
4183 | return AMDGPU::G_ATOMICRMW_FADD; | ||||
4184 | case Intrinsic::amdgcn_ds_fmin: | ||||
4185 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||
4186 | case Intrinsic::amdgcn_ds_fmax: | ||||
4187 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||
4188 | default: | ||||
4189 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4189); | ||||
4190 | } | ||||
4191 | } | ||||
4192 | |||||
4193 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||
4194 | MachineInstr &MI, | ||||
4195 | Intrinsic::ID IID) const { | ||||
4196 | GISelChangeObserver &Observer = Helper.Observer; | ||||
4197 | Observer.changingInstr(MI); | ||||
4198 | |||||
4199 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||
4200 | |||||
4201 | // The remaining operands were used to set fields in the MemOperand on | ||||
4202 | // construction. | ||||
4203 | for (int I = 6; I > 3; --I) | ||||
4204 | MI.removeOperand(I); | ||||
4205 | |||||
4206 | MI.removeOperand(1); // Remove the intrinsic ID. | ||||
4207 | Observer.changedInstr(MI); | ||||
4208 | return true; | ||||
4209 | } | ||||
4210 | |||||
4211 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||
4212 | MachineRegisterInfo &MRI, | ||||
4213 | MachineIRBuilder &B) const { | ||||
4214 | uint64_t Offset = | ||||
4215 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||
4216 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||
4217 | LLT DstTy = MRI.getType(DstReg); | ||||
4218 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||
4219 | |||||
4220 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||
4221 | if (!loadInputValue(KernargPtrReg, B, | ||||
4222 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
4223 | return false; | ||||
4224 | |||||
4225 | // FIXME: This should be nuw | ||||
4226 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||
4227 | return true; | ||||
4228 | } | ||||
4229 | |||||
4230 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||
4231 | MachineRegisterInfo &MRI, | ||||
4232 | MachineIRBuilder &B) const { | ||||
4233 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
4234 | if (!MFI->isEntryFunction()) { | ||||
4235 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
4236 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||
4237 | } | ||||
4238 | |||||
4239 | Register DstReg = MI.getOperand(0).getReg(); | ||||
4240 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||
4241 | return false; | ||||
4242 | |||||
4243 | MI.eraseFromParent(); | ||||
4244 | return true; | ||||
4245 | } | ||||
4246 | |||||
4247 | bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, | ||||
4248 | MachineRegisterInfo &MRI, | ||||
4249 | MachineIRBuilder &B) const { | ||||
4250 | Function &F = B.getMF().getFunction(); | ||||
4251 | std::optional<uint32_t> KnownSize = | ||||
4252 | AMDGPUMachineFunction::getLDSKernelIdMetadata(F); | ||||
4253 | if (KnownSize.has_value()) | ||||
4254 | B.buildConstant(DstReg, *KnownSize); | ||||
4255 | return false; | ||||
4256 | } | ||||
4257 | |||||
4258 | bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, | ||||
4259 | MachineRegisterInfo &MRI, | ||||
4260 | MachineIRBuilder &B) const { | ||||
4261 | |||||
4262 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
4263 | if (!MFI->isEntryFunction()) { | ||||
4264 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
4265 | AMDGPUFunctionArgInfo::LDS_KERNEL_ID); | ||||
4266 | } | ||||
4267 | |||||
4268 | Register DstReg = MI.getOperand(0).getReg(); | ||||
4269 | if (!getLDSKernelId(DstReg, MRI, B)) | ||||
4270 | return false; | ||||
4271 | |||||
4272 | MI.eraseFromParent(); | ||||
4273 | return true; | ||||
4274 | } | ||||
4275 | |||||
4276 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||
4277 | MachineRegisterInfo &MRI, | ||||
4278 | MachineIRBuilder &B, | ||||
4279 | unsigned AddrSpace) const { | ||||
4280 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||
4281 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||
4282 | Register Hi32 = Unmerge.getReg(1); | ||||
4283 | |||||
4284 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||
4285 | MI.eraseFromParent(); | ||||
4286 | return true; | ||||
4287 | } | ||||
4288 | |||||
4289 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||
4290 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||
4291 | // split between the instruction's voffset and immoffset fields) and soffset | ||||
4292 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||
4293 | // the instruction's soffset field). This function takes the first kind of | ||||
4294 | // offset and figures out how to split it between voffset and immoffset. | ||||
4295 | std::pair<Register, unsigned> | ||||
4296 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||
4297 | Register OrigOffset) const { | ||||
4298 | const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); | ||||
4299 | Register BaseReg; | ||||
4300 | unsigned ImmOffset; | ||||
4301 | const LLT S32 = LLT::scalar(32); | ||||
4302 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
4303 | |||||
4304 | std::tie(BaseReg, ImmOffset) = | ||||
4305 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | ||||
4306 | |||||
4307 | // If BaseReg is a pointer, convert it to int. | ||||
4308 | if (MRI.getType(BaseReg).isPointer()) | ||||
4309 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | ||||
4310 | |||||
4311 | // If the immediate value is too big for the immoffset field, put only bits | ||||
4312 | // that would normally fit in the immoffset field. The remaining value that | ||||
4313 | // is copied/added for the voffset field is a large power of 2, and it | ||||
4314 | // stands more chance of being CSEd with the copy/add for another similar | ||||
4315 | // load/store. | ||||
4316 | // However, do not do that rounding down if that is a negative | ||||
4317 | // number, as it appears to be illegal to have a negative offset in the | ||||
4318 | // vgpr, even if adding the immediate offset makes it positive. | ||||
4319 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||
4320 | ImmOffset -= Overflow; | ||||
4321 | if ((int32_t)Overflow < 0) { | ||||
4322 | Overflow += ImmOffset; | ||||
4323 | ImmOffset = 0; | ||||
4324 | } | ||||
4325 | |||||
4326 | if (Overflow != 0) { | ||||
4327 | if (!BaseReg) { | ||||
4328 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||
4329 | } else { | ||||
4330 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||
4331 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||
4332 | } | ||||
4333 | } | ||||
4334 | |||||
4335 | if (!BaseReg) | ||||
4336 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||
4337 | |||||
4338 | return std::pair(BaseReg, ImmOffset); | ||||
4339 | } | ||||
4340 | |||||
4341 | /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. | ||||
4342 | void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, | ||||
4343 | Register VOffset, Register SOffset, | ||||
4344 | unsigned ImmOffset, Register VIndex, | ||||
4345 | MachineRegisterInfo &MRI) const { | ||||
4346 | std::optional<ValueAndVReg> MaybeVOffsetVal = | ||||
4347 | getIConstantVRegValWithLookThrough(VOffset, MRI); | ||||
4348 | std::optional<ValueAndVReg> MaybeSOffsetVal = | ||||
4349 | getIConstantVRegValWithLookThrough(SOffset, MRI); | ||||
4350 | std::optional<ValueAndVReg> MaybeVIndexVal = | ||||
4351 | getIConstantVRegValWithLookThrough(VIndex, MRI); | ||||
4352 | // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, | ||||
4353 | // update the MMO with that offset. The stride is unknown so we can only do | ||||
4354 | // this if VIndex is constant 0. | ||||
4355 | if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && | ||||
4356 | MaybeVIndexVal->Value == 0) { | ||||
4357 | uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + | ||||
4358 | MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; | ||||
4359 | MMO->setOffset(TotalOffset); | ||||
4360 | } else { | ||||
4361 | // We don't have a constant combined offset to use in the MMO. Give up. | ||||
4362 | MMO->setValue((Value *)nullptr); | ||||
4363 | } | ||||
4364 | } | ||||
4365 | |||||
4366 | /// Handle register layout difference for f16 images for some subtargets. | ||||
4367 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||
4368 | MachineRegisterInfo &MRI, | ||||
4369 | Register Reg, | ||||
4370 | bool ImageStore) const { | ||||
4371 | const LLT S16 = LLT::scalar(16); | ||||
4372 | const LLT S32 = LLT::scalar(32); | ||||
4373 | LLT StoreVT = MRI.getType(Reg); | ||||
4374 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4374, __extension__ __PRETTY_FUNCTION__)); | ||||
4375 | |||||
4376 | if (ST.hasUnpackedD16VMem()) { | ||||
4377 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||
4378 | |||||
4379 | SmallVector<Register, 4> WideRegs; | ||||
4380 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
4381 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||
4382 | |||||
4383 | int NumElts = StoreVT.getNumElements(); | ||||
4384 | |||||
4385 | return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) | ||||
4386 | .getReg(0); | ||||
4387 | } | ||||
4388 | |||||
4389 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||
4390 | if (StoreVT.getNumElements() == 2) { | ||||
4391 | SmallVector<Register, 4> PackedRegs; | ||||
4392 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||
4393 | PackedRegs.push_back(Reg); | ||||
4394 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||
4395 | return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) | ||||
4396 | .getReg(0); | ||||
4397 | } | ||||
4398 | |||||
4399 | if (StoreVT.getNumElements() == 3) { | ||||
4400 | SmallVector<Register, 4> PackedRegs; | ||||
4401 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||
4402 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
4403 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||
4404 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||
4405 | Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); | ||||
4406 | return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); | ||||
4407 | } | ||||
4408 | |||||
4409 | if (StoreVT.getNumElements() == 4) { | ||||
4410 | SmallVector<Register, 4> PackedRegs; | ||||
4411 | Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); | ||||
4412 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||
4413 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
4414 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||
4415 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||
4416 | return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) | ||||
4417 | .getReg(0); | ||||
4418 | } | ||||
4419 | |||||
4420 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4420); | ||||
4421 | } | ||||
4422 | |||||
4423 | if (StoreVT == LLT::fixed_vector(3, S16)) { | ||||
4424 | Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) | ||||
4425 | .getReg(0); | ||||
4426 | } | ||||
4427 | return Reg; | ||||
4428 | } | ||||
4429 | |||||
4430 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||
4431 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||
4432 | MachineRegisterInfo *MRI = B.getMRI(); | ||||
4433 | LLT Ty = MRI->getType(VData); | ||||
4434 | |||||
4435 | const LLT S16 = LLT::scalar(16); | ||||
4436 | |||||
4437 | // Fixup illegal register types for i8 stores. | ||||
4438 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||
4439 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||
4440 | return AnyExt; | ||||
4441 | } | ||||
4442 | |||||
4443 | if (Ty.isVector()) { | ||||
4444 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||
4445 | if (IsFormat) | ||||
4446 | return handleD16VData(B, *MRI, VData); | ||||
4447 | } | ||||
4448 | } | ||||
4449 | |||||
4450 | return VData; | ||||
4451 | } | ||||
4452 | |||||
4453 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||
4454 | MachineRegisterInfo &MRI, | ||||
4455 | MachineIRBuilder &B, | ||||
4456 | bool IsTyped, | ||||
4457 | bool IsFormat) const { | ||||
4458 | Register VData = MI.getOperand(1).getReg(); | ||||
4459 | LLT Ty = MRI.getType(VData); | ||||
4460 | LLT EltTy = Ty.getScalarType(); | ||||
4461 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||
4462 | const LLT S32 = LLT::scalar(32); | ||||
4463 | |||||
4464 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||
4465 | Register RSrc = MI.getOperand(2).getReg(); | ||||
4466 | |||||
4467 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
4468 | const int MemSize = MMO->getSize(); | ||||
4469 | |||||
4470 | unsigned ImmOffset; | ||||
4471 | |||||
4472 | // The typed intrinsics add an immediate after the registers. | ||||
4473 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||
4474 | |||||
4475 | // The struct intrinsic variants add one additional operand over raw. | ||||
4476 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||
4477 | Register VIndex; | ||||
4478 | int OpOffset = 0; | ||||
4479 | if (HasVIndex) { | ||||
4480 | VIndex = MI.getOperand(3).getReg(); | ||||
4481 | OpOffset = 1; | ||||
4482 | } else { | ||||
4483 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||
4484 | } | ||||
4485 | |||||
4486 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||
4487 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
4488 | |||||
4489 | unsigned Format = 0; | ||||
4490 | if (IsTyped) { | ||||
4491 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||
4492 | ++OpOffset; | ||||
4493 | } | ||||
4494 | |||||
4495 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||
4496 | |||||
4497 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
4498 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||
4499 | |||||
4500 | unsigned Opc; | ||||
4501 | if (IsTyped) { | ||||
4502 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||
4503 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||
4504 | } else if (IsFormat) { | ||||
4505 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||
4506 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||
4507 | } else { | ||||
4508 | switch (MemSize) { | ||||
4509 | case 1: | ||||
4510 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||
4511 | break; | ||||
4512 | case 2: | ||||
4513 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||
4514 | break; | ||||
4515 | default: | ||||
4516 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||
4517 | break; | ||||
4518 | } | ||||
4519 | } | ||||
4520 | |||||
4521 | auto MIB = B.buildInstr(Opc) | ||||
4522 | .addUse(VData) // vdata | ||||
4523 | .addUse(RSrc) // rsrc | ||||
4524 | .addUse(VIndex) // vindex | ||||
4525 | .addUse(VOffset) // voffset | ||||
4526 | .addUse(SOffset) // soffset | ||||
4527 | .addImm(ImmOffset); // offset(imm) | ||||
4528 | |||||
4529 | if (IsTyped) | ||||
4530 | MIB.addImm(Format); | ||||
4531 | |||||
4532 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
4533 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
4534 | .addMemOperand(MMO); | ||||
4535 | |||||
4536 | MI.eraseFromParent(); | ||||
4537 | return true; | ||||
4538 | } | ||||
4539 | |||||
4540 | static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, | ||||
4541 | Register VIndex, Register VOffset, Register SOffset, | ||||
4542 | unsigned ImmOffset, unsigned Format, | ||||
4543 | unsigned AuxiliaryData, MachineMemOperand *MMO, | ||||
4544 | bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { | ||||
4545 | auto MIB = B.buildInstr(Opc) | ||||
4546 | .addDef(LoadDstReg) // vdata | ||||
4547 | .addUse(RSrc) // rsrc | ||||
4548 | .addUse(VIndex) // vindex | ||||
4549 | .addUse(VOffset) // voffset | ||||
4550 | .addUse(SOffset) // soffset | ||||
4551 | .addImm(ImmOffset); // offset(imm) | ||||
4552 | |||||
4553 | if (IsTyped) | ||||
4554 | MIB.addImm(Format); | ||||
4555 | |||||
4556 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
4557 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
4558 | .addMemOperand(MMO); | ||||
4559 | } | ||||
4560 | |||||
4561 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||
4562 | MachineRegisterInfo &MRI, | ||||
4563 | MachineIRBuilder &B, | ||||
4564 | bool IsFormat, | ||||
4565 | bool IsTyped) const { | ||||
4566 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||
4567 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
4568 | const LLT MemTy = MMO->getMemoryType(); | ||||
4569 | const LLT S32 = LLT::scalar(32); | ||||
4570 | |||||
4571 | Register Dst = MI.getOperand(0).getReg(); | ||||
4572 | |||||
4573 | Register StatusDst; | ||||
4574 | int OpOffset = 0; | ||||
4575 | assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2)(static_cast <bool> (MI.getNumExplicitDefs() == 1 || MI .getNumExplicitDefs() == 2) ? void (0) : __assert_fail ("MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4575, __extension__ __PRETTY_FUNCTION__)); | ||||
4576 | bool IsTFE = MI.getNumExplicitDefs() == 2; | ||||
4577 | if (IsTFE) { | ||||
4578 | StatusDst = MI.getOperand(1).getReg(); | ||||
4579 | ++OpOffset; | ||||
4580 | } | ||||
4581 | |||||
4582 | Register RSrc = MI.getOperand(2 + OpOffset).getReg(); | ||||
4583 | |||||
4584 | // The typed intrinsics add an immediate after the registers. | ||||
4585 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||
4586 | |||||
4587 | // The struct intrinsic variants add one additional operand over raw. | ||||
4588 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; | ||||
4589 | Register VIndex; | ||||
4590 | if (HasVIndex) { | ||||
4591 | VIndex = MI.getOperand(3 + OpOffset).getReg(); | ||||
4592 | ++OpOffset; | ||||
4593 | } else { | ||||
4594 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||
4595 | } | ||||
4596 | |||||
4597 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||
4598 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
4599 | |||||
4600 | unsigned Format = 0; | ||||
4601 | if (IsTyped) { | ||||
4602 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||
4603 | ++OpOffset; | ||||
4604 | } | ||||
4605 | |||||
4606 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||
4607 | unsigned ImmOffset; | ||||
4608 | |||||
4609 | LLT Ty = MRI.getType(Dst); | ||||
4610 | LLT EltTy = Ty.getScalarType(); | ||||
4611 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||
4612 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||
4613 | |||||
4614 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
4615 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||
4616 | |||||
4617 | unsigned Opc; | ||||
4618 | |||||
4619 | // TODO: Support TFE for typed and narrow loads. | ||||
4620 | if (IsTyped) { | ||||
4621 | if (IsTFE) | ||||
4622 | return false; | ||||
4623 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||
4624 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||
4625 | } else if (IsFormat) { | ||||
4626 | if (IsD16) { | ||||
4627 | if (IsTFE) | ||||
4628 | return false; | ||||
4629 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; | ||||
4630 | } else { | ||||
4631 | Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE | ||||
4632 | : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||
4633 | } | ||||
4634 | } else { | ||||
4635 | if (IsTFE) | ||||
4636 | return false; | ||||
4637 | switch (MemTy.getSizeInBits()) { | ||||
4638 | case 8: | ||||
4639 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||
4640 | break; | ||||
4641 | case 16: | ||||
4642 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||
4643 | break; | ||||
4644 | default: | ||||
4645 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||
4646 | break; | ||||
4647 | } | ||||
4648 | } | ||||
4649 | |||||
4650 | if (IsTFE) { | ||||
4651 | unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); | ||||
4652 | unsigned NumLoadDWords = NumValueDWords + 1; | ||||
4653 | LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); | ||||
4654 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); | ||||
4655 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
4656 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4657 | if (NumValueDWords == 1) { | ||||
4658 | B.buildUnmerge({Dst, StatusDst}, LoadDstReg); | ||||
4659 | } else { | ||||
4660 | SmallVector<Register, 5> LoadElts; | ||||
4661 | for (unsigned I = 0; I != NumValueDWords; ++I) | ||||
4662 | LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); | ||||
4663 | LoadElts.push_back(StatusDst); | ||||
4664 | B.buildUnmerge(LoadElts, LoadDstReg); | ||||
4665 | LoadElts.truncate(NumValueDWords); | ||||
4666 | B.buildMergeLikeInstr(Dst, LoadElts); | ||||
4667 | } | ||||
4668 | } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || | ||||
4669 | (IsD16 && !Ty.isVector())) { | ||||
4670 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||
4671 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
4672 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4673 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||
4674 | B.buildTrunc(Dst, LoadDstReg); | ||||
4675 | } else if (Unpacked && IsD16 && Ty.isVector()) { | ||||
4676 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||
4677 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||
4678 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
4679 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4680 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||
4681 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||
4682 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||
4683 | SmallVector<Register, 4> Repack; | ||||
4684 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||
4685 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||
4686 | B.buildMergeLikeInstr(Dst, Repack); | ||||
4687 | } else { | ||||
4688 | buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, | ||||
4689 | AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4690 | } | ||||
4691 | |||||
4692 | MI.eraseFromParent(); | ||||
4693 | return true; | ||||
4694 | } | ||||
4695 | |||||
4696 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||
4697 | MachineIRBuilder &B, | ||||
4698 | bool IsInc) const { | ||||
4699 | unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP : | ||||
4700 | AMDGPU::G_ATOMICRMW_UDEC_WRAP; | ||||
4701 | B.buildInstr(Opc) | ||||
4702 | .addDef(MI.getOperand(0).getReg()) | ||||
4703 | .addUse(MI.getOperand(2).getReg()) | ||||
4704 | .addUse(MI.getOperand(3).getReg()) | ||||
4705 | .cloneMemRefs(MI); | ||||
4706 | MI.eraseFromParent(); | ||||
4707 | return true; | ||||
4708 | } | ||||
4709 | |||||
4710 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||
4711 | switch (IntrID) { | ||||
4712 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||
4713 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||
4714 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||
4715 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||
4716 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||
4717 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||
4718 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||
4719 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||
4720 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||
4721 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||
4722 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||
4723 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||
4724 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||
4725 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||
4726 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||
4727 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||
4728 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||
4729 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||
4730 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||
4731 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||
4732 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||
4733 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||
4734 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||
4735 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||
4736 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||
4737 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||
4738 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||
4739 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||
4740 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||
4741 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||
4742 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||
4743 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||
4744 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||
4745 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||
4746 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||
4747 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||
4748 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||
4749 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||
4750 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||
4751 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||
4752 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||
4753 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||
4754 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||
4755 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||
4756 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | ||||
4757 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||
4758 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||
4759 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | ||||
4760 | default: | ||||
4761 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4761); | ||||
4762 | } | ||||
4763 | } | ||||
4764 | |||||
4765 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||
4766 | MachineIRBuilder &B, | ||||
4767 | Intrinsic::ID IID) const { | ||||
4768 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||
4769 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||
4770 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||
4771 | |||||
4772 | Register Dst; | ||||
4773 | |||||
4774 | int OpOffset = 0; | ||||
4775 | if (HasReturn) { | ||||
4776 | // A few FP atomics do not support return values. | ||||
4777 | Dst = MI.getOperand(0).getReg(); | ||||
4778 | } else { | ||||
4779 | OpOffset = -1; | ||||
4780 | } | ||||
4781 | |||||
4782 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||
4783 | Register CmpVal; | ||||
4784 | |||||
4785 | if (IsCmpSwap) { | ||||
4786 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||
4787 | ++OpOffset; | ||||
4788 | } | ||||
4789 | |||||
4790 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||
4791 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||
4792 | |||||
4793 | // The struct intrinsic variants add one additional operand over raw. | ||||
4794 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||
4795 | Register VIndex; | ||||
4796 | if (HasVIndex) { | ||||
4797 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||
4798 | ++OpOffset; | ||||
4799 | } else { | ||||
4800 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||
4801 | } | ||||
4802 | |||||
4803 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
4804 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||
4805 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||
4806 | |||||
4807 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
4808 | |||||
4809 | unsigned ImmOffset; | ||||
4810 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
4811 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); | ||||
4812 | |||||
4813 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||
4814 | |||||
4815 | if (HasReturn) | ||||
4816 | MIB.addDef(Dst); | ||||
4817 | |||||
4818 | MIB.addUse(VData); // vdata | ||||
4819 | |||||
4820 | if (IsCmpSwap) | ||||
4821 | MIB.addReg(CmpVal); | ||||
4822 | |||||
4823 | MIB.addUse(RSrc) // rsrc | ||||
4824 | .addUse(VIndex) // vindex | ||||
4825 | .addUse(VOffset) // voffset | ||||
4826 | .addUse(SOffset) // soffset | ||||
4827 | .addImm(ImmOffset) // offset(imm) | ||||
4828 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
4829 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
4830 | .addMemOperand(MMO); | ||||
4831 | |||||
4832 | MI.eraseFromParent(); | ||||
4833 | return true; | ||||
4834 | } | ||||
4835 | |||||
4836 | /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized | ||||
4837 | /// vector with s16 typed elements. | ||||
4838 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, | ||||
4839 | SmallVectorImpl<Register> &PackedAddrs, | ||||
4840 | unsigned ArgOffset, | ||||
4841 | const AMDGPU::ImageDimIntrinsicInfo *Intr, | ||||
4842 | bool IsA16, bool IsG16) { | ||||
4843 | const LLT S16 = LLT::scalar(16); | ||||
4844 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
4845 | auto EndIdx = Intr->VAddrEnd; | ||||
4846 | |||||
4847 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||
4848 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||
4849 | if (!SrcOp.isReg()) | ||||
4850 | continue; // _L to _LZ may have eliminated this. | ||||
4851 | |||||
4852 | Register AddrReg = SrcOp.getReg(); | ||||
4853 | |||||
4854 | if ((I < Intr->GradientStart) || | ||||
4855 | (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || | ||||
4856 | (I >= Intr->CoordStart && !IsA16)) { | ||||
4857 | if ((I < Intr->GradientStart) && IsA16 && | ||||
4858 | (B.getMRI()->getType(AddrReg) == S16)) { | ||||
4859 | assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument")(static_cast <bool> (I == Intr->BiasIndex && "Got unexpected 16-bit extra argument") ? void (0) : __assert_fail ("I == Intr->BiasIndex && \"Got unexpected 16-bit extra argument\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4859, __extension__ __PRETTY_FUNCTION__)); | ||||
4860 | // Special handling of bias when A16 is on. Bias is of type half but | ||||
4861 | // occupies full 32-bit. | ||||
4862 | PackedAddrs.push_back( | ||||
4863 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||
4864 | .getReg(0)); | ||||
4865 | } else { | ||||
4866 | assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode" ) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__ __PRETTY_FUNCTION__)) | ||||
4867 | "Bias needs to be converted to 16 bit in A16 mode")(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode" ) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__ __PRETTY_FUNCTION__)); | ||||
4868 | // Handle any gradient or coordinate operands that should not be packed | ||||
4869 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||
4870 | PackedAddrs.push_back(AddrReg); | ||||
4871 | } | ||||
4872 | } else { | ||||
4873 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||
4874 | // derivatives dx/dh and dx/dv are packed with undef. | ||||
4875 | if (((I + 1) >= EndIdx) || | ||||
4876 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||
4877 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||
4878 | (Intr->NumGradients / 2) - 1) || | ||||
4879 | I == static_cast<unsigned>(Intr->GradientStart + | ||||
4880 | Intr->NumGradients - 1))) || | ||||
4881 | // Check for _L to _LZ optimization | ||||
4882 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||
4883 | PackedAddrs.push_back( | ||||
4884 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||
4885 | .getReg(0)); | ||||
4886 | } else { | ||||
4887 | PackedAddrs.push_back( | ||||
4888 | B.buildBuildVector( | ||||
4889 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||
4890 | .getReg(0)); | ||||
4891 | ++I; | ||||
4892 | } | ||||
4893 | } | ||||
4894 | } | ||||
4895 | } | ||||
4896 | |||||
4897 | /// Convert from separate vaddr components to a single vector address register, | ||||
4898 | /// and replace the remaining operands with $noreg. | ||||
4899 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||
4900 | int DimIdx, int NumVAddrs) { | ||||
4901 | const LLT S32 = LLT::scalar(32); | ||||
4902 | (void)S32; | ||||
4903 | SmallVector<Register, 8> AddrRegs; | ||||
4904 | for (int I = 0; I != NumVAddrs; ++I) { | ||||
4905 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||
4906 | if (SrcOp.isReg()) { | ||||
4907 | AddrRegs.push_back(SrcOp.getReg()); | ||||
4908 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4908, __extension__ __PRETTY_FUNCTION__)); | ||||
4909 | } | ||||
4910 | } | ||||
4911 | |||||
4912 | int NumAddrRegs = AddrRegs.size(); | ||||
4913 | if (NumAddrRegs != 1) { | ||||
4914 | auto VAddr = | ||||
4915 | B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); | ||||
4916 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||
4917 | } | ||||
4918 | |||||
4919 | for (int I = 1; I != NumVAddrs; ++I) { | ||||
4920 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||
4921 | if (SrcOp.isReg()) | ||||
4922 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||
4923 | } | ||||
4924 | } | ||||
4925 | |||||
4926 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||
4927 | /// | ||||
4928 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||
4929 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||
4930 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||
4931 | /// registers. | ||||
4932 | /// | ||||
4933 | /// We don't want to directly select image instructions just yet, but also want | ||||
4934 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||
4935 | /// want a selected instruction entering RegBankSelect. In order to avoid | ||||
4936 | /// defining a multitude of intermediate image instructions, directly hack on | ||||
4937 | /// the intrinsic's arguments. In cases like a16 addresses, this requires | ||||
4938 | /// padding now unnecessary arguments with $noreg. | ||||
4939 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||
4940 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||
4941 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||
4942 | |||||
4943 | const MachineFunction &MF = *MI.getMF(); | ||||
4944 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||
4945 | const unsigned ArgOffset = NumDefs + 1; | ||||
4946 | bool IsTFE = NumDefs == 2; | ||||
4947 | // We are only processing the operands of d16 image operations on subtargets | ||||
4948 | // that use the unpacked register layout, or need to repack the TFE result. | ||||
4949 | |||||
4950 | // TODO: Do we need to guard against already legalized intrinsics? | ||||
4951 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||
4952 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||
4953 | |||||
4954 | MachineRegisterInfo *MRI = B.getMRI(); | ||||
4955 | const LLT S32 = LLT::scalar(32); | ||||
4956 | const LLT S16 = LLT::scalar(16); | ||||
4957 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
4958 | |||||
4959 | unsigned DMask = 0; | ||||
4960 | Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); | ||||
4961 | LLT Ty = MRI->getType(VData); | ||||
4962 | |||||
4963 | // Check for 16 bit addresses and pack if true. | ||||
4964 | LLT GradTy = | ||||
4965 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||
4966 | LLT AddrTy = | ||||
4967 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||
4968 | const bool IsG16 = | ||||
4969 | ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; | ||||
4970 | const bool IsA16 = AddrTy == S16; | ||||
4971 | const bool IsD16 = Ty.getScalarType() == S16; | ||||
4972 | |||||
4973 | int DMaskLanes = 0; | ||||
4974 | if (!BaseOpcode->Atomic) { | ||||
4975 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||
4976 | if (BaseOpcode->Gather4) { | ||||
4977 | DMaskLanes = 4; | ||||
4978 | } else if (DMask != 0) { | ||||
4979 | DMaskLanes = llvm::popcount(DMask); | ||||
4980 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||
4981 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||
4982 | B.buildUndef(MI.getOperand(0)); | ||||
4983 | MI.eraseFromParent(); | ||||
4984 | return true; | ||||
4985 | } | ||||
4986 | } | ||||
4987 | |||||
4988 | Observer.changingInstr(MI); | ||||
4989 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||
4990 | |||||
4991 | const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 | ||||
4992 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; | ||||
4993 | const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 | ||||
4994 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||
4995 | unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; | ||||
4996 | |||||
4997 | // Track that we legalized this | ||||
4998 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||
4999 | |||||
5000 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||
5001 | // dmask to be at least 1 otherwise the instruction will fail | ||||
5002 | if (IsTFE && DMask == 0) { | ||||
5003 | DMask = 0x1; | ||||
5004 | DMaskLanes = 1; | ||||
5005 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||
5006 | } | ||||
5007 | |||||
5008 | if (BaseOpcode->Atomic) { | ||||
5009 | Register VData0 = MI.getOperand(2).getReg(); | ||||
5010 | LLT Ty = MRI->getType(VData0); | ||||
5011 | |||||
5012 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||
5013 | if (Ty.isVector()) | ||||
5014 | return false; | ||||
5015 | |||||
5016 | if (BaseOpcode->AtomicX2) { | ||||
5017 | Register VData1 = MI.getOperand(3).getReg(); | ||||
5018 | // The two values are packed in one register. | ||||
5019 | LLT PackedTy = LLT::fixed_vector(2, Ty); | ||||
5020 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||
5021 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||
5022 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||
5023 | } | ||||
5024 | } | ||||
5025 | |||||
5026 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||
5027 | |||||
5028 | // Rewrite the addressing register layout before doing anything else. | ||||
5029 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { | ||||
5030 | // 16 bit gradients are supported, but are tied to the A16 control | ||||
5031 | // so both gradients and addresses must be 16 bit | ||||
5032 | return false; | ||||
5033 | } | ||||
5034 | |||||
5035 | if (IsA16 && !ST.hasA16()) { | ||||
5036 | // A16 not supported | ||||
5037 | return false; | ||||
5038 | } | ||||
5039 | |||||
5040 | const unsigned NSAMaxSize = ST.getNSAMaxSize(); | ||||
5041 | const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); | ||||
5042 | |||||
5043 | if (IsA16 || IsG16) { | ||||
5044 | if (Intr->NumVAddrs > 1) { | ||||
5045 | SmallVector<Register, 4> PackedRegs; | ||||
5046 | |||||
5047 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, | ||||
5048 | IsG16); | ||||
5049 | |||||
5050 | // See also below in the non-a16 branch | ||||
5051 | const bool UseNSA = ST.hasNSAEncoding() && | ||||
5052 | PackedRegs.size() >= ST.getNSAThreshold(MF) && | ||||
5053 | (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); | ||||
5054 | const bool UsePartialNSA = | ||||
5055 | UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; | ||||
5056 | |||||
5057 | if (UsePartialNSA) { | ||||
5058 | // Pack registers that would go over NSAMaxSize into last VAddr register | ||||
5059 | LLT PackedAddrTy = | ||||
5060 | LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); | ||||
5061 | auto Concat = B.buildConcatVectors( | ||||
5062 | PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); | ||||
5063 | PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); | ||||
5064 | PackedRegs.resize(NSAMaxSize); | ||||
5065 | } else if (!UseNSA && PackedRegs.size() > 1) { | ||||
5066 | LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); | ||||
5067 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||
5068 | PackedRegs[0] = Concat.getReg(0); | ||||
5069 | PackedRegs.resize(1); | ||||
5070 | } | ||||
5071 | |||||
5072 | const unsigned NumPacked = PackedRegs.size(); | ||||
5073 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||
5074 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||
5075 | if (!SrcOp.isReg()) { | ||||
5076 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5076, __extension__ __PRETTY_FUNCTION__)); | ||||
5077 | continue; | ||||
5078 | } | ||||
5079 | |||||
5080 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5080, __extension__ __PRETTY_FUNCTION__)); | ||||
5081 | |||||
5082 | if (I - Intr->VAddrStart < NumPacked) | ||||
5083 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||
5084 | else | ||||
5085 | SrcOp.setReg(AMDGPU::NoRegister); | ||||
5086 | } | ||||
5087 | } | ||||
5088 | } else { | ||||
5089 | // If the register allocator cannot place the address registers contiguously | ||||
5090 | // without introducing moves, then using the non-sequential address encoding | ||||
5091 | // is always preferable, since it saves VALU instructions and is usually a | ||||
5092 | // wash in terms of code size or even better. | ||||
5093 | // | ||||
5094 | // However, we currently have no way of hinting to the register allocator | ||||
5095 | // that MIMG addresses should be placed contiguously when it is possible to | ||||
5096 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||
5097 | // | ||||
5098 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||
5099 | // allocation when possible. | ||||
5100 | // | ||||
5101 | // Partial NSA is allowed on GFX11 where the final register is a contiguous | ||||
5102 | // set of the remaining addresses. | ||||
5103 | const bool UseNSA = ST.hasNSAEncoding() && | ||||
5104 | CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && | ||||
5105 | (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); | ||||
5106 | const bool UsePartialNSA = | ||||
5107 | UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; | ||||
5108 | |||||
5109 | if (UsePartialNSA) { | ||||
5110 | convertImageAddrToPacked(B, MI, | ||||
5111 | ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, | ||||
5112 | Intr->NumVAddrs - NSAMaxSize + 1); | ||||
5113 | } else if (!UseNSA && Intr->NumVAddrs > 1) { | ||||
5114 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||
5115 | Intr->NumVAddrs); | ||||
5116 | } | ||||
5117 | } | ||||
5118 | |||||
5119 | int Flags = 0; | ||||
5120 | if (IsA16) | ||||
5121 | Flags |= 1; | ||||
5122 | if (IsG16) | ||||
5123 | Flags |= 2; | ||||
5124 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||
5125 | |||||
5126 | if (BaseOpcode->Store) { // No TFE for stores? | ||||
5127 | // TODO: Handle dmask trim | ||||
5128 | if (!Ty.isVector() || !IsD16) | ||||
5129 | return true; | ||||
5130 | |||||
5131 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||
5132 | if (RepackedReg != VData) { | ||||
5133 | MI.getOperand(1).setReg(RepackedReg); | ||||
5134 | } | ||||
5135 | |||||
5136 | return true; | ||||
5137 | } | ||||
5138 | |||||
5139 | Register DstReg = MI.getOperand(0).getReg(); | ||||
5140 | const LLT EltTy = Ty.getScalarType(); | ||||
5141 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||
5142 | |||||
5143 | // Confirm that the return type is large enough for the dmask specified | ||||
5144 | if (NumElts < DMaskLanes) | ||||
5145 | return false; | ||||
5146 | |||||
5147 | if (NumElts > 4 || DMaskLanes > 4) | ||||
5148 | return false; | ||||
5149 | |||||
5150 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; | ||||
5151 | const LLT AdjustedTy = | ||||
5152 | Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); | ||||
5153 | |||||
5154 | // The raw dword aligned data component of the load. The only legal cases | ||||
5155 | // where this matters should be when using the packed D16 format, for | ||||
5156 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||
5157 | LLT RoundedTy; | ||||
5158 | |||||
5159 | // S32 vector to cover all data, plus TFE result element. | ||||
5160 | LLT TFETy; | ||||
5161 | |||||
5162 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||
5163 | LLT RegTy; | ||||
5164 | |||||
5165 | if (IsD16 && ST.hasUnpackedD16VMem()) { | ||||
5166 | RoundedTy = | ||||
5167 | LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); | ||||
5168 | TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); | ||||
5169 | RegTy = S32; | ||||
5170 | } else { | ||||
5171 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
5172 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||
5173 | unsigned RoundedSize = 32 * RoundedElts; | ||||
5174 | RoundedTy = LLT::scalarOrVector( | ||||
5175 | ElementCount::getFixed(RoundedSize / EltSize), EltSize); | ||||
5176 | TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); | ||||
5177 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||
5178 | } | ||||
5179 | |||||
5180 | // The return type does not need adjustment. | ||||
5181 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||
5182 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||
5183 | return true; | ||||
5184 | |||||
5185 | Register Dst1Reg; | ||||
5186 | |||||
5187 | // Insert after the instruction. | ||||
5188 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||
5189 | |||||
5190 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||
5191 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||
5192 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||
5193 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||
5194 | |||||
5195 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||
5196 | |||||
5197 | MI.getOperand(0).setReg(NewResultReg); | ||||
5198 | |||||
5199 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||
5200 | // type. The instruction really returns these two values in one contiguous | ||||
5201 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||
5202 | // return type to use a single register result. | ||||
5203 | |||||
5204 | if (IsTFE) { | ||||
5205 | Dst1Reg = MI.getOperand(1).getReg(); | ||||
5206 | if (MRI->getType(Dst1Reg) != S32) | ||||
5207 | return false; | ||||
5208 | |||||
5209 | // TODO: Make sure the TFE operand bit is set. | ||||
5210 | MI.removeOperand(1); | ||||
5211 | |||||
5212 | // Handle the easy case that requires no repack instructions. | ||||
5213 | if (Ty == S32) { | ||||
5214 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||
5215 | return true; | ||||
5216 | } | ||||
5217 | } | ||||
5218 | |||||
5219 | // Now figure out how to copy the new result register back into the old | ||||
5220 | // result. | ||||
5221 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||
5222 | |||||
5223 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||
5224 | |||||
5225 | if (ResultNumRegs == 1) { | ||||
5226 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5226, __extension__ __PRETTY_FUNCTION__)); | ||||
5227 | ResultRegs[0] = NewResultReg; | ||||
5228 | } else { | ||||
5229 | // We have to repack into a new vector of some kind. | ||||
5230 | for (int I = 0; I != NumDataRegs; ++I) | ||||
5231 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||
5232 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||
5233 | |||||
5234 | // Drop the final TFE element to get the data part. The TFE result is | ||||
5235 | // directly written to the right place already. | ||||
5236 | if (IsTFE) | ||||
5237 | ResultRegs.resize(NumDataRegs); | ||||
5238 | } | ||||
5239 | |||||
5240 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||
5241 | // of packed vs. unpacked. | ||||
5242 | if (IsD16 && !Ty.isVector()) { | ||||
5243 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||
5244 | return true; | ||||
5245 | } | ||||
5246 | |||||
5247 | // Avoid a build/concat_vector of 1 entry. | ||||
5248 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||
5249 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||
5250 | return true; | ||||
5251 | } | ||||
5252 | |||||
5253 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 5253, __extension__ __PRETTY_FUNCTION__)); | ||||
5254 | |||||
5255 | if (IsD16) { | ||||
5256 | // For packed D16 results with TFE enabled, all the data components are | ||||
5257 | // S32. Cast back to the expected type. | ||||
5258 | // | ||||
5259 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||
5260 | // cast for the TFE result if a multiple of v2s16 was used. | ||||
5261 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||
5262 | for (Register &Reg : ResultRegs) | ||||
5263 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||
5264 | } else if (ST.hasUnpackedD16VMem()) { | ||||
5265 | for (Register &Reg : ResultRegs) | ||||
5266 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||
5267 | } | ||||
5268 | } | ||||
5269 | |||||
5270 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||
5271 | if (NumElts == 0) | ||||
5272 | return; | ||||
5273 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||
5274 | for (int I = 0; I != NumElts; ++I) | ||||
5275 | ResultRegs.push_back(Undef); | ||||
5276 | }; | ||||
5277 | |||||
5278 | // Pad out any elements eliminated due to the dmask. | ||||
5279 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||
5280 | if (!ResTy.isVector()) { | ||||
5281 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||
5282 | B.buildBuildVector(DstReg, ResultRegs); | ||||
5283 | return true; | ||||
5284 | } | ||||
5285 | |||||
5286 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5286, __extension__ __PRETTY_FUNCTION__)); | ||||
5287 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||
5288 | |||||
5289 | // Deal with the one annoying legal case. | ||||
5290 | const LLT V3S16 = LLT::fixed_vector(3, 16); | ||||
5291 | if (Ty == V3S16) { | ||||
5292 | if (IsTFE) { | ||||
5293 | if (ResultRegs.size() == 1) { | ||||
5294 | NewResultReg = ResultRegs[0]; | ||||
5295 | } else if (ResultRegs.size() == 2) { | ||||
5296 | LLT V4S16 = LLT::fixed_vector(4, 16); | ||||
5297 | NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); | ||||
5298 | } else { | ||||
5299 | return false; | ||||
5300 | } | ||||
5301 | } | ||||
5302 | |||||
5303 | if (MRI->getType(DstReg).getNumElements() < | ||||
5304 | MRI->getType(NewResultReg).getNumElements()) { | ||||
5305 | B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); | ||||
5306 | } else { | ||||
5307 | B.buildPadVectorWithUndefElements(DstReg, NewResultReg); | ||||
5308 | } | ||||
5309 | return true; | ||||
5310 | } | ||||
5311 | |||||
5312 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||
5313 | B.buildConcatVectors(DstReg, ResultRegs); | ||||
5314 | return true; | ||||
5315 | } | ||||
5316 | |||||
5317 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||
5318 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||
5319 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
5320 | GISelChangeObserver &Observer = Helper.Observer; | ||||
5321 | |||||
5322 | Register Dst = MI.getOperand(0).getReg(); | ||||
5323 | LLT Ty = B.getMRI()->getType(Dst); | ||||
5324 | unsigned Size = Ty.getSizeInBits(); | ||||
5325 | MachineFunction &MF = B.getMF(); | ||||
5326 | |||||
5327 | Observer.changingInstr(MI); | ||||
5328 | |||||
5329 | if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { | ||||
5330 | Ty = getBitcastRegisterType(Ty); | ||||
5331 | Helper.bitcastDst(MI, Ty, 0); | ||||
5332 | Dst = MI.getOperand(0).getReg(); | ||||
5333 | B.setInsertPt(B.getMBB(), MI); | ||||
5334 | } | ||||
5335 | |||||
5336 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||
5337 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||
5338 | // allowed to add one. | ||||
5339 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||
5340 | MI.removeOperand(1); // Remove intrinsic ID | ||||
5341 | |||||
5342 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||
5343 | // TODO: Should this use datalayout alignment? | ||||
5344 | const unsigned MemSize = (Size + 7) / 8; | ||||
5345 | const Align MemAlign(4); | ||||
5346 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
5347 | MachinePointerInfo(), | ||||
5348 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
5349 | MachineMemOperand::MOInvariant, | ||||
5350 | MemSize, MemAlign); | ||||
5351 | MI.addMemOperand(MF, MMO); | ||||
5352 | |||||
5353 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||
5354 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||
5355 | // out this needs to be converted to a vector load during RegBankSelect. | ||||
5356 | if (!isPowerOf2_32(Size)) { | ||||
5357 | if (Ty.isVector()) | ||||
5358 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||
5359 | else | ||||
5360 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||
5361 | } | ||||
5362 | |||||
5363 | Observer.changedInstr(MI); | ||||
5364 | return true; | ||||
5365 | } | ||||
5366 | |||||
5367 | // TODO: Move to selection | ||||
5368 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||
5369 | MachineRegisterInfo &MRI, | ||||
5370 | MachineIRBuilder &B) const { | ||||
5371 | if (!ST.isTrapHandlerEnabled() || | ||||
5372 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | ||||
5373 | return legalizeTrapEndpgm(MI, MRI, B); | ||||
5374 | |||||
5375 | const Module *M = B.getMF().getFunction().getParent(); | ||||
5376 | unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); | ||||
5377 | if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) | ||||
5378 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||
5379 | |||||
5380 | return ST.supportsGetDoorbellID() ? | ||||
5381 | legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||
5382 | } | ||||
5383 | |||||
5384 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | ||||
5385 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5386 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||
5387 | MI.eraseFromParent(); | ||||
5388 | return true; | ||||
5389 | } | ||||
5390 | |||||
5391 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | ||||
5392 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5393 | MachineFunction &MF = B.getMF(); | ||||
5394 | const LLT S64 = LLT::scalar(64); | ||||
5395 | |||||
5396 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||
5397 | // For code object version 5, queue_ptr is passed through implicit kernarg. | ||||
5398 | if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= | ||||
5399 | AMDGPU::AMDHSA_COV5) { | ||||
5400 | AMDGPUTargetLowering::ImplicitParameter Param = | ||||
5401 | AMDGPUTargetLowering::QUEUE_PTR; | ||||
5402 | uint64_t Offset = | ||||
5403 | ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); | ||||
5404 | |||||
5405 | Register KernargPtrReg = MRI.createGenericVirtualRegister( | ||||
5406 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
5407 | |||||
5408 | if (!loadInputValue(KernargPtrReg, B, | ||||
5409 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
5410 | return false; | ||||
5411 | |||||
5412 | // TODO: can we be smarter about machine pointer info? | ||||
5413 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
5414 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
5415 | PtrInfo, | ||||
5416 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
5417 | MachineMemOperand::MOInvariant, | ||||
5418 | LLT::scalar(64), commonAlignment(Align(64), Offset)); | ||||
5419 | |||||
5420 | // Pointer address | ||||
5421 | Register LoadAddr = MRI.createGenericVirtualRegister( | ||||
5422 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
5423 | B.buildPtrAdd(LoadAddr, KernargPtrReg, | ||||
5424 | B.buildConstant(LLT::scalar(64), Offset).getReg(0)); | ||||
5425 | // Load address | ||||
5426 | Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); | ||||
5427 | B.buildCopy(SGPR01, Temp); | ||||
5428 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5429 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||
5430 | .addReg(SGPR01, RegState::Implicit); | ||||
5431 | MI.eraseFromParent(); | ||||
5432 | return true; | ||||
5433 | } | ||||
5434 | |||||
5435 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||
5436 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||
5437 | Register LiveIn = | ||||
5438 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
5439 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||
5440 | return false; | ||||
5441 | |||||
5442 | B.buildCopy(SGPR01, LiveIn); | ||||
5443 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5444 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||
5445 | .addReg(SGPR01, RegState::Implicit); | ||||
5446 | |||||
5447 | MI.eraseFromParent(); | ||||
5448 | return true; | ||||
5449 | } | ||||
5450 | |||||
5451 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | ||||
5452 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5453 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5454 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | ||||
5455 | MI.eraseFromParent(); | ||||
5456 | return true; | ||||
5457 | } | ||||
5458 | |||||
5459 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||
5460 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5461 | // Is non-HSA path or trap-handler disabled? Then, report a warning | ||||
5462 | // accordingly | ||||
5463 | if (!ST.isTrapHandlerEnabled() || | ||||
5464 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | ||||
5465 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||
5466 | "debugtrap handler not supported", | ||||
5467 | MI.getDebugLoc(), DS_Warning); | ||||
5468 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||
5469 | Ctx.diagnose(NoTrap); | ||||
5470 | } else { | ||||
5471 | // Insert debug-trap instruction | ||||
5472 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5473 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | ||||
5474 | } | ||||
5475 | |||||
5476 | MI.eraseFromParent(); | ||||
5477 | return true; | ||||
5478 | } | ||||
5479 | |||||
5480 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||
5481 | MachineIRBuilder &B) const { | ||||
5482 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
5483 | const LLT S16 = LLT::scalar(16); | ||||
5484 | const LLT S32 = LLT::scalar(32); | ||||
5485 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
5486 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||
5487 | |||||
5488 | Register DstReg = MI.getOperand(0).getReg(); | ||||
5489 | Register NodePtr = MI.getOperand(2).getReg(); | ||||
5490 | Register RayExtent = MI.getOperand(3).getReg(); | ||||
5491 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||
5492 | Register RayDir = MI.getOperand(5).getReg(); | ||||
5493 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||
5494 | Register TDescr = MI.getOperand(7).getReg(); | ||||
5495 | |||||
5496 | if (!ST.hasGFX10_AEncoding()) { | ||||
5497 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), | ||||
5498 | "intrinsic not supported on subtarget", | ||||
5499 | MI.getDebugLoc()); | ||||
5500 | B.getMF().getFunction().getContext().diagnose(BadIntrin); | ||||
5501 | return false; | ||||
5502 | } | ||||
5503 | |||||
5504 | const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); | ||||
5505 | const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||
5506 | const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||
5507 | const unsigned NumVDataDwords = 4; | ||||
5508 | const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); | ||||
5509 | const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; | ||||
5510 | const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); | ||||
5511 | const unsigned BaseOpcodes[2][2] = { | ||||
5512 | {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, | ||||
5513 | {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, | ||||
5514 | AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; | ||||
5515 | int Opcode; | ||||
5516 | if (UseNSA) { | ||||
5517 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], | ||||
5518 | IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA | ||||
5519 | : AMDGPU::MIMGEncGfx10NSA, | ||||
5520 | NumVDataDwords, NumVAddrDwords); | ||||
5521 | } else { | ||||
5522 | Opcode = AMDGPU::getMIMGOpcode( | ||||
5523 | BaseOpcodes[Is64][IsA16], | ||||
5524 | IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, | ||||
5525 | NumVDataDwords, NumVAddrDwords); | ||||
5526 | } | ||||
5527 | assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail ("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 5527, __extension__ __PRETTY_FUNCTION__)); | ||||
5528 | |||||
5529 | SmallVector<Register, 12> Ops; | ||||
5530 | if (UseNSA && IsGFX11Plus) { | ||||
5531 | auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { | ||||
5532 | auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); | ||||
5533 | auto Merged = B.buildMergeLikeInstr( | ||||
5534 | V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); | ||||
5535 | Ops.push_back(Merged.getReg(0)); | ||||
5536 | }; | ||||
5537 | |||||
5538 | Ops.push_back(NodePtr); | ||||
5539 | Ops.push_back(RayExtent); | ||||
5540 | packLanes(RayOrigin); | ||||
5541 | |||||
5542 | if (IsA16) { | ||||
5543 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); | ||||
5544 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); | ||||
5545 | auto MergedDir = B.buildMergeLikeInstr( | ||||
5546 | V3S32, | ||||
5547 | {B.buildBitcast( | ||||
5548 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), | ||||
5549 | UnmergeRayDir.getReg(0)})) | ||||
5550 | .getReg(0), | ||||
5551 | B.buildBitcast( | ||||
5552 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), | ||||
5553 | UnmergeRayDir.getReg(1)})) | ||||
5554 | .getReg(0), | ||||
5555 | B.buildBitcast( | ||||
5556 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), | ||||
5557 | UnmergeRayDir.getReg(2)})) | ||||
5558 | .getReg(0)}); | ||||
5559 | Ops.push_back(MergedDir.getReg(0)); | ||||
5560 | } else { | ||||
5561 | packLanes(RayDir); | ||||
5562 | packLanes(RayInvDir); | ||||
5563 | } | ||||
5564 | } else { | ||||
5565 | if (Is64) { | ||||
5566 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||
5567 | Ops.push_back(Unmerge.getReg(0)); | ||||
5568 | Ops.push_back(Unmerge.getReg(1)); | ||||
5569 | } else { | ||||
5570 | Ops.push_back(NodePtr); | ||||
5571 | } | ||||
5572 | Ops.push_back(RayExtent); | ||||
5573 | |||||
5574 | auto packLanes = [&Ops, &S32, &B](Register Src) { | ||||
5575 | auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); | ||||
5576 | Ops.push_back(Unmerge.getReg(0)); | ||||
5577 | Ops.push_back(Unmerge.getReg(1)); | ||||
5578 | Ops.push_back(Unmerge.getReg(2)); | ||||
5579 | }; | ||||
5580 | |||||
5581 | packLanes(RayOrigin); | ||||
5582 | if (IsA16) { | ||||
5583 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); | ||||
5584 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); | ||||
5585 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||
5586 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||
5587 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||
5588 | B.buildMergeLikeInstr(R1, | ||||
5589 | {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||
5590 | B.buildMergeLikeInstr( | ||||
5591 | R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||
5592 | B.buildMergeLikeInstr( | ||||
5593 | R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||
5594 | Ops.push_back(R1); | ||||
5595 | Ops.push_back(R2); | ||||
5596 | Ops.push_back(R3); | ||||
5597 | } else { | ||||
5598 | packLanes(RayDir); | ||||
5599 | packLanes(RayInvDir); | ||||
5600 | } | ||||
5601 | } | ||||
5602 | |||||
5603 | if (!UseNSA) { | ||||
5604 | // Build a single vector containing all the operands so far prepared. | ||||
5605 | LLT OpTy = LLT::fixed_vector(Ops.size(), 32); | ||||
5606 | Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); | ||||
5607 | Ops.clear(); | ||||
5608 | Ops.push_back(MergedOps); | ||||
5609 | } | ||||
5610 | |||||
5611 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||
5612 | .addDef(DstReg) | ||||
5613 | .addImm(Opcode); | ||||
5614 | |||||
5615 | for (Register R : Ops) { | ||||
5616 | MIB.addUse(R); | ||||
5617 | } | ||||
5618 | |||||
5619 | MIB.addUse(TDescr) | ||||
5620 | .addImm(IsA16 ? 1 : 0) | ||||
5621 | .cloneMemRefs(MI); | ||||
5622 | |||||
5623 | MI.eraseFromParent(); | ||||
5624 | return true; | ||||
5625 | } | ||||
5626 | |||||
5627 | bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, | ||||
5628 | MachineIRBuilder &B) const { | ||||
5629 | unsigned Opc; | ||||
5630 | int RoundMode = MI.getOperand(2).getImm(); | ||||
5631 | |||||
5632 | if (RoundMode == (int)RoundingMode::TowardPositive) | ||||
5633 | Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; | ||||
5634 | else if (RoundMode == (int)RoundingMode::TowardNegative) | ||||
5635 | Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; | ||||
5636 | else | ||||
5637 | return false; | ||||
5638 | |||||
5639 | B.buildInstr(Opc) | ||||
5640 | .addDef(MI.getOperand(0).getReg()) | ||||
5641 | .addUse(MI.getOperand(1).getReg()); | ||||
5642 | |||||
5643 | MI.eraseFromParent(); | ||||
5644 | |||||
5645 | return true; | ||||
5646 | } | ||||
5647 | |||||
5648 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||
5649 | MachineInstr &MI) const { | ||||
5650 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
5651 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
5652 | |||||
5653 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||
5654 | auto IntrID = MI.getIntrinsicID(); | ||||
5655 | switch (IntrID) { | ||||
5656 | case Intrinsic::amdgcn_if: | ||||
5657 | case Intrinsic::amdgcn_else: { | ||||
5658 | MachineInstr *Br = nullptr; | ||||
5659 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||
5660 | bool Negated = false; | ||||
5661 | if (MachineInstr *BrCond = | ||||
5662 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||
5663 | const SIRegisterInfo *TRI | ||||
5664 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||
5665 | |||||
5666 | Register Def = MI.getOperand(1).getReg(); | ||||
5667 | Register Use = MI.getOperand(3).getReg(); | ||||
5668 | |||||
5669 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||
5670 | |||||
5671 | if (Negated) | ||||
5672 | std::swap(CondBrTarget, UncondBrTarget); | ||||
5673 | |||||
5674 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||
5675 | if (IntrID == Intrinsic::amdgcn_if) { | ||||
5676 | B.buildInstr(AMDGPU::SI_IF) | ||||
5677 | .addDef(Def) | ||||
5678 | .addUse(Use) | ||||
5679 | .addMBB(UncondBrTarget); | ||||
5680 | } else { | ||||
5681 | B.buildInstr(AMDGPU::SI_ELSE) | ||||
5682 | .addDef(Def) | ||||
5683 | .addUse(Use) | ||||
5684 | .addMBB(UncondBrTarget); | ||||
5685 | } | ||||
5686 | |||||
5687 | if (Br) { | ||||
5688 | Br->getOperand(0).setMBB(CondBrTarget); | ||||
5689 | } else { | ||||
5690 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||
5691 | // since we're swapping branch targets it needs to be reinserted. | ||||
5692 | // FIXME: IRTranslator should probably not do this | ||||
5693 | B.buildBr(*CondBrTarget); | ||||
5694 | } | ||||
5695 | |||||
5696 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||
5697 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||
5698 | MI.eraseFromParent(); | ||||
5699 | BrCond->eraseFromParent(); | ||||
5700 | return true; | ||||
5701 | } | ||||
5702 | |||||
5703 | return false; | ||||
5704 | } | ||||
5705 | case Intrinsic::amdgcn_loop: { | ||||
5706 | MachineInstr *Br = nullptr; | ||||
5707 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||
5708 | bool Negated = false; | ||||
5709 | if (MachineInstr *BrCond = | ||||
5710 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||
5711 | const SIRegisterInfo *TRI | ||||
5712 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||
5713 | |||||
5714 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||
5715 | Register Reg = MI.getOperand(2).getReg(); | ||||
5716 | |||||
5717 | if (Negated) | ||||
5718 | std::swap(CondBrTarget, UncondBrTarget); | ||||
5719 | |||||
5720 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||
5721 | B.buildInstr(AMDGPU::SI_LOOP) | ||||
5722 | .addUse(Reg) | ||||
5723 | .addMBB(UncondBrTarget); | ||||
5724 | |||||
5725 | if (Br) | ||||
5726 | Br->getOperand(0).setMBB(CondBrTarget); | ||||
5727 | else | ||||
5728 | B.buildBr(*CondBrTarget); | ||||
5729 | |||||
5730 | MI.eraseFromParent(); | ||||
5731 | BrCond->eraseFromParent(); | ||||
5732 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||
5733 | return true; | ||||
5734 | } | ||||
5735 | |||||
5736 | return false; | ||||
5737 | } | ||||
5738 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||
5739 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||
5740 | // This only makes sense to call in a kernel, so just lower to null. | ||||
5741 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||
5742 | MI.eraseFromParent(); | ||||
5743 | return true; | ||||
5744 | } | ||||
5745 | |||||
5746 | return legalizePreloadedArgIntrin( | ||||
5747 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||
5748 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||
5749 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||
5750 | case Intrinsic::amdgcn_workitem_id_x: | ||||
5751 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, | ||||
5752 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||
5753 | case Intrinsic::amdgcn_workitem_id_y: | ||||
5754 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, | ||||
5755 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||
5756 | case Intrinsic::amdgcn_workitem_id_z: | ||||
5757 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, | ||||
5758 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||
5759 | case Intrinsic::amdgcn_workgroup_id_x: | ||||
5760 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5761 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||
5762 | case Intrinsic::amdgcn_workgroup_id_y: | ||||
5763 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5764 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||
5765 | case Intrinsic::amdgcn_workgroup_id_z: | ||||
5766 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5767 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||
5768 | case Intrinsic::amdgcn_lds_kernel_id: | ||||
5769 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5770 | AMDGPUFunctionArgInfo::LDS_KERNEL_ID); | ||||
5771 | case Intrinsic::amdgcn_dispatch_ptr: | ||||
5772 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5773 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||
5774 | case Intrinsic::amdgcn_queue_ptr: | ||||
5775 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5776 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||
5777 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||
5778 | return legalizePreloadedArgIntrin( | ||||
5779 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||
5780 | case Intrinsic::amdgcn_dispatch_id: | ||||
5781 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5782 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||
5783 | case Intrinsic::r600_read_ngroups_x: | ||||
5784 | // TODO: Emit error for hsa | ||||
5785 | return legalizeKernargMemParameter(MI, B, | ||||
5786 | SI::KernelInputOffsets::NGROUPS_X); | ||||
5787 | case Intrinsic::r600_read_ngroups_y: | ||||
5788 | return legalizeKernargMemParameter(MI, B, | ||||
5789 | SI::KernelInputOffsets::NGROUPS_Y); | ||||
5790 | case Intrinsic::r600_read_ngroups_z: | ||||
5791 | return legalizeKernargMemParameter(MI, B, | ||||
5792 | SI::KernelInputOffsets::NGROUPS_Z); | ||||
5793 | case Intrinsic::r600_read_local_size_x: | ||||
5794 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
5795 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); | ||||
5796 | case Intrinsic::r600_read_local_size_y: | ||||
5797 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
5798 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); | ||||
5799 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
5800 | case Intrinsic::r600_read_local_size_z: | ||||
5801 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); | ||||
5802 | case Intrinsic::r600_read_global_size_x: | ||||
5803 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); | ||||
5804 | case Intrinsic::r600_read_global_size_y: | ||||
5805 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); | ||||
5806 | case Intrinsic::r600_read_global_size_z: | ||||
5807 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); | ||||
5808 | case Intrinsic::amdgcn_fdiv_fast: | ||||
5809 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||
5810 | case Intrinsic::amdgcn_is_shared: | ||||
5811 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||
5812 | case Intrinsic::amdgcn_is_private: | ||||
5813 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||
5814 | case Intrinsic::amdgcn_wavefrontsize: { | ||||
5815 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||
5816 | MI.eraseFromParent(); | ||||
5817 | return true; | ||||
5818 | } | ||||
5819 | case Intrinsic::amdgcn_s_buffer_load: | ||||
5820 | return legalizeSBufferLoad(Helper, MI); | ||||
5821 | case Intrinsic::amdgcn_raw_buffer_store: | ||||
5822 | case Intrinsic::amdgcn_struct_buffer_store: | ||||
5823 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||
5824 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||
5825 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||
5826 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||
5827 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||
5828 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||
5829 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||
5830 | case Intrinsic::amdgcn_raw_buffer_load: | ||||
5831 | case Intrinsic::amdgcn_struct_buffer_load: | ||||
5832 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||
5833 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||
5834 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||
5835 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||
5836 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||
5837 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||
5838 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||
5839 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||
5840 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||
5841 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||
5842 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||
5843 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||
5844 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||
5845 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||
5846 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||
5847 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||
5848 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||
5849 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||
5850 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||
5851 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||
5852 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||
5853 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||
5854 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||
5855 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||
5856 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||
5857 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||
5858 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||
5859 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||
5860 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||
5861 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||
5862 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||
5863 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||
5864 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||
5865 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||
5866 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||
5867 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||
5868 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||
5869 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||
5870 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||
5871 | return legalizeBufferAtomic(MI, B, IntrID); | ||||
5872 | case Intrinsic::amdgcn_atomic_inc: | ||||
5873 | return legalizeAtomicIncDec(MI, B, true); | ||||
5874 | case Intrinsic::amdgcn_atomic_dec: | ||||
5875 | return legalizeAtomicIncDec(MI, B, false); | ||||
5876 | case Intrinsic::trap: | ||||
5877 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||
5878 | case Intrinsic::debugtrap: | ||||
5879 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||
5880 | case Intrinsic::amdgcn_rsq_clamp: | ||||
5881 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||
5882 | case Intrinsic::amdgcn_ds_fadd: | ||||
5883 | case Intrinsic::amdgcn_ds_fmin: | ||||
5884 | case Intrinsic::amdgcn_ds_fmax: | ||||
5885 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||
5886 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||
5887 | return legalizeBVHIntrinsic(MI, B); | ||||
5888 | default: { | ||||
5889 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||
5890 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||
5891 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||
5892 | return true; | ||||
5893 | } | ||||
5894 | } | ||||
5895 | |||||
5896 | return true; | ||||
5897 | } |
1 | //===-- llvm/ADT/bit.h - C++20 <bit> ----------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements the C++20 <bit> header. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_ADT_BIT_H |
15 | #define LLVM_ADT_BIT_H |
16 | |
17 | #include "llvm/Support/Compiler.h" |
18 | #include <cstdint> |
19 | #include <limits> |
20 | #include <type_traits> |
21 | |
22 | #if !__has_builtin(__builtin_bit_cast)1 |
23 | #include <cstring> |
24 | #endif |
25 | |
26 | #if defined(_MSC_VER) && !defined(_DEBUG1) |
27 | #include <cstdlib> // for _byteswap_{ushort,ulong,uint64} |
28 | #endif |
29 | |
30 | #ifdef _MSC_VER |
31 | // Declare these intrinsics manually rather including intrin.h. It's very |
32 | // expensive, and bit.h is popular via MathExtras.h. |
33 | // #include <intrin.h> |
34 | extern "C" { |
35 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); |
36 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); |
37 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); |
38 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); |
39 | } |
40 | #endif |
41 | |
42 | namespace llvm { |
43 | |
44 | // This implementation of bit_cast is different from the C++20 one in two ways: |
45 | // - It isn't constexpr because that requires compiler support. |
46 | // - It requires trivially-constructible To, to avoid UB in the implementation. |
47 | template < |
48 | typename To, typename From, |
49 | typename = std::enable_if_t<sizeof(To) == sizeof(From)>, |
50 | typename = std::enable_if_t<std::is_trivially_constructible<To>::value>, |
51 | typename = std::enable_if_t<std::is_trivially_copyable<To>::value>, |
52 | typename = std::enable_if_t<std::is_trivially_copyable<From>::value>> |
53 | [[nodiscard]] inline To bit_cast(const From &from) noexcept { |
54 | #if __has_builtin(__builtin_bit_cast)1 |
55 | return __builtin_bit_cast(To, from); |
56 | #else |
57 | To to; |
58 | std::memcpy(&to, &from, sizeof(To)); |
59 | return to; |
60 | #endif |
61 | } |
62 | |
63 | /// Reverses the bytes in the given integer value V. |
64 | template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>> |
65 | [[nodiscard]] constexpr T byteswap(T V) noexcept { |
66 | if constexpr (sizeof(T) == 1) { |
67 | return V; |
68 | } else if constexpr (sizeof(T) == 2) { |
69 | uint16_t UV = V; |
70 | #if defined(_MSC_VER) && !defined(_DEBUG1) |
71 | // The DLL version of the runtime lacks these functions (bug!?), but in a |
72 | // release build they're replaced with BSWAP instructions anyway. |
73 | return _byteswap_ushort(UV); |
74 | #else |
75 | uint16_t Hi = UV << 8; |
76 | uint16_t Lo = UV >> 8; |
77 | return Hi | Lo; |
78 | #endif |
79 | } else if constexpr (sizeof(T) == 4) { |
80 | uint32_t UV = V; |
81 | #if __has_builtin(__builtin_bswap32)1 |
82 | return __builtin_bswap32(UV); |
83 | #elif defined(_MSC_VER) && !defined(_DEBUG1) |
84 | return _byteswap_ulong(UV); |
85 | #else |
86 | uint32_t Byte0 = UV & 0x000000FF; |
87 | uint32_t Byte1 = UV & 0x0000FF00; |
88 | uint32_t Byte2 = UV & 0x00FF0000; |
89 | uint32_t Byte3 = UV & 0xFF000000; |
90 | return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24); |
91 | #endif |
92 | } else if constexpr (sizeof(T) == 8) { |
93 | uint64_t UV = V; |
94 | #if __has_builtin(__builtin_bswap64)1 |
95 | return __builtin_bswap64(UV); |
96 | #elif defined(_MSC_VER) && !defined(_DEBUG1) |
97 | return _byteswap_uint64(UV); |
98 | #else |
99 | uint64_t Hi = llvm::byteswap<uint32_t>(UV); |
100 | uint32_t Lo = llvm::byteswap<uint32_t>(UV >> 32); |
101 | return (Hi << 32) | Lo; |
102 | #endif |
103 | } else { |
104 | static_assert(!sizeof(T *), "Don't know how to handle the given type."); |
105 | return 0; |
106 | } |
107 | } |
108 | |
109 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
110 | [[nodiscard]] constexpr inline bool has_single_bit(T Value) noexcept { |
111 | return (Value != 0) && ((Value & (Value - 1)) == 0); |
112 | } |
113 | |
114 | namespace detail { |
115 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { |
116 | static unsigned count(T Val) { |
117 | if (!Val) |
118 | return std::numeric_limits<T>::digits; |
119 | if (Val & 0x1) |
120 | return 0; |
121 | |
122 | // Bisection method. |
123 | unsigned ZeroBits = 0; |
124 | T Shift = std::numeric_limits<T>::digits >> 1; |
125 | T Mask = std::numeric_limits<T>::max() >> Shift; |
126 | while (Shift) { |
127 | if ((Val & Mask) == 0) { |
128 | Val >>= Shift; |
129 | ZeroBits |= Shift; |
130 | } |
131 | Shift >>= 1; |
132 | Mask >>= Shift; |
133 | } |
134 | return ZeroBits; |
135 | } |
136 | }; |
137 | |
138 | #if defined(__GNUC__4) || defined(_MSC_VER) |
139 | template <typename T> struct TrailingZerosCounter<T, 4> { |
140 | static unsigned count(T Val) { |
141 | if (Val == 0) |
142 | return 32; |
143 | |
144 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) |
145 | return __builtin_ctz(Val); |
146 | #elif defined(_MSC_VER) |
147 | unsigned long Index; |
148 | _BitScanForward(&Index, Val); |
149 | return Index; |
150 | #endif |
151 | } |
152 | }; |
153 | |
154 | #if !defined(_MSC_VER) || defined(_M_X64) |
155 | template <typename T> struct TrailingZerosCounter<T, 8> { |
156 | static unsigned count(T Val) { |
157 | if (Val == 0) |
158 | return 64; |
159 | |
160 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) |
161 | return __builtin_ctzll(Val); |
162 | #elif defined(_MSC_VER) |
163 | unsigned long Index; |
164 | _BitScanForward64(&Index, Val); |
165 | return Index; |
166 | #endif |
167 | } |
168 | }; |
169 | #endif |
170 | #endif |
171 | } // namespace detail |
172 | |
173 | /// Count number of 0's from the least significant bit to the most |
174 | /// stopping at the first 1. |
175 | /// |
176 | /// Only unsigned integral types are allowed. |
177 | /// |
178 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
179 | template <typename T> [[nodiscard]] int countr_zero(T Val) { |
180 | static_assert(std::is_unsigned_v<T>, |
181 | "Only unsigned integral types are allowed."); |
182 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val); |
183 | } |
184 | |
185 | namespace detail { |
186 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { |
187 | static unsigned count(T Val) { |
188 | if (!Val) |
189 | return std::numeric_limits<T>::digits; |
190 | |
191 | // Bisection method. |
192 | unsigned ZeroBits = 0; |
193 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { |
194 | T Tmp = Val >> Shift; |
195 | if (Tmp) |
196 | Val = Tmp; |
197 | else |
198 | ZeroBits |= Shift; |
199 | } |
200 | return ZeroBits; |
201 | } |
202 | }; |
203 | |
204 | #if defined(__GNUC__4) || defined(_MSC_VER) |
205 | template <typename T> struct LeadingZerosCounter<T, 4> { |
206 | static unsigned count(T Val) { |
207 | if (Val == 0) |
208 | return 32; |
209 | |
210 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) |
211 | return __builtin_clz(Val); |
212 | #elif defined(_MSC_VER) |
213 | unsigned long Index; |
214 | _BitScanReverse(&Index, Val); |
215 | return Index ^ 31; |
216 | #endif |
217 | } |
218 | }; |
219 | |
220 | #if !defined(_MSC_VER) || defined(_M_X64) |
221 | template <typename T> struct LeadingZerosCounter<T, 8> { |
222 | static unsigned count(T Val) { |
223 | if (Val == 0) |
224 | return 64; |
225 | |
226 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) |
227 | return __builtin_clzll(Val); |
228 | #elif defined(_MSC_VER) |
229 | unsigned long Index; |
230 | _BitScanReverse64(&Index, Val); |
231 | return Index ^ 63; |
232 | #endif |
233 | } |
234 | }; |
235 | #endif |
236 | #endif |
237 | } // namespace detail |
238 | |
239 | /// Count number of 0's from the most significant bit to the least |
240 | /// stopping at the first 1. |
241 | /// |
242 | /// Only unsigned integral types are allowed. |
243 | /// |
244 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
245 | template <typename T> [[nodiscard]] int countl_zero(T Val) { |
246 | static_assert(std::is_unsigned_v<T>, |
247 | "Only unsigned integral types are allowed."); |
248 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val); |
249 | } |
250 | |
251 | /// Count the number of ones from the most significant bit to the first |
252 | /// zero bit. |
253 | /// |
254 | /// Ex. countl_one(0xFF0FFF00) == 8. |
255 | /// Only unsigned integral types are allowed. |
256 | /// |
257 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
258 | template <typename T> [[nodiscard]] int countl_one(T Value) { |
259 | static_assert(std::is_unsigned_v<T>, |
260 | "Only unsigned integral types are allowed."); |
261 | return llvm::countl_zero<T>(~Value); |
262 | } |
263 | |
264 | /// Count the number of ones from the least significant bit to the first |
265 | /// zero bit. |
266 | /// |
267 | /// Ex. countr_one(0x00FF00FF) == 8. |
268 | /// Only unsigned integral types are allowed. |
269 | /// |
270 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
271 | template <typename T> [[nodiscard]] int countr_one(T Value) { |
272 | static_assert(std::is_unsigned_v<T>, |
273 | "Only unsigned integral types are allowed."); |
274 | return llvm::countr_zero<T>(~Value); |
275 | } |
276 | |
277 | /// Returns the number of bits needed to represent Value if Value is nonzero. |
278 | /// Returns 0 otherwise. |
279 | /// |
280 | /// Ex. bit_width(5) == 3. |
281 | template <typename T> [[nodiscard]] int bit_width(T Value) { |
282 | static_assert(std::is_unsigned_v<T>, |
283 | "Only unsigned integral types are allowed."); |
284 | return std::numeric_limits<T>::digits - llvm::countl_zero(Value); |
285 | } |
286 | |
287 | /// Returns the largest integral power of two no greater than Value if Value is |
288 | /// nonzero. Returns 0 otherwise. |
289 | /// |
290 | /// Ex. bit_floor(5) == 4. |
291 | template <typename T> [[nodiscard]] T bit_floor(T Value) { |
292 | static_assert(std::is_unsigned_v<T>, |
293 | "Only unsigned integral types are allowed."); |
294 | if (!Value) |
295 | return 0; |
296 | return T(1) << (llvm::bit_width(Value) - 1); |
297 | } |
298 | |
299 | /// Returns the smallest integral power of two no smaller than Value if Value is |
300 | /// nonzero. Returns 1 otherwise. |
301 | /// |
302 | /// Ex. bit_ceil(5) == 8. |
303 | /// |
304 | /// The return value is undefined if the input is larger than the largest power |
305 | /// of two representable in T. |
306 | template <typename T> [[nodiscard]] T bit_ceil(T Value) { |
307 | static_assert(std::is_unsigned_v<T>, |
308 | "Only unsigned integral types are allowed."); |
309 | if (Value < 2) |
310 | return 1; |
311 | return T(1) << llvm::bit_width<T>(Value - 1u); |
312 | } |
313 | |
314 | namespace detail { |
315 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { |
316 | static int count(T Value) { |
317 | // Generic version, forward to 32 bits. |
318 | static_assert(SizeOfT <= 4, "Not implemented!"); |
319 | #if defined(__GNUC__4) |
320 | return (int)__builtin_popcount(Value); |
321 | #else |
322 | uint32_t v = Value; |
323 | v = v - ((v >> 1) & 0x55555555); |
324 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); |
325 | return int(((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24); |
326 | #endif |
327 | } |
328 | }; |
329 | |
330 | template <typename T> struct PopulationCounter<T, 8> { |
331 | static int count(T Value) { |
332 | #if defined(__GNUC__4) |
333 | return (int)__builtin_popcountll(Value); |
334 | #else |
335 | uint64_t v = Value; |
336 | v = v - ((v >> 1) & 0x5555555555555555ULL); |
337 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); |
338 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; |
339 | return int((uint64_t)(v * 0x0101010101010101ULL) >> 56); |
340 | #endif |
341 | } |
342 | }; |
343 | } // namespace detail |
344 | |
345 | /// Count the number of set bits in a value. |
346 | /// Ex. popcount(0xF000F000) = 8 |
347 | /// Returns 0 if the word is zero. |
348 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
349 | [[nodiscard]] inline int popcount(T Value) noexcept { |
350 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); |
351 | } |
352 | |
353 | // Forward-declare rotr so that rotl can use it. |
354 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
355 | [[nodiscard]] constexpr T rotr(T V, int R); |
356 | |
357 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
358 | [[nodiscard]] constexpr T rotl(T V, int R) { |
359 | unsigned N = std::numeric_limits<T>::digits; |
360 | |
361 | R = R % N; |
362 | if (!R) |
363 | return V; |
364 | |
365 | if (R < 0) |
366 | return llvm::rotr(V, -R); |
367 | |
368 | return (V << R) | (V >> (N - R)); |
369 | } |
370 | |
371 | template <typename T, typename> [[nodiscard]] constexpr T rotr(T V, int R) { |
372 | unsigned N = std::numeric_limits<T>::digits; |
373 | |
374 | R = R % N; |
375 | if (!R) |
376 | return V; |
377 | |
378 | if (R < 0) |
379 | return llvm::rotl(V, -R); |
380 | |
381 | return (V >> R) | (V << (N - R)); |
382 | } |
383 | |
384 | } // namespace llvm |
385 | |
386 | #endif |