File: | build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 3299, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||
2 | // | ||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
6 | // | ||||
7 | //===----------------------------------------------------------------------===// | ||||
8 | /// \file | ||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||
10 | /// AMDGPU. | ||||
11 | /// \todo This should be generated by TableGen. | ||||
12 | //===----------------------------------------------------------------------===// | ||||
13 | |||||
14 | #include "AMDGPULegalizerInfo.h" | ||||
15 | |||||
16 | #include "AMDGPU.h" | ||||
17 | #include "AMDGPUGlobalISelUtils.h" | ||||
18 | #include "AMDGPUInstrInfo.h" | ||||
19 | #include "AMDGPUTargetMachine.h" | ||||
20 | #include "SIMachineFunctionInfo.h" | ||||
21 | #include "Utils/AMDGPUBaseInfo.h" | ||||
22 | #include "llvm/ADT/ScopeExit.h" | ||||
23 | #include "llvm/BinaryFormat/ELF.h" | ||||
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||
27 | #include "llvm/IR/DiagnosticInfo.h" | ||||
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||
29 | #include "llvm/IR/IntrinsicsR600.h" | ||||
30 | |||||
31 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||
32 | |||||
33 | using namespace llvm; | ||||
34 | using namespace LegalizeActions; | ||||
35 | using namespace LegalizeMutations; | ||||
36 | using namespace LegalityPredicates; | ||||
37 | using namespace MIPatternMatch; | ||||
38 | |||||
39 | // Hack until load/store selection patterns support any tuple of legal types. | ||||
40 | static cl::opt<bool> EnableNewLegality( | ||||
41 | "amdgpu-global-isel-new-legality", | ||||
42 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||
43 | "rules compatible with selection patterns"), | ||||
44 | cl::init(false), | ||||
45 | cl::ReallyHidden); | ||||
46 | |||||
47 | static constexpr unsigned MaxRegisterSize = 1024; | ||||
48 | |||||
49 | // Round the number of elements to the next power of two elements | ||||
50 | static LLT getPow2VectorType(LLT Ty) { | ||||
51 | unsigned NElts = Ty.getNumElements(); | ||||
52 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||
53 | return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); | ||||
54 | } | ||||
55 | |||||
56 | // Round the number of bits to the next power of two bits | ||||
57 | static LLT getPow2ScalarType(LLT Ty) { | ||||
58 | unsigned Bits = Ty.getSizeInBits(); | ||||
59 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||
60 | return LLT::scalar(Pow2Bits); | ||||
61 | } | ||||
62 | |||||
63 | /// \returns true if this is an odd sized vector which should widen by adding an | ||||
64 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||
65 | /// excludes s1 vectors, which should always be scalarized. | ||||
66 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||
67 | return [=](const LegalityQuery &Query) { | ||||
68 | const LLT Ty = Query.Types[TypeIdx]; | ||||
69 | if (!Ty.isVector()) | ||||
70 | return false; | ||||
71 | |||||
72 | const LLT EltTy = Ty.getElementType(); | ||||
73 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||
74 | return Ty.getNumElements() % 2 != 0 && | ||||
75 | EltSize > 1 && EltSize < 32 && | ||||
76 | Ty.getSizeInBits() % 32 != 0; | ||||
77 | }; | ||||
78 | } | ||||
79 | |||||
80 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||
81 | return [=](const LegalityQuery &Query) { | ||||
82 | const LLT Ty = Query.Types[TypeIdx]; | ||||
83 | return Ty.getSizeInBits() % 32 == 0; | ||||
84 | }; | ||||
85 | } | ||||
86 | |||||
87 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||
88 | return [=](const LegalityQuery &Query) { | ||||
89 | const LLT Ty = Query.Types[TypeIdx]; | ||||
90 | const LLT EltTy = Ty.getScalarType(); | ||||
91 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||
92 | }; | ||||
93 | } | ||||
94 | |||||
95 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||
96 | return [=](const LegalityQuery &Query) { | ||||
97 | const LLT Ty = Query.Types[TypeIdx]; | ||||
98 | const LLT EltTy = Ty.getElementType(); | ||||
99 | return std::pair(TypeIdx, | ||||
100 | LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); | ||||
101 | }; | ||||
102 | } | ||||
103 | |||||
104 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||
105 | return [=](const LegalityQuery &Query) { | ||||
106 | const LLT Ty = Query.Types[TypeIdx]; | ||||
107 | const LLT EltTy = Ty.getElementType(); | ||||
108 | unsigned Size = Ty.getSizeInBits(); | ||||
109 | unsigned Pieces = (Size + 63) / 64; | ||||
110 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||
111 | return std::pair(TypeIdx, LLT::scalarOrVector( | ||||
112 | ElementCount::getFixed(NewNumElts), EltTy)); | ||||
113 | }; | ||||
114 | } | ||||
115 | |||||
116 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||
117 | // type. | ||||
118 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||
119 | return [=](const LegalityQuery &Query) { | ||||
120 | const LLT Ty = Query.Types[TypeIdx]; | ||||
121 | |||||
122 | const LLT EltTy = Ty.getElementType(); | ||||
123 | const int Size = Ty.getSizeInBits(); | ||||
124 | const int EltSize = EltTy.getSizeInBits(); | ||||
125 | const int NextMul32 = (Size + 31) / 32; | ||||
126 | |||||
127 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 127, __extension__ __PRETTY_FUNCTION__)); | ||||
128 | |||||
129 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||
130 | return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); | ||||
131 | }; | ||||
132 | } | ||||
133 | |||||
134 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||
135 | const unsigned Size = Ty.getSizeInBits(); | ||||
136 | |||||
137 | if (Size <= 32) { | ||||
138 | // <2 x s8> -> s16 | ||||
139 | // <4 x s8> -> s32 | ||||
140 | return LLT::scalar(Size); | ||||
141 | } | ||||
142 | |||||
143 | return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); | ||||
144 | } | ||||
145 | |||||
146 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||
147 | return [=](const LegalityQuery &Query) { | ||||
148 | const LLT Ty = Query.Types[TypeIdx]; | ||||
149 | return std::pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||
150 | }; | ||||
151 | } | ||||
152 | |||||
153 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||
154 | return [=](const LegalityQuery &Query) { | ||||
155 | const LLT Ty = Query.Types[TypeIdx]; | ||||
156 | unsigned Size = Ty.getSizeInBits(); | ||||
157 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 157, __extension__ __PRETTY_FUNCTION__)); | ||||
158 | return std::pair( | ||||
159 | TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); | ||||
160 | }; | ||||
161 | } | ||||
162 | |||||
163 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||
164 | return [=](const LegalityQuery &Query) { | ||||
165 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
166 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||
167 | }; | ||||
168 | } | ||||
169 | |||||
170 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||
171 | return [=](const LegalityQuery &Query) { | ||||
172 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
173 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||
174 | }; | ||||
175 | } | ||||
176 | |||||
177 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||
178 | return [=](const LegalityQuery &Query) { | ||||
179 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
180 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||
181 | }; | ||||
182 | } | ||||
183 | |||||
184 | static bool isRegisterSize(unsigned Size) { | ||||
185 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||
186 | } | ||||
187 | |||||
188 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||
189 | const int EltSize = EltTy.getSizeInBits(); | ||||
190 | return EltSize == 16 || EltSize % 32 == 0; | ||||
191 | } | ||||
192 | |||||
193 | static bool isRegisterVectorType(LLT Ty) { | ||||
194 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||
195 | return EltSize == 32 || EltSize == 64 || | ||||
196 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||
197 | EltSize == 128 || EltSize == 256; | ||||
198 | } | ||||
199 | |||||
200 | static bool isRegisterType(LLT Ty) { | ||||
201 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||
202 | return false; | ||||
203 | |||||
204 | if (Ty.isVector()) | ||||
205 | return isRegisterVectorType(Ty); | ||||
206 | |||||
207 | return true; | ||||
208 | } | ||||
209 | |||||
210 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||
211 | // multiples of v2s16. | ||||
212 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||
213 | return [=](const LegalityQuery &Query) { | ||||
214 | return isRegisterType(Query.Types[TypeIdx]); | ||||
215 | }; | ||||
216 | } | ||||
217 | |||||
218 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||
219 | return [=](const LegalityQuery &Query) { | ||||
220 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
221 | if (!QueryTy.isVector()) | ||||
222 | return false; | ||||
223 | const LLT EltTy = QueryTy.getElementType(); | ||||
224 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||
225 | }; | ||||
226 | } | ||||
227 | |||||
228 | // If we have a truncating store or an extending load with a data size larger | ||||
229 | // than 32-bits, we need to reduce to a 32-bit type. | ||||
230 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { | ||||
231 | return [=](const LegalityQuery &Query) { | ||||
232 | const LLT Ty = Query.Types[TypeIdx]; | ||||
233 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||
234 | Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); | ||||
235 | }; | ||||
236 | } | ||||
237 | |||||
238 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||
239 | // handle some operations by just promoting the register during | ||||
240 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||
241 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||
242 | bool IsLoad, bool IsAtomic) { | ||||
243 | switch (AS) { | ||||
244 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||
245 | // FIXME: Private element size. | ||||
246 | return ST.enableFlatScratch() ? 128 : 32; | ||||
247 | case AMDGPUAS::LOCAL_ADDRESS: | ||||
248 | return ST.useDS128() ? 128 : 64; | ||||
249 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||
250 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||
251 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||
252 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||
253 | // global loads (ideally constant address space should be eliminated) | ||||
254 | // depending on the context. Legality cannot be context dependent, but | ||||
255 | // RegBankSelect can split the load as necessary depending on the pointer | ||||
256 | // register bank/uniformity and if the memory is invariant or not written in a | ||||
257 | // kernel. | ||||
258 | return IsLoad ? 512 : 128; | ||||
259 | default: | ||||
260 | // FIXME: Flat addresses may contextually need to be split to 32-bit parts | ||||
261 | // if they may alias scratch depending on the subtarget. This needs to be | ||||
262 | // moved to custom handling to use addressMayBeAccessedAsPrivate | ||||
263 | return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; | ||||
264 | } | ||||
265 | } | ||||
266 | |||||
267 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||
268 | const LegalityQuery &Query) { | ||||
269 | const LLT Ty = Query.Types[0]; | ||||
270 | |||||
271 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||
272 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; | ||||
273 | |||||
274 | unsigned RegSize = Ty.getSizeInBits(); | ||||
275 | uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
276 | uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; | ||||
277 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||
278 | |||||
279 | // All of these need to be custom lowered to cast the pointer operand. | ||||
280 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||
281 | return false; | ||||
282 | |||||
283 | // Do not handle extending vector loads. | ||||
284 | if (Ty.isVector() && MemSize != RegSize) | ||||
285 | return false; | ||||
286 | |||||
287 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||
288 | // we also need to modify the memory access size. | ||||
289 | #if 0 | ||||
290 | // Accept widening loads based on alignment. | ||||
291 | if (IsLoad && MemSize < Size) | ||||
292 | MemSize = std::max(MemSize, Align); | ||||
293 | #endif | ||||
294 | |||||
295 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||
296 | if (MemSize != RegSize && RegSize != 32) | ||||
297 | return false; | ||||
298 | |||||
299 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, | ||||
300 | Query.MMODescrs[0].Ordering != | ||||
301 | AtomicOrdering::NotAtomic)) | ||||
302 | return false; | ||||
303 | |||||
304 | switch (MemSize) { | ||||
305 | case 8: | ||||
306 | case 16: | ||||
307 | case 32: | ||||
308 | case 64: | ||||
309 | case 128: | ||||
310 | break; | ||||
311 | case 96: | ||||
312 | if (!ST.hasDwordx3LoadStores()) | ||||
313 | return false; | ||||
314 | break; | ||||
315 | case 256: | ||||
316 | case 512: | ||||
317 | // These may contextually need to be broken down. | ||||
318 | break; | ||||
319 | default: | ||||
320 | return false; | ||||
321 | } | ||||
322 | |||||
323 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 323, __extension__ __PRETTY_FUNCTION__)); | ||||
324 | |||||
325 | if (AlignBits < MemSize) { | ||||
326 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
327 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||
328 | Align(AlignBits / 8))) | ||||
329 | return false; | ||||
330 | } | ||||
331 | |||||
332 | return true; | ||||
333 | } | ||||
334 | |||||
335 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||
336 | // workaround this. Eventually it should ignore the type for loads and only care | ||||
337 | // about the size. Return true in cases where we will workaround this for now by | ||||
338 | // bitcasting. | ||||
339 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||
340 | if (EnableNewLegality) | ||||
341 | return false; | ||||
342 | |||||
343 | const unsigned Size = Ty.getSizeInBits(); | ||||
344 | if (Size <= 64) | ||||
345 | return false; | ||||
346 | if (!Ty.isVector()) | ||||
347 | return true; | ||||
348 | |||||
349 | LLT EltTy = Ty.getElementType(); | ||||
350 | if (EltTy.isPointer()) | ||||
351 | return true; | ||||
352 | |||||
353 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
354 | return EltSize != 32 && EltSize != 64; | ||||
355 | } | ||||
356 | |||||
357 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { | ||||
358 | const LLT Ty = Query.Types[0]; | ||||
359 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && | ||||
360 | !loadStoreBitcastWorkaround(Ty); | ||||
361 | } | ||||
362 | |||||
363 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||
364 | /// to a different type. | ||||
365 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||
366 | const LLT MemTy) { | ||||
367 | const unsigned MemSizeInBits = MemTy.getSizeInBits(); | ||||
368 | const unsigned Size = Ty.getSizeInBits(); | ||||
369 | if (Size != MemSizeInBits) | ||||
370 | return Size <= 32 && Ty.isVector(); | ||||
371 | |||||
372 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||
373 | return true; | ||||
374 | |||||
375 | // Don't try to handle bitcasting vector ext loads for now. | ||||
376 | return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && | ||||
377 | (Size <= 32 || isRegisterSize(Size)) && | ||||
378 | !isRegisterVectorElementType(Ty.getElementType()); | ||||
379 | } | ||||
380 | |||||
381 | /// Return true if we should legalize a load by widening an odd sized memory | ||||
382 | /// access up to the alignment. Note this case when the memory access itself | ||||
383 | /// changes, not the size of the result register. | ||||
384 | static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, | ||||
385 | uint64_t AlignInBits, unsigned AddrSpace, | ||||
386 | unsigned Opcode) { | ||||
387 | unsigned SizeInBits = MemoryTy.getSizeInBits(); | ||||
388 | // We don't want to widen cases that are naturally legal. | ||||
389 | if (isPowerOf2_32(SizeInBits)) | ||||
390 | return false; | ||||
391 | |||||
392 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||
393 | // end up widening these for a scalar load during RegBankSelect, since there | ||||
394 | // aren't 96-bit scalar loads. | ||||
395 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||
396 | return false; | ||||
397 | |||||
398 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) | ||||
399 | return false; | ||||
400 | |||||
401 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||
402 | // to it. | ||||
403 | // | ||||
404 | // TODO: Could check dereferenceable for less aligned cases. | ||||
405 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||
406 | if (AlignInBits < RoundedSize) | ||||
407 | return false; | ||||
408 | |||||
409 | // Do not widen if it would introduce a slow unaligned load. | ||||
410 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
411 | unsigned Fast = 0; | ||||
412 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||
413 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||
414 | MachineMemOperand::MOLoad, &Fast) && | ||||
415 | Fast; | ||||
416 | } | ||||
417 | |||||
418 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||
419 | unsigned Opcode) { | ||||
420 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||
421 | return false; | ||||
422 | |||||
423 | return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, | ||||
424 | Query.MMODescrs[0].AlignInBits, | ||||
425 | Query.Types[1].getAddressSpace(), Opcode); | ||||
426 | } | ||||
427 | |||||
428 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||
429 | const GCNTargetMachine &TM) | ||||
430 | : ST(ST_) { | ||||
431 | using namespace TargetOpcode; | ||||
432 | |||||
433 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||
434 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||
435 | }; | ||||
436 | |||||
437 | const LLT S1 = LLT::scalar(1); | ||||
438 | const LLT S8 = LLT::scalar(8); | ||||
439 | const LLT S16 = LLT::scalar(16); | ||||
440 | const LLT S32 = LLT::scalar(32); | ||||
441 | const LLT S64 = LLT::scalar(64); | ||||
442 | const LLT S128 = LLT::scalar(128); | ||||
443 | const LLT S256 = LLT::scalar(256); | ||||
444 | const LLT S512 = LLT::scalar(512); | ||||
445 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||
446 | |||||
447 | const LLT V2S8 = LLT::fixed_vector(2, 8); | ||||
448 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
449 | const LLT V4S16 = LLT::fixed_vector(4, 16); | ||||
450 | |||||
451 | const LLT V2S32 = LLT::fixed_vector(2, 32); | ||||
452 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||
453 | const LLT V4S32 = LLT::fixed_vector(4, 32); | ||||
454 | const LLT V5S32 = LLT::fixed_vector(5, 32); | ||||
455 | const LLT V6S32 = LLT::fixed_vector(6, 32); | ||||
456 | const LLT V7S32 = LLT::fixed_vector(7, 32); | ||||
457 | const LLT V8S32 = LLT::fixed_vector(8, 32); | ||||
458 | const LLT V9S32 = LLT::fixed_vector(9, 32); | ||||
459 | const LLT V10S32 = LLT::fixed_vector(10, 32); | ||||
460 | const LLT V11S32 = LLT::fixed_vector(11, 32); | ||||
461 | const LLT V12S32 = LLT::fixed_vector(12, 32); | ||||
462 | const LLT V13S32 = LLT::fixed_vector(13, 32); | ||||
463 | const LLT V14S32 = LLT::fixed_vector(14, 32); | ||||
464 | const LLT V15S32 = LLT::fixed_vector(15, 32); | ||||
465 | const LLT V16S32 = LLT::fixed_vector(16, 32); | ||||
466 | const LLT V32S32 = LLT::fixed_vector(32, 32); | ||||
467 | |||||
468 | const LLT V2S64 = LLT::fixed_vector(2, 64); | ||||
469 | const LLT V3S64 = LLT::fixed_vector(3, 64); | ||||
470 | const LLT V4S64 = LLT::fixed_vector(4, 64); | ||||
471 | const LLT V5S64 = LLT::fixed_vector(5, 64); | ||||
472 | const LLT V6S64 = LLT::fixed_vector(6, 64); | ||||
473 | const LLT V7S64 = LLT::fixed_vector(7, 64); | ||||
474 | const LLT V8S64 = LLT::fixed_vector(8, 64); | ||||
475 | const LLT V16S64 = LLT::fixed_vector(16, 64); | ||||
476 | |||||
477 | std::initializer_list<LLT> AllS32Vectors = | ||||
478 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||
479 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||
480 | std::initializer_list<LLT> AllS64Vectors = | ||||
481 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||
482 | |||||
483 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||
484 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||
485 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||
486 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||
487 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||
488 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||
489 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||
490 | |||||
491 | const LLT CodePtr = FlatPtr; | ||||
492 | |||||
493 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||
494 | GlobalPtr, ConstantPtr, FlatPtr | ||||
495 | }; | ||||
496 | |||||
497 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||
498 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||
499 | }; | ||||
500 | |||||
501 | const std::initializer_list<LLT> FPTypesBase = { | ||||
502 | S32, S64 | ||||
503 | }; | ||||
504 | |||||
505 | const std::initializer_list<LLT> FPTypes16 = { | ||||
506 | S32, S64, S16 | ||||
507 | }; | ||||
508 | |||||
509 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||
510 | S32, S64, S16, V2S16 | ||||
511 | }; | ||||
512 | |||||
513 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||
514 | |||||
515 | // s1 for VCC branches, s32 for SCC branches. | ||||
516 | getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); | ||||
517 | |||||
518 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||
519 | // elements for v3s16 | ||||
520 | getActionDefinitionsBuilder(G_PHI) | ||||
521 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||
522 | .legalFor(AllS32Vectors) | ||||
523 | .legalFor(AllS64Vectors) | ||||
524 | .legalFor(AddrSpaces64) | ||||
525 | .legalFor(AddrSpaces32) | ||||
526 | .legalIf(isPointer(0)) | ||||
527 | .clampScalar(0, S16, S256) | ||||
528 | .widenScalarToNextPow2(0, 32) | ||||
529 | .clampMaxNumElements(0, S32, 16) | ||||
530 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
531 | .scalarize(0); | ||||
532 | |||||
533 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||
534 | // Full set of gfx9 features. | ||||
535 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
536 | .legalFor({S32, S16, V2S16}) | ||||
537 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
538 | .scalarize(0) | ||||
539 | .minScalar(0, S16) | ||||
540 | .widenScalarToNextMultipleOf(0, 32) | ||||
541 | .maxScalar(0, S32); | ||||
542 | |||||
543 | getActionDefinitionsBuilder(G_MUL) | ||||
544 | .legalFor({S32, S16, V2S16}) | ||||
545 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
546 | .scalarize(0) | ||||
547 | .minScalar(0, S16) | ||||
548 | .widenScalarToNextMultipleOf(0, 32) | ||||
549 | .custom(); | ||||
550 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 550, __extension__ __PRETTY_FUNCTION__)); | ||||
551 | |||||
552 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||
553 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||
554 | .minScalarOrElt(0, S16) | ||||
555 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
556 | .scalarize(0) | ||||
557 | .widenScalarToNextPow2(0, 32) | ||||
558 | .lower(); | ||||
559 | } else if (ST.has16BitInsts()) { | ||||
560 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
561 | .legalFor({S32, S16}) | ||||
562 | .minScalar(0, S16) | ||||
563 | .widenScalarToNextMultipleOf(0, 32) | ||||
564 | .maxScalar(0, S32) | ||||
565 | .scalarize(0); | ||||
566 | |||||
567 | getActionDefinitionsBuilder(G_MUL) | ||||
568 | .legalFor({S32, S16}) | ||||
569 | .scalarize(0) | ||||
570 | .minScalar(0, S16) | ||||
571 | .widenScalarToNextMultipleOf(0, 32) | ||||
572 | .custom(); | ||||
573 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 573, __extension__ __PRETTY_FUNCTION__)); | ||||
574 | |||||
575 | // Technically the saturating operations require clamp bit support, but this | ||||
576 | // was introduced at the same time as 16-bit operations. | ||||
577 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
578 | .legalFor({S32, S16}) // Clamp modifier | ||||
579 | .minScalar(0, S16) | ||||
580 | .scalarize(0) | ||||
581 | .widenScalarToNextPow2(0, 16) | ||||
582 | .lower(); | ||||
583 | |||||
584 | // We're just lowering this, but it helps get a better result to try to | ||||
585 | // coerce to the desired type first. | ||||
586 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||
587 | .minScalar(0, S16) | ||||
588 | .scalarize(0) | ||||
589 | .lower(); | ||||
590 | } else { | ||||
591 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
592 | .legalFor({S32}) | ||||
593 | .widenScalarToNextMultipleOf(0, 32) | ||||
594 | .clampScalar(0, S32, S32) | ||||
595 | .scalarize(0); | ||||
596 | |||||
597 | auto &Mul = getActionDefinitionsBuilder(G_MUL) | ||||
598 | .legalFor({S32}) | ||||
599 | .scalarize(0) | ||||
600 | .minScalar(0, S32) | ||||
601 | .widenScalarToNextMultipleOf(0, 32); | ||||
602 | |||||
603 | if (ST.hasMad64_32()) | ||||
604 | Mul.custom(); | ||||
605 | else | ||||
606 | Mul.maxScalar(0, S32); | ||||
607 | |||||
608 | if (ST.hasIntClamp()) { | ||||
609 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
610 | .legalFor({S32}) // Clamp modifier. | ||||
611 | .scalarize(0) | ||||
612 | .minScalarOrElt(0, S32) | ||||
613 | .lower(); | ||||
614 | } else { | ||||
615 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||
616 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
617 | .minScalar(0, S32) | ||||
618 | .scalarize(0) | ||||
619 | .lower(); | ||||
620 | } | ||||
621 | |||||
622 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||
623 | // range values and goes for the min/max lowering directly. | ||||
624 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||
625 | .minScalar(0, S32) | ||||
626 | .scalarize(0) | ||||
627 | .lower(); | ||||
628 | } | ||||
629 | |||||
630 | getActionDefinitionsBuilder( | ||||
631 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) | ||||
632 | .customFor({S32, S64}) | ||||
633 | .clampScalar(0, S32, S64) | ||||
634 | .widenScalarToNextPow2(0, 32) | ||||
635 | .scalarize(0); | ||||
636 | |||||
637 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||
638 | .legalFor({S32}) | ||||
639 | .maxScalar(0, S32); | ||||
640 | |||||
641 | if (ST.hasVOP3PInsts()) { | ||||
642 | Mulh | ||||
643 | .clampMaxNumElements(0, S8, 2) | ||||
644 | .lowerFor({V2S8}); | ||||
645 | } | ||||
646 | |||||
647 | Mulh | ||||
648 | .scalarize(0) | ||||
649 | .lower(); | ||||
650 | |||||
651 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||
652 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||
653 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||
654 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||
655 | .clampScalar(0, S32, S64) | ||||
656 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
657 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||
658 | .widenScalarToNextPow2(0) | ||||
659 | .scalarize(0); | ||||
660 | |||||
661 | getActionDefinitionsBuilder( | ||||
662 | {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||
663 | .legalFor({{S32, S1}, {S32, S32}}) | ||||
664 | .clampScalar(0, S32, S32) | ||||
665 | .scalarize(0); | ||||
666 | |||||
667 | getActionDefinitionsBuilder(G_BITCAST) | ||||
668 | // Don't worry about the size constraint. | ||||
669 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
670 | .lower(); | ||||
671 | |||||
672 | |||||
673 | getActionDefinitionsBuilder(G_CONSTANT) | ||||
674 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||
675 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||
676 | .legalIf(isPointer(0)) | ||||
677 | .clampScalar(0, S32, S64) | ||||
678 | .widenScalarToNextPow2(0); | ||||
679 | |||||
680 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||
681 | .legalFor({S32, S64, S16}) | ||||
682 | .clampScalar(0, S16, S64); | ||||
683 | |||||
684 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||
685 | .legalIf(isRegisterType(0)) | ||||
686 | // s1 and s16 are special cases because they have legal operations on | ||||
687 | // them, but don't really occupy registers in the normal way. | ||||
688 | .legalFor({S1, S16}) | ||||
689 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
690 | .clampScalarOrElt(0, S32, MaxScalar) | ||||
691 | .widenScalarToNextPow2(0, 32) | ||||
692 | .clampMaxNumElements(0, S32, 16); | ||||
693 | |||||
694 | getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); | ||||
695 | |||||
696 | // If the amount is divergent, we have to do a wave reduction to get the | ||||
697 | // maximum value, so this is expanded during RegBankSelect. | ||||
698 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||
699 | .legalFor({{PrivatePtr, S32}}); | ||||
700 | |||||
701 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||
702 | .customIf(typeIsNot(0, PrivatePtr)); | ||||
703 | |||||
704 | getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); | ||||
705 | |||||
706 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||
707 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, | ||||
708 | G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) | ||||
709 | .legalFor({S32, S64}); | ||||
710 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||
711 | .customFor({S32, S64}); | ||||
712 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||
713 | .customFor({S32, S64}); | ||||
714 | |||||
715 | if (ST.has16BitInsts()) { | ||||
716 | if (ST.hasVOP3PInsts()) | ||||
717 | FPOpActions.legalFor({S16, V2S16}); | ||||
718 | else | ||||
719 | FPOpActions.legalFor({S16}); | ||||
720 | |||||
721 | TrigActions.customFor({S16}); | ||||
722 | FDIVActions.customFor({S16}); | ||||
723 | } | ||||
724 | |||||
725 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||
726 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||
727 | |||||
728 | if (ST.hasVOP3PInsts()) { | ||||
729 | MinNumMaxNum.customFor(FPTypesPK16) | ||||
730 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
731 | .clampMaxNumElements(0, S16, 2) | ||||
732 | .clampScalar(0, S16, S64) | ||||
733 | .scalarize(0); | ||||
734 | } else if (ST.has16BitInsts()) { | ||||
735 | MinNumMaxNum.customFor(FPTypes16) | ||||
736 | .clampScalar(0, S16, S64) | ||||
737 | .scalarize(0); | ||||
738 | } else { | ||||
739 | MinNumMaxNum.customFor(FPTypesBase) | ||||
740 | .clampScalar(0, S32, S64) | ||||
741 | .scalarize(0); | ||||
742 | } | ||||
743 | |||||
744 | if (ST.hasVOP3PInsts()) | ||||
745 | FPOpActions.clampMaxNumElementsStrict(0, S16, 2); | ||||
746 | |||||
747 | FPOpActions | ||||
748 | .scalarize(0) | ||||
749 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
750 | |||||
751 | TrigActions | ||||
752 | .scalarize(0) | ||||
753 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
754 | |||||
755 | FDIVActions | ||||
756 | .scalarize(0) | ||||
757 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
758 | |||||
759 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||
760 | .legalFor(FPTypesPK16) | ||||
761 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
762 | .scalarize(0) | ||||
763 | .clampScalar(0, S16, S64); | ||||
764 | |||||
765 | if (ST.has16BitInsts()) { | ||||
766 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||
767 | .legalFor({S32, S64, S16}) | ||||
768 | .scalarize(0) | ||||
769 | .clampScalar(0, S16, S64); | ||||
770 | } else { | ||||
771 | getActionDefinitionsBuilder(G_FSQRT) | ||||
772 | .legalFor({S32, S64}) | ||||
773 | .scalarize(0) | ||||
774 | .clampScalar(0, S32, S64); | ||||
775 | |||||
776 | if (ST.hasFractBug()) { | ||||
777 | getActionDefinitionsBuilder(G_FFLOOR) | ||||
778 | .customFor({S64}) | ||||
779 | .legalFor({S32, S64}) | ||||
780 | .scalarize(0) | ||||
781 | .clampScalar(0, S32, S64); | ||||
782 | } else { | ||||
783 | getActionDefinitionsBuilder(G_FFLOOR) | ||||
784 | .legalFor({S32, S64}) | ||||
785 | .scalarize(0) | ||||
786 | .clampScalar(0, S32, S64); | ||||
787 | } | ||||
788 | } | ||||
789 | |||||
790 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||
791 | .legalFor({{S32, S64}, {S16, S32}}) | ||||
792 | .scalarize(0) | ||||
793 | .lower(); | ||||
794 | |||||
795 | getActionDefinitionsBuilder(G_FPEXT) | ||||
796 | .legalFor({{S64, S32}, {S32, S16}}) | ||||
797 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||
798 | .scalarize(0); | ||||
799 | |||||
800 | auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); | ||||
801 | if (ST.has16BitInsts()) { | ||||
802 | FSubActions | ||||
803 | // Use actual fsub instruction | ||||
804 | .legalFor({S32, S16}) | ||||
805 | // Must use fadd + fneg | ||||
806 | .lowerFor({S64, V2S16}); | ||||
807 | } else { | ||||
808 | FSubActions | ||||
809 | // Use actual fsub instruction | ||||
810 | .legalFor({S32}) | ||||
811 | // Must use fadd + fneg | ||||
812 | .lowerFor({S64, S16, V2S16}); | ||||
813 | } | ||||
814 | |||||
815 | FSubActions | ||||
816 | .scalarize(0) | ||||
817 | .clampScalar(0, S32, S64); | ||||
818 | |||||
819 | // Whether this is legal depends on the floating point mode for the function. | ||||
820 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||
821 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||
822 | FMad.customFor({S32, S16}); | ||||
823 | else if (ST.hasMadMacF32Insts()) | ||||
824 | FMad.customFor({S32}); | ||||
825 | else if (ST.hasMadF16()) | ||||
826 | FMad.customFor({S16}); | ||||
827 | FMad.scalarize(0) | ||||
828 | .lower(); | ||||
829 | |||||
830 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||
831 | if (ST.has16BitInsts()) { | ||||
832 | FRem.customFor({S16, S32, S64}); | ||||
833 | } else { | ||||
834 | FRem.minScalar(0, S32) | ||||
835 | .customFor({S32, S64}); | ||||
836 | } | ||||
837 | FRem.scalarize(0); | ||||
838 | |||||
839 | // TODO: Do we need to clamp maximum bitwidth? | ||||
840 | getActionDefinitionsBuilder(G_TRUNC) | ||||
841 | .legalIf(isScalar(0)) | ||||
842 | .legalFor({{V2S16, V2S32}}) | ||||
843 | .clampMaxNumElements(0, S16, 2) | ||||
844 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||
845 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||
846 | // in the legalizer. | ||||
847 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||
848 | .alwaysLegal(); | ||||
849 | |||||
850 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||
851 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||
852 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||
853 | .scalarize(0) | ||||
854 | .clampScalar(0, S32, S64) | ||||
855 | .widenScalarToNextPow2(1, 32); | ||||
856 | |||||
857 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||
858 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||
859 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||
860 | .lowerIf(typeIs(1, S1)) | ||||
861 | .customFor({{S32, S64}, {S64, S64}}); | ||||
862 | if (ST.has16BitInsts()) | ||||
863 | IToFP.legalFor({{S16, S16}}); | ||||
864 | IToFP.clampScalar(1, S32, S64) | ||||
865 | .minScalar(0, S32) | ||||
866 | .scalarize(0) | ||||
867 | .widenScalarToNextPow2(1); | ||||
868 | |||||
869 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||
870 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||
871 | .customFor({{S64, S32}, {S64, S64}}) | ||||
872 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||
873 | if (ST.has16BitInsts()) | ||||
874 | FPToI.legalFor({{S16, S16}}); | ||||
875 | else | ||||
876 | FPToI.minScalar(1, S32); | ||||
877 | |||||
878 | FPToI.minScalar(0, S32) | ||||
879 | .widenScalarToNextPow2(0, 32) | ||||
880 | .scalarize(0) | ||||
881 | .lower(); | ||||
882 | |||||
883 | getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) | ||||
884 | .customFor({S16, S32}) | ||||
885 | .scalarize(0) | ||||
886 | .lower(); | ||||
887 | |||||
888 | // Lower roundeven into G_FRINT | ||||
889 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||
890 | .scalarize(0) | ||||
891 | .lower(); | ||||
892 | |||||
893 | if (ST.has16BitInsts()) { | ||||
894 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
895 | .legalFor({S16, S32, S64}) | ||||
896 | .clampScalar(0, S16, S64) | ||||
897 | .scalarize(0); | ||||
898 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||
899 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
900 | .legalFor({S32, S64}) | ||||
901 | .clampScalar(0, S32, S64) | ||||
902 | .scalarize(0); | ||||
903 | } else { | ||||
904 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
905 | .legalFor({S32}) | ||||
906 | .customFor({S64}) | ||||
907 | .clampScalar(0, S32, S64) | ||||
908 | .scalarize(0); | ||||
909 | } | ||||
910 | |||||
911 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||
912 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||
913 | .scalarize(0) | ||||
914 | .scalarSameSizeAs(1, 0); | ||||
915 | |||||
916 | getActionDefinitionsBuilder(G_PTRMASK) | ||||
917 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||
918 | .scalarSameSizeAs(1, 0) | ||||
919 | .scalarize(0); | ||||
920 | |||||
921 | auto &CmpBuilder = | ||||
922 | getActionDefinitionsBuilder(G_ICMP) | ||||
923 | // The compare output type differs based on the register bank of the output, | ||||
924 | // so make both s1 and s32 legal. | ||||
925 | // | ||||
926 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||
927 | // is the allocatable register type that will be needed for the copy from | ||||
928 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||
929 | // before that won't try to use s32 result types. | ||||
930 | // | ||||
931 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||
932 | // bank. | ||||
933 | .legalForCartesianProduct( | ||||
934 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||
935 | .legalForCartesianProduct( | ||||
936 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||
937 | if (ST.has16BitInsts()) { | ||||
938 | CmpBuilder.legalFor({{S1, S16}}); | ||||
939 | } | ||||
940 | |||||
941 | CmpBuilder | ||||
942 | .widenScalarToNextPow2(1) | ||||
943 | .clampScalar(1, S32, S64) | ||||
944 | .scalarize(0) | ||||
945 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||
946 | |||||
947 | getActionDefinitionsBuilder(G_FCMP) | ||||
948 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||
949 | .widenScalarToNextPow2(1) | ||||
950 | .clampScalar(1, S32, S64) | ||||
951 | .scalarize(0); | ||||
952 | |||||
953 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||
954 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||
955 | if (ST.has16BitInsts()) | ||||
956 | Exp2Ops.legalFor({S32, S16}); | ||||
957 | else | ||||
958 | Exp2Ops.legalFor({S32}); | ||||
959 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||
960 | Exp2Ops.scalarize(0); | ||||
961 | |||||
962 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||
963 | if (ST.has16BitInsts()) | ||||
964 | ExpOps.customFor({{S32}, {S16}}); | ||||
965 | else | ||||
966 | ExpOps.customFor({S32}); | ||||
967 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||
968 | .scalarize(0); | ||||
969 | |||||
970 | getActionDefinitionsBuilder(G_FPOWI) | ||||
971 | .clampScalar(0, MinScalarFPTy, S32) | ||||
972 | .lower(); | ||||
973 | |||||
974 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||
975 | getActionDefinitionsBuilder(G_CTPOP) | ||||
976 | .legalFor({{S32, S32}, {S32, S64}}) | ||||
977 | .clampScalar(0, S32, S32) | ||||
978 | .widenScalarToNextPow2(1, 32) | ||||
979 | .clampScalar(1, S32, S64) | ||||
980 | .scalarize(0) | ||||
981 | .widenScalarToNextPow2(0, 32); | ||||
982 | |||||
983 | // If no 16 bit instr is available, lower into different instructions. | ||||
984 | if (ST.has16BitInsts()) | ||||
985 | getActionDefinitionsBuilder(G_IS_FPCLASS) | ||||
986 | .legalForCartesianProduct({S1}, FPTypes16) | ||||
987 | .widenScalarToNextPow2(1) | ||||
988 | .scalarize(0) | ||||
989 | .lower(); | ||||
990 | else | ||||
991 | getActionDefinitionsBuilder(G_IS_FPCLASS) | ||||
992 | .legalForCartesianProduct({S1}, FPTypesBase) | ||||
993 | .lowerFor({S1, S16}) | ||||
994 | .widenScalarToNextPow2(1) | ||||
995 | .scalarize(0) | ||||
996 | .lower(); | ||||
997 | |||||
998 | // The hardware instructions return a different result on 0 than the generic | ||||
999 | // instructions expect. The hardware produces -1, but these produce the | ||||
1000 | // bitwidth. | ||||
1001 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||
1002 | .scalarize(0) | ||||
1003 | .clampScalar(0, S32, S32) | ||||
1004 | .clampScalar(1, S32, S64) | ||||
1005 | .widenScalarToNextPow2(0, 32) | ||||
1006 | .widenScalarToNextPow2(1, 32) | ||||
1007 | .custom(); | ||||
1008 | |||||
1009 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||
1010 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||
1011 | .legalFor({{S32, S32}, {S32, S64}}) | ||||
1012 | .clampScalar(0, S32, S32) | ||||
1013 | .clampScalar(1, S32, S64) | ||||
1014 | .scalarize(0) | ||||
1015 | .widenScalarToNextPow2(0, 32) | ||||
1016 | .widenScalarToNextPow2(1, 32); | ||||
1017 | |||||
1018 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||
1019 | // RegBankSelect. | ||||
1020 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||
1021 | .legalFor({S32, S64}) | ||||
1022 | .clampScalar(0, S32, S64) | ||||
1023 | .scalarize(0) | ||||
1024 | .widenScalarToNextPow2(0); | ||||
1025 | |||||
1026 | if (ST.has16BitInsts()) { | ||||
1027 | getActionDefinitionsBuilder(G_BSWAP) | ||||
1028 | .legalFor({S16, S32, V2S16}) | ||||
1029 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
1030 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||
1031 | // narrowScalar limitation. | ||||
1032 | .widenScalarToNextPow2(0) | ||||
1033 | .clampScalar(0, S16, S32) | ||||
1034 | .scalarize(0); | ||||
1035 | |||||
1036 | if (ST.hasVOP3PInsts()) { | ||||
1037 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
1038 | .legalFor({S32, S16, V2S16}) | ||||
1039 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
1040 | .clampMaxNumElements(0, S16, 2) | ||||
1041 | .minScalar(0, S16) | ||||
1042 | .widenScalarToNextPow2(0) | ||||
1043 | .scalarize(0) | ||||
1044 | .lower(); | ||||
1045 | } else { | ||||
1046 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
1047 | .legalFor({S32, S16}) | ||||
1048 | .widenScalarToNextPow2(0) | ||||
1049 | .minScalar(0, S16) | ||||
1050 | .scalarize(0) | ||||
1051 | .lower(); | ||||
1052 | } | ||||
1053 | } else { | ||||
1054 | // TODO: Should have same legality without v_perm_b32 | ||||
1055 | getActionDefinitionsBuilder(G_BSWAP) | ||||
1056 | .legalFor({S32}) | ||||
1057 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||
1058 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||
1059 | // narrowScalar limitation. | ||||
1060 | .widenScalarToNextPow2(0) | ||||
1061 | .maxScalar(0, S32) | ||||
1062 | .scalarize(0) | ||||
1063 | .lower(); | ||||
1064 | |||||
1065 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
1066 | .legalFor({S32}) | ||||
1067 | .minScalar(0, S32) | ||||
1068 | .widenScalarToNextPow2(0) | ||||
1069 | .scalarize(0) | ||||
1070 | .lower(); | ||||
1071 | } | ||||
1072 | |||||
1073 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||
1074 | // List the common cases | ||||
1075 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||
1076 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||
1077 | .scalarize(0) | ||||
1078 | // Accept any address space as long as the size matches | ||||
1079 | .legalIf(sameSize(0, 1)) | ||||
1080 | .widenScalarIf(smallerThan(1, 0), | ||||
1081 | [](const LegalityQuery &Query) { | ||||
1082 | return std::pair( | ||||
1083 | 1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||
1084 | }) | ||||
1085 | .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { | ||||
1086 | return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||
1087 | }); | ||||
1088 | |||||
1089 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||
1090 | // List the common cases | ||||
1091 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||
1092 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||
1093 | .scalarize(0) | ||||
1094 | // Accept any address space as long as the size matches | ||||
1095 | .legalIf(sameSize(0, 1)) | ||||
1096 | .widenScalarIf(smallerThan(0, 1), | ||||
1097 | [](const LegalityQuery &Query) { | ||||
1098 | return std::pair( | ||||
1099 | 0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||
1100 | }) | ||||
1101 | .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { | ||||
1102 | return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||
1103 | }); | ||||
1104 | |||||
1105 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||
1106 | .scalarize(0) | ||||
1107 | .custom(); | ||||
1108 | |||||
1109 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||
1110 | bool IsLoad) -> bool { | ||||
1111 | const LLT DstTy = Query.Types[0]; | ||||
1112 | |||||
1113 | // Split vector extloads. | ||||
1114 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
1115 | |||||
1116 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||
1117 | return true; | ||||
1118 | |||||
1119 | const LLT PtrTy = Query.Types[1]; | ||||
1120 | unsigned AS = PtrTy.getAddressSpace(); | ||||
1121 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, | ||||
1122 | Query.MMODescrs[0].Ordering != | ||||
1123 | AtomicOrdering::NotAtomic)) | ||||
1124 | return true; | ||||
1125 | |||||
1126 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||
1127 | // TODO: May be able to widen depending on alignment etc. | ||||
1128 | unsigned NumRegs = (MemSize + 31) / 32; | ||||
1129 | if (NumRegs == 3) { | ||||
1130 | if (!ST.hasDwordx3LoadStores()) | ||||
1131 | return true; | ||||
1132 | } else { | ||||
1133 | // If the alignment allows, these should have been widened. | ||||
1134 | if (!isPowerOf2_32(NumRegs)) | ||||
1135 | return true; | ||||
1136 | } | ||||
1137 | |||||
1138 | return false; | ||||
1139 | }; | ||||
1140 | |||||
1141 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||
1142 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||
1143 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||
1144 | |||||
1145 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||
1146 | // LDS | ||||
1147 | // TODO: Unsupported flat for SI. | ||||
1148 | |||||
1149 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||
1150 | const bool IsStore = Op == G_STORE; | ||||
1151 | |||||
1152 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||
1153 | // Explicitly list some common cases. | ||||
1154 | // TODO: Does this help compile time at all? | ||||
1155 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, | ||||
1156 | {V2S32, GlobalPtr, V2S32, GlobalAlign32}, | ||||
1157 | {V4S32, GlobalPtr, V4S32, GlobalAlign32}, | ||||
1158 | {S64, GlobalPtr, S64, GlobalAlign32}, | ||||
1159 | {V2S64, GlobalPtr, V2S64, GlobalAlign32}, | ||||
1160 | {V2S16, GlobalPtr, V2S16, GlobalAlign32}, | ||||
1161 | {S32, GlobalPtr, S8, GlobalAlign8}, | ||||
1162 | {S32, GlobalPtr, S16, GlobalAlign16}, | ||||
1163 | |||||
1164 | {S32, LocalPtr, S32, 32}, | ||||
1165 | {S64, LocalPtr, S64, 32}, | ||||
1166 | {V2S32, LocalPtr, V2S32, 32}, | ||||
1167 | {S32, LocalPtr, S8, 8}, | ||||
1168 | {S32, LocalPtr, S16, 16}, | ||||
1169 | {V2S16, LocalPtr, S32, 32}, | ||||
1170 | |||||
1171 | {S32, PrivatePtr, S32, 32}, | ||||
1172 | {S32, PrivatePtr, S8, 8}, | ||||
1173 | {S32, PrivatePtr, S16, 16}, | ||||
1174 | {V2S16, PrivatePtr, S32, 32}, | ||||
1175 | |||||
1176 | {S32, ConstantPtr, S32, GlobalAlign32}, | ||||
1177 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}, | ||||
1178 | {V4S32, ConstantPtr, V4S32, GlobalAlign32}, | ||||
1179 | {S64, ConstantPtr, S64, GlobalAlign32}, | ||||
1180 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); | ||||
1181 | Actions.legalIf( | ||||
1182 | [=](const LegalityQuery &Query) -> bool { | ||||
1183 | return isLoadStoreLegal(ST, Query); | ||||
1184 | }); | ||||
1185 | |||||
1186 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||
1187 | // 64-bits. | ||||
1188 | // | ||||
1189 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||
1190 | // inserting addrspacecasts. | ||||
1191 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||
1192 | |||||
1193 | // Turn any illegal element vectors into something easier to deal | ||||
1194 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||
1195 | // parts anyway. | ||||
1196 | // | ||||
1197 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||
1198 | // 16-bit vector parts. | ||||
1199 | Actions.bitcastIf( | ||||
1200 | [=](const LegalityQuery &Query) -> bool { | ||||
1201 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||
1202 | Query.MMODescrs[0].MemoryTy); | ||||
1203 | }, bitcastToRegisterType(0)); | ||||
1204 | |||||
1205 | if (!IsStore) { | ||||
1206 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||
1207 | // legalization actions can't properly express widening memory operands. | ||||
1208 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||
1209 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||
1210 | }); | ||||
1211 | } | ||||
1212 | |||||
1213 | // FIXME: load/store narrowing should be moved to lower action | ||||
1214 | Actions | ||||
1215 | .narrowScalarIf( | ||||
1216 | [=](const LegalityQuery &Query) -> bool { | ||||
1217 | return !Query.Types[0].isVector() && | ||||
1218 | needToSplitMemOp(Query, Op == G_LOAD); | ||||
1219 | }, | ||||
1220 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||
1221 | const LLT DstTy = Query.Types[0]; | ||||
1222 | const LLT PtrTy = Query.Types[1]; | ||||
1223 | |||||
1224 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||
1225 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
1226 | |||||
1227 | // Split extloads. | ||||
1228 | if (DstSize > MemSize) | ||||
1229 | return std::pair(0, LLT::scalar(MemSize)); | ||||
1230 | |||||
1231 | unsigned MaxSize = maxSizeForAddrSpace( | ||||
1232 | ST, PtrTy.getAddressSpace(), Op == G_LOAD, | ||||
1233 | Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); | ||||
1234 | if (MemSize > MaxSize) | ||||
1235 | return std::pair(0, LLT::scalar(MaxSize)); | ||||
1236 | |||||
1237 | uint64_t Align = Query.MMODescrs[0].AlignInBits; | ||||
1238 | return std::pair(0, LLT::scalar(Align)); | ||||
1239 | }) | ||||
1240 | .fewerElementsIf( | ||||
1241 | [=](const LegalityQuery &Query) -> bool { | ||||
1242 | return Query.Types[0].isVector() && | ||||
1243 | needToSplitMemOp(Query, Op == G_LOAD); | ||||
1244 | }, | ||||
1245 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||
1246 | const LLT DstTy = Query.Types[0]; | ||||
1247 | const LLT PtrTy = Query.Types[1]; | ||||
1248 | |||||
1249 | LLT EltTy = DstTy.getElementType(); | ||||
1250 | unsigned MaxSize = maxSizeForAddrSpace( | ||||
1251 | ST, PtrTy.getAddressSpace(), Op == G_LOAD, | ||||
1252 | Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); | ||||
1253 | |||||
1254 | // FIXME: Handle widened to power of 2 results better. This ends | ||||
1255 | // up scalarizing. | ||||
1256 | // FIXME: 3 element stores scalarized on SI | ||||
1257 | |||||
1258 | // Split if it's too large for the address space. | ||||
1259 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
1260 | if (MemSize > MaxSize) { | ||||
1261 | unsigned NumElts = DstTy.getNumElements(); | ||||
1262 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
1263 | |||||
1264 | if (MaxSize % EltSize == 0) { | ||||
1265 | return std::pair( | ||||
1266 | 0, LLT::scalarOrVector( | ||||
1267 | ElementCount::getFixed(MaxSize / EltSize), EltTy)); | ||||
1268 | } | ||||
1269 | |||||
1270 | unsigned NumPieces = MemSize / MaxSize; | ||||
1271 | |||||
1272 | // FIXME: Refine when odd breakdowns handled | ||||
1273 | // The scalars will need to be re-legalized. | ||||
1274 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||
1275 | NumElts % NumPieces != 0) | ||||
1276 | return std::pair(0, EltTy); | ||||
1277 | |||||
1278 | return std::pair(0, | ||||
1279 | LLT::fixed_vector(NumElts / NumPieces, EltTy)); | ||||
1280 | } | ||||
1281 | |||||
1282 | // FIXME: We could probably handle weird extending loads better. | ||||
1283 | if (DstTy.getSizeInBits() > MemSize) | ||||
1284 | return std::pair(0, EltTy); | ||||
1285 | |||||
1286 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
1287 | unsigned DstSize = DstTy.getSizeInBits(); | ||||
1288 | if (!isPowerOf2_32(DstSize)) { | ||||
1289 | // We're probably decomposing an odd sized store. Try to split | ||||
1290 | // to the widest type. TODO: Account for alignment. As-is it | ||||
1291 | // should be OK, since the new parts will be further legalized. | ||||
1292 | unsigned FloorSize = llvm::bit_floor(DstSize); | ||||
1293 | return std::pair( | ||||
1294 | 0, LLT::scalarOrVector( | ||||
1295 | ElementCount::getFixed(FloorSize / EltSize), EltTy)); | ||||
1296 | } | ||||
1297 | |||||
1298 | // May need relegalization for the scalars. | ||||
1299 | return std::pair(0, EltTy); | ||||
1300 | }) | ||||
1301 | .minScalar(0, S32) | ||||
1302 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) | ||||
1303 | .widenScalarToNextPow2(0) | ||||
1304 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||
1305 | .lower(); | ||||
1306 | } | ||||
1307 | |||||
1308 | // FIXME: Unaligned accesses not lowered. | ||||
1309 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||
1310 | .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, | ||||
1311 | {S32, GlobalPtr, S16, 2 * 8}, | ||||
1312 | {S32, LocalPtr, S8, 8}, | ||||
1313 | {S32, LocalPtr, S16, 16}, | ||||
1314 | {S32, PrivatePtr, S8, 8}, | ||||
1315 | {S32, PrivatePtr, S16, 16}, | ||||
1316 | {S32, ConstantPtr, S8, 8}, | ||||
1317 | {S32, ConstantPtr, S16, 2 * 8}}) | ||||
1318 | .legalIf( | ||||
1319 | [=](const LegalityQuery &Query) -> bool { | ||||
1320 | return isLoadStoreLegal(ST, Query); | ||||
1321 | }); | ||||
1322 | |||||
1323 | if (ST.hasFlatAddressSpace()) { | ||||
1324 | ExtLoads.legalForTypesWithMemDesc( | ||||
1325 | {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); | ||||
1326 | } | ||||
1327 | |||||
1328 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||
1329 | // 64-bits. | ||||
1330 | // | ||||
1331 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||
1332 | // inserting addrspacecasts. | ||||
1333 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); | ||||
1334 | |||||
1335 | ExtLoads.clampScalar(0, S32, S32) | ||||
1336 | .widenScalarToNextPow2(0) | ||||
1337 | .lower(); | ||||
1338 | |||||
1339 | auto &Atomics = getActionDefinitionsBuilder( | ||||
1340 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||
1341 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||
1342 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||
1343 | G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) | ||||
1344 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||
1345 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||
1346 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||
1347 | if (ST.hasFlatAddressSpace()) { | ||||
1348 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||
1349 | } | ||||
1350 | |||||
1351 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | ||||
1352 | if (ST.hasLDSFPAtomicAdd()) { | ||||
1353 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||
1354 | if (ST.hasGFX90AInsts()) | ||||
1355 | Atomic.legalFor({{S64, LocalPtr}}); | ||||
1356 | if (ST.hasGFX940Insts()) | ||||
1357 | Atomic.legalFor({{V2S16, LocalPtr}}); | ||||
1358 | } | ||||
1359 | if (ST.hasAtomicFaddInsts()) | ||||
1360 | Atomic.legalFor({{S32, GlobalPtr}}); | ||||
1361 | if (ST.hasFlatAtomicFaddF32Inst()) | ||||
1362 | Atomic.legalFor({{S32, FlatPtr}}); | ||||
1363 | |||||
1364 | if (ST.hasGFX90AInsts()) { | ||||
1365 | // These are legal with some caveats, and should have undergone expansion in | ||||
1366 | // the IR in most situations | ||||
1367 | // TODO: Move atomic expansion into legalizer | ||||
1368 | Atomic.legalFor({ | ||||
1369 | {S32, GlobalPtr}, | ||||
1370 | {S64, GlobalPtr}, | ||||
1371 | {S64, FlatPtr} | ||||
1372 | }); | ||||
1373 | } | ||||
1374 | |||||
1375 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||
1376 | // demarshalling | ||||
1377 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||
1378 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||
1379 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||
1380 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||
1381 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||
1382 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||
1383 | |||||
1384 | // Condition should be s32 for scalar, s1 for vector. | ||||
1385 | getActionDefinitionsBuilder(G_SELECT) | ||||
1386 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, | ||||
1387 | LocalPtr, FlatPtr, PrivatePtr, | ||||
1388 | LLT::fixed_vector(2, LocalPtr), | ||||
1389 | LLT::fixed_vector(2, PrivatePtr)}, | ||||
1390 | {S1, S32}) | ||||
1391 | .clampScalar(0, S16, S64) | ||||
1392 | .scalarize(1) | ||||
1393 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
1394 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||
1395 | .clampMaxNumElements(0, S32, 2) | ||||
1396 | .clampMaxNumElements(0, LocalPtr, 2) | ||||
1397 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||
1398 | .scalarize(0) | ||||
1399 | .widenScalarToNextPow2(0) | ||||
1400 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||
1401 | |||||
1402 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||
1403 | // be more flexible with the shift amount type. | ||||
1404 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||
1405 | .legalFor({{S32, S32}, {S64, S32}}); | ||||
1406 | if (ST.has16BitInsts()) { | ||||
1407 | if (ST.hasVOP3PInsts()) { | ||||
1408 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||
1409 | .clampMaxNumElements(0, S16, 2); | ||||
1410 | } else | ||||
1411 | Shifts.legalFor({{S16, S16}}); | ||||
1412 | |||||
1413 | // TODO: Support 16-bit shift amounts for all types | ||||
1414 | Shifts.widenScalarIf( | ||||
1415 | [=](const LegalityQuery &Query) { | ||||
1416 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||
1417 | // 32-bit amount. | ||||
1418 | const LLT ValTy = Query.Types[0]; | ||||
1419 | const LLT AmountTy = Query.Types[1]; | ||||
1420 | return ValTy.getSizeInBits() <= 16 && | ||||
1421 | AmountTy.getSizeInBits() < 16; | ||||
1422 | }, changeTo(1, S16)); | ||||
1423 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||
1424 | Shifts.clampScalar(1, S32, S32); | ||||
1425 | Shifts.widenScalarToNextPow2(0, 16); | ||||
1426 | Shifts.clampScalar(0, S16, S64); | ||||
1427 | |||||
1428 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||
1429 | .minScalar(0, S16) | ||||
1430 | .scalarize(0) | ||||
1431 | .lower(); | ||||
1432 | } else { | ||||
1433 | // Make sure we legalize the shift amount type first, as the general | ||||
1434 | // expansion for the shifted type will produce much worse code if it hasn't | ||||
1435 | // been truncated already. | ||||
1436 | Shifts.clampScalar(1, S32, S32); | ||||
1437 | Shifts.widenScalarToNextPow2(0, 32); | ||||
1438 | Shifts.clampScalar(0, S32, S64); | ||||
1439 | |||||
1440 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||
1441 | .minScalar(0, S32) | ||||
1442 | .scalarize(0) | ||||
1443 | .lower(); | ||||
1444 | } | ||||
1445 | Shifts.scalarize(0); | ||||
1446 | |||||
1447 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||
1448 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||
1449 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||
1450 | unsigned IdxTypeIdx = 2; | ||||
1451 | |||||
1452 | getActionDefinitionsBuilder(Op) | ||||
1453 | .customIf([=](const LegalityQuery &Query) { | ||||
1454 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||
1455 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||
1456 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||
1457 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||
1458 | return (EltSize == 32 || EltSize == 64) && | ||||
1459 | VecTy.getSizeInBits() % 32 == 0 && | ||||
1460 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||
1461 | IdxTy.getSizeInBits() == 32; | ||||
1462 | }) | ||||
1463 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||
1464 | bitcastToVectorElement32(VecTypeIdx)) | ||||
1465 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||
1466 | .bitcastIf( | ||||
1467 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||
1468 | [=](const LegalityQuery &Query) { | ||||
1469 | // For > 64-bit element types, try to turn this into a 64-bit | ||||
1470 | // element vector since we may be able to do better indexing | ||||
1471 | // if this is scalar. If not, fall back to 32. | ||||
1472 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||
1473 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||
1474 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||
1475 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||
1476 | |||||
1477 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||
1478 | return std::pair( | ||||
1479 | VecTypeIdx, | ||||
1480 | LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); | ||||
1481 | }) | ||||
1482 | .clampScalar(EltTypeIdx, S32, S64) | ||||
1483 | .clampScalar(VecTypeIdx, S32, S64) | ||||
1484 | .clampScalar(IdxTypeIdx, S32, S32) | ||||
1485 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||
1486 | // TODO: Clamp elements for 64-bit vectors? | ||||
1487 | // It should only be necessary with variable indexes. | ||||
1488 | // As a last resort, lower to the stack | ||||
1489 | .lower(); | ||||
1490 | } | ||||
1491 | |||||
1492 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||
1493 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||
1494 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||
1495 | return Query.Types[0] != EltTy; | ||||
1496 | }); | ||||
1497 | |||||
1498 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||
1499 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||
1500 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||
1501 | |||||
1502 | // FIXME: Doesn't handle extract of illegal sizes. | ||||
1503 | getActionDefinitionsBuilder(Op) | ||||
1504 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||
1505 | .lowerIf([=](const LegalityQuery &Query) { | ||||
1506 | // Sub-vector(or single element) insert and extract. | ||||
1507 | // TODO: verify immediate offset here since lower only works with | ||||
1508 | // whole elements. | ||||
1509 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1510 | return BigTy.isVector(); | ||||
1511 | }) | ||||
1512 | // FIXME: Multiples of 16 should not be legal. | ||||
1513 | .legalIf([=](const LegalityQuery &Query) { | ||||
1514 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1515 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||
1516 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||
1517 | (LitTy.getSizeInBits() % 16 == 0); | ||||
1518 | }) | ||||
1519 | .widenScalarIf( | ||||
1520 | [=](const LegalityQuery &Query) { | ||||
1521 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1522 | return (BigTy.getScalarSizeInBits() < 16); | ||||
1523 | }, | ||||
1524 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||
1525 | .widenScalarIf( | ||||
1526 | [=](const LegalityQuery &Query) { | ||||
1527 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||
1528 | return (LitTy.getScalarSizeInBits() < 16); | ||||
1529 | }, | ||||
1530 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||
1531 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||
1532 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||
1533 | |||||
1534 | } | ||||
1535 | |||||
1536 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||
1537 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||
1538 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||
1539 | .clampNumElements(0, V16S32, V32S32) | ||||
1540 | .clampNumElements(0, V2S64, V16S64) | ||||
1541 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||
1542 | |||||
1543 | if (ST.hasScalarPackInsts()) { | ||||
1544 | BuildVector | ||||
1545 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||
1546 | .minScalarOrElt(0, S16) | ||||
1547 | .minScalar(1, S16); | ||||
1548 | |||||
1549 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||
1550 | .legalFor({V2S16, S32}) | ||||
1551 | .lower(); | ||||
1552 | } else { | ||||
1553 | BuildVector.customFor({V2S16, S16}); | ||||
1554 | BuildVector.minScalarOrElt(0, S32); | ||||
1555 | |||||
1556 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||
1557 | .customFor({V2S16, S32}) | ||||
1558 | .lower(); | ||||
1559 | } | ||||
1560 | |||||
1561 | BuildVector.legalIf(isRegisterType(0)); | ||||
1562 | |||||
1563 | // FIXME: Clamp maximum size | ||||
1564 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||
1565 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
1566 | .clampMaxNumElements(0, S32, 32) | ||||
1567 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||
1568 | .clampMaxNumElements(0, S16, 64); | ||||
1569 | |||||
1570 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||
1571 | |||||
1572 | // Merge/Unmerge | ||||
1573 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||
1574 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||
1575 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||
1576 | |||||
1577 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||
1578 | const LLT Ty = Query.Types[TypeIdx]; | ||||
1579 | if (Ty.isVector()) { | ||||
1580 | const LLT &EltTy = Ty.getElementType(); | ||||
1581 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||
1582 | return true; | ||||
1583 | if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) | ||||
1584 | return true; | ||||
1585 | } | ||||
1586 | return false; | ||||
1587 | }; | ||||
1588 | |||||
1589 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||
1590 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
1591 | .lowerFor({{S16, V2S16}}) | ||||
1592 | .lowerIf([=](const LegalityQuery &Query) { | ||||
1593 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
1594 | return BigTy.getSizeInBits() == 32; | ||||
1595 | }) | ||||
1596 | // Try to widen to s16 first for small types. | ||||
1597 | // TODO: Only do this on targets with legal s16 shifts | ||||
1598 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||
1599 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||
1600 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||
1601 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||
1602 | elementTypeIs(1, S16)), | ||||
1603 | changeTo(1, V2S16)) | ||||
1604 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||
1605 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||
1606 | // valid. | ||||
1607 | .clampScalar(LitTyIdx, S32, S512) | ||||
1608 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||
1609 | // Break up vectors with weird elements into scalars | ||||
1610 | .fewerElementsIf( | ||||
1611 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||
1612 | scalarize(0)) | ||||
1613 | .fewerElementsIf( | ||||
1614 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||
1615 | scalarize(1)) | ||||
1616 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||
1617 | |||||
1618 | if (Op == G_MERGE_VALUES) { | ||||
1619 | Builder.widenScalarIf( | ||||
1620 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||
1621 | [=](const LegalityQuery &Query) { | ||||
1622 | const LLT Ty = Query.Types[LitTyIdx]; | ||||
1623 | return Ty.getSizeInBits() < 32; | ||||
1624 | }, | ||||
1625 | changeTo(LitTyIdx, S32)); | ||||
1626 | } | ||||
1627 | |||||
1628 | Builder.widenScalarIf( | ||||
1629 | [=](const LegalityQuery &Query) { | ||||
1630 | const LLT Ty = Query.Types[BigTyIdx]; | ||||
1631 | return Ty.getSizeInBits() % 16 != 0; | ||||
1632 | }, | ||||
1633 | [=](const LegalityQuery &Query) { | ||||
1634 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||
1635 | // Whichever is smaller. | ||||
1636 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||
1637 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||
1638 | if (NewSizeInBits >= 256) { | ||||
1639 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||
1640 | if (RoundedTo < NewSizeInBits) | ||||
1641 | NewSizeInBits = RoundedTo; | ||||
1642 | } | ||||
1643 | return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||
1644 | }) | ||||
1645 | // Any vectors left are the wrong size. Scalarize them. | ||||
1646 | .scalarize(0) | ||||
1647 | .scalarize(1); | ||||
1648 | } | ||||
1649 | |||||
1650 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||
1651 | // RegBankSelect. | ||||
1652 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||
1653 | .legalFor({{S32}, {S64}}); | ||||
1654 | |||||
1655 | if (ST.hasVOP3PInsts()) { | ||||
1656 | SextInReg.lowerFor({{V2S16}}) | ||||
1657 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||
1658 | // get more vector shift opportunities, since we'll get those when | ||||
1659 | // expanded. | ||||
1660 | .clampMaxNumElementsStrict(0, S16, 2); | ||||
1661 | } else if (ST.has16BitInsts()) { | ||||
1662 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||
1663 | } else { | ||||
1664 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||
1665 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||
1666 | SextInReg.lowerFor({{S32}, {S64}}); | ||||
1667 | } | ||||
1668 | |||||
1669 | SextInReg | ||||
1670 | .scalarize(0) | ||||
1671 | .clampScalar(0, S32, S64) | ||||
1672 | .lower(); | ||||
1673 | |||||
1674 | getActionDefinitionsBuilder({G_ROTR, G_ROTL}) | ||||
1675 | .scalarize(0) | ||||
1676 | .lower(); | ||||
1677 | |||||
1678 | // TODO: Only Try to form v2s16 with legal packed instructions. | ||||
1679 | getActionDefinitionsBuilder(G_FSHR) | ||||
1680 | .legalFor({{S32, S32}}) | ||||
1681 | .lowerFor({{V2S16, V2S16}}) | ||||
1682 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
1683 | .scalarize(0) | ||||
1684 | .lower(); | ||||
1685 | |||||
1686 | if (ST.hasVOP3PInsts()) { | ||||
1687 | getActionDefinitionsBuilder(G_FSHL) | ||||
1688 | .lowerFor({{V2S16, V2S16}}) | ||||
1689 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
1690 | .scalarize(0) | ||||
1691 | .lower(); | ||||
1692 | } else { | ||||
1693 | getActionDefinitionsBuilder(G_FSHL) | ||||
1694 | .scalarize(0) | ||||
1695 | .lower(); | ||||
1696 | } | ||||
1697 | |||||
1698 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||
1699 | .legalFor({S64}); | ||||
1700 | |||||
1701 | getActionDefinitionsBuilder(G_FENCE) | ||||
1702 | .alwaysLegal(); | ||||
1703 | |||||
1704 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | ||||
1705 | .scalarize(0) | ||||
1706 | .minScalar(0, S32) | ||||
1707 | .lower(); | ||||
1708 | |||||
1709 | getActionDefinitionsBuilder({G_SBFX, G_UBFX}) | ||||
1710 | .legalFor({{S32, S32}, {S64, S32}}) | ||||
1711 | .clampScalar(1, S32, S32) | ||||
1712 | .clampScalar(0, S32, S64) | ||||
1713 | .widenScalarToNextPow2(0) | ||||
1714 | .scalarize(0); | ||||
1715 | |||||
1716 | getActionDefinitionsBuilder({ | ||||
1717 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||
1718 | G_FCOPYSIGN, | ||||
1719 | |||||
1720 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||
1721 | G_ATOMICRMW_NAND, | ||||
1722 | G_ATOMICRMW_FSUB, | ||||
1723 | G_READ_REGISTER, | ||||
1724 | G_WRITE_REGISTER, | ||||
1725 | |||||
1726 | G_SADDO, G_SSUBO, | ||||
1727 | |||||
1728 | // TODO: Implement | ||||
1729 | G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||
1730 | |||||
1731 | getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) | ||||
1732 | .lower(); | ||||
1733 | |||||
1734 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||
1735 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||
1736 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||
1737 | .unsupported(); | ||||
1738 | |||||
1739 | getLegacyLegalizerInfo().computeTables(); | ||||
1740 | verify(*ST.getInstrInfo()); | ||||
1741 | } | ||||
1742 | |||||
1743 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||
1744 | MachineInstr &MI) const { | ||||
1745 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
1746 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
1747 | |||||
1748 | switch (MI.getOpcode()) { | ||||
1749 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||
1750 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||
1751 | case TargetOpcode::G_FRINT: | ||||
1752 | return legalizeFrint(MI, MRI, B); | ||||
1753 | case TargetOpcode::G_FCEIL: | ||||
1754 | return legalizeFceil(MI, MRI, B); | ||||
1755 | case TargetOpcode::G_FREM: | ||||
1756 | return legalizeFrem(MI, MRI, B); | ||||
1757 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||
1758 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||
1759 | case TargetOpcode::G_SITOFP: | ||||
1760 | return legalizeITOFP(MI, MRI, B, true); | ||||
1761 | case TargetOpcode::G_UITOFP: | ||||
1762 | return legalizeITOFP(MI, MRI, B, false); | ||||
1763 | case TargetOpcode::G_FPTOSI: | ||||
1764 | return legalizeFPTOI(MI, MRI, B, true); | ||||
1765 | case TargetOpcode::G_FPTOUI: | ||||
1766 | return legalizeFPTOI(MI, MRI, B, false); | ||||
1767 | case TargetOpcode::G_FMINNUM: | ||||
1768 | case TargetOpcode::G_FMAXNUM: | ||||
1769 | case TargetOpcode::G_FMINNUM_IEEE: | ||||
1770 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||
1771 | return legalizeMinNumMaxNum(Helper, MI); | ||||
1772 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||
1773 | return legalizeExtractVectorElt(MI, MRI, B); | ||||
1774 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||
1775 | return legalizeInsertVectorElt(MI, MRI, B); | ||||
1776 | case TargetOpcode::G_FSIN: | ||||
1777 | case TargetOpcode::G_FCOS: | ||||
1778 | return legalizeSinCos(MI, MRI, B); | ||||
1779 | case TargetOpcode::G_GLOBAL_VALUE: | ||||
1780 | return legalizeGlobalValue(MI, MRI, B); | ||||
1781 | case TargetOpcode::G_LOAD: | ||||
1782 | case TargetOpcode::G_SEXTLOAD: | ||||
1783 | case TargetOpcode::G_ZEXTLOAD: | ||||
1784 | return legalizeLoad(Helper, MI); | ||||
1785 | case TargetOpcode::G_FMAD: | ||||
1786 | return legalizeFMad(MI, MRI, B); | ||||
1787 | case TargetOpcode::G_FDIV: | ||||
1788 | return legalizeFDIV(MI, MRI, B); | ||||
1789 | case TargetOpcode::G_UDIV: | ||||
1790 | case TargetOpcode::G_UREM: | ||||
1791 | case TargetOpcode::G_UDIVREM: | ||||
1792 | return legalizeUnsignedDIV_REM(MI, MRI, B); | ||||
1793 | case TargetOpcode::G_SDIV: | ||||
1794 | case TargetOpcode::G_SREM: | ||||
1795 | case TargetOpcode::G_SDIVREM: | ||||
1796 | return legalizeSignedDIV_REM(MI, MRI, B); | ||||
1797 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||
1798 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||
1799 | case TargetOpcode::G_FLOG: | ||||
1800 | return legalizeFlog(MI, B, numbers::ln2f); | ||||
1801 | case TargetOpcode::G_FLOG10: | ||||
1802 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||
1803 | case TargetOpcode::G_FEXP: | ||||
1804 | return legalizeFExp(MI, B); | ||||
1805 | case TargetOpcode::G_FPOW: | ||||
1806 | return legalizeFPow(MI, B); | ||||
1807 | case TargetOpcode::G_FFLOOR: | ||||
1808 | return legalizeFFloor(MI, MRI, B); | ||||
1809 | case TargetOpcode::G_BUILD_VECTOR: | ||||
1810 | case TargetOpcode::G_BUILD_VECTOR_TRUNC: | ||||
1811 | return legalizeBuildVector(MI, MRI, B); | ||||
1812 | case TargetOpcode::G_MUL: | ||||
1813 | return legalizeMul(Helper, MI); | ||||
1814 | case TargetOpcode::G_CTLZ: | ||||
1815 | case TargetOpcode::G_CTTZ: | ||||
1816 | return legalizeCTLZ_CTTZ(MI, MRI, B); | ||||
1817 | case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: | ||||
1818 | return legalizeFPTruncRound(MI, B); | ||||
1819 | default: | ||||
1820 | return false; | ||||
1821 | } | ||||
1822 | |||||
1823 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1823); | ||||
1824 | } | ||||
1825 | |||||
1826 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||
1827 | unsigned AS, | ||||
1828 | MachineRegisterInfo &MRI, | ||||
1829 | MachineIRBuilder &B) const { | ||||
1830 | MachineFunction &MF = B.getMF(); | ||||
1831 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||
1832 | const LLT S32 = LLT::scalar(32); | ||||
1833 | const LLT S64 = LLT::scalar(64); | ||||
1834 | |||||
1835 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1835, __extension__ __PRETTY_FUNCTION__)); | ||||
1836 | |||||
1837 | if (ST.hasApertureRegs()) { | ||||
1838 | // Note: this register is somewhat broken. When used as a 32-bit operand, | ||||
1839 | // it only returns zeroes. The real value is in the upper 32 bits. | ||||
1840 | // Thus, we must emit extract the high 32 bits. | ||||
1841 | const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) | ||||
1842 | ? AMDGPU::SRC_SHARED_BASE | ||||
1843 | : AMDGPU::SRC_PRIVATE_BASE; | ||||
1844 | // FIXME: It would be more natural to emit a COPY here, but then copy | ||||
1845 | // coalescing would kick in and it would think it's okay to use the "HI" | ||||
1846 | // subregister (instead of extracting the HI 32 bits) which is an artificial | ||||
1847 | // (unusable) register. | ||||
1848 | // Register TableGen definitions would need an overhaul to get rid of the | ||||
1849 | // artificial "HI" aperture registers and prevent this kind of issue from | ||||
1850 | // happening. | ||||
1851 | Register Dst = MRI.createGenericVirtualRegister(S64); | ||||
1852 | MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); | ||||
1853 | B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); | ||||
1854 | return B.buildUnmerge(S32, Dst).getReg(1); | ||||
1855 | } | ||||
1856 | |||||
1857 | // TODO: can we be smarter about machine pointer info? | ||||
1858 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
1859 | Register LoadAddr = MRI.createGenericVirtualRegister( | ||||
1860 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
1861 | // For code object version 5, private_base and shared_base are passed through | ||||
1862 | // implicit kernargs. | ||||
1863 | if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= | ||||
1864 | AMDGPU::AMDHSA_COV5) { | ||||
1865 | AMDGPUTargetLowering::ImplicitParameter Param = | ||||
1866 | AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE | ||||
1867 | : AMDGPUTargetLowering::PRIVATE_BASE; | ||||
1868 | uint64_t Offset = | ||||
1869 | ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); | ||||
1870 | |||||
1871 | Register KernargPtrReg = MRI.createGenericVirtualRegister( | ||||
1872 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
1873 | |||||
1874 | if (!loadInputValue(KernargPtrReg, B, | ||||
1875 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
1876 | return Register(); | ||||
1877 | |||||
1878 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
1879 | PtrInfo, | ||||
1880 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
1881 | MachineMemOperand::MOInvariant, | ||||
1882 | LLT::scalar(32), commonAlignment(Align(64), Offset)); | ||||
1883 | |||||
1884 | // Pointer address | ||||
1885 | B.buildPtrAdd(LoadAddr, KernargPtrReg, | ||||
1886 | B.buildConstant(LLT::scalar(64), Offset).getReg(0)); | ||||
1887 | // Load address | ||||
1888 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||
1889 | } | ||||
1890 | |||||
1891 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||
1892 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
1893 | |||||
1894 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||
1895 | return Register(); | ||||
1896 | |||||
1897 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||
1898 | // private_segment_aperture_base_hi. | ||||
1899 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||
1900 | |||||
1901 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
1902 | PtrInfo, | ||||
1903 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
1904 | MachineMemOperand::MOInvariant, | ||||
1905 | LLT::scalar(32), commonAlignment(Align(64), StructOffset)); | ||||
1906 | |||||
1907 | B.buildPtrAdd(LoadAddr, QueuePtr, | ||||
1908 | B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); | ||||
1909 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||
1910 | } | ||||
1911 | |||||
1912 | /// Return true if the value is a known valid address, such that a null check is | ||||
1913 | /// not necessary. | ||||
1914 | static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, | ||||
1915 | const AMDGPUTargetMachine &TM, unsigned AddrSpace) { | ||||
1916 | MachineInstr *Def = MRI.getVRegDef(Val); | ||||
1917 | switch (Def->getOpcode()) { | ||||
1918 | case AMDGPU::G_FRAME_INDEX: | ||||
1919 | case AMDGPU::G_GLOBAL_VALUE: | ||||
1920 | case AMDGPU::G_BLOCK_ADDR: | ||||
1921 | return true; | ||||
1922 | case AMDGPU::G_CONSTANT: { | ||||
1923 | const ConstantInt *CI = Def->getOperand(1).getCImm(); | ||||
1924 | return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); | ||||
1925 | } | ||||
1926 | default: | ||||
1927 | return false; | ||||
1928 | } | ||||
1929 | |||||
1930 | return false; | ||||
1931 | } | ||||
1932 | |||||
1933 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||
1934 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
1935 | MachineIRBuilder &B) const { | ||||
1936 | MachineFunction &MF = B.getMF(); | ||||
1937 | |||||
1938 | const LLT S32 = LLT::scalar(32); | ||||
1939 | Register Dst = MI.getOperand(0).getReg(); | ||||
1940 | Register Src = MI.getOperand(1).getReg(); | ||||
1941 | |||||
1942 | LLT DstTy = MRI.getType(Dst); | ||||
1943 | LLT SrcTy = MRI.getType(Src); | ||||
1944 | unsigned DestAS = DstTy.getAddressSpace(); | ||||
1945 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||
1946 | |||||
1947 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||
1948 | // vector element. | ||||
1949 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1949, __extension__ __PRETTY_FUNCTION__)); | ||||
1950 | |||||
1951 | const AMDGPUTargetMachine &TM | ||||
1952 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||
1953 | |||||
1954 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||
1955 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||
1956 | return true; | ||||
1957 | } | ||||
1958 | |||||
1959 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS && | ||||
1960 | (DestAS == AMDGPUAS::LOCAL_ADDRESS || | ||||
1961 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { | ||||
1962 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||
1963 | // Extract low 32-bits of the pointer. | ||||
1964 | B.buildExtract(Dst, Src, 0); | ||||
1965 | MI.eraseFromParent(); | ||||
1966 | return true; | ||||
1967 | } | ||||
1968 | |||||
1969 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||
1970 | |||||
1971 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||
1972 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||
1973 | |||||
1974 | // Extract low 32-bits of the pointer. | ||||
1975 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||
1976 | |||||
1977 | auto CmpRes = | ||||
1978 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||
1979 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||
1980 | |||||
1981 | MI.eraseFromParent(); | ||||
1982 | return true; | ||||
1983 | } | ||||
1984 | |||||
1985 | if (DestAS == AMDGPUAS::FLAT_ADDRESS && | ||||
1986 | (SrcAS == AMDGPUAS::LOCAL_ADDRESS || | ||||
1987 | SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { | ||||
1988 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||
1989 | if (!ApertureReg.isValid()) | ||||
1990 | return false; | ||||
1991 | |||||
1992 | // Coerce the type of the low half of the result so we can use merge_values. | ||||
1993 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||
1994 | |||||
1995 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||
1996 | // avoid the ptrtoint? | ||||
1997 | auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); | ||||
1998 | |||||
1999 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||
2000 | B.buildCopy(Dst, BuildPtr); | ||||
2001 | MI.eraseFromParent(); | ||||
2002 | return true; | ||||
2003 | } | ||||
2004 | |||||
2005 | auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||
2006 | auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||
2007 | |||||
2008 | auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, | ||||
2009 | SegmentNull.getReg(0)); | ||||
2010 | |||||
2011 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||
2012 | |||||
2013 | MI.eraseFromParent(); | ||||
2014 | return true; | ||||
2015 | } | ||||
2016 | |||||
2017 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && | ||||
2018 | SrcTy.getSizeInBits() == 64) { | ||||
2019 | // Truncate. | ||||
2020 | B.buildExtract(Dst, Src, 0); | ||||
2021 | MI.eraseFromParent(); | ||||
2022 | return true; | ||||
2023 | } | ||||
2024 | |||||
2025 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && | ||||
2026 | DstTy.getSizeInBits() == 64) { | ||||
2027 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2028 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||
2029 | auto PtrLo = B.buildPtrToInt(S32, Src); | ||||
2030 | auto HighAddr = B.buildConstant(S32, AddrHiVal); | ||||
2031 | B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); | ||||
2032 | MI.eraseFromParent(); | ||||
2033 | return true; | ||||
2034 | } | ||||
2035 | |||||
2036 | DiagnosticInfoUnsupported InvalidAddrSpaceCast( | ||||
2037 | MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); | ||||
2038 | |||||
2039 | LLVMContext &Ctx = MF.getFunction().getContext(); | ||||
2040 | Ctx.diagnose(InvalidAddrSpaceCast); | ||||
2041 | B.buildUndef(Dst); | ||||
2042 | MI.eraseFromParent(); | ||||
2043 | return true; | ||||
2044 | } | ||||
2045 | |||||
2046 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||
2047 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2048 | MachineIRBuilder &B) const { | ||||
2049 | Register Src = MI.getOperand(1).getReg(); | ||||
2050 | LLT Ty = MRI.getType(Src); | ||||
2051 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2051, __extension__ __PRETTY_FUNCTION__)); | ||||
2052 | |||||
2053 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||
2054 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||
2055 | |||||
2056 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||
2057 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||
2058 | |||||
2059 | // TODO: Should this propagate fast-math-flags? | ||||
2060 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||
2061 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||
2062 | |||||
2063 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||
2064 | auto Fabs = B.buildFAbs(Ty, Src); | ||||
2065 | |||||
2066 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||
2067 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||
2068 | MI.eraseFromParent(); | ||||
2069 | return true; | ||||
2070 | } | ||||
2071 | |||||
2072 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||
2073 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2074 | MachineIRBuilder &B) const { | ||||
2075 | |||||
2076 | const LLT S1 = LLT::scalar(1); | ||||
2077 | const LLT S64 = LLT::scalar(64); | ||||
2078 | |||||
2079 | Register Src = MI.getOperand(1).getReg(); | ||||
2080 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2080, __extension__ __PRETTY_FUNCTION__)); | ||||
2081 | |||||
2082 | // result = trunc(src) | ||||
2083 | // if (src > 0.0 && src != result) | ||||
2084 | // result += 1.0 | ||||
2085 | |||||
2086 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||
2087 | |||||
2088 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||
2089 | const auto One = B.buildFConstant(S64, 1.0); | ||||
2090 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||
2091 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||
2092 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||
2093 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||
2094 | |||||
2095 | // TODO: Should this propagate fast-math-flags? | ||||
2096 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||
2097 | MI.eraseFromParent(); | ||||
2098 | return true; | ||||
2099 | } | ||||
2100 | |||||
2101 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||
2102 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2103 | MachineIRBuilder &B) const { | ||||
2104 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2105 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||
2106 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||
2107 | auto Flags = MI.getFlags(); | ||||
2108 | LLT Ty = MRI.getType(DstReg); | ||||
2109 | |||||
2110 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||
2111 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||
2112 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||
2113 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||
2114 | MI.eraseFromParent(); | ||||
2115 | return true; | ||||
2116 | } | ||||
2117 | |||||
2118 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||
2119 | MachineIRBuilder &B) { | ||||
2120 | const unsigned FractBits = 52; | ||||
2121 | const unsigned ExpBits = 11; | ||||
2122 | LLT S32 = LLT::scalar(32); | ||||
2123 | |||||
2124 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||
2125 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||
2126 | |||||
2127 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||
2128 | .addUse(Hi) | ||||
2129 | .addUse(Const0.getReg(0)) | ||||
2130 | .addUse(Const1.getReg(0)); | ||||
2131 | |||||
2132 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||
2133 | } | ||||
2134 | |||||
2135 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||
2136 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2137 | MachineIRBuilder &B) const { | ||||
2138 | const LLT S1 = LLT::scalar(1); | ||||
2139 | const LLT S32 = LLT::scalar(32); | ||||
2140 | const LLT S64 = LLT::scalar(64); | ||||
2141 | |||||
2142 | Register Src = MI.getOperand(1).getReg(); | ||||
2143 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2143, __extension__ __PRETTY_FUNCTION__)); | ||||
2144 | |||||
2145 | // TODO: Should this use extract since the low half is unused? | ||||
2146 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||
2147 | Register Hi = Unmerge.getReg(1); | ||||
2148 | |||||
2149 | // Extract the upper half, since this is where we will find the sign and | ||||
2150 | // exponent. | ||||
2151 | auto Exp = extractF64Exponent(Hi, B); | ||||
2152 | |||||
2153 | const unsigned FractBits = 52; | ||||
2154 | |||||
2155 | // Extract the sign bit. | ||||
2156 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||
2157 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||
2158 | |||||
2159 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||
2160 | |||||
2161 | const auto Zero32 = B.buildConstant(S32, 0); | ||||
2162 | |||||
2163 | // Extend back to 64-bits. | ||||
2164 | auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); | ||||
2165 | |||||
2166 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||
2167 | auto Not = B.buildNot(S64, Shr); | ||||
2168 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||
2169 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||
2170 | |||||
2171 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||
2172 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||
2173 | |||||
2174 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||
2175 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||
2176 | MI.eraseFromParent(); | ||||
2177 | return true; | ||||
2178 | } | ||||
2179 | |||||
2180 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||
2181 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2182 | MachineIRBuilder &B, bool Signed) const { | ||||
2183 | |||||
2184 | Register Dst = MI.getOperand(0).getReg(); | ||||
2185 | Register Src = MI.getOperand(1).getReg(); | ||||
2186 | |||||
2187 | const LLT S64 = LLT::scalar(64); | ||||
2188 | const LLT S32 = LLT::scalar(32); | ||||
2189 | |||||
2190 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2190, __extension__ __PRETTY_FUNCTION__)); | ||||
2191 | |||||
2192 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||
2193 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||
2194 | |||||
2195 | if (MRI.getType(Dst) == S64) { | ||||
2196 | auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) | ||||
2197 | : B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||
2198 | |||||
2199 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||
2200 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||
2201 | .addUse(CvtHi.getReg(0)) | ||||
2202 | .addUse(ThirtyTwo.getReg(0)); | ||||
2203 | |||||
2204 | // TODO: Should this propagate fast-math-flags? | ||||
2205 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||
2206 | MI.eraseFromParent(); | ||||
2207 | return true; | ||||
2208 | } | ||||
2209 | |||||
2210 | assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0 ) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2210, __extension__ __PRETTY_FUNCTION__)); | ||||
2211 | |||||
2212 | auto One = B.buildConstant(S32, 1); | ||||
2213 | |||||
2214 | MachineInstrBuilder ShAmt; | ||||
2215 | if (Signed) { | ||||
2216 | auto ThirtyOne = B.buildConstant(S32, 31); | ||||
2217 | auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); | ||||
2218 | auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); | ||||
2219 | auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); | ||||
2220 | auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, | ||||
2221 | /*HasSideEffects=*/false) | ||||
2222 | .addUse(Unmerge.getReg(1)); | ||||
2223 | auto LS2 = B.buildSub(S32, LS, One); | ||||
2224 | ShAmt = B.buildUMin(S32, LS2, MaxShAmt); | ||||
2225 | } else | ||||
2226 | ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); | ||||
2227 | auto Norm = B.buildShl(S64, Src, ShAmt); | ||||
2228 | auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); | ||||
2229 | auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); | ||||
2230 | auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); | ||||
2231 | auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); | ||||
2232 | auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); | ||||
2233 | B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, | ||||
2234 | /*HasSideEffects=*/false) | ||||
2235 | .addUse(FVal.getReg(0)) | ||||
2236 | .addUse(Scale.getReg(0)); | ||||
2237 | MI.eraseFromParent(); | ||||
2238 | return true; | ||||
2239 | } | ||||
2240 | |||||
2241 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||
2242 | // actually works. | ||||
2243 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, | ||||
2244 | MachineRegisterInfo &MRI, | ||||
2245 | MachineIRBuilder &B, | ||||
2246 | bool Signed) const { | ||||
2247 | |||||
2248 | Register Dst = MI.getOperand(0).getReg(); | ||||
2249 | Register Src = MI.getOperand(1).getReg(); | ||||
2250 | |||||
2251 | const LLT S64 = LLT::scalar(64); | ||||
2252 | const LLT S32 = LLT::scalar(32); | ||||
2253 | |||||
2254 | const LLT SrcLT = MRI.getType(Src); | ||||
2255 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2255, __extension__ __PRETTY_FUNCTION__)); | ||||
2256 | |||||
2257 | unsigned Flags = MI.getFlags(); | ||||
2258 | |||||
2259 | // The basic idea of converting a floating point number into a pair of 32-bit | ||||
2260 | // integers is illustrated as follows: | ||||
2261 | // | ||||
2262 | // tf := trunc(val); | ||||
2263 | // hif := floor(tf * 2^-32); | ||||
2264 | // lof := tf - hif * 2^32; // lof is always positive due to floor. | ||||
2265 | // hi := fptoi(hif); | ||||
2266 | // lo := fptoi(lof); | ||||
2267 | // | ||||
2268 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); | ||||
2269 | MachineInstrBuilder Sign; | ||||
2270 | if (Signed && SrcLT == S32) { | ||||
2271 | // However, a 32-bit floating point number has only 23 bits mantissa and | ||||
2272 | // it's not enough to hold all the significant bits of `lof` if val is | ||||
2273 | // negative. To avoid the loss of precision, We need to take the absolute | ||||
2274 | // value after truncating and flip the result back based on the original | ||||
2275 | // signedness. | ||||
2276 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); | ||||
2277 | Trunc = B.buildFAbs(S32, Trunc, Flags); | ||||
2278 | } | ||||
2279 | MachineInstrBuilder K0, K1; | ||||
2280 | if (SrcLT == S64) { | ||||
2281 | K0 = B.buildFConstant( | ||||
2282 | S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL)); | ||||
2283 | K1 = B.buildFConstant( | ||||
2284 | S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL)); | ||||
2285 | } else { | ||||
2286 | K0 = B.buildFConstant( | ||||
2287 | S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U)); | ||||
2288 | K1 = B.buildFConstant( | ||||
2289 | S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U)); | ||||
2290 | } | ||||
2291 | |||||
2292 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); | ||||
2293 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); | ||||
2294 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); | ||||
2295 | |||||
2296 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) | ||||
2297 | : B.buildFPTOUI(S32, FloorMul); | ||||
2298 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||
2299 | |||||
2300 | if (Signed && SrcLT == S32) { | ||||
2301 | // Flip the result based on the signedness, which is either all 0s or 1s. | ||||
2302 | Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); | ||||
2303 | // r := xor({lo, hi}, sign) - sign; | ||||
2304 | B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), | ||||
2305 | Sign); | ||||
2306 | } else | ||||
2307 | B.buildMergeLikeInstr(Dst, {Lo, Hi}); | ||||
2308 | MI.eraseFromParent(); | ||||
2309 | |||||
2310 | return true; | ||||
2311 | } | ||||
2312 | |||||
2313 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||
2314 | MachineInstr &MI) const { | ||||
2315 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||
2316 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2317 | |||||
2318 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||
2319 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||
2320 | |||||
2321 | // With ieee_mode disabled, the instructions have the correct behavior | ||||
2322 | // already for G_FMINNUM/G_FMAXNUM | ||||
2323 | if (!MFI->getMode().IEEE) | ||||
2324 | return !IsIEEEOp; | ||||
2325 | |||||
2326 | if (IsIEEEOp) | ||||
2327 | return true; | ||||
2328 | |||||
2329 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||
2330 | } | ||||
2331 | |||||
2332 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||
2333 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2334 | MachineIRBuilder &B) const { | ||||
2335 | // TODO: Should move some of this into LegalizerHelper. | ||||
2336 | |||||
2337 | // TODO: Promote dynamic indexing of s16 to s32 | ||||
2338 | |||||
2339 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||
2340 | // constant before this, so we shouldn't need | ||||
2341 | // getIConstantVRegValWithLookThrough. | ||||
2342 | std::optional<ValueAndVReg> MaybeIdxVal = | ||||
2343 | getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||
2344 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||
2345 | return true; | ||||
2346 | const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); | ||||
2347 | |||||
2348 | Register Dst = MI.getOperand(0).getReg(); | ||||
2349 | Register Vec = MI.getOperand(1).getReg(); | ||||
2350 | |||||
2351 | LLT VecTy = MRI.getType(Vec); | ||||
2352 | LLT EltTy = VecTy.getElementType(); | ||||
2353 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2353, __extension__ __PRETTY_FUNCTION__)); | ||||
2354 | |||||
2355 | if (IdxVal < VecTy.getNumElements()) { | ||||
2356 | auto Unmerge = B.buildUnmerge(EltTy, Vec); | ||||
2357 | B.buildCopy(Dst, Unmerge.getReg(IdxVal)); | ||||
2358 | } else { | ||||
2359 | B.buildUndef(Dst); | ||||
2360 | } | ||||
2361 | |||||
2362 | MI.eraseFromParent(); | ||||
2363 | return true; | ||||
2364 | } | ||||
2365 | |||||
2366 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||
2367 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2368 | MachineIRBuilder &B) const { | ||||
2369 | // TODO: Should move some of this into LegalizerHelper. | ||||
2370 | |||||
2371 | // TODO: Promote dynamic indexing of s16 to s32 | ||||
2372 | |||||
2373 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||
2374 | // constant before this, so we shouldn't need | ||||
2375 | // getIConstantVRegValWithLookThrough. | ||||
2376 | std::optional<ValueAndVReg> MaybeIdxVal = | ||||
2377 | getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||
2378 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||
2379 | return true; | ||||
2380 | |||||
2381 | const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); | ||||
2382 | Register Dst = MI.getOperand(0).getReg(); | ||||
2383 | Register Vec = MI.getOperand(1).getReg(); | ||||
2384 | Register Ins = MI.getOperand(2).getReg(); | ||||
2385 | |||||
2386 | LLT VecTy = MRI.getType(Vec); | ||||
2387 | LLT EltTy = VecTy.getElementType(); | ||||
2388 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2388, __extension__ __PRETTY_FUNCTION__)); | ||||
2389 | (void)Ins; | ||||
2390 | |||||
2391 | unsigned NumElts = VecTy.getNumElements(); | ||||
2392 | if (IdxVal < NumElts) { | ||||
2393 | SmallVector<Register, 8> SrcRegs; | ||||
2394 | for (unsigned i = 0; i < NumElts; ++i) | ||||
2395 | SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); | ||||
2396 | B.buildUnmerge(SrcRegs, Vec); | ||||
2397 | |||||
2398 | SrcRegs[IdxVal] = MI.getOperand(2).getReg(); | ||||
2399 | B.buildMergeLikeInstr(Dst, SrcRegs); | ||||
2400 | } else { | ||||
2401 | B.buildUndef(Dst); | ||||
2402 | } | ||||
2403 | |||||
2404 | MI.eraseFromParent(); | ||||
2405 | return true; | ||||
2406 | } | ||||
2407 | |||||
2408 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||
2409 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2410 | MachineIRBuilder &B) const { | ||||
2411 | |||||
2412 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2413 | Register SrcReg = MI.getOperand(1).getReg(); | ||||
2414 | LLT Ty = MRI.getType(DstReg); | ||||
2415 | unsigned Flags = MI.getFlags(); | ||||
2416 | |||||
2417 | Register TrigVal; | ||||
2418 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||
2419 | if (ST.hasTrigReducedRange()) { | ||||
2420 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||
2421 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||
2422 | .addUse(MulVal.getReg(0)) | ||||
2423 | .setMIFlags(Flags).getReg(0); | ||||
2424 | } else | ||||
2425 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||
2426 | |||||
2427 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||
2428 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||
2429 | B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) | ||||
2430 | .addUse(TrigVal) | ||||
2431 | .setMIFlags(Flags); | ||||
2432 | MI.eraseFromParent(); | ||||
2433 | return true; | ||||
2434 | } | ||||
2435 | |||||
2436 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||
2437 | MachineIRBuilder &B, | ||||
2438 | const GlobalValue *GV, | ||||
2439 | int64_t Offset, | ||||
2440 | unsigned GAFlags) const { | ||||
2441 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2441, __extension__ __PRETTY_FUNCTION__)); | ||||
2442 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||
2443 | // to the following code sequence: | ||||
2444 | // | ||||
2445 | // For constant address space: | ||||
2446 | // s_getpc_b64 s[0:1] | ||||
2447 | // s_add_u32 s0, s0, $symbol | ||||
2448 | // s_addc_u32 s1, s1, 0 | ||||
2449 | // | ||||
2450 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||
2451 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||
2452 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||
2453 | // operand to the global variable. | ||||
2454 | // | ||||
2455 | // For global address space: | ||||
2456 | // s_getpc_b64 s[0:1] | ||||
2457 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||
2458 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||
2459 | // | ||||
2460 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||
2461 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||
2462 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||
2463 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||
2464 | // operand to the global variable. | ||||
2465 | // | ||||
2466 | // What we want here is an offset from the value returned by s_getpc | ||||
2467 | // (which is the address of the s_add_u32 instruction) to the global | ||||
2468 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||
2469 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||
2470 | // small. This requires us to add 4 to the global variable offset in order to | ||||
2471 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||
2472 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||
2473 | // instruction. | ||||
2474 | |||||
2475 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
2476 | |||||
2477 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||
2478 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||
2479 | |||||
2480 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||
2481 | .addDef(PCReg); | ||||
2482 | |||||
2483 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||
2484 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||
2485 | MIB.addImm(0); | ||||
2486 | else | ||||
2487 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||
2488 | |||||
2489 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||
2490 | |||||
2491 | if (PtrTy.getSizeInBits() == 32) | ||||
2492 | B.buildExtract(DstReg, PCReg, 0); | ||||
2493 | return true; | ||||
2494 | } | ||||
2495 | |||||
2496 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||
2497 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2498 | MachineIRBuilder &B) const { | ||||
2499 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2500 | LLT Ty = MRI.getType(DstReg); | ||||
2501 | unsigned AS = Ty.getAddressSpace(); | ||||
2502 | |||||
2503 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||
2504 | MachineFunction &MF = B.getMF(); | ||||
2505 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2506 | |||||
2507 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||
2508 | if (!MFI->isModuleEntryFunction() && | ||||
2509 | !GV->getName().equals("llvm.amdgcn.module.lds")) { | ||||
2510 | const Function &Fn = MF.getFunction(); | ||||
2511 | DiagnosticInfoUnsupported BadLDSDecl( | ||||
2512 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||
2513 | DS_Warning); | ||||
2514 | Fn.getContext().diagnose(BadLDSDecl); | ||||
2515 | |||||
2516 | // We currently don't have a way to correctly allocate LDS objects that | ||||
2517 | // aren't directly associated with a kernel. We do force inlining of | ||||
2518 | // functions that use local objects. However, if these dead functions are | ||||
2519 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||
2520 | // and a trap, since there should be no callable path here. | ||||
2521 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||
2522 | B.buildUndef(DstReg); | ||||
2523 | MI.eraseFromParent(); | ||||
2524 | return true; | ||||
2525 | } | ||||
2526 | |||||
2527 | // TODO: We could emit code to handle the initialization somewhere. | ||||
2528 | // We ignore the initializer for now and legalize it to allow selection. | ||||
2529 | // The initializer will anyway get errored out during assembly emission. | ||||
2530 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
2531 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||
2532 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||
2533 | return true; // Leave in place; | ||||
2534 | } | ||||
2535 | |||||
2536 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||
2537 | Type *Ty = GV->getValueType(); | ||||
2538 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||
2539 | // zero-sized type in other languages to declare the dynamic shared | ||||
2540 | // memory which size is not known at the compile time. They will be | ||||
2541 | // allocated by the runtime and placed directly after the static | ||||
2542 | // allocated ones. They all share the same offset. | ||||
2543 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||
2544 | // Adjust alignment for that dynamic shared memory array. | ||||
2545 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); | ||||
2546 | LLT S32 = LLT::scalar(32); | ||||
2547 | auto Sz = | ||||
2548 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||
2549 | B.buildIntToPtr(DstReg, Sz); | ||||
2550 | MI.eraseFromParent(); | ||||
2551 | return true; | ||||
2552 | } | ||||
2553 | } | ||||
2554 | |||||
2555 | B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), | ||||
2556 | *cast<GlobalVariable>(GV))); | ||||
2557 | MI.eraseFromParent(); | ||||
2558 | return true; | ||||
2559 | } | ||||
2560 | |||||
2561 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
2562 | |||||
2563 | if (TLI->shouldEmitFixup(GV)) { | ||||
2564 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||
2565 | MI.eraseFromParent(); | ||||
2566 | return true; | ||||
2567 | } | ||||
2568 | |||||
2569 | if (TLI->shouldEmitPCReloc(GV)) { | ||||
2570 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||
2571 | MI.eraseFromParent(); | ||||
2572 | return true; | ||||
2573 | } | ||||
2574 | |||||
2575 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
2576 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||
2577 | |||||
2578 | LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; | ||||
2579 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||
2580 | MachinePointerInfo::getGOT(MF), | ||||
2581 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
2582 | MachineMemOperand::MOInvariant, | ||||
2583 | LoadTy, Align(8)); | ||||
2584 | |||||
2585 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||
2586 | |||||
2587 | if (Ty.getSizeInBits() == 32) { | ||||
2588 | // Truncate if this is a 32-bit constant address. | ||||
2589 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||
2590 | B.buildExtract(DstReg, Load, 0); | ||||
2591 | } else | ||||
2592 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||
2593 | |||||
2594 | MI.eraseFromParent(); | ||||
2595 | return true; | ||||
2596 | } | ||||
2597 | |||||
2598 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||
2599 | if (Ty.isVector()) | ||||
2600 | return Ty.changeElementCount( | ||||
2601 | ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); | ||||
2602 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||
2603 | } | ||||
2604 | |||||
2605 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||
2606 | MachineInstr &MI) const { | ||||
2607 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
2608 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
2609 | GISelChangeObserver &Observer = Helper.Observer; | ||||
2610 | |||||
2611 | Register PtrReg = MI.getOperand(1).getReg(); | ||||
2612 | LLT PtrTy = MRI.getType(PtrReg); | ||||
2613 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||
2614 | |||||
2615 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||
2616 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
2617 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||
2618 | Observer.changingInstr(MI); | ||||
2619 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||
2620 | Observer.changedInstr(MI); | ||||
2621 | return true; | ||||
2622 | } | ||||
2623 | |||||
2624 | if (MI.getOpcode() != AMDGPU::G_LOAD) | ||||
2625 | return false; | ||||
2626 | |||||
2627 | Register ValReg = MI.getOperand(0).getReg(); | ||||
2628 | LLT ValTy = MRI.getType(ValReg); | ||||
2629 | |||||
2630 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
2631 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||
2632 | const LLT MemTy = MMO->getMemoryType(); | ||||
2633 | const Align MemAlign = MMO->getAlign(); | ||||
2634 | const unsigned MemSize = MemTy.getSizeInBits(); | ||||
2635 | const uint64_t AlignInBits = 8 * MemAlign.value(); | ||||
2636 | |||||
2637 | // Widen non-power-of-2 loads to the alignment if needed | ||||
2638 | if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||
2639 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||
2640 | |||||
2641 | // This was already the correct extending load result type, so just adjust | ||||
2642 | // the memory type. | ||||
2643 | if (WideMemSize == ValSize) { | ||||
2644 | MachineFunction &MF = B.getMF(); | ||||
2645 | |||||
2646 | MachineMemOperand *WideMMO = | ||||
2647 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||
2648 | Observer.changingInstr(MI); | ||||
2649 | MI.setMemRefs(MF, {WideMMO}); | ||||
2650 | Observer.changedInstr(MI); | ||||
2651 | return true; | ||||
2652 | } | ||||
2653 | |||||
2654 | // Don't bother handling edge case that should probably never be produced. | ||||
2655 | if (ValSize > WideMemSize) | ||||
2656 | return false; | ||||
2657 | |||||
2658 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||
2659 | |||||
2660 | Register WideLoad; | ||||
2661 | if (!WideTy.isVector()) { | ||||
2662 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
2663 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||
2664 | } else { | ||||
2665 | // Extract the subvector. | ||||
2666 | |||||
2667 | if (isRegisterType(ValTy)) { | ||||
2668 | // If this a case where G_EXTRACT is legal, use it. | ||||
2669 | // (e.g. <3 x s32> -> <4 x s32>) | ||||
2670 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
2671 | B.buildExtract(ValReg, WideLoad, 0); | ||||
2672 | } else { | ||||
2673 | // For cases where the widened type isn't a nice register value, unmerge | ||||
2674 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||
2675 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
2676 | B.buildDeleteTrailingVectorElements(ValReg, WideLoad); | ||||
2677 | } | ||||
2678 | } | ||||
2679 | |||||
2680 | MI.eraseFromParent(); | ||||
2681 | return true; | ||||
2682 | } | ||||
2683 | |||||
2684 | return false; | ||||
2685 | } | ||||
2686 | |||||
2687 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||
2688 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
2689 | MachineIRBuilder &B) const { | ||||
2690 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
2691 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2691, __extension__ __PRETTY_FUNCTION__)); | ||||
2692 | |||||
2693 | MachineFunction &MF = B.getMF(); | ||||
2694 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
2695 | |||||
2696 | // TODO: Always legal with future ftz flag. | ||||
2697 | // FIXME: Do we need just output? | ||||
2698 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||
2699 | return true; | ||||
2700 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||
2701 | return true; | ||||
2702 | |||||
2703 | MachineIRBuilder HelperBuilder(MI); | ||||
2704 | GISelObserverWrapper DummyObserver; | ||||
2705 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||
2706 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||
2707 | } | ||||
2708 | |||||
2709 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||
2710 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
2711 | Register DstReg = MI.getOperand(0).getReg(); | ||||
2712 | Register PtrReg = MI.getOperand(1).getReg(); | ||||
2713 | Register CmpVal = MI.getOperand(2).getReg(); | ||||
2714 | Register NewVal = MI.getOperand(3).getReg(); | ||||
2715 | |||||
2716 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2717, __extension__ __PRETTY_FUNCTION__)) | ||||
2717 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2717, __extension__ __PRETTY_FUNCTION__)); | ||||
2718 | |||||
2719 | LLT ValTy = MRI.getType(CmpVal); | ||||
2720 | LLT VecTy = LLT::fixed_vector(2, ValTy); | ||||
2721 | |||||
2722 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||
2723 | |||||
2724 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||
2725 | .addDef(DstReg) | ||||
2726 | .addUse(PtrReg) | ||||
2727 | .addUse(PackedVal) | ||||
2728 | .setMemRefs(MI.memoperands()); | ||||
2729 | |||||
2730 | MI.eraseFromParent(); | ||||
2731 | return true; | ||||
2732 | } | ||||
2733 | |||||
2734 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||
2735 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||
2736 | Register Dst = MI.getOperand(0).getReg(); | ||||
2737 | Register Src = MI.getOperand(1).getReg(); | ||||
2738 | LLT Ty = B.getMRI()->getType(Dst); | ||||
2739 | unsigned Flags = MI.getFlags(); | ||||
2740 | |||||
2741 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||
2742 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||
2743 | |||||
2744 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||
2745 | MI.eraseFromParent(); | ||||
2746 | return true; | ||||
2747 | } | ||||
2748 | |||||
2749 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||
2750 | MachineIRBuilder &B) const { | ||||
2751 | Register Dst = MI.getOperand(0).getReg(); | ||||
2752 | Register Src = MI.getOperand(1).getReg(); | ||||
2753 | unsigned Flags = MI.getFlags(); | ||||
2754 | LLT Ty = B.getMRI()->getType(Dst); | ||||
2755 | |||||
2756 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||
2757 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||
2758 | B.buildFExp2(Dst, Mul, Flags); | ||||
2759 | MI.eraseFromParent(); | ||||
2760 | return true; | ||||
2761 | } | ||||
2762 | |||||
2763 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||
2764 | MachineIRBuilder &B) const { | ||||
2765 | Register Dst = MI.getOperand(0).getReg(); | ||||
2766 | Register Src0 = MI.getOperand(1).getReg(); | ||||
2767 | Register Src1 = MI.getOperand(2).getReg(); | ||||
2768 | unsigned Flags = MI.getFlags(); | ||||
2769 | LLT Ty = B.getMRI()->getType(Dst); | ||||
2770 | const LLT S16 = LLT::scalar(16); | ||||
2771 | const LLT S32 = LLT::scalar(32); | ||||
2772 | |||||
2773 | if (Ty == S32) { | ||||
2774 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||
2775 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||
2776 | .addUse(Log.getReg(0)) | ||||
2777 | .addUse(Src1) | ||||
2778 | .setMIFlags(Flags); | ||||
2779 | B.buildFExp2(Dst, Mul, Flags); | ||||
2780 | } else if (Ty == S16) { | ||||
2781 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||
2782 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||
2783 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||
2784 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||
2785 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||
2786 | .addUse(Ext0.getReg(0)) | ||||
2787 | .addUse(Ext1.getReg(0)) | ||||
2788 | .setMIFlags(Flags); | ||||
2789 | |||||
2790 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||
2791 | } else | ||||
2792 | return false; | ||||
2793 | |||||
2794 | MI.eraseFromParent(); | ||||
2795 | return true; | ||||
2796 | } | ||||
2797 | |||||
2798 | // Find a source register, ignoring any possible source modifiers. | ||||
2799 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||
2800 | Register ModSrc = OrigSrc; | ||||
2801 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||
2802 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||
2803 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||
2804 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||
2805 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||
2806 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||
2807 | return ModSrc; | ||||
2808 | } | ||||
2809 | |||||
2810 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||
2811 | MachineRegisterInfo &MRI, | ||||
2812 | MachineIRBuilder &B) const { | ||||
2813 | |||||
2814 | const LLT S1 = LLT::scalar(1); | ||||
2815 | const LLT S64 = LLT::scalar(64); | ||||
2816 | Register Dst = MI.getOperand(0).getReg(); | ||||
2817 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||
2818 | unsigned Flags = MI.getFlags(); | ||||
2819 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2820, __extension__ __PRETTY_FUNCTION__)) | ||||
2820 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2820, __extension__ __PRETTY_FUNCTION__)); | ||||
2821 | |||||
2822 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||
2823 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||
2824 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||
2825 | // V_FRACT bug is: | ||||
2826 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||
2827 | // | ||||
2828 | // Convert floor(x) to (x - fract(x)) | ||||
2829 | |||||
2830 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||
2831 | .addUse(OrigSrc) | ||||
2832 | .setMIFlags(Flags); | ||||
2833 | |||||
2834 | // Give source modifier matching some assistance before obscuring a foldable | ||||
2835 | // pattern. | ||||
2836 | |||||
2837 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||
2838 | // shouldn't matter? | ||||
2839 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||
2840 | |||||
2841 | auto Const = | ||||
2842 | B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); | ||||
2843 | |||||
2844 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||
2845 | |||||
2846 | // We don't need to concern ourselves with the snan handling difference, so | ||||
2847 | // use the one which will directly select. | ||||
2848 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
2849 | if (MFI->getMode().IEEE) | ||||
2850 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||
2851 | else | ||||
2852 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||
2853 | |||||
2854 | Register CorrectedFract = Min; | ||||
2855 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||
2856 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||
2857 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||
2858 | } | ||||
2859 | |||||
2860 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||
2861 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||
2862 | |||||
2863 | MI.eraseFromParent(); | ||||
2864 | return true; | ||||
2865 | } | ||||
2866 | |||||
2867 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||
2868 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||
2869 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||
2870 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
2871 | Register Dst = MI.getOperand(0).getReg(); | ||||
2872 | const LLT S32 = LLT::scalar(32); | ||||
2873 | const LLT S16 = LLT::scalar(16); | ||||
2874 | assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector (2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2874, __extension__ __PRETTY_FUNCTION__)); | ||||
2875 | |||||
2876 | Register Src0 = MI.getOperand(1).getReg(); | ||||
2877 | Register Src1 = MI.getOperand(2).getReg(); | ||||
2878 | |||||
2879 | if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { | ||||
2880 | assert(MRI.getType(Src0) == S32)(static_cast <bool> (MRI.getType(Src0) == S32) ? void ( 0) : __assert_fail ("MRI.getType(Src0) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2880, __extension__ __PRETTY_FUNCTION__)); | ||||
2881 | Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); | ||||
2882 | Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); | ||||
2883 | } | ||||
2884 | |||||
2885 | auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); | ||||
2886 | B.buildBitcast(Dst, Merge); | ||||
2887 | |||||
2888 | MI.eraseFromParent(); | ||||
2889 | return true; | ||||
2890 | } | ||||
2891 | |||||
2892 | // Build a big integer multiply or multiply-add using MAD_64_32 instructions. | ||||
2893 | // | ||||
2894 | // Source and accumulation registers must all be 32-bits. | ||||
2895 | // | ||||
2896 | // TODO: When the multiply is uniform, we should produce a code sequence | ||||
2897 | // that is better suited to instruction selection on the SALU. Instead of | ||||
2898 | // the outer loop going over parts of the result, the outer loop should go | ||||
2899 | // over parts of one of the factors. This should result in instruction | ||||
2900 | // selection that makes full use of S_ADDC_U32 instructions. | ||||
2901 | void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, | ||||
2902 | MutableArrayRef<Register> Accum, | ||||
2903 | ArrayRef<Register> Src0, | ||||
2904 | ArrayRef<Register> Src1, | ||||
2905 | bool UsePartialMad64_32, | ||||
2906 | bool SeparateOddAlignedProducts) const { | ||||
2907 | // Use (possibly empty) vectors of S1 registers to represent the set of | ||||
2908 | // carries from one pair of positions to the next. | ||||
2909 | using Carry = SmallVector<Register, 2>; | ||||
2910 | |||||
2911 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
2912 | GISelKnownBits &KB = *Helper.getKnownBits(); | ||||
2913 | |||||
2914 | const LLT S1 = LLT::scalar(1); | ||||
2915 | const LLT S32 = LLT::scalar(32); | ||||
2916 | const LLT S64 = LLT::scalar(64); | ||||
2917 | |||||
2918 | Register Zero32; | ||||
2919 | Register Zero64; | ||||
2920 | |||||
2921 | auto getZero32 = [&]() -> Register { | ||||
2922 | if (!Zero32) | ||||
2923 | Zero32 = B.buildConstant(S32, 0).getReg(0); | ||||
2924 | return Zero32; | ||||
2925 | }; | ||||
2926 | auto getZero64 = [&]() -> Register { | ||||
2927 | if (!Zero64) | ||||
2928 | Zero64 = B.buildConstant(S64, 0).getReg(0); | ||||
2929 | return Zero64; | ||||
2930 | }; | ||||
2931 | |||||
2932 | SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; | ||||
2933 | for (unsigned i = 0; i < Src0.size(); ++i) { | ||||
2934 | Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); | ||||
2935 | Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); | ||||
2936 | } | ||||
2937 | |||||
2938 | // Merge the given carries into the 32-bit LocalAccum, which is modified | ||||
2939 | // in-place. | ||||
2940 | // | ||||
2941 | // Returns the carry-out, which is a single S1 register or null. | ||||
2942 | auto mergeCarry = | ||||
2943 | [&](Register &LocalAccum, const Carry &CarryIn) -> Register { | ||||
2944 | if (CarryIn.empty()) | ||||
2945 | return Register(); | ||||
2946 | |||||
2947 | bool HaveCarryOut = true; | ||||
2948 | Register CarryAccum; | ||||
2949 | if (CarryIn.size() == 1) { | ||||
2950 | if (!LocalAccum) { | ||||
2951 | LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); | ||||
2952 | return Register(); | ||||
2953 | } | ||||
2954 | |||||
2955 | CarryAccum = getZero32(); | ||||
2956 | } else { | ||||
2957 | CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); | ||||
2958 | for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { | ||||
2959 | CarryAccum = | ||||
2960 | B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) | ||||
2961 | .getReg(0); | ||||
2962 | } | ||||
2963 | |||||
2964 | if (!LocalAccum) { | ||||
2965 | LocalAccum = getZero32(); | ||||
2966 | HaveCarryOut = false; | ||||
2967 | } | ||||
2968 | } | ||||
2969 | |||||
2970 | auto Add = | ||||
2971 | B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); | ||||
2972 | LocalAccum = Add.getReg(0); | ||||
2973 | return HaveCarryOut ? Add.getReg(1) : Register(); | ||||
2974 | }; | ||||
2975 | |||||
2976 | // Build a multiply-add chain to compute | ||||
2977 | // | ||||
2978 | // LocalAccum + (partial products at DstIndex) | ||||
2979 | // + (opportunistic subset of CarryIn) | ||||
2980 | // | ||||
2981 | // LocalAccum is an array of one or two 32-bit registers that are updated | ||||
2982 | // in-place. The incoming registers may be null. | ||||
2983 | // | ||||
2984 | // In some edge cases, carry-ins can be consumed "for free". In that case, | ||||
2985 | // the consumed carry bits are removed from CarryIn in-place. | ||||
2986 | auto buildMadChain = | ||||
2987 | [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) | ||||
2988 | -> Carry { | ||||
2989 | assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||(static_cast <bool> ((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2990, __extension__ __PRETTY_FUNCTION__)) | ||||
2990 | (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1))(static_cast <bool> ((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2990, __extension__ __PRETTY_FUNCTION__)); | ||||
2991 | |||||
2992 | Carry CarryOut; | ||||
2993 | unsigned j0 = 0; | ||||
2994 | |||||
2995 | // Use plain 32-bit multiplication for the most significant part of the | ||||
2996 | // result by default. | ||||
2997 | if (LocalAccum.size() == 1 && | ||||
2998 | (!UsePartialMad64_32 || !CarryIn.empty())) { | ||||
2999 | do { | ||||
3000 | // Skip multiplication if one of the operands is 0 | ||||
3001 | unsigned j1 = DstIndex - j0; | ||||
3002 | if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { | ||||
3003 | ++j0; | ||||
3004 | continue; | ||||
3005 | } | ||||
3006 | auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); | ||||
3007 | if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { | ||||
3008 | LocalAccum[0] = Mul.getReg(0); | ||||
3009 | } else { | ||||
3010 | if (CarryIn.empty()) { | ||||
3011 | LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); | ||||
3012 | } else { | ||||
3013 | LocalAccum[0] = | ||||
3014 | B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) | ||||
3015 | .getReg(0); | ||||
3016 | CarryIn.pop_back(); | ||||
3017 | } | ||||
3018 | } | ||||
3019 | ++j0; | ||||
3020 | } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); | ||||
3021 | } | ||||
3022 | |||||
3023 | // Build full 64-bit multiplies. | ||||
3024 | if (j0 <= DstIndex) { | ||||
3025 | bool HaveSmallAccum = false; | ||||
3026 | Register Tmp; | ||||
3027 | |||||
3028 | if (LocalAccum[0]) { | ||||
3029 | if (LocalAccum.size() == 1) { | ||||
3030 | Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); | ||||
3031 | HaveSmallAccum = true; | ||||
3032 | } else if (LocalAccum[1]) { | ||||
3033 | Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); | ||||
3034 | HaveSmallAccum = false; | ||||
3035 | } else { | ||||
3036 | Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); | ||||
3037 | HaveSmallAccum = true; | ||||
3038 | } | ||||
3039 | } else { | ||||
3040 | assert(LocalAccum.size() == 1 || !LocalAccum[1])(static_cast <bool> (LocalAccum.size() == 1 || !LocalAccum [1]) ? void (0) : __assert_fail ("LocalAccum.size() == 1 || !LocalAccum[1]" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3040, __extension__ __PRETTY_FUNCTION__)); | ||||
3041 | Tmp = getZero64(); | ||||
3042 | HaveSmallAccum = true; | ||||
3043 | } | ||||
3044 | |||||
3045 | do { | ||||
3046 | unsigned j1 = DstIndex - j0; | ||||
3047 | if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { | ||||
3048 | ++j0; | ||||
3049 | continue; | ||||
3050 | } | ||||
3051 | auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, | ||||
3052 | {Src0[j0], Src1[j1], Tmp}); | ||||
3053 | Tmp = Mad.getReg(0); | ||||
3054 | if (!HaveSmallAccum) | ||||
3055 | CarryOut.push_back(Mad.getReg(1)); | ||||
3056 | HaveSmallAccum = false; | ||||
3057 | |||||
3058 | ++j0; | ||||
3059 | } while (j0 <= DstIndex); | ||||
3060 | |||||
3061 | auto Unmerge = B.buildUnmerge(S32, Tmp); | ||||
3062 | LocalAccum[0] = Unmerge.getReg(0); | ||||
3063 | if (LocalAccum.size() > 1) | ||||
3064 | LocalAccum[1] = Unmerge.getReg(1); | ||||
3065 | } | ||||
3066 | |||||
3067 | return CarryOut; | ||||
3068 | }; | ||||
3069 | |||||
3070 | // Outer multiply loop, iterating over destination parts from least | ||||
3071 | // significant to most significant parts. | ||||
3072 | // | ||||
3073 | // The columns of the following diagram correspond to the destination parts | ||||
3074 | // affected by one iteration of the outer loop (ignoring boundary | ||||
3075 | // conditions). | ||||
3076 | // | ||||
3077 | // Dest index relative to 2 * i: 1 0 -1 | ||||
3078 | // ------ | ||||
3079 | // Carries from previous iteration: e o | ||||
3080 | // Even-aligned partial product sum: E E . | ||||
3081 | // Odd-aligned partial product sum: O O | ||||
3082 | // | ||||
3083 | // 'o' is OddCarry, 'e' is EvenCarry. | ||||
3084 | // EE and OO are computed from partial products via buildMadChain and use | ||||
3085 | // accumulation where possible and appropriate. | ||||
3086 | // | ||||
3087 | Register SeparateOddCarry; | ||||
3088 | Carry EvenCarry; | ||||
3089 | Carry OddCarry; | ||||
3090 | |||||
3091 | for (unsigned i = 0; i <= Accum.size() / 2; ++i) { | ||||
3092 | Carry OddCarryIn = std::move(OddCarry); | ||||
3093 | Carry EvenCarryIn = std::move(EvenCarry); | ||||
3094 | OddCarry.clear(); | ||||
3095 | EvenCarry.clear(); | ||||
3096 | |||||
3097 | // Partial products at offset 2 * i. | ||||
3098 | if (2 * i < Accum.size()) { | ||||
3099 | auto LocalAccum = Accum.drop_front(2 * i).take_front(2); | ||||
3100 | EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); | ||||
3101 | } | ||||
3102 | |||||
3103 | // Partial products at offset 2 * i - 1. | ||||
3104 | if (i > 0) { | ||||
3105 | if (!SeparateOddAlignedProducts) { | ||||
3106 | auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); | ||||
3107 | OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); | ||||
3108 | } else { | ||||
3109 | bool IsHighest = 2 * i >= Accum.size(); | ||||
3110 | Register SeparateOddOut[2]; | ||||
3111 | auto LocalAccum = MutableArrayRef(SeparateOddOut) | ||||
3112 | .take_front(IsHighest ? 1 : 2); | ||||
3113 | OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); | ||||
3114 | |||||
3115 | MachineInstr *Lo; | ||||
3116 | |||||
3117 | if (i == 1) { | ||||
3118 | if (!IsHighest) | ||||
3119 | Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); | ||||
3120 | else | ||||
3121 | Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); | ||||
3122 | } else { | ||||
3123 | Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], | ||||
3124 | SeparateOddCarry); | ||||
3125 | } | ||||
3126 | Accum[2 * i - 1] = Lo->getOperand(0).getReg(); | ||||
3127 | |||||
3128 | if (!IsHighest) { | ||||
3129 | auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], | ||||
3130 | Lo->getOperand(1).getReg()); | ||||
3131 | Accum[2 * i] = Hi.getReg(0); | ||||
3132 | SeparateOddCarry = Hi.getReg(1); | ||||
3133 | } | ||||
3134 | } | ||||
3135 | } | ||||
3136 | |||||
3137 | // Add in the carries from the previous iteration | ||||
3138 | if (i > 0) { | ||||
3139 | if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) | ||||
3140 | EvenCarryIn.push_back(CarryOut); | ||||
3141 | |||||
3142 | if (2 * i < Accum.size()) { | ||||
3143 | if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) | ||||
3144 | OddCarry.push_back(CarryOut); | ||||
3145 | } | ||||
3146 | } | ||||
3147 | } | ||||
3148 | } | ||||
3149 | |||||
3150 | // Custom narrowing of wide multiplies using wide multiply-add instructions. | ||||
3151 | // | ||||
3152 | // TODO: If the multiply is followed by an addition, we should attempt to | ||||
3153 | // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. | ||||
3154 | bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, | ||||
3155 | MachineInstr &MI) const { | ||||
3156 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3156, __extension__ __PRETTY_FUNCTION__)); | ||||
3157 | assert(MI.getOpcode() == TargetOpcode::G_MUL)(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_MUL ) ? void (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_MUL" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3157, __extension__ __PRETTY_FUNCTION__)); | ||||
3158 | |||||
3159 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
3160 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
3161 | |||||
3162 | Register DstReg = MI.getOperand(0).getReg(); | ||||
3163 | Register Src0 = MI.getOperand(1).getReg(); | ||||
3164 | Register Src1 = MI.getOperand(2).getReg(); | ||||
3165 | |||||
3166 | LLT Ty = MRI.getType(DstReg); | ||||
3167 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3167, __extension__ __PRETTY_FUNCTION__)); | ||||
3168 | |||||
3169 | unsigned Size = Ty.getSizeInBits(); | ||||
3170 | unsigned NumParts = Size / 32; | ||||
3171 | assert((Size % 32) == 0)(static_cast <bool> ((Size % 32) == 0) ? void (0) : __assert_fail ("(Size % 32) == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3171, __extension__ __PRETTY_FUNCTION__)); | ||||
3172 | assert(NumParts >= 2)(static_cast <bool> (NumParts >= 2) ? void (0) : __assert_fail ("NumParts >= 2", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3172, __extension__ __PRETTY_FUNCTION__)); | ||||
3173 | |||||
3174 | // Whether to use MAD_64_32 for partial products whose high half is | ||||
3175 | // discarded. This avoids some ADD instructions but risks false dependency | ||||
3176 | // stalls on some subtargets in some cases. | ||||
3177 | const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; | ||||
3178 | |||||
3179 | // Whether to compute odd-aligned partial products separately. This is | ||||
3180 | // advisable on subtargets where the accumulator of MAD_64_32 must be placed | ||||
3181 | // in an even-aligned VGPR. | ||||
3182 | const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); | ||||
3183 | |||||
3184 | LLT S32 = LLT::scalar(32); | ||||
3185 | SmallVector<Register, 2> Src0Parts, Src1Parts; | ||||
3186 | for (unsigned i = 0; i < NumParts; ++i) { | ||||
3187 | Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); | ||||
3188 | Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); | ||||
3189 | } | ||||
3190 | B.buildUnmerge(Src0Parts, Src0); | ||||
3191 | B.buildUnmerge(Src1Parts, Src1); | ||||
3192 | |||||
3193 | SmallVector<Register, 2> AccumRegs(NumParts); | ||||
3194 | buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, | ||||
3195 | SeparateOddAlignedProducts); | ||||
3196 | |||||
3197 | B.buildMergeLikeInstr(DstReg, AccumRegs); | ||||
3198 | MI.eraseFromParent(); | ||||
3199 | return true; | ||||
3200 | } | ||||
3201 | |||||
3202 | // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to | ||||
3203 | // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input | ||||
3204 | // case with a single min instruction instead of a compare+select. | ||||
3205 | bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, | ||||
3206 | MachineRegisterInfo &MRI, | ||||
3207 | MachineIRBuilder &B) const { | ||||
3208 | Register Dst = MI.getOperand(0).getReg(); | ||||
3209 | Register Src = MI.getOperand(1).getReg(); | ||||
3210 | LLT DstTy = MRI.getType(Dst); | ||||
3211 | LLT SrcTy = MRI.getType(Src); | ||||
3212 | |||||
3213 | unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ | ||||
3214 | ? AMDGPU::G_AMDGPU_FFBH_U32 | ||||
3215 | : AMDGPU::G_AMDGPU_FFBL_B32; | ||||
3216 | auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); | ||||
3217 | B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); | ||||
3218 | |||||
3219 | MI.eraseFromParent(); | ||||
3220 | return true; | ||||
3221 | } | ||||
3222 | |||||
3223 | // Check that this is a G_XOR x, -1 | ||||
3224 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||
3225 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||
3226 | return false; | ||||
3227 | auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||
3228 | return ConstVal && *ConstVal == -1; | ||||
3229 | } | ||||
3230 | |||||
3231 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||
3232 | static MachineInstr * | ||||
3233 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||
3234 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||
3235 | Register CondDef = MI.getOperand(0).getReg(); | ||||
3236 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||
3237 | return nullptr; | ||||
3238 | |||||
3239 | MachineBasicBlock *Parent = MI.getParent(); | ||||
3240 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||
3241 | |||||
3242 | if (isNot(MRI, *UseMI)) { | ||||
3243 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||
3244 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||
3245 | return nullptr; | ||||
3246 | |||||
3247 | // We're deleting the def of this value, so we need to remove it. | ||||
3248 | eraseInstr(*UseMI, MRI); | ||||
3249 | |||||
3250 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||
3251 | Negated = true; | ||||
3252 | } | ||||
3253 | |||||
3254 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||
3255 | return nullptr; | ||||
3256 | |||||
3257 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||
3258 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||
3259 | if (Next == Parent->end()) { | ||||
3260 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||
3261 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||
3262 | return nullptr; | ||||
3263 | UncondBrTarget = &*NextMBB; | ||||
3264 | } else { | ||||
3265 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||
3266 | return nullptr; | ||||
3267 | Br = &*Next; | ||||
3268 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||
3269 | } | ||||
3270 | |||||
3271 | return UseMI; | ||||
3272 | } | ||||
3273 | |||||
3274 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||
3275 | const ArgDescriptor *Arg, | ||||
3276 | const TargetRegisterClass *ArgRC, | ||||
3277 | LLT ArgTy) const { | ||||
3278 | MCRegister SrcReg = Arg->getRegister(); | ||||
3279 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3279, __extension__ __PRETTY_FUNCTION__)); | ||||
3280 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3280, __extension__ __PRETTY_FUNCTION__)); | ||||
3281 | |||||
3282 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, | ||||
3283 | *ArgRC, B.getDebugLoc(), ArgTy); | ||||
3284 | if (Arg->isMasked()) { | ||||
3285 | // TODO: Should we try to emit this once in the entry block? | ||||
3286 | const LLT S32 = LLT::scalar(32); | ||||
3287 | const unsigned Mask = Arg->getMask(); | ||||
3288 | const unsigned Shift = llvm::countr_zero<unsigned>(Mask); | ||||
3289 | |||||
3290 | Register AndMaskSrc = LiveIn; | ||||
3291 | |||||
3292 | // TODO: Avoid clearing the high bits if we know workitem id y/z are always | ||||
3293 | // 0. | ||||
3294 | if (Shift
| ||||
3295 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||
3296 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||
3297 | } | ||||
3298 | |||||
3299 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||
| |||||
3300 | } else { | ||||
3301 | B.buildCopy(DstReg, LiveIn); | ||||
3302 | } | ||||
3303 | |||||
3304 | return true; | ||||
3305 | } | ||||
3306 | |||||
3307 | bool AMDGPULegalizerInfo::loadInputValue( | ||||
3308 | Register DstReg, MachineIRBuilder &B, | ||||
3309 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
3310 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
3311 | const ArgDescriptor *Arg; | ||||
3312 | const TargetRegisterClass *ArgRC; | ||||
3313 | LLT ArgTy; | ||||
3314 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||
3315 | |||||
3316 | if (!Arg) { | ||||
3317 | if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { | ||||
3318 | // The intrinsic may appear when we have a 0 sized kernarg segment, in which | ||||
3319 | // case the pointer argument may be missing and we use null. | ||||
3320 | B.buildConstant(DstReg, 0); | ||||
3321 | return true; | ||||
3322 | } | ||||
3323 | |||||
3324 | // It's undefined behavior if a function marked with the amdgpu-no-* | ||||
3325 | // attributes uses the corresponding intrinsic. | ||||
3326 | B.buildUndef(DstReg); | ||||
3327 | return true; | ||||
3328 | } | ||||
3329 | |||||
3330 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||
3331 | return false; // TODO: Handle these | ||||
3332 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||
3333 | } | ||||
3334 | |||||
3335 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||
3336 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||
3337 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
3338 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||
3339 | return false; | ||||
3340 | |||||
3341 | MI.eraseFromParent(); | ||||
3342 | return true; | ||||
3343 | } | ||||
3344 | |||||
3345 | static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, | ||||
3346 | int64_t C) { | ||||
3347 | B.buildConstant(MI.getOperand(0).getReg(), C); | ||||
3348 | MI.eraseFromParent(); | ||||
3349 | return true; | ||||
3350 | } | ||||
3351 | |||||
3352 | bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( | ||||
3353 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||
3354 | unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
3355 | unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); | ||||
3356 | if (MaxID == 0) | ||||
3357 | return replaceWithConstant(B, MI, 0); | ||||
3358 | |||||
3359 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
3360 | const ArgDescriptor *Arg; | ||||
3361 | const TargetRegisterClass *ArgRC; | ||||
3362 | LLT ArgTy; | ||||
3363 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||
3364 | |||||
3365 | Register DstReg = MI.getOperand(0).getReg(); | ||||
3366 | if (!Arg) { | ||||
3367 | // It's undefined behavior if a function marked with the amdgpu-no-* | ||||
3368 | // attributes uses the corresponding intrinsic. | ||||
3369 | B.buildUndef(DstReg); | ||||
3370 | MI.eraseFromParent(); | ||||
3371 | return true; | ||||
3372 | } | ||||
3373 | |||||
3374 | if (Arg->isMasked()) { | ||||
3375 | // Don't bother inserting AssertZext for packed IDs since we're emitting the | ||||
3376 | // masking operations anyway. | ||||
3377 | // | ||||
3378 | // TODO: We could assert the top bit is 0 for the source copy. | ||||
3379 | if (!loadInputValue(DstReg, B, ArgType)) | ||||
3380 | return false; | ||||
3381 | } else { | ||||
3382 | Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); | ||||
3383 | if (!loadInputValue(TmpReg, B, ArgType)) | ||||
3384 | return false; | ||||
3385 | B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); | ||||
3386 | } | ||||
3387 | |||||
3388 | MI.eraseFromParent(); | ||||
3389 | return true; | ||||
3390 | } | ||||
3391 | |||||
3392 | Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, | ||||
3393 | int64_t Offset) const { | ||||
3394 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
3395 | Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); | ||||
3396 | |||||
3397 | // TODO: If we passed in the base kernel offset we could have a better | ||||
3398 | // alignment than 4, but we don't really need it. | ||||
3399 | if (!loadInputValue(KernArgReg, B, | ||||
| |||||
3400 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
3401 | llvm_unreachable("failed to find kernarg segment ptr")::llvm::llvm_unreachable_internal("failed to find kernarg segment ptr" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3401); | ||||
3402 | |||||
3403 | auto COffset = B.buildConstant(LLT::scalar(64), Offset); | ||||
3404 | // TODO: Should get nuw | ||||
3405 | return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); | ||||
3406 | } | ||||
3407 | |||||
3408 | /// Legalize a value that's loaded from kernel arguments. This is only used by | ||||
3409 | /// legacy intrinsics. | ||||
3410 | bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, | ||||
3411 | MachineIRBuilder &B, | ||||
3412 | uint64_t Offset, | ||||
3413 | Align Alignment) const { | ||||
3414 | Register DstReg = MI.getOperand(0).getReg(); | ||||
3415 | |||||
3416 | assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT ::scalar(32) && "unexpected kernarg parameter type") ? void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3417, __extension__ __PRETTY_FUNCTION__)) | ||||
3417 | "unexpected kernarg parameter type")(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT ::scalar(32) && "unexpected kernarg parameter type") ? void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3417, __extension__ __PRETTY_FUNCTION__)); | ||||
3418 | |||||
3419 | Register Ptr = getKernargParameterPtr(B, Offset); | ||||
3420 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
3421 | B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), | ||||
3422 | MachineMemOperand::MODereferenceable | | ||||
3423 | MachineMemOperand::MOInvariant); | ||||
3424 | MI.eraseFromParent(); | ||||
3425 | return true; | ||||
3426 | } | ||||
3427 | |||||
3428 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||
3429 | MachineRegisterInfo &MRI, | ||||
3430 | MachineIRBuilder &B) const { | ||||
3431 | Register Dst = MI.getOperand(0).getReg(); | ||||
3432 | LLT DstTy = MRI.getType(Dst); | ||||
3433 | LLT S16 = LLT::scalar(16); | ||||
3434 | LLT S32 = LLT::scalar(32); | ||||
3435 | LLT S64 = LLT::scalar(64); | ||||
3436 | |||||
3437 | if (DstTy == S16) | ||||
3438 | return legalizeFDIV16(MI, MRI, B); | ||||
3439 | if (DstTy == S32) | ||||
3440 | return legalizeFDIV32(MI, MRI, B); | ||||
3441 | if (DstTy == S64) | ||||
3442 | return legalizeFDIV64(MI, MRI, B); | ||||
3443 | |||||
3444 | return false; | ||||
3445 | } | ||||
3446 | |||||
3447 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, | ||||
3448 | Register DstDivReg, | ||||
3449 | Register DstRemReg, | ||||
3450 | Register X, | ||||
3451 | Register Y) const { | ||||
3452 | const LLT S1 = LLT::scalar(1); | ||||
3453 | const LLT S32 = LLT::scalar(32); | ||||
3454 | |||||
3455 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||
3456 | // algorithm used here. | ||||
3457 | |||||
3458 | // Initial estimate of inv(y). | ||||
3459 | auto FloatY = B.buildUITOFP(S32, Y); | ||||
3460 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||
3461 | auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); | ||||
3462 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||
3463 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||
3464 | |||||
3465 | // One round of UNR. | ||||
3466 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||
3467 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||
3468 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||
3469 | |||||
3470 | // Quotient/remainder estimate. | ||||
3471 | auto Q = B.buildUMulH(S32, X, Z); | ||||
3472 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||
3473 | |||||
3474 | // First quotient/remainder refinement. | ||||
3475 | auto One = B.buildConstant(S32, 1); | ||||
3476 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||
3477 | if (DstDivReg) | ||||
3478 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||
3479 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||
3480 | |||||
3481 | // Second quotient/remainder refinement. | ||||
3482 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||
3483 | if (DstDivReg) | ||||
3484 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||
3485 | |||||
3486 | if (DstRemReg) | ||||
3487 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); | ||||
3488 | } | ||||
3489 | |||||
3490 | // Build integer reciprocal sequence around V_RCP_IFLAG_F32 | ||||
3491 | // | ||||
3492 | // Return lo, hi of result | ||||
3493 | // | ||||
3494 | // %cvt.lo = G_UITOFP Val.lo | ||||
3495 | // %cvt.hi = G_UITOFP Val.hi | ||||
3496 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||
3497 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||
3498 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||
3499 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||
3500 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||
3501 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||
3502 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||
3503 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||
3504 | Register Val) { | ||||
3505 | const LLT S32 = LLT::scalar(32); | ||||
3506 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||
3507 | |||||
3508 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||
3509 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||
3510 | |||||
3511 | auto Mad = B.buildFMAD( | ||||
3512 | S32, CvtHi, // 2**32 | ||||
3513 | B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); | ||||
3514 | |||||
3515 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||
3516 | auto Mul1 = B.buildFMul( | ||||
3517 | S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); | ||||
3518 | |||||
3519 | // 2**(-32) | ||||
3520 | auto Mul2 = B.buildFMul( | ||||
3521 | S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); | ||||
3522 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||
3523 | |||||
3524 | // -(2**32) | ||||
3525 | auto Mad2 = B.buildFMAD( | ||||
3526 | S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), | ||||
3527 | Mul1); | ||||
3528 | |||||
3529 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||
3530 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||
3531 | |||||
3532 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||
3533 | } | ||||
3534 | |||||
3535 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, | ||||
3536 | Register DstDivReg, | ||||
3537 | Register DstRemReg, | ||||
3538 | Register Numer, | ||||
3539 | Register Denom) const { | ||||
3540 | const LLT S32 = LLT::scalar(32); | ||||
3541 | const LLT S64 = LLT::scalar(64); | ||||
3542 | const LLT S1 = LLT::scalar(1); | ||||
3543 | Register RcpLo, RcpHi; | ||||
3544 | |||||
3545 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||
3546 | |||||
3547 | auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); | ||||
3548 | |||||
3549 | auto Zero64 = B.buildConstant(S64, 0); | ||||
3550 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||
3551 | |||||
3552 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||
3553 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||
3554 | |||||
3555 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||
3556 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||
3557 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||
3558 | |||||
3559 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||
3560 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||
3561 | auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); | ||||
3562 | |||||
3563 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||
3564 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||
3565 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||
3566 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||
3567 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||
3568 | |||||
3569 | auto Zero32 = B.buildConstant(S32, 0); | ||||
3570 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||
3571 | auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); | ||||
3572 | auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); | ||||
3573 | |||||
3574 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||
3575 | Register NumerLo = UnmergeNumer.getReg(0); | ||||
3576 | Register NumerHi = UnmergeNumer.getReg(1); | ||||
3577 | |||||
3578 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||
3579 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||
3580 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||
3581 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||
3582 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||
3583 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||
3584 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||
3585 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||
3586 | auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); | ||||
3587 | |||||
3588 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||
3589 | Register DenomLo = UnmergeDenom.getReg(0); | ||||
3590 | Register DenomHi = UnmergeDenom.getReg(1); | ||||
3591 | |||||
3592 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||
3593 | auto C1 = B.buildSExt(S32, CmpHi); | ||||
3594 | |||||
3595 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||
3596 | auto C2 = B.buildSExt(S32, CmpLo); | ||||
3597 | |||||
3598 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||
3599 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||
3600 | |||||
3601 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||
3602 | // Currently control flow is unconditional and we have 4 selects after | ||||
3603 | // potential endif to substitute PHIs. | ||||
3604 | |||||
3605 | // if C3 != 0 ... | ||||
3606 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||
3607 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||
3608 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||
3609 | auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); | ||||
3610 | |||||
3611 | auto One64 = B.buildConstant(S64, 1); | ||||
3612 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||
3613 | |||||
3614 | auto C4 = | ||||
3615 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||
3616 | auto C5 = | ||||
3617 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||
3618 | auto C6 = B.buildSelect( | ||||
3619 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||
3620 | |||||
3621 | // if (C6 != 0) | ||||
3622 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||
3623 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||
3624 | |||||
3625 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||
3626 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||
3627 | auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); | ||||
3628 | |||||
3629 | // endif C6 | ||||
3630 | // endif C3 | ||||
3631 | |||||
3632 | if (DstDivReg) { | ||||
3633 | auto Sel1 = B.buildSelect( | ||||
3634 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||
3635 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||
3636 | Sel1, MulHi3); | ||||
3637 | } | ||||
3638 | |||||
3639 | if (DstRemReg) { | ||||
3640 | auto Sel2 = B.buildSelect( | ||||
3641 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||
3642 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||
3643 | Sel2, Sub1); | ||||
3644 | } | ||||
3645 | } | ||||
3646 | |||||
3647 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, | ||||
3648 | MachineRegisterInfo &MRI, | ||||
3649 | MachineIRBuilder &B) const { | ||||
3650 | Register DstDivReg, DstRemReg; | ||||
3651 | switch (MI.getOpcode()) { | ||||
3652 | default: | ||||
3653 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3653); | ||||
3654 | case AMDGPU::G_UDIV: { | ||||
3655 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3656 | break; | ||||
3657 | } | ||||
3658 | case AMDGPU::G_UREM: { | ||||
3659 | DstRemReg = MI.getOperand(0).getReg(); | ||||
3660 | break; | ||||
3661 | } | ||||
3662 | case AMDGPU::G_UDIVREM: { | ||||
3663 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3664 | DstRemReg = MI.getOperand(1).getReg(); | ||||
3665 | break; | ||||
3666 | } | ||||
3667 | } | ||||
3668 | |||||
3669 | const LLT S64 = LLT::scalar(64); | ||||
3670 | const LLT S32 = LLT::scalar(32); | ||||
3671 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||
3672 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||
3673 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||
3674 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
3675 | |||||
3676 | if (Ty == S32) | ||||
3677 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||
3678 | else if (Ty == S64) | ||||
3679 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||
3680 | else | ||||
3681 | return false; | ||||
3682 | |||||
3683 | MI.eraseFromParent(); | ||||
3684 | return true; | ||||
3685 | } | ||||
3686 | |||||
3687 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, | ||||
3688 | MachineRegisterInfo &MRI, | ||||
3689 | MachineIRBuilder &B) const { | ||||
3690 | const LLT S64 = LLT::scalar(64); | ||||
3691 | const LLT S32 = LLT::scalar(32); | ||||
3692 | |||||
3693 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
3694 | if (Ty != S32 && Ty != S64) | ||||
3695 | return false; | ||||
3696 | |||||
3697 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||
3698 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||
3699 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||
3700 | |||||
3701 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||
3702 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||
3703 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||
3704 | |||||
3705 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||
3706 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||
3707 | |||||
3708 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||
3709 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||
3710 | |||||
3711 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; | ||||
3712 | switch (MI.getOpcode()) { | ||||
3713 | default: | ||||
3714 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3714); | ||||
3715 | case AMDGPU::G_SDIV: { | ||||
3716 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3717 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||
3718 | break; | ||||
3719 | } | ||||
3720 | case AMDGPU::G_SREM: { | ||||
3721 | DstRemReg = MI.getOperand(0).getReg(); | ||||
3722 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||
3723 | break; | ||||
3724 | } | ||||
3725 | case AMDGPU::G_SDIVREM: { | ||||
3726 | DstDivReg = MI.getOperand(0).getReg(); | ||||
3727 | DstRemReg = MI.getOperand(1).getReg(); | ||||
3728 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||
3729 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||
3730 | break; | ||||
3731 | } | ||||
3732 | } | ||||
3733 | |||||
3734 | if (Ty == S32) | ||||
3735 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||
3736 | else | ||||
3737 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||
3738 | |||||
3739 | if (DstDivReg) { | ||||
3740 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||
3741 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); | ||||
3742 | B.buildSub(DstDivReg, SignXor, Sign); | ||||
3743 | } | ||||
3744 | |||||
3745 | if (DstRemReg) { | ||||
3746 | auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||
3747 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); | ||||
3748 | B.buildSub(DstRemReg, SignXor, Sign); | ||||
3749 | } | ||||
3750 | |||||
3751 | MI.eraseFromParent(); | ||||
3752 | return true; | ||||
3753 | } | ||||
3754 | |||||
3755 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||
3756 | MachineRegisterInfo &MRI, | ||||
3757 | MachineIRBuilder &B) const { | ||||
3758 | Register Res = MI.getOperand(0).getReg(); | ||||
3759 | Register LHS = MI.getOperand(1).getReg(); | ||||
3760 | Register RHS = MI.getOperand(2).getReg(); | ||||
3761 | uint16_t Flags = MI.getFlags(); | ||||
3762 | LLT ResTy = MRI.getType(Res); | ||||
3763 | |||||
3764 | const MachineFunction &MF = B.getMF(); | ||||
3765 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||
3766 | MI.getFlag(MachineInstr::FmAfn); | ||||
3767 | |||||
3768 | if (!AllowInaccurateRcp) | ||||
3769 | return false; | ||||
3770 | |||||
3771 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||
3772 | // 1 / x -> RCP(x) | ||||
3773 | if (CLHS->isExactlyValue(1.0)) { | ||||
3774 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||
3775 | .addUse(RHS) | ||||
3776 | .setMIFlags(Flags); | ||||
3777 | |||||
3778 | MI.eraseFromParent(); | ||||
3779 | return true; | ||||
3780 | } | ||||
3781 | |||||
3782 | // -1 / x -> RCP( FNEG(x) ) | ||||
3783 | if (CLHS->isExactlyValue(-1.0)) { | ||||
3784 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||
3785 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||
3786 | .addUse(FNeg.getReg(0)) | ||||
3787 | .setMIFlags(Flags); | ||||
3788 | |||||
3789 | MI.eraseFromParent(); | ||||
3790 | return true; | ||||
3791 | } | ||||
3792 | } | ||||
3793 | |||||
3794 | // x / y -> x * (1.0 / y) | ||||
3795 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||
3796 | .addUse(RHS) | ||||
3797 | .setMIFlags(Flags); | ||||
3798 | B.buildFMul(Res, LHS, RCP, Flags); | ||||
3799 | |||||
3800 | MI.eraseFromParent(); | ||||
3801 | return true; | ||||
3802 | } | ||||
3803 | |||||
3804 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||
3805 | MachineRegisterInfo &MRI, | ||||
3806 | MachineIRBuilder &B) const { | ||||
3807 | Register Res = MI.getOperand(0).getReg(); | ||||
3808 | Register X = MI.getOperand(1).getReg(); | ||||
3809 | Register Y = MI.getOperand(2).getReg(); | ||||
3810 | uint16_t Flags = MI.getFlags(); | ||||
3811 | LLT ResTy = MRI.getType(Res); | ||||
3812 | |||||
3813 | const MachineFunction &MF = B.getMF(); | ||||
3814 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||
3815 | MI.getFlag(MachineInstr::FmAfn); | ||||
3816 | |||||
3817 | if (!AllowInaccurateRcp) | ||||
3818 | return false; | ||||
3819 | |||||
3820 | auto NegY = B.buildFNeg(ResTy, Y); | ||||
3821 | auto One = B.buildFConstant(ResTy, 1.0); | ||||
3822 | |||||
3823 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||
3824 | .addUse(Y) | ||||
3825 | .setMIFlags(Flags); | ||||
3826 | |||||
3827 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||
3828 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||
3829 | |||||
3830 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||
3831 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||
3832 | |||||
3833 | auto Ret = B.buildFMul(ResTy, X, R); | ||||
3834 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||
3835 | |||||
3836 | B.buildFMA(Res, Tmp2, R, Ret); | ||||
3837 | MI.eraseFromParent(); | ||||
3838 | return true; | ||||
3839 | } | ||||
3840 | |||||
3841 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||
3842 | MachineRegisterInfo &MRI, | ||||
3843 | MachineIRBuilder &B) const { | ||||
3844 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||
3845 | return true; | ||||
3846 | |||||
3847 | Register Res = MI.getOperand(0).getReg(); | ||||
3848 | Register LHS = MI.getOperand(1).getReg(); | ||||
3849 | Register RHS = MI.getOperand(2).getReg(); | ||||
3850 | |||||
3851 | uint16_t Flags = MI.getFlags(); | ||||
3852 | |||||
3853 | LLT S16 = LLT::scalar(16); | ||||
3854 | LLT S32 = LLT::scalar(32); | ||||
3855 | |||||
3856 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||
3857 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||
3858 | |||||
3859 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
3860 | .addUse(RHSExt.getReg(0)) | ||||
3861 | .setMIFlags(Flags); | ||||
3862 | |||||
3863 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||
3864 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||
3865 | |||||
3866 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||
3867 | .addUse(RDst.getReg(0)) | ||||
3868 | .addUse(RHS) | ||||
3869 | .addUse(LHS) | ||||
3870 | .setMIFlags(Flags); | ||||
3871 | |||||
3872 | MI.eraseFromParent(); | ||||
3873 | return true; | ||||
3874 | } | ||||
3875 | |||||
3876 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||
3877 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||
3878 | static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, | ||||
3879 | const GCNSubtarget &ST, | ||||
3880 | SIModeRegisterDefaults Mode) { | ||||
3881 | // Set SP denorm mode to this value. | ||||
3882 | unsigned SPDenormMode = | ||||
3883 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||
3884 | |||||
3885 | if (ST.hasDenormModeInst()) { | ||||
3886 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||
3887 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||
3888 | |||||
3889 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||
3890 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||
3891 | .addImm(NewDenormModeValue); | ||||
3892 | |||||
3893 | } else { | ||||
3894 | // Select FP32 bit field in mode register. | ||||
3895 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||
3896 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||
3897 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||
3898 | |||||
3899 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||
3900 | .addImm(SPDenormMode) | ||||
3901 | .addImm(SPDenormModeBitField); | ||||
3902 | } | ||||
3903 | } | ||||
3904 | |||||
3905 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||
3906 | MachineRegisterInfo &MRI, | ||||
3907 | MachineIRBuilder &B) const { | ||||
3908 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||
3909 | return true; | ||||
3910 | |||||
3911 | Register Res = MI.getOperand(0).getReg(); | ||||
3912 | Register LHS = MI.getOperand(1).getReg(); | ||||
3913 | Register RHS = MI.getOperand(2).getReg(); | ||||
3914 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
3915 | SIModeRegisterDefaults Mode = MFI->getMode(); | ||||
3916 | |||||
3917 | uint16_t Flags = MI.getFlags(); | ||||
3918 | |||||
3919 | LLT S32 = LLT::scalar(32); | ||||
3920 | LLT S1 = LLT::scalar(1); | ||||
3921 | |||||
3922 | auto One = B.buildFConstant(S32, 1.0f); | ||||
3923 | |||||
3924 | auto DenominatorScaled = | ||||
3925 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||
3926 | .addUse(LHS) | ||||
3927 | .addUse(RHS) | ||||
3928 | .addImm(0) | ||||
3929 | .setMIFlags(Flags); | ||||
3930 | auto NumeratorScaled = | ||||
3931 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||
3932 | .addUse(LHS) | ||||
3933 | .addUse(RHS) | ||||
3934 | .addImm(1) | ||||
3935 | .setMIFlags(Flags); | ||||
3936 | |||||
3937 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
3938 | .addUse(DenominatorScaled.getReg(0)) | ||||
3939 | .setMIFlags(Flags); | ||||
3940 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||
3941 | |||||
3942 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||
3943 | // aren't modeled as reading it. | ||||
3944 | if (!Mode.allFP32Denormals()) | ||||
3945 | toggleSPDenormMode(true, B, ST, Mode); | ||||
3946 | |||||
3947 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||
3948 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||
3949 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||
3950 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||
3951 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||
3952 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||
3953 | |||||
3954 | if (!Mode.allFP32Denormals()) | ||||
3955 | toggleSPDenormMode(false, B, ST, Mode); | ||||
3956 | |||||
3957 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||
3958 | .addUse(Fma4.getReg(0)) | ||||
3959 | .addUse(Fma1.getReg(0)) | ||||
3960 | .addUse(Fma3.getReg(0)) | ||||
3961 | .addUse(NumeratorScaled.getReg(1)) | ||||
3962 | .setMIFlags(Flags); | ||||
3963 | |||||
3964 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||
3965 | .addUse(Fmas.getReg(0)) | ||||
3966 | .addUse(RHS) | ||||
3967 | .addUse(LHS) | ||||
3968 | .setMIFlags(Flags); | ||||
3969 | |||||
3970 | MI.eraseFromParent(); | ||||
3971 | return true; | ||||
3972 | } | ||||
3973 | |||||
3974 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||
3975 | MachineRegisterInfo &MRI, | ||||
3976 | MachineIRBuilder &B) const { | ||||
3977 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||
3978 | return true; | ||||
3979 | |||||
3980 | Register Res = MI.getOperand(0).getReg(); | ||||
3981 | Register LHS = MI.getOperand(1).getReg(); | ||||
3982 | Register RHS = MI.getOperand(2).getReg(); | ||||
3983 | |||||
3984 | uint16_t Flags = MI.getFlags(); | ||||
3985 | |||||
3986 | LLT S64 = LLT::scalar(64); | ||||
3987 | LLT S1 = LLT::scalar(1); | ||||
3988 | |||||
3989 | auto One = B.buildFConstant(S64, 1.0); | ||||
3990 | |||||
3991 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||
3992 | .addUse(LHS) | ||||
3993 | .addUse(RHS) | ||||
3994 | .addImm(0) | ||||
3995 | .setMIFlags(Flags); | ||||
3996 | |||||
3997 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||
3998 | |||||
3999 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||
4000 | .addUse(DivScale0.getReg(0)) | ||||
4001 | .setMIFlags(Flags); | ||||
4002 | |||||
4003 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||
4004 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||
4005 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||
4006 | |||||
4007 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||
4008 | .addUse(LHS) | ||||
4009 | .addUse(RHS) | ||||
4010 | .addImm(1) | ||||
4011 | .setMIFlags(Flags); | ||||
4012 | |||||
4013 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||
4014 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||
4015 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||
4016 | |||||
4017 | Register Scale; | ||||
4018 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||
4019 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||
4020 | // is not usable. | ||||
4021 | |||||
4022 | LLT S32 = LLT::scalar(32); | ||||
4023 | |||||
4024 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||
4025 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||
4026 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||
4027 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||
4028 | |||||
4029 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||
4030 | Scale1Unmerge.getReg(1)); | ||||
4031 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||
4032 | Scale0Unmerge.getReg(1)); | ||||
4033 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||
4034 | } else { | ||||
4035 | Scale = DivScale1.getReg(1); | ||||
4036 | } | ||||
4037 | |||||
4038 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||
4039 | .addUse(Fma4.getReg(0)) | ||||
4040 | .addUse(Fma3.getReg(0)) | ||||
4041 | .addUse(Mul.getReg(0)) | ||||
4042 | .addUse(Scale) | ||||
4043 | .setMIFlags(Flags); | ||||
4044 | |||||
4045 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) | ||||
4046 | .addUse(Fmas.getReg(0)) | ||||
4047 | .addUse(RHS) | ||||
4048 | .addUse(LHS) | ||||
4049 | .setMIFlags(Flags); | ||||
4050 | |||||
4051 | MI.eraseFromParent(); | ||||
4052 | return true; | ||||
4053 | } | ||||
4054 | |||||
4055 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||
4056 | MachineRegisterInfo &MRI, | ||||
4057 | MachineIRBuilder &B) const { | ||||
4058 | Register Res = MI.getOperand(0).getReg(); | ||||
4059 | Register LHS = MI.getOperand(2).getReg(); | ||||
4060 | Register RHS = MI.getOperand(3).getReg(); | ||||
4061 | uint16_t Flags = MI.getFlags(); | ||||
4062 | |||||
4063 | LLT S32 = LLT::scalar(32); | ||||
4064 | LLT S1 = LLT::scalar(1); | ||||
4065 | |||||
4066 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||
4067 | const APFloat C0Val(1.0f); | ||||
4068 | |||||
4069 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||
4070 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||
4071 | auto C2 = B.buildConstant(S32, llvm::bit_cast<uint32_t>(1.0f)); | ||||
4072 | |||||
4073 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||
4074 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||
4075 | |||||
4076 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||
4077 | |||||
4078 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
4079 | .addUse(Mul0.getReg(0)) | ||||
4080 | .setMIFlags(Flags); | ||||
4081 | |||||
4082 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||
4083 | |||||
4084 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||
4085 | |||||
4086 | MI.eraseFromParent(); | ||||
4087 | return true; | ||||
4088 | } | ||||
4089 | |||||
4090 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||
4091 | // FIXME: Why do we handle this one but not other removed instructions? | ||||
4092 | // | ||||
4093 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||
4094 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||
4095 | // +-max_float. | ||||
4096 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||
4097 | MachineRegisterInfo &MRI, | ||||
4098 | MachineIRBuilder &B) const { | ||||
4099 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||
4100 | return true; | ||||
4101 | |||||
4102 | Register Dst = MI.getOperand(0).getReg(); | ||||
4103 | Register Src = MI.getOperand(2).getReg(); | ||||
4104 | auto Flags = MI.getFlags(); | ||||
4105 | |||||
4106 | LLT Ty = MRI.getType(Dst); | ||||
4107 | |||||
4108 | const fltSemantics *FltSemantics; | ||||
4109 | if (Ty == LLT::scalar(32)) | ||||
4110 | FltSemantics = &APFloat::IEEEsingle(); | ||||
4111 | else if (Ty == LLT::scalar(64)) | ||||
4112 | FltSemantics = &APFloat::IEEEdouble(); | ||||
4113 | else | ||||
4114 | return false; | ||||
4115 | |||||
4116 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||
4117 | .addUse(Src) | ||||
4118 | .setMIFlags(Flags); | ||||
4119 | |||||
4120 | // We don't need to concern ourselves with the snan handling difference, since | ||||
4121 | // the rsq quieted (or not) so use the one which will directly select. | ||||
4122 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
4123 | const bool UseIEEE = MFI->getMode().IEEE; | ||||
4124 | |||||
4125 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||
4126 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||
4127 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||
4128 | |||||
4129 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||
4130 | |||||
4131 | if (UseIEEE) | ||||
4132 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||
4133 | else | ||||
4134 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||
4135 | MI.eraseFromParent(); | ||||
4136 | return true; | ||||
4137 | } | ||||
4138 | |||||
4139 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||
4140 | switch (IID) { | ||||
4141 | case Intrinsic::amdgcn_ds_fadd: | ||||
4142 | return AMDGPU::G_ATOMICRMW_FADD; | ||||
4143 | case Intrinsic::amdgcn_ds_fmin: | ||||
4144 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||
4145 | case Intrinsic::amdgcn_ds_fmax: | ||||
4146 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||
4147 | default: | ||||
4148 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4148); | ||||
4149 | } | ||||
4150 | } | ||||
4151 | |||||
4152 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||
4153 | MachineInstr &MI, | ||||
4154 | Intrinsic::ID IID) const { | ||||
4155 | GISelChangeObserver &Observer = Helper.Observer; | ||||
4156 | Observer.changingInstr(MI); | ||||
4157 | |||||
4158 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||
4159 | |||||
4160 | // The remaining operands were used to set fields in the MemOperand on | ||||
4161 | // construction. | ||||
4162 | for (int I = 6; I > 3; --I) | ||||
4163 | MI.removeOperand(I); | ||||
4164 | |||||
4165 | MI.removeOperand(1); // Remove the intrinsic ID. | ||||
4166 | Observer.changedInstr(MI); | ||||
4167 | return true; | ||||
4168 | } | ||||
4169 | |||||
4170 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||
4171 | MachineRegisterInfo &MRI, | ||||
4172 | MachineIRBuilder &B) const { | ||||
4173 | uint64_t Offset = | ||||
4174 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||
4175 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||
4176 | LLT DstTy = MRI.getType(DstReg); | ||||
4177 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||
4178 | |||||
4179 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||
4180 | if (!loadInputValue(KernargPtrReg, B, | ||||
4181 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
4182 | return false; | ||||
4183 | |||||
4184 | // FIXME: This should be nuw | ||||
4185 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||
4186 | return true; | ||||
4187 | } | ||||
4188 | |||||
4189 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||
4190 | MachineRegisterInfo &MRI, | ||||
4191 | MachineIRBuilder &B) const { | ||||
4192 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
4193 | if (!MFI->isEntryFunction()) { | ||||
4194 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
4195 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||
4196 | } | ||||
4197 | |||||
4198 | Register DstReg = MI.getOperand(0).getReg(); | ||||
4199 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||
4200 | return false; | ||||
4201 | |||||
4202 | MI.eraseFromParent(); | ||||
4203 | return true; | ||||
4204 | } | ||||
4205 | |||||
4206 | bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, | ||||
4207 | MachineRegisterInfo &MRI, | ||||
4208 | MachineIRBuilder &B) const { | ||||
4209 | Function &F = B.getMF().getFunction(); | ||||
4210 | std::optional<uint32_t> KnownSize = | ||||
4211 | AMDGPUMachineFunction::getLDSKernelIdMetadata(F); | ||||
4212 | if (KnownSize.has_value()) | ||||
4213 | B.buildConstant(DstReg, *KnownSize); | ||||
4214 | return false; | ||||
4215 | } | ||||
4216 | |||||
4217 | bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, | ||||
4218 | MachineRegisterInfo &MRI, | ||||
4219 | MachineIRBuilder &B) const { | ||||
4220 | |||||
4221 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
4222 | if (!MFI->isEntryFunction()) { | ||||
4223 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
4224 | AMDGPUFunctionArgInfo::LDS_KERNEL_ID); | ||||
4225 | } | ||||
4226 | |||||
4227 | Register DstReg = MI.getOperand(0).getReg(); | ||||
4228 | if (!getLDSKernelId(DstReg, MRI, B)) | ||||
4229 | return false; | ||||
4230 | |||||
4231 | MI.eraseFromParent(); | ||||
4232 | return true; | ||||
4233 | } | ||||
4234 | |||||
4235 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||
4236 | MachineRegisterInfo &MRI, | ||||
4237 | MachineIRBuilder &B, | ||||
4238 | unsigned AddrSpace) const { | ||||
4239 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||
4240 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||
4241 | Register Hi32 = Unmerge.getReg(1); | ||||
4242 | |||||
4243 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||
4244 | MI.eraseFromParent(); | ||||
4245 | return true; | ||||
4246 | } | ||||
4247 | |||||
4248 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||
4249 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||
4250 | // split between the instruction's voffset and immoffset fields) and soffset | ||||
4251 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||
4252 | // the instruction's soffset field). This function takes the first kind of | ||||
4253 | // offset and figures out how to split it between voffset and immoffset. | ||||
4254 | std::pair<Register, unsigned> | ||||
4255 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||
4256 | Register OrigOffset) const { | ||||
4257 | const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); | ||||
4258 | Register BaseReg; | ||||
4259 | unsigned ImmOffset; | ||||
4260 | const LLT S32 = LLT::scalar(32); | ||||
4261 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
4262 | |||||
4263 | std::tie(BaseReg, ImmOffset) = | ||||
4264 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | ||||
4265 | |||||
4266 | // If BaseReg is a pointer, convert it to int. | ||||
4267 | if (MRI.getType(BaseReg).isPointer()) | ||||
4268 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | ||||
4269 | |||||
4270 | // If the immediate value is too big for the immoffset field, put only bits | ||||
4271 | // that would normally fit in the immoffset field. The remaining value that | ||||
4272 | // is copied/added for the voffset field is a large power of 2, and it | ||||
4273 | // stands more chance of being CSEd with the copy/add for another similar | ||||
4274 | // load/store. | ||||
4275 | // However, do not do that rounding down if that is a negative | ||||
4276 | // number, as it appears to be illegal to have a negative offset in the | ||||
4277 | // vgpr, even if adding the immediate offset makes it positive. | ||||
4278 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||
4279 | ImmOffset -= Overflow; | ||||
4280 | if ((int32_t)Overflow < 0) { | ||||
4281 | Overflow += ImmOffset; | ||||
4282 | ImmOffset = 0; | ||||
4283 | } | ||||
4284 | |||||
4285 | if (Overflow != 0) { | ||||
4286 | if (!BaseReg) { | ||||
4287 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||
4288 | } else { | ||||
4289 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||
4290 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||
4291 | } | ||||
4292 | } | ||||
4293 | |||||
4294 | if (!BaseReg) | ||||
4295 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||
4296 | |||||
4297 | return std::pair(BaseReg, ImmOffset); | ||||
4298 | } | ||||
4299 | |||||
4300 | /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. | ||||
4301 | void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, | ||||
4302 | Register VOffset, Register SOffset, | ||||
4303 | unsigned ImmOffset, Register VIndex, | ||||
4304 | MachineRegisterInfo &MRI) const { | ||||
4305 | std::optional<ValueAndVReg> MaybeVOffsetVal = | ||||
4306 | getIConstantVRegValWithLookThrough(VOffset, MRI); | ||||
4307 | std::optional<ValueAndVReg> MaybeSOffsetVal = | ||||
4308 | getIConstantVRegValWithLookThrough(SOffset, MRI); | ||||
4309 | std::optional<ValueAndVReg> MaybeVIndexVal = | ||||
4310 | getIConstantVRegValWithLookThrough(VIndex, MRI); | ||||
4311 | // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, | ||||
4312 | // update the MMO with that offset. The stride is unknown so we can only do | ||||
4313 | // this if VIndex is constant 0. | ||||
4314 | if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && | ||||
4315 | MaybeVIndexVal->Value == 0) { | ||||
4316 | uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + | ||||
4317 | MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; | ||||
4318 | MMO->setOffset(TotalOffset); | ||||
4319 | } else { | ||||
4320 | // We don't have a constant combined offset to use in the MMO. Give up. | ||||
4321 | MMO->setValue((Value *)nullptr); | ||||
4322 | } | ||||
4323 | } | ||||
4324 | |||||
4325 | /// Handle register layout difference for f16 images for some subtargets. | ||||
4326 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||
4327 | MachineRegisterInfo &MRI, | ||||
4328 | Register Reg, | ||||
4329 | bool ImageStore) const { | ||||
4330 | const LLT S16 = LLT::scalar(16); | ||||
4331 | const LLT S32 = LLT::scalar(32); | ||||
4332 | LLT StoreVT = MRI.getType(Reg); | ||||
4333 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4333, __extension__ __PRETTY_FUNCTION__)); | ||||
4334 | |||||
4335 | if (ST.hasUnpackedD16VMem()) { | ||||
4336 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||
4337 | |||||
4338 | SmallVector<Register, 4> WideRegs; | ||||
4339 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
4340 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||
4341 | |||||
4342 | int NumElts = StoreVT.getNumElements(); | ||||
4343 | |||||
4344 | return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) | ||||
4345 | .getReg(0); | ||||
4346 | } | ||||
4347 | |||||
4348 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||
4349 | if (StoreVT.getNumElements() == 2) { | ||||
4350 | SmallVector<Register, 4> PackedRegs; | ||||
4351 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||
4352 | PackedRegs.push_back(Reg); | ||||
4353 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||
4354 | return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) | ||||
4355 | .getReg(0); | ||||
4356 | } | ||||
4357 | |||||
4358 | if (StoreVT.getNumElements() == 3) { | ||||
4359 | SmallVector<Register, 4> PackedRegs; | ||||
4360 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||
4361 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
4362 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||
4363 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||
4364 | Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); | ||||
4365 | return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); | ||||
4366 | } | ||||
4367 | |||||
4368 | if (StoreVT.getNumElements() == 4) { | ||||
4369 | SmallVector<Register, 4> PackedRegs; | ||||
4370 | Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); | ||||
4371 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||
4372 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
4373 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||
4374 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||
4375 | return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) | ||||
4376 | .getReg(0); | ||||
4377 | } | ||||
4378 | |||||
4379 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4379); | ||||
4380 | } | ||||
4381 | |||||
4382 | if (StoreVT == LLT::fixed_vector(3, S16)) { | ||||
4383 | Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) | ||||
4384 | .getReg(0); | ||||
4385 | } | ||||
4386 | return Reg; | ||||
4387 | } | ||||
4388 | |||||
4389 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||
4390 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||
4391 | MachineRegisterInfo *MRI = B.getMRI(); | ||||
4392 | LLT Ty = MRI->getType(VData); | ||||
4393 | |||||
4394 | const LLT S16 = LLT::scalar(16); | ||||
4395 | |||||
4396 | // Fixup illegal register types for i8 stores. | ||||
4397 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||
4398 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||
4399 | return AnyExt; | ||||
4400 | } | ||||
4401 | |||||
4402 | if (Ty.isVector()) { | ||||
4403 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||
4404 | if (IsFormat) | ||||
4405 | return handleD16VData(B, *MRI, VData); | ||||
4406 | } | ||||
4407 | } | ||||
4408 | |||||
4409 | return VData; | ||||
4410 | } | ||||
4411 | |||||
4412 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||
4413 | MachineRegisterInfo &MRI, | ||||
4414 | MachineIRBuilder &B, | ||||
4415 | bool IsTyped, | ||||
4416 | bool IsFormat) const { | ||||
4417 | Register VData = MI.getOperand(1).getReg(); | ||||
4418 | LLT Ty = MRI.getType(VData); | ||||
4419 | LLT EltTy = Ty.getScalarType(); | ||||
4420 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||
4421 | const LLT S32 = LLT::scalar(32); | ||||
4422 | |||||
4423 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||
4424 | Register RSrc = MI.getOperand(2).getReg(); | ||||
4425 | |||||
4426 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
4427 | const int MemSize = MMO->getSize(); | ||||
4428 | |||||
4429 | unsigned ImmOffset; | ||||
4430 | |||||
4431 | // The typed intrinsics add an immediate after the registers. | ||||
4432 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||
4433 | |||||
4434 | // The struct intrinsic variants add one additional operand over raw. | ||||
4435 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||
4436 | Register VIndex; | ||||
4437 | int OpOffset = 0; | ||||
4438 | if (HasVIndex) { | ||||
4439 | VIndex = MI.getOperand(3).getReg(); | ||||
4440 | OpOffset = 1; | ||||
4441 | } else { | ||||
4442 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||
4443 | } | ||||
4444 | |||||
4445 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||
4446 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
4447 | |||||
4448 | unsigned Format = 0; | ||||
4449 | if (IsTyped) { | ||||
4450 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||
4451 | ++OpOffset; | ||||
4452 | } | ||||
4453 | |||||
4454 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||
4455 | |||||
4456 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
4457 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||
4458 | |||||
4459 | unsigned Opc; | ||||
4460 | if (IsTyped) { | ||||
4461 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||
4462 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||
4463 | } else if (IsFormat) { | ||||
4464 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||
4465 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||
4466 | } else { | ||||
4467 | switch (MemSize) { | ||||
4468 | case 1: | ||||
4469 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||
4470 | break; | ||||
4471 | case 2: | ||||
4472 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||
4473 | break; | ||||
4474 | default: | ||||
4475 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||
4476 | break; | ||||
4477 | } | ||||
4478 | } | ||||
4479 | |||||
4480 | auto MIB = B.buildInstr(Opc) | ||||
4481 | .addUse(VData) // vdata | ||||
4482 | .addUse(RSrc) // rsrc | ||||
4483 | .addUse(VIndex) // vindex | ||||
4484 | .addUse(VOffset) // voffset | ||||
4485 | .addUse(SOffset) // soffset | ||||
4486 | .addImm(ImmOffset); // offset(imm) | ||||
4487 | |||||
4488 | if (IsTyped) | ||||
4489 | MIB.addImm(Format); | ||||
4490 | |||||
4491 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
4492 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
4493 | .addMemOperand(MMO); | ||||
4494 | |||||
4495 | MI.eraseFromParent(); | ||||
4496 | return true; | ||||
4497 | } | ||||
4498 | |||||
4499 | static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, | ||||
4500 | Register VIndex, Register VOffset, Register SOffset, | ||||
4501 | unsigned ImmOffset, unsigned Format, | ||||
4502 | unsigned AuxiliaryData, MachineMemOperand *MMO, | ||||
4503 | bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { | ||||
4504 | auto MIB = B.buildInstr(Opc) | ||||
4505 | .addDef(LoadDstReg) // vdata | ||||
4506 | .addUse(RSrc) // rsrc | ||||
4507 | .addUse(VIndex) // vindex | ||||
4508 | .addUse(VOffset) // voffset | ||||
4509 | .addUse(SOffset) // soffset | ||||
4510 | .addImm(ImmOffset); // offset(imm) | ||||
4511 | |||||
4512 | if (IsTyped) | ||||
4513 | MIB.addImm(Format); | ||||
4514 | |||||
4515 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
4516 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
4517 | .addMemOperand(MMO); | ||||
4518 | } | ||||
4519 | |||||
4520 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||
4521 | MachineRegisterInfo &MRI, | ||||
4522 | MachineIRBuilder &B, | ||||
4523 | bool IsFormat, | ||||
4524 | bool IsTyped) const { | ||||
4525 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||
4526 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
4527 | const LLT MemTy = MMO->getMemoryType(); | ||||
4528 | const LLT S32 = LLT::scalar(32); | ||||
4529 | |||||
4530 | Register Dst = MI.getOperand(0).getReg(); | ||||
4531 | |||||
4532 | Register StatusDst; | ||||
4533 | int OpOffset = 0; | ||||
4534 | assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2)(static_cast <bool> (MI.getNumExplicitDefs() == 1 || MI .getNumExplicitDefs() == 2) ? void (0) : __assert_fail ("MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4534, __extension__ __PRETTY_FUNCTION__)); | ||||
4535 | bool IsTFE = MI.getNumExplicitDefs() == 2; | ||||
4536 | if (IsTFE) { | ||||
4537 | StatusDst = MI.getOperand(1).getReg(); | ||||
4538 | ++OpOffset; | ||||
4539 | } | ||||
4540 | |||||
4541 | Register RSrc = MI.getOperand(2 + OpOffset).getReg(); | ||||
4542 | |||||
4543 | // The typed intrinsics add an immediate after the registers. | ||||
4544 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||
4545 | |||||
4546 | // The struct intrinsic variants add one additional operand over raw. | ||||
4547 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; | ||||
4548 | Register VIndex; | ||||
4549 | if (HasVIndex) { | ||||
4550 | VIndex = MI.getOperand(3 + OpOffset).getReg(); | ||||
4551 | ++OpOffset; | ||||
4552 | } else { | ||||
4553 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||
4554 | } | ||||
4555 | |||||
4556 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||
4557 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
4558 | |||||
4559 | unsigned Format = 0; | ||||
4560 | if (IsTyped) { | ||||
4561 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||
4562 | ++OpOffset; | ||||
4563 | } | ||||
4564 | |||||
4565 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||
4566 | unsigned ImmOffset; | ||||
4567 | |||||
4568 | LLT Ty = MRI.getType(Dst); | ||||
4569 | LLT EltTy = Ty.getScalarType(); | ||||
4570 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||
4571 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||
4572 | |||||
4573 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
4574 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||
4575 | |||||
4576 | unsigned Opc; | ||||
4577 | |||||
4578 | // TODO: Support TFE for typed and narrow loads. | ||||
4579 | if (IsTyped) { | ||||
4580 | if (IsTFE) | ||||
4581 | return false; | ||||
4582 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||
4583 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||
4584 | } else if (IsFormat) { | ||||
4585 | if (IsD16) { | ||||
4586 | if (IsTFE) | ||||
4587 | return false; | ||||
4588 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; | ||||
4589 | } else { | ||||
4590 | Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE | ||||
4591 | : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||
4592 | } | ||||
4593 | } else { | ||||
4594 | if (IsTFE) | ||||
4595 | return false; | ||||
4596 | switch (MemTy.getSizeInBits()) { | ||||
4597 | case 8: | ||||
4598 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||
4599 | break; | ||||
4600 | case 16: | ||||
4601 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||
4602 | break; | ||||
4603 | default: | ||||
4604 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||
4605 | break; | ||||
4606 | } | ||||
4607 | } | ||||
4608 | |||||
4609 | if (IsTFE) { | ||||
4610 | unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); | ||||
4611 | unsigned NumLoadDWords = NumValueDWords + 1; | ||||
4612 | LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); | ||||
4613 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); | ||||
4614 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
4615 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4616 | if (NumValueDWords == 1) { | ||||
4617 | B.buildUnmerge({Dst, StatusDst}, LoadDstReg); | ||||
4618 | } else { | ||||
4619 | SmallVector<Register, 5> LoadElts; | ||||
4620 | for (unsigned I = 0; I != NumValueDWords; ++I) | ||||
4621 | LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); | ||||
4622 | LoadElts.push_back(StatusDst); | ||||
4623 | B.buildUnmerge(LoadElts, LoadDstReg); | ||||
4624 | LoadElts.truncate(NumValueDWords); | ||||
4625 | B.buildMergeLikeInstr(Dst, LoadElts); | ||||
4626 | } | ||||
4627 | } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || | ||||
4628 | (IsD16 && !Ty.isVector())) { | ||||
4629 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||
4630 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
4631 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4632 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||
4633 | B.buildTrunc(Dst, LoadDstReg); | ||||
4634 | } else if (Unpacked && IsD16 && Ty.isVector()) { | ||||
4635 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||
4636 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||
4637 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
4638 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4639 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||
4640 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||
4641 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||
4642 | SmallVector<Register, 4> Repack; | ||||
4643 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||
4644 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||
4645 | B.buildMergeLikeInstr(Dst, Repack); | ||||
4646 | } else { | ||||
4647 | buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, | ||||
4648 | AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
4649 | } | ||||
4650 | |||||
4651 | MI.eraseFromParent(); | ||||
4652 | return true; | ||||
4653 | } | ||||
4654 | |||||
4655 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||
4656 | MachineIRBuilder &B, | ||||
4657 | bool IsInc) const { | ||||
4658 | unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP : | ||||
4659 | AMDGPU::G_ATOMICRMW_UDEC_WRAP; | ||||
4660 | B.buildInstr(Opc) | ||||
4661 | .addDef(MI.getOperand(0).getReg()) | ||||
4662 | .addUse(MI.getOperand(2).getReg()) | ||||
4663 | .addUse(MI.getOperand(3).getReg()) | ||||
4664 | .cloneMemRefs(MI); | ||||
4665 | MI.eraseFromParent(); | ||||
4666 | return true; | ||||
4667 | } | ||||
4668 | |||||
4669 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||
4670 | switch (IntrID) { | ||||
4671 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||
4672 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||
4673 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||
4674 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||
4675 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||
4676 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||
4677 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||
4678 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||
4679 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||
4680 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||
4681 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||
4682 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||
4683 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||
4684 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||
4685 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||
4686 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||
4687 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||
4688 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||
4689 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||
4690 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||
4691 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||
4692 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||
4693 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||
4694 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||
4695 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||
4696 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||
4697 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||
4698 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||
4699 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||
4700 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||
4701 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||
4702 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||
4703 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||
4704 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||
4705 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||
4706 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||
4707 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||
4708 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||
4709 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||
4710 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||
4711 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||
4712 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||
4713 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||
4714 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||
4715 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | ||||
4716 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||
4717 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||
4718 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | ||||
4719 | default: | ||||
4720 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4720); | ||||
4721 | } | ||||
4722 | } | ||||
4723 | |||||
4724 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||
4725 | MachineIRBuilder &B, | ||||
4726 | Intrinsic::ID IID) const { | ||||
4727 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||
4728 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||
4729 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||
4730 | |||||
4731 | Register Dst; | ||||
4732 | |||||
4733 | int OpOffset = 0; | ||||
4734 | if (HasReturn) { | ||||
4735 | // A few FP atomics do not support return values. | ||||
4736 | Dst = MI.getOperand(0).getReg(); | ||||
4737 | } else { | ||||
4738 | OpOffset = -1; | ||||
4739 | } | ||||
4740 | |||||
4741 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||
4742 | Register CmpVal; | ||||
4743 | |||||
4744 | if (IsCmpSwap) { | ||||
4745 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||
4746 | ++OpOffset; | ||||
4747 | } | ||||
4748 | |||||
4749 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||
4750 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||
4751 | |||||
4752 | // The struct intrinsic variants add one additional operand over raw. | ||||
4753 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||
4754 | Register VIndex; | ||||
4755 | if (HasVIndex) { | ||||
4756 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||
4757 | ++OpOffset; | ||||
4758 | } else { | ||||
4759 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||
4760 | } | ||||
4761 | |||||
4762 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
4763 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||
4764 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||
4765 | |||||
4766 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
4767 | |||||
4768 | unsigned ImmOffset; | ||||
4769 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
4770 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); | ||||
4771 | |||||
4772 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||
4773 | |||||
4774 | if (HasReturn) | ||||
4775 | MIB.addDef(Dst); | ||||
4776 | |||||
4777 | MIB.addUse(VData); // vdata | ||||
4778 | |||||
4779 | if (IsCmpSwap) | ||||
4780 | MIB.addReg(CmpVal); | ||||
4781 | |||||
4782 | MIB.addUse(RSrc) // rsrc | ||||
4783 | .addUse(VIndex) // vindex | ||||
4784 | .addUse(VOffset) // voffset | ||||
4785 | .addUse(SOffset) // soffset | ||||
4786 | .addImm(ImmOffset) // offset(imm) | ||||
4787 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
4788 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
4789 | .addMemOperand(MMO); | ||||
4790 | |||||
4791 | MI.eraseFromParent(); | ||||
4792 | return true; | ||||
4793 | } | ||||
4794 | |||||
4795 | /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized | ||||
4796 | /// vector with s16 typed elements. | ||||
4797 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, | ||||
4798 | SmallVectorImpl<Register> &PackedAddrs, | ||||
4799 | unsigned ArgOffset, | ||||
4800 | const AMDGPU::ImageDimIntrinsicInfo *Intr, | ||||
4801 | bool IsA16, bool IsG16) { | ||||
4802 | const LLT S16 = LLT::scalar(16); | ||||
4803 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
4804 | auto EndIdx = Intr->VAddrEnd; | ||||
4805 | |||||
4806 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||
4807 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||
4808 | if (!SrcOp.isReg()) | ||||
4809 | continue; // _L to _LZ may have eliminated this. | ||||
4810 | |||||
4811 | Register AddrReg = SrcOp.getReg(); | ||||
4812 | |||||
4813 | if ((I < Intr->GradientStart) || | ||||
4814 | (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || | ||||
4815 | (I >= Intr->CoordStart && !IsA16)) { | ||||
4816 | if ((I < Intr->GradientStart) && IsA16 && | ||||
4817 | (B.getMRI()->getType(AddrReg) == S16)) { | ||||
4818 | assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument")(static_cast <bool> (I == Intr->BiasIndex && "Got unexpected 16-bit extra argument") ? void (0) : __assert_fail ("I == Intr->BiasIndex && \"Got unexpected 16-bit extra argument\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4818, __extension__ __PRETTY_FUNCTION__)); | ||||
4819 | // Special handling of bias when A16 is on. Bias is of type half but | ||||
4820 | // occupies full 32-bit. | ||||
4821 | PackedAddrs.push_back( | ||||
4822 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||
4823 | .getReg(0)); | ||||
4824 | } else { | ||||
4825 | assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode" ) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4826, __extension__ __PRETTY_FUNCTION__)) | ||||
4826 | "Bias needs to be converted to 16 bit in A16 mode")(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode" ) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4826, __extension__ __PRETTY_FUNCTION__)); | ||||
4827 | // Handle any gradient or coordinate operands that should not be packed | ||||
4828 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||
4829 | PackedAddrs.push_back(AddrReg); | ||||
4830 | } | ||||
4831 | } else { | ||||
4832 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||
4833 | // derivatives dx/dh and dx/dv are packed with undef. | ||||
4834 | if (((I + 1) >= EndIdx) || | ||||
4835 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||
4836 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||
4837 | (Intr->NumGradients / 2) - 1) || | ||||
4838 | I == static_cast<unsigned>(Intr->GradientStart + | ||||
4839 | Intr->NumGradients - 1))) || | ||||
4840 | // Check for _L to _LZ optimization | ||||
4841 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||
4842 | PackedAddrs.push_back( | ||||
4843 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||
4844 | .getReg(0)); | ||||
4845 | } else { | ||||
4846 | PackedAddrs.push_back( | ||||
4847 | B.buildBuildVector( | ||||
4848 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||
4849 | .getReg(0)); | ||||
4850 | ++I; | ||||
4851 | } | ||||
4852 | } | ||||
4853 | } | ||||
4854 | } | ||||
4855 | |||||
4856 | /// Convert from separate vaddr components to a single vector address register, | ||||
4857 | /// and replace the remaining operands with $noreg. | ||||
4858 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||
4859 | int DimIdx, int NumVAddrs) { | ||||
4860 | const LLT S32 = LLT::scalar(32); | ||||
4861 | (void)S32; | ||||
4862 | SmallVector<Register, 8> AddrRegs; | ||||
4863 | for (int I = 0; I != NumVAddrs; ++I) { | ||||
4864 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||
4865 | if (SrcOp.isReg()) { | ||||
4866 | AddrRegs.push_back(SrcOp.getReg()); | ||||
4867 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__ __PRETTY_FUNCTION__)); | ||||
4868 | } | ||||
4869 | } | ||||
4870 | |||||
4871 | int NumAddrRegs = AddrRegs.size(); | ||||
4872 | if (NumAddrRegs != 1) { | ||||
4873 | auto VAddr = | ||||
4874 | B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); | ||||
4875 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||
4876 | } | ||||
4877 | |||||
4878 | for (int I = 1; I != NumVAddrs; ++I) { | ||||
4879 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||
4880 | if (SrcOp.isReg()) | ||||
4881 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||
4882 | } | ||||
4883 | } | ||||
4884 | |||||
4885 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||
4886 | /// | ||||
4887 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||
4888 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||
4889 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||
4890 | /// registers. | ||||
4891 | /// | ||||
4892 | /// We don't want to directly select image instructions just yet, but also want | ||||
4893 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||
4894 | /// want a selected instruction entering RegBankSelect. In order to avoid | ||||
4895 | /// defining a multitude of intermediate image instructions, directly hack on | ||||
4896 | /// the intrinsic's arguments. In cases like a16 addresses, this requires | ||||
4897 | /// padding now unnecessary arguments with $noreg. | ||||
4898 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||
4899 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||
4900 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||
4901 | |||||
4902 | const MachineFunction &MF = *MI.getMF(); | ||||
4903 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||
4904 | const unsigned ArgOffset = NumDefs + 1; | ||||
4905 | bool IsTFE = NumDefs == 2; | ||||
4906 | // We are only processing the operands of d16 image operations on subtargets | ||||
4907 | // that use the unpacked register layout, or need to repack the TFE result. | ||||
4908 | |||||
4909 | // TODO: Do we need to guard against already legalized intrinsics? | ||||
4910 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||
4911 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||
4912 | |||||
4913 | MachineRegisterInfo *MRI = B.getMRI(); | ||||
4914 | const LLT S32 = LLT::scalar(32); | ||||
4915 | const LLT S16 = LLT::scalar(16); | ||||
4916 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
4917 | |||||
4918 | unsigned DMask = 0; | ||||
4919 | Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); | ||||
4920 | LLT Ty = MRI->getType(VData); | ||||
4921 | |||||
4922 | // Check for 16 bit addresses and pack if true. | ||||
4923 | LLT GradTy = | ||||
4924 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||
4925 | LLT AddrTy = | ||||
4926 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||
4927 | const bool IsG16 = | ||||
4928 | ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; | ||||
4929 | const bool IsA16 = AddrTy == S16; | ||||
4930 | const bool IsD16 = Ty.getScalarType() == S16; | ||||
4931 | |||||
4932 | int DMaskLanes = 0; | ||||
4933 | if (!BaseOpcode->Atomic) { | ||||
4934 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||
4935 | if (BaseOpcode->Gather4) { | ||||
4936 | DMaskLanes = 4; | ||||
4937 | } else if (DMask != 0) { | ||||
4938 | DMaskLanes = llvm::popcount(DMask); | ||||
4939 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||
4940 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||
4941 | B.buildUndef(MI.getOperand(0)); | ||||
4942 | MI.eraseFromParent(); | ||||
4943 | return true; | ||||
4944 | } | ||||
4945 | } | ||||
4946 | |||||
4947 | Observer.changingInstr(MI); | ||||
4948 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||
4949 | |||||
4950 | const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 | ||||
4951 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; | ||||
4952 | const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 | ||||
4953 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||
4954 | unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; | ||||
4955 | |||||
4956 | // Track that we legalized this | ||||
4957 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||
4958 | |||||
4959 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||
4960 | // dmask to be at least 1 otherwise the instruction will fail | ||||
4961 | if (IsTFE && DMask == 0) { | ||||
4962 | DMask = 0x1; | ||||
4963 | DMaskLanes = 1; | ||||
4964 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||
4965 | } | ||||
4966 | |||||
4967 | if (BaseOpcode->Atomic) { | ||||
4968 | Register VData0 = MI.getOperand(2).getReg(); | ||||
4969 | LLT Ty = MRI->getType(VData0); | ||||
4970 | |||||
4971 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||
4972 | if (Ty.isVector()) | ||||
4973 | return false; | ||||
4974 | |||||
4975 | if (BaseOpcode->AtomicX2) { | ||||
4976 | Register VData1 = MI.getOperand(3).getReg(); | ||||
4977 | // The two values are packed in one register. | ||||
4978 | LLT PackedTy = LLT::fixed_vector(2, Ty); | ||||
4979 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||
4980 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||
4981 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||
4982 | } | ||||
4983 | } | ||||
4984 | |||||
4985 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||
4986 | |||||
4987 | // Rewrite the addressing register layout before doing anything else. | ||||
4988 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { | ||||
4989 | // 16 bit gradients are supported, but are tied to the A16 control | ||||
4990 | // so both gradients and addresses must be 16 bit | ||||
4991 | return false; | ||||
4992 | } | ||||
4993 | |||||
4994 | if (IsA16 && !ST.hasA16()) { | ||||
4995 | // A16 not supported | ||||
4996 | return false; | ||||
4997 | } | ||||
4998 | |||||
4999 | const unsigned NSAMaxSize = ST.getNSAMaxSize(); | ||||
5000 | const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); | ||||
5001 | |||||
5002 | if (IsA16 || IsG16) { | ||||
5003 | if (Intr->NumVAddrs > 1) { | ||||
5004 | SmallVector<Register, 4> PackedRegs; | ||||
5005 | |||||
5006 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, | ||||
5007 | IsG16); | ||||
5008 | |||||
5009 | // See also below in the non-a16 branch | ||||
5010 | const bool UseNSA = ST.hasNSAEncoding() && | ||||
5011 | PackedRegs.size() >= ST.getNSAThreshold(MF) && | ||||
5012 | (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); | ||||
5013 | const bool UsePartialNSA = | ||||
5014 | UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; | ||||
5015 | |||||
5016 | if (UsePartialNSA) { | ||||
5017 | // Pack registers that would go over NSAMaxSize into last VAddr register | ||||
5018 | LLT PackedAddrTy = | ||||
5019 | LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); | ||||
5020 | auto Concat = B.buildConcatVectors( | ||||
5021 | PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); | ||||
5022 | PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); | ||||
5023 | PackedRegs.resize(NSAMaxSize); | ||||
5024 | } else if (!UseNSA && PackedRegs.size() > 1) { | ||||
5025 | LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); | ||||
5026 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||
5027 | PackedRegs[0] = Concat.getReg(0); | ||||
5028 | PackedRegs.resize(1); | ||||
5029 | } | ||||
5030 | |||||
5031 | const unsigned NumPacked = PackedRegs.size(); | ||||
5032 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||
5033 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||
5034 | if (!SrcOp.isReg()) { | ||||
5035 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5035, __extension__ __PRETTY_FUNCTION__)); | ||||
5036 | continue; | ||||
5037 | } | ||||
5038 | |||||
5039 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5039, __extension__ __PRETTY_FUNCTION__)); | ||||
5040 | |||||
5041 | if (I - Intr->VAddrStart < NumPacked) | ||||
5042 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||
5043 | else | ||||
5044 | SrcOp.setReg(AMDGPU::NoRegister); | ||||
5045 | } | ||||
5046 | } | ||||
5047 | } else { | ||||
5048 | // If the register allocator cannot place the address registers contiguously | ||||
5049 | // without introducing moves, then using the non-sequential address encoding | ||||
5050 | // is always preferable, since it saves VALU instructions and is usually a | ||||
5051 | // wash in terms of code size or even better. | ||||
5052 | // | ||||
5053 | // However, we currently have no way of hinting to the register allocator | ||||
5054 | // that MIMG addresses should be placed contiguously when it is possible to | ||||
5055 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||
5056 | // | ||||
5057 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||
5058 | // allocation when possible. | ||||
5059 | // | ||||
5060 | // Partial NSA is allowed on GFX11 where the final register is a contiguous | ||||
5061 | // set of the remaining addresses. | ||||
5062 | const bool UseNSA = ST.hasNSAEncoding() && | ||||
5063 | CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && | ||||
5064 | (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); | ||||
5065 | const bool UsePartialNSA = | ||||
5066 | UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; | ||||
5067 | |||||
5068 | if (UsePartialNSA) { | ||||
5069 | convertImageAddrToPacked(B, MI, | ||||
5070 | ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, | ||||
5071 | Intr->NumVAddrs - NSAMaxSize + 1); | ||||
5072 | } else if (!UseNSA && Intr->NumVAddrs > 1) { | ||||
5073 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||
5074 | Intr->NumVAddrs); | ||||
5075 | } | ||||
5076 | } | ||||
5077 | |||||
5078 | int Flags = 0; | ||||
5079 | if (IsA16) | ||||
5080 | Flags |= 1; | ||||
5081 | if (IsG16) | ||||
5082 | Flags |= 2; | ||||
5083 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||
5084 | |||||
5085 | if (BaseOpcode->Store) { // No TFE for stores? | ||||
5086 | // TODO: Handle dmask trim | ||||
5087 | if (!Ty.isVector() || !IsD16) | ||||
5088 | return true; | ||||
5089 | |||||
5090 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||
5091 | if (RepackedReg != VData) { | ||||
5092 | MI.getOperand(1).setReg(RepackedReg); | ||||
5093 | } | ||||
5094 | |||||
5095 | return true; | ||||
5096 | } | ||||
5097 | |||||
5098 | Register DstReg = MI.getOperand(0).getReg(); | ||||
5099 | const LLT EltTy = Ty.getScalarType(); | ||||
5100 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||
5101 | |||||
5102 | // Confirm that the return type is large enough for the dmask specified | ||||
5103 | if (NumElts < DMaskLanes) | ||||
5104 | return false; | ||||
5105 | |||||
5106 | if (NumElts > 4 || DMaskLanes > 4) | ||||
5107 | return false; | ||||
5108 | |||||
5109 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; | ||||
5110 | const LLT AdjustedTy = | ||||
5111 | Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); | ||||
5112 | |||||
5113 | // The raw dword aligned data component of the load. The only legal cases | ||||
5114 | // where this matters should be when using the packed D16 format, for | ||||
5115 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||
5116 | LLT RoundedTy; | ||||
5117 | |||||
5118 | // S32 vector to cover all data, plus TFE result element. | ||||
5119 | LLT TFETy; | ||||
5120 | |||||
5121 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||
5122 | LLT RegTy; | ||||
5123 | |||||
5124 | if (IsD16 && ST.hasUnpackedD16VMem()) { | ||||
5125 | RoundedTy = | ||||
5126 | LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); | ||||
5127 | TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); | ||||
5128 | RegTy = S32; | ||||
5129 | } else { | ||||
5130 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
5131 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||
5132 | unsigned RoundedSize = 32 * RoundedElts; | ||||
5133 | RoundedTy = LLT::scalarOrVector( | ||||
5134 | ElementCount::getFixed(RoundedSize / EltSize), EltSize); | ||||
5135 | TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); | ||||
5136 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||
5137 | } | ||||
5138 | |||||
5139 | // The return type does not need adjustment. | ||||
5140 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||
5141 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||
5142 | return true; | ||||
5143 | |||||
5144 | Register Dst1Reg; | ||||
5145 | |||||
5146 | // Insert after the instruction. | ||||
5147 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||
5148 | |||||
5149 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||
5150 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||
5151 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||
5152 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||
5153 | |||||
5154 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||
5155 | |||||
5156 | MI.getOperand(0).setReg(NewResultReg); | ||||
5157 | |||||
5158 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||
5159 | // type. The instruction really returns these two values in one contiguous | ||||
5160 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||
5161 | // return type to use a single register result. | ||||
5162 | |||||
5163 | if (IsTFE) { | ||||
5164 | Dst1Reg = MI.getOperand(1).getReg(); | ||||
5165 | if (MRI->getType(Dst1Reg) != S32) | ||||
5166 | return false; | ||||
5167 | |||||
5168 | // TODO: Make sure the TFE operand bit is set. | ||||
5169 | MI.removeOperand(1); | ||||
5170 | |||||
5171 | // Handle the easy case that requires no repack instructions. | ||||
5172 | if (Ty == S32) { | ||||
5173 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||
5174 | return true; | ||||
5175 | } | ||||
5176 | } | ||||
5177 | |||||
5178 | // Now figure out how to copy the new result register back into the old | ||||
5179 | // result. | ||||
5180 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||
5181 | |||||
5182 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||
5183 | |||||
5184 | if (ResultNumRegs == 1) { | ||||
5185 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5185, __extension__ __PRETTY_FUNCTION__)); | ||||
5186 | ResultRegs[0] = NewResultReg; | ||||
5187 | } else { | ||||
5188 | // We have to repack into a new vector of some kind. | ||||
5189 | for (int I = 0; I != NumDataRegs; ++I) | ||||
5190 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||
5191 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||
5192 | |||||
5193 | // Drop the final TFE element to get the data part. The TFE result is | ||||
5194 | // directly written to the right place already. | ||||
5195 | if (IsTFE) | ||||
5196 | ResultRegs.resize(NumDataRegs); | ||||
5197 | } | ||||
5198 | |||||
5199 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||
5200 | // of packed vs. unpacked. | ||||
5201 | if (IsD16 && !Ty.isVector()) { | ||||
5202 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||
5203 | return true; | ||||
5204 | } | ||||
5205 | |||||
5206 | // Avoid a build/concat_vector of 1 entry. | ||||
5207 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||
5208 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||
5209 | return true; | ||||
5210 | } | ||||
5211 | |||||
5212 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 5212, __extension__ __PRETTY_FUNCTION__)); | ||||
5213 | |||||
5214 | if (IsD16) { | ||||
5215 | // For packed D16 results with TFE enabled, all the data components are | ||||
5216 | // S32. Cast back to the expected type. | ||||
5217 | // | ||||
5218 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||
5219 | // cast for the TFE result if a multiple of v2s16 was used. | ||||
5220 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||
5221 | for (Register &Reg : ResultRegs) | ||||
5222 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||
5223 | } else if (ST.hasUnpackedD16VMem()) { | ||||
5224 | for (Register &Reg : ResultRegs) | ||||
5225 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||
5226 | } | ||||
5227 | } | ||||
5228 | |||||
5229 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||
5230 | if (NumElts == 0) | ||||
5231 | return; | ||||
5232 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||
5233 | for (int I = 0; I != NumElts; ++I) | ||||
5234 | ResultRegs.push_back(Undef); | ||||
5235 | }; | ||||
5236 | |||||
5237 | // Pad out any elements eliminated due to the dmask. | ||||
5238 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||
5239 | if (!ResTy.isVector()) { | ||||
5240 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||
5241 | B.buildBuildVector(DstReg, ResultRegs); | ||||
5242 | return true; | ||||
5243 | } | ||||
5244 | |||||
5245 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5245, __extension__ __PRETTY_FUNCTION__)); | ||||
5246 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||
5247 | |||||
5248 | // Deal with the one annoying legal case. | ||||
5249 | const LLT V3S16 = LLT::fixed_vector(3, 16); | ||||
5250 | if (Ty == V3S16) { | ||||
5251 | if (IsTFE) { | ||||
5252 | if (ResultRegs.size() == 1) { | ||||
5253 | NewResultReg = ResultRegs[0]; | ||||
5254 | } else if (ResultRegs.size() == 2) { | ||||
5255 | LLT V4S16 = LLT::fixed_vector(4, 16); | ||||
5256 | NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); | ||||
5257 | } else { | ||||
5258 | return false; | ||||
5259 | } | ||||
5260 | } | ||||
5261 | |||||
5262 | if (MRI->getType(DstReg).getNumElements() < | ||||
5263 | MRI->getType(NewResultReg).getNumElements()) { | ||||
5264 | B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); | ||||
5265 | } else { | ||||
5266 | B.buildPadVectorWithUndefElements(DstReg, NewResultReg); | ||||
5267 | } | ||||
5268 | return true; | ||||
5269 | } | ||||
5270 | |||||
5271 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||
5272 | B.buildConcatVectors(DstReg, ResultRegs); | ||||
5273 | return true; | ||||
5274 | } | ||||
5275 | |||||
5276 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||
5277 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||
5278 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
5279 | GISelChangeObserver &Observer = Helper.Observer; | ||||
5280 | |||||
5281 | Register Dst = MI.getOperand(0).getReg(); | ||||
5282 | LLT Ty = B.getMRI()->getType(Dst); | ||||
5283 | unsigned Size = Ty.getSizeInBits(); | ||||
5284 | MachineFunction &MF = B.getMF(); | ||||
5285 | |||||
5286 | Observer.changingInstr(MI); | ||||
5287 | |||||
5288 | if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { | ||||
5289 | Ty = getBitcastRegisterType(Ty); | ||||
5290 | Helper.bitcastDst(MI, Ty, 0); | ||||
5291 | Dst = MI.getOperand(0).getReg(); | ||||
5292 | B.setInsertPt(B.getMBB(), MI); | ||||
5293 | } | ||||
5294 | |||||
5295 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||
5296 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||
5297 | // allowed to add one. | ||||
5298 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||
5299 | MI.removeOperand(1); // Remove intrinsic ID | ||||
5300 | |||||
5301 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||
5302 | // TODO: Should this use datalayout alignment? | ||||
5303 | const unsigned MemSize = (Size + 7) / 8; | ||||
5304 | const Align MemAlign(4); | ||||
5305 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
5306 | MachinePointerInfo(), | ||||
5307 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
5308 | MachineMemOperand::MOInvariant, | ||||
5309 | MemSize, MemAlign); | ||||
5310 | MI.addMemOperand(MF, MMO); | ||||
5311 | |||||
5312 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||
5313 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||
5314 | // out this needs to be converted to a vector load during RegBankSelect. | ||||
5315 | if (!isPowerOf2_32(Size)) { | ||||
5316 | if (Ty.isVector()) | ||||
5317 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||
5318 | else | ||||
5319 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||
5320 | } | ||||
5321 | |||||
5322 | Observer.changedInstr(MI); | ||||
5323 | return true; | ||||
5324 | } | ||||
5325 | |||||
5326 | // TODO: Move to selection | ||||
5327 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||
5328 | MachineRegisterInfo &MRI, | ||||
5329 | MachineIRBuilder &B) const { | ||||
5330 | if (!ST.isTrapHandlerEnabled() || | ||||
5331 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | ||||
5332 | return legalizeTrapEndpgm(MI, MRI, B); | ||||
5333 | |||||
5334 | const Module *M = B.getMF().getFunction().getParent(); | ||||
5335 | unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); | ||||
5336 | if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) | ||||
5337 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||
5338 | |||||
5339 | return ST.supportsGetDoorbellID() ? | ||||
5340 | legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||
5341 | } | ||||
5342 | |||||
5343 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | ||||
5344 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5345 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||
5346 | MI.eraseFromParent(); | ||||
5347 | return true; | ||||
5348 | } | ||||
5349 | |||||
5350 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | ||||
5351 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5352 | MachineFunction &MF = B.getMF(); | ||||
5353 | const LLT S64 = LLT::scalar(64); | ||||
5354 | |||||
5355 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||
5356 | // For code object version 5, queue_ptr is passed through implicit kernarg. | ||||
5357 | if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= | ||||
5358 | AMDGPU::AMDHSA_COV5) { | ||||
5359 | AMDGPUTargetLowering::ImplicitParameter Param = | ||||
5360 | AMDGPUTargetLowering::QUEUE_PTR; | ||||
5361 | uint64_t Offset = | ||||
5362 | ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); | ||||
5363 | |||||
5364 | Register KernargPtrReg = MRI.createGenericVirtualRegister( | ||||
5365 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
5366 | |||||
5367 | if (!loadInputValue(KernargPtrReg, B, | ||||
5368 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
5369 | return false; | ||||
5370 | |||||
5371 | // TODO: can we be smarter about machine pointer info? | ||||
5372 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
5373 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
5374 | PtrInfo, | ||||
5375 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
5376 | MachineMemOperand::MOInvariant, | ||||
5377 | LLT::scalar(64), commonAlignment(Align(64), Offset)); | ||||
5378 | |||||
5379 | // Pointer address | ||||
5380 | Register LoadAddr = MRI.createGenericVirtualRegister( | ||||
5381 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
5382 | B.buildPtrAdd(LoadAddr, KernargPtrReg, | ||||
5383 | B.buildConstant(LLT::scalar(64), Offset).getReg(0)); | ||||
5384 | // Load address | ||||
5385 | Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); | ||||
5386 | B.buildCopy(SGPR01, Temp); | ||||
5387 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5388 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||
5389 | .addReg(SGPR01, RegState::Implicit); | ||||
5390 | MI.eraseFromParent(); | ||||
5391 | return true; | ||||
5392 | } | ||||
5393 | |||||
5394 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||
5395 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||
5396 | Register LiveIn = | ||||
5397 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
5398 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||
5399 | return false; | ||||
5400 | |||||
5401 | B.buildCopy(SGPR01, LiveIn); | ||||
5402 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5403 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||
5404 | .addReg(SGPR01, RegState::Implicit); | ||||
5405 | |||||
5406 | MI.eraseFromParent(); | ||||
5407 | return true; | ||||
5408 | } | ||||
5409 | |||||
5410 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | ||||
5411 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5412 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5413 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | ||||
5414 | MI.eraseFromParent(); | ||||
5415 | return true; | ||||
5416 | } | ||||
5417 | |||||
5418 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||
5419 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
5420 | // Is non-HSA path or trap-handler disabled? Then, report a warning | ||||
5421 | // accordingly | ||||
5422 | if (!ST.isTrapHandlerEnabled() || | ||||
5423 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | ||||
5424 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||
5425 | "debugtrap handler not supported", | ||||
5426 | MI.getDebugLoc(), DS_Warning); | ||||
5427 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||
5428 | Ctx.diagnose(NoTrap); | ||||
5429 | } else { | ||||
5430 | // Insert debug-trap instruction | ||||
5431 | B.buildInstr(AMDGPU::S_TRAP) | ||||
5432 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | ||||
5433 | } | ||||
5434 | |||||
5435 | MI.eraseFromParent(); | ||||
5436 | return true; | ||||
5437 | } | ||||
5438 | |||||
5439 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||
5440 | MachineIRBuilder &B) const { | ||||
5441 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
5442 | const LLT S16 = LLT::scalar(16); | ||||
5443 | const LLT S32 = LLT::scalar(32); | ||||
5444 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
5445 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||
5446 | |||||
5447 | Register DstReg = MI.getOperand(0).getReg(); | ||||
5448 | Register NodePtr = MI.getOperand(2).getReg(); | ||||
5449 | Register RayExtent = MI.getOperand(3).getReg(); | ||||
5450 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||
5451 | Register RayDir = MI.getOperand(5).getReg(); | ||||
5452 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||
5453 | Register TDescr = MI.getOperand(7).getReg(); | ||||
5454 | |||||
5455 | if (!ST.hasGFX10_AEncoding()) { | ||||
5456 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), | ||||
5457 | "intrinsic not supported on subtarget", | ||||
5458 | MI.getDebugLoc()); | ||||
5459 | B.getMF().getFunction().getContext().diagnose(BadIntrin); | ||||
5460 | return false; | ||||
5461 | } | ||||
5462 | |||||
5463 | const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); | ||||
5464 | const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||
5465 | const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||
5466 | const unsigned NumVDataDwords = 4; | ||||
5467 | const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); | ||||
5468 | const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; | ||||
5469 | const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); | ||||
5470 | const unsigned BaseOpcodes[2][2] = { | ||||
5471 | {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, | ||||
5472 | {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, | ||||
5473 | AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; | ||||
5474 | int Opcode; | ||||
5475 | if (UseNSA) { | ||||
5476 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], | ||||
5477 | IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA | ||||
5478 | : AMDGPU::MIMGEncGfx10NSA, | ||||
5479 | NumVDataDwords, NumVAddrDwords); | ||||
5480 | } else { | ||||
5481 | Opcode = AMDGPU::getMIMGOpcode( | ||||
5482 | BaseOpcodes[Is64][IsA16], | ||||
5483 | IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, | ||||
5484 | NumVDataDwords, NumVAddrDwords); | ||||
5485 | } | ||||
5486 | assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail ("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 5486, __extension__ __PRETTY_FUNCTION__)); | ||||
5487 | |||||
5488 | SmallVector<Register, 12> Ops; | ||||
5489 | if (UseNSA && IsGFX11Plus) { | ||||
5490 | auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { | ||||
5491 | auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); | ||||
5492 | auto Merged = B.buildMergeLikeInstr( | ||||
5493 | V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); | ||||
5494 | Ops.push_back(Merged.getReg(0)); | ||||
5495 | }; | ||||
5496 | |||||
5497 | Ops.push_back(NodePtr); | ||||
5498 | Ops.push_back(RayExtent); | ||||
5499 | packLanes(RayOrigin); | ||||
5500 | |||||
5501 | if (IsA16) { | ||||
5502 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); | ||||
5503 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); | ||||
5504 | auto MergedDir = B.buildMergeLikeInstr( | ||||
5505 | V3S32, | ||||
5506 | {B.buildBitcast( | ||||
5507 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), | ||||
5508 | UnmergeRayDir.getReg(0)})) | ||||
5509 | .getReg(0), | ||||
5510 | B.buildBitcast( | ||||
5511 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), | ||||
5512 | UnmergeRayDir.getReg(1)})) | ||||
5513 | .getReg(0), | ||||
5514 | B.buildBitcast( | ||||
5515 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), | ||||
5516 | UnmergeRayDir.getReg(2)})) | ||||
5517 | .getReg(0)}); | ||||
5518 | Ops.push_back(MergedDir.getReg(0)); | ||||
5519 | } else { | ||||
5520 | packLanes(RayDir); | ||||
5521 | packLanes(RayInvDir); | ||||
5522 | } | ||||
5523 | } else { | ||||
5524 | if (Is64) { | ||||
5525 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||
5526 | Ops.push_back(Unmerge.getReg(0)); | ||||
5527 | Ops.push_back(Unmerge.getReg(1)); | ||||
5528 | } else { | ||||
5529 | Ops.push_back(NodePtr); | ||||
5530 | } | ||||
5531 | Ops.push_back(RayExtent); | ||||
5532 | |||||
5533 | auto packLanes = [&Ops, &S32, &B](Register Src) { | ||||
5534 | auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); | ||||
5535 | Ops.push_back(Unmerge.getReg(0)); | ||||
5536 | Ops.push_back(Unmerge.getReg(1)); | ||||
5537 | Ops.push_back(Unmerge.getReg(2)); | ||||
5538 | }; | ||||
5539 | |||||
5540 | packLanes(RayOrigin); | ||||
5541 | if (IsA16) { | ||||
5542 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); | ||||
5543 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); | ||||
5544 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||
5545 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||
5546 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||
5547 | B.buildMergeLikeInstr(R1, | ||||
5548 | {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||
5549 | B.buildMergeLikeInstr( | ||||
5550 | R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||
5551 | B.buildMergeLikeInstr( | ||||
5552 | R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||
5553 | Ops.push_back(R1); | ||||
5554 | Ops.push_back(R2); | ||||
5555 | Ops.push_back(R3); | ||||
5556 | } else { | ||||
5557 | packLanes(RayDir); | ||||
5558 | packLanes(RayInvDir); | ||||
5559 | } | ||||
5560 | } | ||||
5561 | |||||
5562 | if (!UseNSA) { | ||||
5563 | // Build a single vector containing all the operands so far prepared. | ||||
5564 | LLT OpTy = LLT::fixed_vector(Ops.size(), 32); | ||||
5565 | Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); | ||||
5566 | Ops.clear(); | ||||
5567 | Ops.push_back(MergedOps); | ||||
5568 | } | ||||
5569 | |||||
5570 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||
5571 | .addDef(DstReg) | ||||
5572 | .addImm(Opcode); | ||||
5573 | |||||
5574 | for (Register R : Ops) { | ||||
5575 | MIB.addUse(R); | ||||
5576 | } | ||||
5577 | |||||
5578 | MIB.addUse(TDescr) | ||||
5579 | .addImm(IsA16 ? 1 : 0) | ||||
5580 | .cloneMemRefs(MI); | ||||
5581 | |||||
5582 | MI.eraseFromParent(); | ||||
5583 | return true; | ||||
5584 | } | ||||
5585 | |||||
5586 | bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, | ||||
5587 | MachineIRBuilder &B) const { | ||||
5588 | unsigned Opc; | ||||
5589 | int RoundMode = MI.getOperand(2).getImm(); | ||||
5590 | |||||
5591 | if (RoundMode == (int)RoundingMode::TowardPositive) | ||||
5592 | Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; | ||||
5593 | else if (RoundMode == (int)RoundingMode::TowardNegative) | ||||
5594 | Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; | ||||
5595 | else | ||||
5596 | return false; | ||||
5597 | |||||
5598 | B.buildInstr(Opc) | ||||
5599 | .addDef(MI.getOperand(0).getReg()) | ||||
5600 | .addUse(MI.getOperand(1).getReg()); | ||||
5601 | |||||
5602 | MI.eraseFromParent(); | ||||
5603 | |||||
5604 | return true; | ||||
5605 | } | ||||
5606 | |||||
5607 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||
5608 | MachineInstr &MI) const { | ||||
5609 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
5610 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
5611 | |||||
5612 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||
5613 | auto IntrID = MI.getIntrinsicID(); | ||||
5614 | switch (IntrID) { | ||||
5615 | case Intrinsic::amdgcn_if: | ||||
5616 | case Intrinsic::amdgcn_else: { | ||||
5617 | MachineInstr *Br = nullptr; | ||||
5618 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||
5619 | bool Negated = false; | ||||
5620 | if (MachineInstr *BrCond = | ||||
5621 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||
5622 | const SIRegisterInfo *TRI | ||||
5623 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||
5624 | |||||
5625 | Register Def = MI.getOperand(1).getReg(); | ||||
5626 | Register Use = MI.getOperand(3).getReg(); | ||||
5627 | |||||
5628 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||
5629 | |||||
5630 | if (Negated) | ||||
5631 | std::swap(CondBrTarget, UncondBrTarget); | ||||
5632 | |||||
5633 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||
5634 | if (IntrID == Intrinsic::amdgcn_if) { | ||||
5635 | B.buildInstr(AMDGPU::SI_IF) | ||||
5636 | .addDef(Def) | ||||
5637 | .addUse(Use) | ||||
5638 | .addMBB(UncondBrTarget); | ||||
5639 | } else { | ||||
5640 | B.buildInstr(AMDGPU::SI_ELSE) | ||||
5641 | .addDef(Def) | ||||
5642 | .addUse(Use) | ||||
5643 | .addMBB(UncondBrTarget); | ||||
5644 | } | ||||
5645 | |||||
5646 | if (Br) { | ||||
5647 | Br->getOperand(0).setMBB(CondBrTarget); | ||||
5648 | } else { | ||||
5649 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||
5650 | // since we're swapping branch targets it needs to be reinserted. | ||||
5651 | // FIXME: IRTranslator should probably not do this | ||||
5652 | B.buildBr(*CondBrTarget); | ||||
5653 | } | ||||
5654 | |||||
5655 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||
5656 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||
5657 | MI.eraseFromParent(); | ||||
5658 | BrCond->eraseFromParent(); | ||||
5659 | return true; | ||||
5660 | } | ||||
5661 | |||||
5662 | return false; | ||||
5663 | } | ||||
5664 | case Intrinsic::amdgcn_loop: { | ||||
5665 | MachineInstr *Br = nullptr; | ||||
5666 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||
5667 | bool Negated = false; | ||||
5668 | if (MachineInstr *BrCond = | ||||
5669 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||
5670 | const SIRegisterInfo *TRI | ||||
5671 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||
5672 | |||||
5673 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||
5674 | Register Reg = MI.getOperand(2).getReg(); | ||||
5675 | |||||
5676 | if (Negated) | ||||
5677 | std::swap(CondBrTarget, UncondBrTarget); | ||||
5678 | |||||
5679 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||
5680 | B.buildInstr(AMDGPU::SI_LOOP) | ||||
5681 | .addUse(Reg) | ||||
5682 | .addMBB(UncondBrTarget); | ||||
5683 | |||||
5684 | if (Br) | ||||
5685 | Br->getOperand(0).setMBB(CondBrTarget); | ||||
5686 | else | ||||
5687 | B.buildBr(*CondBrTarget); | ||||
5688 | |||||
5689 | MI.eraseFromParent(); | ||||
5690 | BrCond->eraseFromParent(); | ||||
5691 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||
5692 | return true; | ||||
5693 | } | ||||
5694 | |||||
5695 | return false; | ||||
5696 | } | ||||
5697 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||
5698 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||
5699 | // This only makes sense to call in a kernel, so just lower to null. | ||||
5700 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||
5701 | MI.eraseFromParent(); | ||||
5702 | return true; | ||||
5703 | } | ||||
5704 | |||||
5705 | return legalizePreloadedArgIntrin( | ||||
5706 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||
5707 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||
5708 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||
5709 | case Intrinsic::amdgcn_workitem_id_x: | ||||
5710 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, | ||||
5711 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||
5712 | case Intrinsic::amdgcn_workitem_id_y: | ||||
5713 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, | ||||
5714 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||
5715 | case Intrinsic::amdgcn_workitem_id_z: | ||||
5716 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, | ||||
5717 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||
5718 | case Intrinsic::amdgcn_workgroup_id_x: | ||||
5719 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5720 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||
5721 | case Intrinsic::amdgcn_workgroup_id_y: | ||||
5722 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5723 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||
5724 | case Intrinsic::amdgcn_workgroup_id_z: | ||||
5725 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5726 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||
5727 | case Intrinsic::amdgcn_lds_kernel_id: | ||||
5728 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5729 | AMDGPUFunctionArgInfo::LDS_KERNEL_ID); | ||||
5730 | case Intrinsic::amdgcn_dispatch_ptr: | ||||
5731 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5732 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||
5733 | case Intrinsic::amdgcn_queue_ptr: | ||||
5734 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5735 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||
5736 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||
5737 | return legalizePreloadedArgIntrin( | ||||
5738 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||
5739 | case Intrinsic::amdgcn_dispatch_id: | ||||
5740 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
5741 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||
5742 | case Intrinsic::r600_read_ngroups_x: | ||||
5743 | // TODO: Emit error for hsa | ||||
5744 | return legalizeKernargMemParameter(MI, B, | ||||
5745 | SI::KernelInputOffsets::NGROUPS_X); | ||||
5746 | case Intrinsic::r600_read_ngroups_y: | ||||
5747 | return legalizeKernargMemParameter(MI, B, | ||||
5748 | SI::KernelInputOffsets::NGROUPS_Y); | ||||
5749 | case Intrinsic::r600_read_ngroups_z: | ||||
5750 | return legalizeKernargMemParameter(MI, B, | ||||
5751 | SI::KernelInputOffsets::NGROUPS_Z); | ||||
5752 | case Intrinsic::r600_read_local_size_x: | ||||
5753 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
5754 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); | ||||
5755 | case Intrinsic::r600_read_local_size_y: | ||||
5756 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
5757 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); | ||||
5758 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
5759 | case Intrinsic::r600_read_local_size_z: | ||||
5760 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); | ||||
5761 | case Intrinsic::r600_read_global_size_x: | ||||
5762 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); | ||||
5763 | case Intrinsic::r600_read_global_size_y: | ||||
5764 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); | ||||
5765 | case Intrinsic::r600_read_global_size_z: | ||||
5766 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); | ||||
5767 | case Intrinsic::amdgcn_fdiv_fast: | ||||
5768 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||
5769 | case Intrinsic::amdgcn_is_shared: | ||||
5770 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||
5771 | case Intrinsic::amdgcn_is_private: | ||||
5772 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||
5773 | case Intrinsic::amdgcn_wavefrontsize: { | ||||
5774 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||
5775 | MI.eraseFromParent(); | ||||
5776 | return true; | ||||
5777 | } | ||||
5778 | case Intrinsic::amdgcn_s_buffer_load: | ||||
5779 | return legalizeSBufferLoad(Helper, MI); | ||||
5780 | case Intrinsic::amdgcn_raw_buffer_store: | ||||
5781 | case Intrinsic::amdgcn_struct_buffer_store: | ||||
5782 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||
5783 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||
5784 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||
5785 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||
5786 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||
5787 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||
5788 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||
5789 | case Intrinsic::amdgcn_raw_buffer_load: | ||||
5790 | case Intrinsic::amdgcn_struct_buffer_load: | ||||
5791 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||
5792 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||
5793 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||
5794 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||
5795 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||
5796 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||
5797 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||
5798 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||
5799 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||
5800 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||
5801 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||
5802 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||
5803 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||
5804 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||
5805 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||
5806 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||
5807 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||
5808 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||
5809 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||
5810 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||
5811 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||
5812 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||
5813 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||
5814 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||
5815 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||
5816 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||
5817 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||
5818 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||
5819 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||
5820 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||
5821 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||
5822 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||
5823 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||
5824 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||
5825 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||
5826 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||
5827 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||
5828 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||
5829 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||
5830 | return legalizeBufferAtomic(MI, B, IntrID); | ||||
5831 | case Intrinsic::amdgcn_atomic_inc: | ||||
5832 | return legalizeAtomicIncDec(MI, B, true); | ||||
5833 | case Intrinsic::amdgcn_atomic_dec: | ||||
5834 | return legalizeAtomicIncDec(MI, B, false); | ||||
5835 | case Intrinsic::trap: | ||||
5836 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||
5837 | case Intrinsic::debugtrap: | ||||
5838 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||
5839 | case Intrinsic::amdgcn_rsq_clamp: | ||||
5840 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||
5841 | case Intrinsic::amdgcn_ds_fadd: | ||||
5842 | case Intrinsic::amdgcn_ds_fmin: | ||||
5843 | case Intrinsic::amdgcn_ds_fmax: | ||||
5844 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||
5845 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||
5846 | return legalizeBVHIntrinsic(MI, B); | ||||
5847 | default: { | ||||
5848 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||
5849 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||
5850 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||
5851 | return true; | ||||
5852 | } | ||||
5853 | } | ||||
5854 | |||||
5855 | return true; | ||||
5856 | } |
1 | //===-- llvm/ADT/bit.h - C++20 <bit> ----------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements the C++20 <bit> header. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_ADT_BIT_H |
15 | #define LLVM_ADT_BIT_H |
16 | |
17 | #include "llvm/Support/Compiler.h" |
18 | #include <cstdint> |
19 | #include <limits> |
20 | #include <type_traits> |
21 | |
22 | #if !__has_builtin(__builtin_bit_cast)1 |
23 | #include <cstring> |
24 | #endif |
25 | |
26 | #if defined(_MSC_VER) && !defined(_DEBUG1) |
27 | #include <cstdlib> // for _byteswap_{ushort,ulong,uint64} |
28 | #endif |
29 | |
30 | #ifdef _MSC_VER |
31 | // Declare these intrinsics manually rather including intrin.h. It's very |
32 | // expensive, and bit.h is popular via MathExtras.h. |
33 | // #include <intrin.h> |
34 | extern "C" { |
35 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); |
36 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); |
37 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); |
38 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); |
39 | } |
40 | #endif |
41 | |
42 | namespace llvm { |
43 | |
44 | // This implementation of bit_cast is different from the C++20 one in two ways: |
45 | // - It isn't constexpr because that requires compiler support. |
46 | // - It requires trivially-constructible To, to avoid UB in the implementation. |
47 | template < |
48 | typename To, typename From, |
49 | typename = std::enable_if_t<sizeof(To) == sizeof(From)>, |
50 | typename = std::enable_if_t<std::is_trivially_constructible<To>::value>, |
51 | typename = std::enable_if_t<std::is_trivially_copyable<To>::value>, |
52 | typename = std::enable_if_t<std::is_trivially_copyable<From>::value>> |
53 | [[nodiscard]] inline To bit_cast(const From &from) noexcept { |
54 | #if __has_builtin(__builtin_bit_cast)1 |
55 | return __builtin_bit_cast(To, from); |
56 | #else |
57 | To to; |
58 | std::memcpy(&to, &from, sizeof(To)); |
59 | return to; |
60 | #endif |
61 | } |
62 | |
63 | /// Reverses the bytes in the given integer value V. |
64 | template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>> |
65 | [[nodiscard]] constexpr T byteswap(T V) noexcept { |
66 | if constexpr (sizeof(T) == 1) { |
67 | return V; |
68 | } else if constexpr (sizeof(T) == 2) { |
69 | uint16_t UV = V; |
70 | #if defined(_MSC_VER) && !defined(_DEBUG1) |
71 | // The DLL version of the runtime lacks these functions (bug!?), but in a |
72 | // release build they're replaced with BSWAP instructions anyway. |
73 | return _byteswap_ushort(UV); |
74 | #else |
75 | uint16_t Hi = UV << 8; |
76 | uint16_t Lo = UV >> 8; |
77 | return Hi | Lo; |
78 | #endif |
79 | } else if constexpr (sizeof(T) == 4) { |
80 | uint32_t UV = V; |
81 | #if __has_builtin(__builtin_bswap32)1 |
82 | return __builtin_bswap32(UV); |
83 | #elif defined(_MSC_VER) && !defined(_DEBUG1) |
84 | return _byteswap_ulong(UV); |
85 | #else |
86 | uint32_t Byte0 = UV & 0x000000FF; |
87 | uint32_t Byte1 = UV & 0x0000FF00; |
88 | uint32_t Byte2 = UV & 0x00FF0000; |
89 | uint32_t Byte3 = UV & 0xFF000000; |
90 | return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24); |
91 | #endif |
92 | } else if constexpr (sizeof(T) == 8) { |
93 | uint64_t UV = V; |
94 | #if __has_builtin(__builtin_bswap64)1 |
95 | return __builtin_bswap64(UV); |
96 | #elif defined(_MSC_VER) && !defined(_DEBUG1) |
97 | return _byteswap_uint64(UV); |
98 | #else |
99 | uint64_t Hi = llvm::byteswap<uint32_t>(UV); |
100 | uint32_t Lo = llvm::byteswap<uint32_t>(UV >> 32); |
101 | return (Hi << 32) | Lo; |
102 | #endif |
103 | } else { |
104 | static_assert(!sizeof(T *), "Don't know how to handle the given type."); |
105 | return 0; |
106 | } |
107 | } |
108 | |
109 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
110 | [[nodiscard]] constexpr inline bool has_single_bit(T Value) noexcept { |
111 | return (Value != 0) && ((Value & (Value - 1)) == 0); |
112 | } |
113 | |
114 | namespace detail { |
115 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { |
116 | static unsigned count(T Val) { |
117 | if (!Val) |
118 | return std::numeric_limits<T>::digits; |
119 | if (Val & 0x1) |
120 | return 0; |
121 | |
122 | // Bisection method. |
123 | unsigned ZeroBits = 0; |
124 | T Shift = std::numeric_limits<T>::digits >> 1; |
125 | T Mask = std::numeric_limits<T>::max() >> Shift; |
126 | while (Shift) { |
127 | if ((Val & Mask) == 0) { |
128 | Val >>= Shift; |
129 | ZeroBits |= Shift; |
130 | } |
131 | Shift >>= 1; |
132 | Mask >>= Shift; |
133 | } |
134 | return ZeroBits; |
135 | } |
136 | }; |
137 | |
138 | #if defined(__GNUC__4) || defined(_MSC_VER) |
139 | template <typename T> struct TrailingZerosCounter<T, 4> { |
140 | static unsigned count(T Val) { |
141 | if (Val == 0) |
142 | return 32; |
143 | |
144 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) |
145 | return __builtin_ctz(Val); |
146 | #elif defined(_MSC_VER) |
147 | unsigned long Index; |
148 | _BitScanForward(&Index, Val); |
149 | return Index; |
150 | #endif |
151 | } |
152 | }; |
153 | |
154 | #if !defined(_MSC_VER) || defined(_M_X64) |
155 | template <typename T> struct TrailingZerosCounter<T, 8> { |
156 | static unsigned count(T Val) { |
157 | if (Val == 0) |
158 | return 64; |
159 | |
160 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) |
161 | return __builtin_ctzll(Val); |
162 | #elif defined(_MSC_VER) |
163 | unsigned long Index; |
164 | _BitScanForward64(&Index, Val); |
165 | return Index; |
166 | #endif |
167 | } |
168 | }; |
169 | #endif |
170 | #endif |
171 | } // namespace detail |
172 | |
173 | /// Count number of 0's from the least significant bit to the most |
174 | /// stopping at the first 1. |
175 | /// |
176 | /// Only unsigned integral types are allowed. |
177 | /// |
178 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
179 | template <typename T> [[nodiscard]] int countr_zero(T Val) { |
180 | static_assert(std::is_unsigned_v<T>, |
181 | "Only unsigned integral types are allowed."); |
182 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val); |
183 | } |
184 | |
185 | namespace detail { |
186 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { |
187 | static unsigned count(T Val) { |
188 | if (!Val) |
189 | return std::numeric_limits<T>::digits; |
190 | |
191 | // Bisection method. |
192 | unsigned ZeroBits = 0; |
193 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { |
194 | T Tmp = Val >> Shift; |
195 | if (Tmp) |
196 | Val = Tmp; |
197 | else |
198 | ZeroBits |= Shift; |
199 | } |
200 | return ZeroBits; |
201 | } |
202 | }; |
203 | |
204 | #if defined(__GNUC__4) || defined(_MSC_VER) |
205 | template <typename T> struct LeadingZerosCounter<T, 4> { |
206 | static unsigned count(T Val) { |
207 | if (Val == 0) |
208 | return 32; |
209 | |
210 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) |
211 | return __builtin_clz(Val); |
212 | #elif defined(_MSC_VER) |
213 | unsigned long Index; |
214 | _BitScanReverse(&Index, Val); |
215 | return Index ^ 31; |
216 | #endif |
217 | } |
218 | }; |
219 | |
220 | #if !defined(_MSC_VER) || defined(_M_X64) |
221 | template <typename T> struct LeadingZerosCounter<T, 8> { |
222 | static unsigned count(T Val) { |
223 | if (Val == 0) |
224 | return 64; |
225 | |
226 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) |
227 | return __builtin_clzll(Val); |
228 | #elif defined(_MSC_VER) |
229 | unsigned long Index; |
230 | _BitScanReverse64(&Index, Val); |
231 | return Index ^ 63; |
232 | #endif |
233 | } |
234 | }; |
235 | #endif |
236 | #endif |
237 | } // namespace detail |
238 | |
239 | /// Count number of 0's from the most significant bit to the least |
240 | /// stopping at the first 1. |
241 | /// |
242 | /// Only unsigned integral types are allowed. |
243 | /// |
244 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
245 | template <typename T> [[nodiscard]] int countl_zero(T Val) { |
246 | static_assert(std::is_unsigned_v<T>, |
247 | "Only unsigned integral types are allowed."); |
248 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val); |
249 | } |
250 | |
251 | /// Count the number of ones from the most significant bit to the first |
252 | /// zero bit. |
253 | /// |
254 | /// Ex. countl_one(0xFF0FFF00) == 8. |
255 | /// Only unsigned integral types are allowed. |
256 | /// |
257 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
258 | template <typename T> [[nodiscard]] int countl_one(T Value) { |
259 | static_assert(std::is_unsigned_v<T>, |
260 | "Only unsigned integral types are allowed."); |
261 | return llvm::countl_zero<T>(~Value); |
262 | } |
263 | |
264 | /// Count the number of ones from the least significant bit to the first |
265 | /// zero bit. |
266 | /// |
267 | /// Ex. countr_one(0x00FF00FF) == 8. |
268 | /// Only unsigned integral types are allowed. |
269 | /// |
270 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
271 | template <typename T> [[nodiscard]] int countr_one(T Value) { |
272 | static_assert(std::is_unsigned_v<T>, |
273 | "Only unsigned integral types are allowed."); |
274 | return llvm::countr_zero<T>(~Value); |
275 | } |
276 | |
277 | /// Returns the number of bits needed to represent Value if Value is nonzero. |
278 | /// Returns 0 otherwise. |
279 | /// |
280 | /// Ex. bit_width(5) == 3. |
281 | template <typename T> [[nodiscard]] int bit_width(T Value) { |
282 | static_assert(std::is_unsigned_v<T>, |
283 | "Only unsigned integral types are allowed."); |
284 | return std::numeric_limits<T>::digits - llvm::countl_zero(Value); |
285 | } |
286 | |
287 | /// Returns the largest integral power of two no greater than Value if Value is |
288 | /// nonzero. Returns 0 otherwise. |
289 | /// |
290 | /// Ex. bit_floor(5) == 4. |
291 | template <typename T> [[nodiscard]] T bit_floor(T Value) { |
292 | static_assert(std::is_unsigned_v<T>, |
293 | "Only unsigned integral types are allowed."); |
294 | if (!Value) |
295 | return 0; |
296 | return T(1) << (llvm::bit_width(Value) - 1); |
297 | } |
298 | |
299 | /// Returns the smallest integral power of two no smaller than Value if Value is |
300 | /// nonzero. Returns 1 otherwise. |
301 | /// |
302 | /// Ex. bit_ceil(5) == 8. |
303 | /// |
304 | /// The return value is undefined if the input is larger than the largest power |
305 | /// of two representable in T. |
306 | template <typename T> [[nodiscard]] T bit_ceil(T Value) { |
307 | static_assert(std::is_unsigned_v<T>, |
308 | "Only unsigned integral types are allowed."); |
309 | if (Value < 2) |
310 | return 1; |
311 | return T(1) << llvm::bit_width<T>(Value - 1u); |
312 | } |
313 | |
314 | namespace detail { |
315 |