File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 126, column 59 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// This file implements the targeting of the Machinelegalizer class for | |||
10 | /// AMDGPU. | |||
11 | /// \todo This should be generated by TableGen. | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #include "AMDGPULegalizerInfo.h" | |||
15 | ||||
16 | #include "AMDGPU.h" | |||
17 | #include "AMDGPUGlobalISelUtils.h" | |||
18 | #include "AMDGPUInstrInfo.h" | |||
19 | #include "AMDGPUTargetMachine.h" | |||
20 | #include "SIMachineFunctionInfo.h" | |||
21 | #include "Utils/AMDGPUBaseInfo.h" | |||
22 | #include "llvm/ADT/ScopeExit.h" | |||
23 | #include "llvm/BinaryFormat/ELF.h" | |||
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | |||
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | |||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | |||
27 | #include "llvm/IR/DiagnosticInfo.h" | |||
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | |||
29 | ||||
30 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | |||
31 | ||||
32 | using namespace llvm; | |||
33 | using namespace LegalizeActions; | |||
34 | using namespace LegalizeMutations; | |||
35 | using namespace LegalityPredicates; | |||
36 | using namespace MIPatternMatch; | |||
37 | ||||
38 | // Hack until load/store selection patterns support any tuple of legal types. | |||
39 | static cl::opt<bool> EnableNewLegality( | |||
40 | "amdgpu-global-isel-new-legality", | |||
41 | cl::desc("Use GlobalISel desired legality, rather than try to use" | |||
42 | "rules compatible with selection patterns"), | |||
43 | cl::init(false), | |||
44 | cl::ReallyHidden); | |||
45 | ||||
46 | static constexpr unsigned MaxRegisterSize = 1024; | |||
47 | ||||
48 | // Round the number of elements to the next power of two elements | |||
49 | static LLT getPow2VectorType(LLT Ty) { | |||
50 | unsigned NElts = Ty.getNumElements(); | |||
51 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | |||
52 | return Ty.changeNumElements(Pow2NElts); | |||
53 | } | |||
54 | ||||
55 | // Round the number of bits to the next power of two bits | |||
56 | static LLT getPow2ScalarType(LLT Ty) { | |||
57 | unsigned Bits = Ty.getSizeInBits(); | |||
58 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | |||
59 | return LLT::scalar(Pow2Bits); | |||
60 | } | |||
61 | ||||
62 | /// \returs true if this is an odd sized vector which should widen by adding an | |||
63 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | |||
64 | /// excludes s1 vectors, which should always be scalarized. | |||
65 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | |||
66 | return [=](const LegalityQuery &Query) { | |||
67 | const LLT Ty = Query.Types[TypeIdx]; | |||
68 | if (!Ty.isVector()) | |||
69 | return false; | |||
70 | ||||
71 | const LLT EltTy = Ty.getElementType(); | |||
72 | const unsigned EltSize = EltTy.getSizeInBits(); | |||
73 | return Ty.getNumElements() % 2 != 0 && | |||
74 | EltSize > 1 && EltSize < 32 && | |||
75 | Ty.getSizeInBits() % 32 != 0; | |||
76 | }; | |||
77 | } | |||
78 | ||||
79 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | |||
80 | return [=](const LegalityQuery &Query) { | |||
81 | const LLT Ty = Query.Types[TypeIdx]; | |||
82 | return Ty.getSizeInBits() % 32 == 0; | |||
83 | }; | |||
84 | } | |||
85 | ||||
86 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | |||
87 | return [=](const LegalityQuery &Query) { | |||
88 | const LLT Ty = Query.Types[TypeIdx]; | |||
89 | const LLT EltTy = Ty.getScalarType(); | |||
90 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | |||
91 | }; | |||
92 | } | |||
93 | ||||
94 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | |||
95 | return [=](const LegalityQuery &Query) { | |||
96 | const LLT Ty = Query.Types[TypeIdx]; | |||
97 | const LLT EltTy = Ty.getElementType(); | |||
98 | return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); | |||
99 | }; | |||
100 | } | |||
101 | ||||
102 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | |||
103 | return [=](const LegalityQuery &Query) { | |||
104 | const LLT Ty = Query.Types[TypeIdx]; | |||
105 | const LLT EltTy = Ty.getElementType(); | |||
106 | unsigned Size = Ty.getSizeInBits(); | |||
107 | unsigned Pieces = (Size + 63) / 64; | |||
108 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | |||
109 | return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); | |||
110 | }; | |||
111 | } | |||
112 | ||||
113 | // Increase the number of vector elements to reach the next multiple of 32-bit | |||
114 | // type. | |||
115 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | |||
116 | return [=](const LegalityQuery &Query) { | |||
117 | const LLT Ty = Query.Types[TypeIdx]; | |||
118 | ||||
119 | const LLT EltTy = Ty.getElementType(); | |||
120 | const int Size = Ty.getSizeInBits(); | |||
121 | const int EltSize = EltTy.getSizeInBits(); | |||
| ||||
122 | const int NextMul32 = (Size + 31) / 32; | |||
123 | ||||
124 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 124, __extension__ __PRETTY_FUNCTION__)); | |||
125 | ||||
126 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | |||
| ||||
127 | return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); | |||
128 | }; | |||
129 | } | |||
130 | ||||
131 | static LLT getBitcastRegisterType(const LLT Ty) { | |||
132 | const unsigned Size = Ty.getSizeInBits(); | |||
133 | ||||
134 | LLT CoercedTy; | |||
135 | if (Size <= 32) { | |||
136 | // <2 x s8> -> s16 | |||
137 | // <4 x s8> -> s32 | |||
138 | return LLT::scalar(Size); | |||
139 | } | |||
140 | ||||
141 | return LLT::scalarOrVector(Size / 32, 32); | |||
142 | } | |||
143 | ||||
144 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | |||
145 | return [=](const LegalityQuery &Query) { | |||
146 | const LLT Ty = Query.Types[TypeIdx]; | |||
147 | return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); | |||
148 | }; | |||
149 | } | |||
150 | ||||
151 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | |||
152 | return [=](const LegalityQuery &Query) { | |||
153 | const LLT Ty = Query.Types[TypeIdx]; | |||
154 | unsigned Size = Ty.getSizeInBits(); | |||
155 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 155, __extension__ __PRETTY_FUNCTION__)); | |||
156 | return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); | |||
157 | }; | |||
158 | } | |||
159 | ||||
160 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | |||
161 | return [=](const LegalityQuery &Query) { | |||
162 | const LLT QueryTy = Query.Types[TypeIdx]; | |||
163 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | |||
164 | }; | |||
165 | } | |||
166 | ||||
167 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | |||
168 | return [=](const LegalityQuery &Query) { | |||
169 | const LLT QueryTy = Query.Types[TypeIdx]; | |||
170 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | |||
171 | }; | |||
172 | } | |||
173 | ||||
174 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | |||
175 | return [=](const LegalityQuery &Query) { | |||
176 | const LLT QueryTy = Query.Types[TypeIdx]; | |||
177 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | |||
178 | }; | |||
179 | } | |||
180 | ||||
181 | static bool isRegisterSize(unsigned Size) { | |||
182 | return Size % 32 == 0 && Size <= MaxRegisterSize; | |||
183 | } | |||
184 | ||||
185 | static bool isRegisterVectorElementType(LLT EltTy) { | |||
186 | const int EltSize = EltTy.getSizeInBits(); | |||
187 | return EltSize == 16 || EltSize % 32 == 0; | |||
188 | } | |||
189 | ||||
190 | static bool isRegisterVectorType(LLT Ty) { | |||
191 | const int EltSize = Ty.getElementType().getSizeInBits(); | |||
192 | return EltSize == 32 || EltSize == 64 || | |||
193 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | |||
194 | EltSize == 128 || EltSize == 256; | |||
195 | } | |||
196 | ||||
197 | static bool isRegisterType(LLT Ty) { | |||
198 | if (!isRegisterSize(Ty.getSizeInBits())) | |||
199 | return false; | |||
200 | ||||
201 | if (Ty.isVector()) | |||
202 | return isRegisterVectorType(Ty); | |||
203 | ||||
204 | return true; | |||
205 | } | |||
206 | ||||
207 | // Any combination of 32 or 64-bit elements up the maximum register size, and | |||
208 | // multiples of v2s16. | |||
209 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | |||
210 | return [=](const LegalityQuery &Query) { | |||
211 | return isRegisterType(Query.Types[TypeIdx]); | |||
212 | }; | |||
213 | } | |||
214 | ||||
215 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | |||
216 | return [=](const LegalityQuery &Query) { | |||
217 | const LLT QueryTy = Query.Types[TypeIdx]; | |||
218 | if (!QueryTy.isVector()) | |||
219 | return false; | |||
220 | const LLT EltTy = QueryTy.getElementType(); | |||
221 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | |||
222 | }; | |||
223 | } | |||
224 | ||||
225 | // If we have a truncating store or an extending load with a data size larger | |||
226 | // than 32-bits, we need to reduce to a 32-bit type. | |||
227 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { | |||
228 | return [=](const LegalityQuery &Query) { | |||
229 | const LLT Ty = Query.Types[TypeIdx]; | |||
230 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | |||
231 | Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); | |||
232 | }; | |||
233 | } | |||
234 | ||||
235 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | |||
236 | // handle some operations by just promoting the register during | |||
237 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | |||
238 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | |||
239 | bool IsLoad) { | |||
240 | switch (AS) { | |||
241 | case AMDGPUAS::PRIVATE_ADDRESS: | |||
242 | // FIXME: Private element size. | |||
243 | return ST.enableFlatScratch() ? 128 : 32; | |||
244 | case AMDGPUAS::LOCAL_ADDRESS: | |||
245 | return ST.useDS128() ? 128 : 64; | |||
246 | case AMDGPUAS::GLOBAL_ADDRESS: | |||
247 | case AMDGPUAS::CONSTANT_ADDRESS: | |||
248 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | |||
249 | // Treat constant and global as identical. SMRD loads are sometimes usable for | |||
250 | // global loads (ideally constant address space should be eliminated) | |||
251 | // depending on the context. Legality cannot be context dependent, but | |||
252 | // RegBankSelect can split the load as necessary depending on the pointer | |||
253 | // register bank/uniformity and if the memory is invariant or not written in a | |||
254 | // kernel. | |||
255 | return IsLoad ? 512 : 128; | |||
256 | default: | |||
257 | // Flat addresses may contextually need to be split to 32-bit parts if they | |||
258 | // may alias scratch depending on the subtarget. | |||
259 | return 128; | |||
260 | } | |||
261 | } | |||
262 | ||||
263 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | |||
264 | const LegalityQuery &Query) { | |||
265 | const LLT Ty = Query.Types[0]; | |||
266 | ||||
267 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | |||
268 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; | |||
269 | ||||
270 | unsigned RegSize = Ty.getSizeInBits(); | |||
271 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | |||
272 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | |||
273 | unsigned AS = Query.Types[1].getAddressSpace(); | |||
274 | ||||
275 | // All of these need to be custom lowered to cast the pointer operand. | |||
276 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | |||
277 | return false; | |||
278 | ||||
279 | // Do not handle extending vector loads. | |||
280 | if (Ty.isVector() && MemSize != RegSize) | |||
281 | return false; | |||
282 | ||||
283 | // TODO: We should be able to widen loads if the alignment is high enough, but | |||
284 | // we also need to modify the memory access size. | |||
285 | #if 0 | |||
286 | // Accept widening loads based on alignment. | |||
287 | if (IsLoad && MemSize < Size) | |||
288 | MemSize = std::max(MemSize, Align); | |||
289 | #endif | |||
290 | ||||
291 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | |||
292 | if (MemSize != RegSize && RegSize != 32) | |||
293 | return false; | |||
294 | ||||
295 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | |||
296 | return false; | |||
297 | ||||
298 | switch (MemSize) { | |||
299 | case 8: | |||
300 | case 16: | |||
301 | case 32: | |||
302 | case 64: | |||
303 | case 128: | |||
304 | break; | |||
305 | case 96: | |||
306 | if (!ST.hasDwordx3LoadStores()) | |||
307 | return false; | |||
308 | break; | |||
309 | case 256: | |||
310 | case 512: | |||
311 | // These may contextually need to be broken down. | |||
312 | break; | |||
313 | default: | |||
314 | return false; | |||
315 | } | |||
316 | ||||
317 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 317, __extension__ __PRETTY_FUNCTION__)); | |||
318 | ||||
319 | if (AlignBits < MemSize) { | |||
320 | const SITargetLowering *TLI = ST.getTargetLowering(); | |||
321 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | |||
322 | Align(AlignBits / 8))) | |||
323 | return false; | |||
324 | } | |||
325 | ||||
326 | return true; | |||
327 | } | |||
328 | ||||
329 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | |||
330 | // workaround this. Eventually it should ignore the type for loads and only care | |||
331 | // about the size. Return true in cases where we will workaround this for now by | |||
332 | // bitcasting. | |||
333 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | |||
334 | if (EnableNewLegality) | |||
335 | return false; | |||
336 | ||||
337 | const unsigned Size = Ty.getSizeInBits(); | |||
338 | if (Size <= 64) | |||
339 | return false; | |||
340 | if (!Ty.isVector()) | |||
341 | return true; | |||
342 | ||||
343 | LLT EltTy = Ty.getElementType(); | |||
344 | if (EltTy.isPointer()) | |||
345 | return true; | |||
346 | ||||
347 | unsigned EltSize = EltTy.getSizeInBits(); | |||
348 | return EltSize != 32 && EltSize != 64; | |||
349 | } | |||
350 | ||||
351 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { | |||
352 | const LLT Ty = Query.Types[0]; | |||
353 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && | |||
354 | !loadStoreBitcastWorkaround(Ty); | |||
355 | } | |||
356 | ||||
357 | /// Return true if a load or store of the type should be lowered with a bitcast | |||
358 | /// to a different type. | |||
359 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | |||
360 | const unsigned MemSizeInBits) { | |||
361 | const unsigned Size = Ty.getSizeInBits(); | |||
362 | if (Size != MemSizeInBits) | |||
363 | return Size <= 32 && Ty.isVector(); | |||
364 | ||||
365 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | |||
366 | return true; | |||
367 | return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && | |||
368 | !isRegisterVectorElementType(Ty.getElementType()); | |||
369 | } | |||
370 | ||||
371 | /// Return true if we should legalize a load by widening an odd sized memory | |||
372 | /// access up to the alignment. Note this case when the memory access itself | |||
373 | /// changes, not the size of the result register. | |||
374 | static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, | |||
375 | unsigned AlignInBits, unsigned AddrSpace, | |||
376 | unsigned Opcode) { | |||
377 | // We don't want to widen cases that are naturally legal. | |||
378 | if (isPowerOf2_32(SizeInBits)) | |||
379 | return false; | |||
380 | ||||
381 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | |||
382 | // end up widening these for a scalar load during RegBankSelect, since there | |||
383 | // aren't 96-bit scalar loads. | |||
384 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | |||
385 | return false; | |||
386 | ||||
387 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) | |||
388 | return false; | |||
389 | ||||
390 | // A load is known dereferenceable up to the alignment, so it's legal to widen | |||
391 | // to it. | |||
392 | // | |||
393 | // TODO: Could check dereferenceable for less aligned cases. | |||
394 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | |||
395 | if (AlignInBits < RoundedSize) | |||
396 | return false; | |||
397 | ||||
398 | // Do not widen if it would introduce a slow unaligned load. | |||
399 | const SITargetLowering *TLI = ST.getTargetLowering(); | |||
400 | bool Fast = false; | |||
401 | return TLI->allowsMisalignedMemoryAccessesImpl( | |||
402 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | |||
403 | MachineMemOperand::MOLoad, &Fast) && | |||
404 | Fast; | |||
405 | } | |||
406 | ||||
407 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | |||
408 | unsigned Opcode) { | |||
409 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | |||
410 | return false; | |||
411 | ||||
412 | return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, | |||
413 | Query.MMODescrs[0].AlignInBits, | |||
414 | Query.Types[1].getAddressSpace(), Opcode); | |||
415 | } | |||
416 | ||||
417 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | |||
418 | const GCNTargetMachine &TM) | |||
419 | : ST(ST_) { | |||
420 | using namespace TargetOpcode; | |||
421 | ||||
422 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | |||
423 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | |||
424 | }; | |||
425 | ||||
426 | const LLT S1 = LLT::scalar(1); | |||
427 | const LLT S8 = LLT::scalar(8); | |||
428 | const LLT S16 = LLT::scalar(16); | |||
429 | const LLT S32 = LLT::scalar(32); | |||
430 | const LLT S64 = LLT::scalar(64); | |||
431 | const LLT S128 = LLT::scalar(128); | |||
432 | const LLT S256 = LLT::scalar(256); | |||
433 | const LLT S512 = LLT::scalar(512); | |||
434 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | |||
435 | ||||
436 | const LLT V2S8 = LLT::vector(2, 8); | |||
437 | const LLT V2S16 = LLT::vector(2, 16); | |||
438 | const LLT V4S16 = LLT::vector(4, 16); | |||
439 | ||||
440 | const LLT V2S32 = LLT::vector(2, 32); | |||
441 | const LLT V3S32 = LLT::vector(3, 32); | |||
442 | const LLT V4S32 = LLT::vector(4, 32); | |||
443 | const LLT V5S32 = LLT::vector(5, 32); | |||
444 | const LLT V6S32 = LLT::vector(6, 32); | |||
445 | const LLT V7S32 = LLT::vector(7, 32); | |||
446 | const LLT V8S32 = LLT::vector(8, 32); | |||
447 | const LLT V9S32 = LLT::vector(9, 32); | |||
448 | const LLT V10S32 = LLT::vector(10, 32); | |||
449 | const LLT V11S32 = LLT::vector(11, 32); | |||
450 | const LLT V12S32 = LLT::vector(12, 32); | |||
451 | const LLT V13S32 = LLT::vector(13, 32); | |||
452 | const LLT V14S32 = LLT::vector(14, 32); | |||
453 | const LLT V15S32 = LLT::vector(15, 32); | |||
454 | const LLT V16S32 = LLT::vector(16, 32); | |||
455 | const LLT V32S32 = LLT::vector(32, 32); | |||
456 | ||||
457 | const LLT V2S64 = LLT::vector(2, 64); | |||
458 | const LLT V3S64 = LLT::vector(3, 64); | |||
459 | const LLT V4S64 = LLT::vector(4, 64); | |||
460 | const LLT V5S64 = LLT::vector(5, 64); | |||
461 | const LLT V6S64 = LLT::vector(6, 64); | |||
462 | const LLT V7S64 = LLT::vector(7, 64); | |||
463 | const LLT V8S64 = LLT::vector(8, 64); | |||
464 | const LLT V16S64 = LLT::vector(16, 64); | |||
465 | ||||
466 | std::initializer_list<LLT> AllS32Vectors = | |||
467 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | |||
468 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | |||
469 | std::initializer_list<LLT> AllS64Vectors = | |||
470 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | |||
471 | ||||
472 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | |||
473 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | |||
474 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | |||
475 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | |||
476 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | |||
477 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | |||
478 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | |||
479 | ||||
480 | const LLT CodePtr = FlatPtr; | |||
481 | ||||
482 | const std::initializer_list<LLT> AddrSpaces64 = { | |||
483 | GlobalPtr, ConstantPtr, FlatPtr | |||
484 | }; | |||
485 | ||||
486 | const std::initializer_list<LLT> AddrSpaces32 = { | |||
487 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | |||
488 | }; | |||
489 | ||||
490 | const std::initializer_list<LLT> FPTypesBase = { | |||
491 | S32, S64 | |||
492 | }; | |||
493 | ||||
494 | const std::initializer_list<LLT> FPTypes16 = { | |||
495 | S32, S64, S16 | |||
496 | }; | |||
497 | ||||
498 | const std::initializer_list<LLT> FPTypesPK16 = { | |||
499 | S32, S64, S16, V2S16 | |||
500 | }; | |||
501 | ||||
502 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | |||
503 | ||||
504 | auto &LegacyInfo = getLegacyLegalizerInfo(); | |||
505 | LegacyInfo.setAction({G_BRCOND, S1}, | |||
506 | LegacyLegalizeActions::Legal); // VCC branches | |||
507 | LegacyInfo.setAction({G_BRCOND, S32}, | |||
508 | LegacyLegalizeActions::Legal); // SCC branches | |||
509 | ||||
510 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | |||
511 | // elements for v3s16 | |||
512 | getActionDefinitionsBuilder(G_PHI) | |||
513 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | |||
514 | .legalFor(AllS32Vectors) | |||
515 | .legalFor(AllS64Vectors) | |||
516 | .legalFor(AddrSpaces64) | |||
517 | .legalFor(AddrSpaces32) | |||
518 | .legalIf(isPointer(0)) | |||
519 | .clampScalar(0, S16, S256) | |||
520 | .widenScalarToNextPow2(0, 32) | |||
521 | .clampMaxNumElements(0, S32, 16) | |||
522 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | |||
523 | .scalarize(0); | |||
524 | ||||
525 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | |||
526 | // Full set of gfx9 features. | |||
527 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | |||
528 | .legalFor({S32, S16, V2S16}) | |||
529 | .clampScalar(0, S16, S32) | |||
530 | .clampMaxNumElements(0, S16, 2) | |||
531 | .scalarize(0) | |||
532 | .widenScalarToNextPow2(0, 32); | |||
533 | ||||
534 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | |||
535 | .legalFor({S32, S16, V2S16}) // Clamp modifier | |||
536 | .minScalarOrElt(0, S16) | |||
537 | .clampMaxNumElements(0, S16, 2) | |||
538 | .scalarize(0) | |||
539 | .widenScalarToNextPow2(0, 32) | |||
540 | .lower(); | |||
541 | } else if (ST.has16BitInsts()) { | |||
542 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | |||
543 | .legalFor({S32, S16}) | |||
544 | .clampScalar(0, S16, S32) | |||
545 | .scalarize(0) | |||
546 | .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 | |||
547 | ||||
548 | // Technically the saturating operations require clamp bit support, but this | |||
549 | // was introduced at the same time as 16-bit operations. | |||
550 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | |||
551 | .legalFor({S32, S16}) // Clamp modifier | |||
552 | .minScalar(0, S16) | |||
553 | .scalarize(0) | |||
554 | .widenScalarToNextPow2(0, 16) | |||
555 | .lower(); | |||
556 | ||||
557 | // We're just lowering this, but it helps get a better result to try to | |||
558 | // coerce to the desired type first. | |||
559 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | |||
560 | .minScalar(0, S16) | |||
561 | .scalarize(0) | |||
562 | .lower(); | |||
563 | } else { | |||
564 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | |||
565 | .legalFor({S32}) | |||
566 | .clampScalar(0, S32, S32) | |||
567 | .scalarize(0); | |||
568 | ||||
569 | if (ST.hasIntClamp()) { | |||
570 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | |||
571 | .legalFor({S32}) // Clamp modifier. | |||
572 | .scalarize(0) | |||
573 | .minScalarOrElt(0, S32) | |||
574 | .lower(); | |||
575 | } else { | |||
576 | // Clamp bit support was added in VI, along with 16-bit operations. | |||
577 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | |||
578 | .minScalar(0, S32) | |||
579 | .scalarize(0) | |||
580 | .lower(); | |||
581 | } | |||
582 | ||||
583 | // FIXME: DAG expansion gets better results. The widening uses the smaller | |||
584 | // range values and goes for the min/max lowering directly. | |||
585 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | |||
586 | .minScalar(0, S32) | |||
587 | .scalarize(0) | |||
588 | .lower(); | |||
589 | } | |||
590 | ||||
591 | getActionDefinitionsBuilder( | |||
592 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) | |||
593 | .customFor({S32, S64}) | |||
594 | .clampScalar(0, S32, S64) | |||
595 | .widenScalarToNextPow2(0, 32) | |||
596 | .scalarize(0); | |||
597 | ||||
598 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | |||
599 | .legalFor({S32}) | |||
600 | .maxScalarOrElt(0, S32); | |||
601 | ||||
602 | if (ST.hasVOP3PInsts()) { | |||
603 | Mulh | |||
604 | .clampMaxNumElements(0, S8, 2) | |||
605 | .lowerFor({V2S8}); | |||
606 | } | |||
607 | ||||
608 | Mulh | |||
609 | .scalarize(0) | |||
610 | .lower(); | |||
611 | ||||
612 | // Report legal for any types we can handle anywhere. For the cases only legal | |||
613 | // on the SALU, RegBankSelect will be able to re-legalize. | |||
614 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | |||
615 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | |||
616 | .clampScalar(0, S32, S64) | |||
617 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | |||
618 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | |||
619 | .widenScalarToNextPow2(0) | |||
620 | .scalarize(0); | |||
621 | ||||
622 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | |||
623 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | |||
624 | .legalFor({{S32, S1}, {S32, S32}}) | |||
625 | .minScalar(0, S32) | |||
626 | // TODO: .scalarize(0) | |||
627 | .lower(); | |||
628 | ||||
629 | getActionDefinitionsBuilder(G_BITCAST) | |||
630 | // Don't worry about the size constraint. | |||
631 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | |||
632 | .lower(); | |||
633 | ||||
634 | ||||
635 | getActionDefinitionsBuilder(G_CONSTANT) | |||
636 | .legalFor({S1, S32, S64, S16, GlobalPtr, | |||
637 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | |||
638 | .legalIf(isPointer(0)) | |||
639 | .clampScalar(0, S32, S64) | |||
640 | .widenScalarToNextPow2(0); | |||
641 | ||||
642 | getActionDefinitionsBuilder(G_FCONSTANT) | |||
643 | .legalFor({S32, S64, S16}) | |||
644 | .clampScalar(0, S16, S64); | |||
645 | ||||
646 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | |||
647 | .legalIf(isRegisterType(0)) | |||
648 | // s1 and s16 are special cases because they have legal operations on | |||
649 | // them, but don't really occupy registers in the normal way. | |||
650 | .legalFor({S1, S16}) | |||
651 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | |||
652 | .clampScalarOrElt(0, S32, MaxScalar) | |||
653 | .widenScalarToNextPow2(0, 32) | |||
654 | .clampMaxNumElements(0, S32, 16); | |||
655 | ||||
656 | LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr}, | |||
657 | LegacyLegalizeActions::Legal); | |||
658 | ||||
659 | // If the amount is divergent, we have to do a wave reduction to get the | |||
660 | // maximum value, so this is expanded during RegBankSelect. | |||
661 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | |||
662 | .legalFor({{PrivatePtr, S32}}); | |||
663 | ||||
664 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | |||
665 | .customIf(typeIsNot(0, PrivatePtr)); | |||
666 | ||||
667 | LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal); | |||
668 | ||||
669 | auto &FPOpActions = getActionDefinitionsBuilder( | |||
670 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | |||
671 | .legalFor({S32, S64}); | |||
672 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | |||
673 | .customFor({S32, S64}); | |||
674 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | |||
675 | .customFor({S32, S64}); | |||
676 | ||||
677 | if (ST.has16BitInsts()) { | |||
678 | if (ST.hasVOP3PInsts()) | |||
679 | FPOpActions.legalFor({S16, V2S16}); | |||
680 | else | |||
681 | FPOpActions.legalFor({S16}); | |||
682 | ||||
683 | TrigActions.customFor({S16}); | |||
684 | FDIVActions.customFor({S16}); | |||
685 | } | |||
686 | ||||
687 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | |||
688 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | |||
689 | ||||
690 | if (ST.hasVOP3PInsts()) { | |||
691 | MinNumMaxNum.customFor(FPTypesPK16) | |||
692 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | |||
693 | .clampMaxNumElements(0, S16, 2) | |||
694 | .clampScalar(0, S16, S64) | |||
695 | .scalarize(0); | |||
696 | } else if (ST.has16BitInsts()) { | |||
697 | MinNumMaxNum.customFor(FPTypes16) | |||
698 | .clampScalar(0, S16, S64) | |||
699 | .scalarize(0); | |||
700 | } else { | |||
701 | MinNumMaxNum.customFor(FPTypesBase) | |||
702 | .clampScalar(0, S32, S64) | |||
703 | .scalarize(0); | |||
704 | } | |||
705 | ||||
706 | if (ST.hasVOP3PInsts()) | |||
707 | FPOpActions.clampMaxNumElements(0, S16, 2); | |||
708 | ||||
709 | FPOpActions | |||
710 | .scalarize(0) | |||
711 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | |||
712 | ||||
713 | TrigActions | |||
714 | .scalarize(0) | |||
715 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | |||
716 | ||||
717 | FDIVActions | |||
718 | .scalarize(0) | |||
719 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | |||
720 | ||||
721 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | |||
722 | .legalFor(FPTypesPK16) | |||
723 | .clampMaxNumElements(0, S16, 2) | |||
724 | .scalarize(0) | |||
725 | .clampScalar(0, S16, S64); | |||
726 | ||||
727 | if (ST.has16BitInsts()) { | |||
728 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | |||
729 | .legalFor({S32, S64, S16}) | |||
730 | .scalarize(0) | |||
731 | .clampScalar(0, S16, S64); | |||
732 | } else { | |||
733 | getActionDefinitionsBuilder(G_FSQRT) | |||
734 | .legalFor({S32, S64}) | |||
735 | .scalarize(0) | |||
736 | .clampScalar(0, S32, S64); | |||
737 | ||||
738 | if (ST.hasFractBug()) { | |||
739 | getActionDefinitionsBuilder(G_FFLOOR) | |||
740 | .customFor({S64}) | |||
741 | .legalFor({S32, S64}) | |||
742 | .scalarize(0) | |||
743 | .clampScalar(0, S32, S64); | |||
744 | } else { | |||
745 | getActionDefinitionsBuilder(G_FFLOOR) | |||
746 | .legalFor({S32, S64}) | |||
747 | .scalarize(0) | |||
748 | .clampScalar(0, S32, S64); | |||
749 | } | |||
750 | } | |||
751 | ||||
752 | getActionDefinitionsBuilder(G_FPTRUNC) | |||
753 | .legalFor({{S32, S64}, {S16, S32}}) | |||
754 | .scalarize(0) | |||
755 | .lower(); | |||
756 | ||||
757 | getActionDefinitionsBuilder(G_FPEXT) | |||
758 | .legalFor({{S64, S32}, {S32, S16}}) | |||
759 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | |||
760 | .scalarize(0); | |||
761 | ||||
762 | getActionDefinitionsBuilder(G_FSUB) | |||
763 | // Use actual fsub instruction | |||
764 | .legalFor({S32}) | |||
765 | // Must use fadd + fneg | |||
766 | .lowerFor({S64, S16, V2S16}) | |||
767 | .scalarize(0) | |||
768 | .clampScalar(0, S32, S64); | |||
769 | ||||
770 | // Whether this is legal depends on the floating point mode for the function. | |||
771 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | |||
772 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | |||
773 | FMad.customFor({S32, S16}); | |||
774 | else if (ST.hasMadMacF32Insts()) | |||
775 | FMad.customFor({S32}); | |||
776 | else if (ST.hasMadF16()) | |||
777 | FMad.customFor({S16}); | |||
778 | FMad.scalarize(0) | |||
779 | .lower(); | |||
780 | ||||
781 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | |||
782 | if (ST.has16BitInsts()) { | |||
783 | FRem.customFor({S16, S32, S64}); | |||
784 | } else { | |||
785 | FRem.minScalar(0, S32) | |||
786 | .customFor({S32, S64}); | |||
787 | } | |||
788 | FRem.scalarize(0); | |||
789 | ||||
790 | // TODO: Do we need to clamp maximum bitwidth? | |||
791 | getActionDefinitionsBuilder(G_TRUNC) | |||
792 | .legalIf(isScalar(0)) | |||
793 | .legalFor({{V2S16, V2S32}}) | |||
794 | .clampMaxNumElements(0, S16, 2) | |||
795 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | |||
796 | // situations (like an invalid implicit use), we don't want to infinite loop | |||
797 | // in the legalizer. | |||
798 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | |||
799 | .alwaysLegal(); | |||
800 | ||||
801 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | |||
802 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | |||
803 | {S32, S1}, {S64, S1}, {S16, S1}}) | |||
804 | .scalarize(0) | |||
805 | .clampScalar(0, S32, S64) | |||
806 | .widenScalarToNextPow2(1, 32); | |||
807 | ||||
808 | // TODO: Split s1->s64 during regbankselect for VALU. | |||
809 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | |||
810 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | |||
811 | .lowerFor({{S32, S64}}) | |||
812 | .lowerIf(typeIs(1, S1)) | |||
813 | .customFor({{S64, S64}}); | |||
814 | if (ST.has16BitInsts()) | |||
815 | IToFP.legalFor({{S16, S16}}); | |||
816 | IToFP.clampScalar(1, S32, S64) | |||
817 | .minScalar(0, S32) | |||
818 | .scalarize(0) | |||
819 | .widenScalarToNextPow2(1); | |||
820 | ||||
821 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | |||
822 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | |||
823 | .customFor({{S64, S32}, {S64, S64}}) | |||
824 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | |||
825 | if (ST.has16BitInsts()) | |||
826 | FPToI.legalFor({{S16, S16}}); | |||
827 | else | |||
828 | FPToI.minScalar(1, S32); | |||
829 | ||||
830 | FPToI.minScalar(0, S32) | |||
831 | .widenScalarToNextPow2(0, 32) | |||
832 | .scalarize(0) | |||
833 | .lower(); | |||
834 | ||||
835 | // Lower roundeven into G_FRINT | |||
836 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | |||
837 | .scalarize(0) | |||
838 | .lower(); | |||
839 | ||||
840 | if (ST.has16BitInsts()) { | |||
841 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | |||
842 | .legalFor({S16, S32, S64}) | |||
843 | .clampScalar(0, S16, S64) | |||
844 | .scalarize(0); | |||
845 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | |||
846 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | |||
847 | .legalFor({S32, S64}) | |||
848 | .clampScalar(0, S32, S64) | |||
849 | .scalarize(0); | |||
850 | } else { | |||
851 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | |||
852 | .legalFor({S32}) | |||
853 | .customFor({S64}) | |||
854 | .clampScalar(0, S32, S64) | |||
855 | .scalarize(0); | |||
856 | } | |||
857 | ||||
858 | getActionDefinitionsBuilder(G_PTR_ADD) | |||
859 | .legalIf(all(isPointer(0), sameSize(0, 1))) | |||
860 | .scalarize(0) | |||
861 | .scalarSameSizeAs(1, 0); | |||
862 | ||||
863 | getActionDefinitionsBuilder(G_PTRMASK) | |||
864 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | |||
865 | .scalarSameSizeAs(1, 0) | |||
866 | .scalarize(0); | |||
867 | ||||
868 | auto &CmpBuilder = | |||
869 | getActionDefinitionsBuilder(G_ICMP) | |||
870 | // The compare output type differs based on the register bank of the output, | |||
871 | // so make both s1 and s32 legal. | |||
872 | // | |||
873 | // Scalar compares producing output in scc will be promoted to s32, as that | |||
874 | // is the allocatable register type that will be needed for the copy from | |||
875 | // scc. This will be promoted during RegBankSelect, and we assume something | |||
876 | // before that won't try to use s32 result types. | |||
877 | // | |||
878 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | |||
879 | // bank. | |||
880 | .legalForCartesianProduct( | |||
881 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | |||
882 | .legalForCartesianProduct( | |||
883 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | |||
884 | if (ST.has16BitInsts()) { | |||
885 | CmpBuilder.legalFor({{S1, S16}}); | |||
886 | } | |||
887 | ||||
888 | CmpBuilder | |||
889 | .widenScalarToNextPow2(1) | |||
890 | .clampScalar(1, S32, S64) | |||
891 | .scalarize(0) | |||
892 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | |||
893 | ||||
894 | getActionDefinitionsBuilder(G_FCMP) | |||
895 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | |||
896 | .widenScalarToNextPow2(1) | |||
897 | .clampScalar(1, S32, S64) | |||
898 | .scalarize(0); | |||
899 | ||||
900 | // FIXME: fpow has a selection pattern that should move to custom lowering. | |||
901 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | |||
902 | if (ST.has16BitInsts()) | |||
903 | Exp2Ops.legalFor({S32, S16}); | |||
904 | else | |||
905 | Exp2Ops.legalFor({S32}); | |||
906 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | |||
907 | Exp2Ops.scalarize(0); | |||
908 | ||||
909 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | |||
910 | if (ST.has16BitInsts()) | |||
911 | ExpOps.customFor({{S32}, {S16}}); | |||
912 | else | |||
913 | ExpOps.customFor({S32}); | |||
914 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | |||
915 | .scalarize(0); | |||
916 | ||||
917 | getActionDefinitionsBuilder(G_FPOWI) | |||
918 | .clampScalar(0, MinScalarFPTy, S32) | |||
919 | .lower(); | |||
920 | ||||
921 | // The 64-bit versions produce 32-bit results, but only on the SALU. | |||
922 | getActionDefinitionsBuilder(G_CTPOP) | |||
923 | .legalFor({{S32, S32}, {S32, S64}}) | |||
924 | .clampScalar(0, S32, S32) | |||
925 | .clampScalar(1, S32, S64) | |||
926 | .scalarize(0) | |||
927 | .widenScalarToNextPow2(0, 32) | |||
928 | .widenScalarToNextPow2(1, 32); | |||
929 | ||||
930 | // The hardware instructions return a different result on 0 than the generic | |||
931 | // instructions expect. The hardware produces -1, but these produce the | |||
932 | // bitwidth. | |||
933 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | |||
934 | .scalarize(0) | |||
935 | .clampScalar(0, S32, S32) | |||
936 | .clampScalar(1, S32, S64) | |||
937 | .widenScalarToNextPow2(0, 32) | |||
938 | .widenScalarToNextPow2(1, 32) | |||
939 | .lower(); | |||
940 | ||||
941 | // The 64-bit versions produce 32-bit results, but only on the SALU. | |||
942 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | |||
943 | .legalFor({{S32, S32}, {S32, S64}}) | |||
944 | .clampScalar(0, S32, S32) | |||
945 | .clampScalar(1, S32, S64) | |||
946 | .scalarize(0) | |||
947 | .widenScalarToNextPow2(0, 32) | |||
948 | .widenScalarToNextPow2(1, 32); | |||
949 | ||||
950 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | |||
951 | // RegBankSelect. | |||
952 | getActionDefinitionsBuilder(G_BITREVERSE) | |||
953 | .legalFor({S32, S64}) | |||
954 | .clampScalar(0, S32, S64) | |||
955 | .scalarize(0) | |||
956 | .widenScalarToNextPow2(0); | |||
957 | ||||
958 | if (ST.has16BitInsts()) { | |||
959 | getActionDefinitionsBuilder(G_BSWAP) | |||
960 | .legalFor({S16, S32, V2S16}) | |||
961 | .clampMaxNumElements(0, S16, 2) | |||
962 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | |||
963 | // narrowScalar limitation. | |||
964 | .widenScalarToNextPow2(0) | |||
965 | .clampScalar(0, S16, S32) | |||
966 | .scalarize(0); | |||
967 | ||||
968 | if (ST.hasVOP3PInsts()) { | |||
969 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | |||
970 | .legalFor({S32, S16, V2S16}) | |||
971 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | |||
972 | .clampMaxNumElements(0, S16, 2) | |||
973 | .minScalar(0, S16) | |||
974 | .widenScalarToNextPow2(0) | |||
975 | .scalarize(0) | |||
976 | .lower(); | |||
977 | } else { | |||
978 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | |||
979 | .legalFor({S32, S16}) | |||
980 | .widenScalarToNextPow2(0) | |||
981 | .minScalar(0, S16) | |||
982 | .scalarize(0) | |||
983 | .lower(); | |||
984 | } | |||
985 | } else { | |||
986 | // TODO: Should have same legality without v_perm_b32 | |||
987 | getActionDefinitionsBuilder(G_BSWAP) | |||
988 | .legalFor({S32}) | |||
989 | .lowerIf(scalarNarrowerThan(0, 32)) | |||
990 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | |||
991 | // narrowScalar limitation. | |||
992 | .widenScalarToNextPow2(0) | |||
993 | .maxScalar(0, S32) | |||
994 | .scalarize(0) | |||
995 | .lower(); | |||
996 | ||||
997 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | |||
998 | .legalFor({S32}) | |||
999 | .minScalar(0, S32) | |||
1000 | .widenScalarToNextPow2(0) | |||
1001 | .scalarize(0) | |||
1002 | .lower(); | |||
1003 | } | |||
1004 | ||||
1005 | getActionDefinitionsBuilder(G_INTTOPTR) | |||
1006 | // List the common cases | |||
1007 | .legalForCartesianProduct(AddrSpaces64, {S64}) | |||
1008 | .legalForCartesianProduct(AddrSpaces32, {S32}) | |||
1009 | .scalarize(0) | |||
1010 | // Accept any address space as long as the size matches | |||
1011 | .legalIf(sameSize(0, 1)) | |||
1012 | .widenScalarIf(smallerThan(1, 0), | |||
1013 | [](const LegalityQuery &Query) { | |||
1014 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | |||
1015 | }) | |||
1016 | .narrowScalarIf(largerThan(1, 0), | |||
1017 | [](const LegalityQuery &Query) { | |||
1018 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | |||
1019 | }); | |||
1020 | ||||
1021 | getActionDefinitionsBuilder(G_PTRTOINT) | |||
1022 | // List the common cases | |||
1023 | .legalForCartesianProduct(AddrSpaces64, {S64}) | |||
1024 | .legalForCartesianProduct(AddrSpaces32, {S32}) | |||
1025 | .scalarize(0) | |||
1026 | // Accept any address space as long as the size matches | |||
1027 | .legalIf(sameSize(0, 1)) | |||
1028 | .widenScalarIf(smallerThan(0, 1), | |||
1029 | [](const LegalityQuery &Query) { | |||
1030 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | |||
1031 | }) | |||
1032 | .narrowScalarIf( | |||
1033 | largerThan(0, 1), | |||
1034 | [](const LegalityQuery &Query) { | |||
1035 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | |||
1036 | }); | |||
1037 | ||||
1038 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | |||
1039 | .scalarize(0) | |||
1040 | .custom(); | |||
1041 | ||||
1042 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | |||
1043 | bool IsLoad) -> bool { | |||
1044 | const LLT DstTy = Query.Types[0]; | |||
1045 | ||||
1046 | // Split vector extloads. | |||
1047 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | |||
1048 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | |||
1049 | ||||
1050 | if (MemSize < DstTy.getSizeInBits()) | |||
1051 | MemSize = std::max(MemSize, AlignBits); | |||
1052 | ||||
1053 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | |||
1054 | return true; | |||
1055 | ||||
1056 | const LLT PtrTy = Query.Types[1]; | |||
1057 | unsigned AS = PtrTy.getAddressSpace(); | |||
1058 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | |||
1059 | return true; | |||
1060 | ||||
1061 | // Catch weird sized loads that don't evenly divide into the access sizes | |||
1062 | // TODO: May be able to widen depending on alignment etc. | |||
1063 | unsigned NumRegs = (MemSize + 31) / 32; | |||
1064 | if (NumRegs == 3) { | |||
1065 | if (!ST.hasDwordx3LoadStores()) | |||
1066 | return true; | |||
1067 | } else { | |||
1068 | // If the alignment allows, these should have been widened. | |||
1069 | if (!isPowerOf2_32(NumRegs)) | |||
1070 | return true; | |||
1071 | } | |||
1072 | ||||
1073 | if (AlignBits < MemSize) { | |||
1074 | const SITargetLowering *TLI = ST.getTargetLowering(); | |||
1075 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | |||
1076 | Align(AlignBits / 8)); | |||
1077 | } | |||
1078 | ||||
1079 | return false; | |||
1080 | }; | |||
1081 | ||||
1082 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | |||
1083 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | |||
1084 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | |||
1085 | ||||
1086 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | |||
1087 | // LDS | |||
1088 | // TODO: Unsupported flat for SI. | |||
1089 | ||||
1090 | for (unsigned Op : {G_LOAD, G_STORE}) { | |||
1091 | const bool IsStore = Op == G_STORE; | |||
1092 | ||||
1093 | auto &Actions = getActionDefinitionsBuilder(Op); | |||
1094 | // Explicitly list some common cases. | |||
1095 | // TODO: Does this help compile time at all? | |||
1096 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, | |||
1097 | {V2S32, GlobalPtr, 64, GlobalAlign32}, | |||
1098 | {V4S32, GlobalPtr, 128, GlobalAlign32}, | |||
1099 | {S64, GlobalPtr, 64, GlobalAlign32}, | |||
1100 | {V2S64, GlobalPtr, 128, GlobalAlign32}, | |||
1101 | {V2S16, GlobalPtr, 32, GlobalAlign32}, | |||
1102 | {S32, GlobalPtr, 8, GlobalAlign8}, | |||
1103 | {S32, GlobalPtr, 16, GlobalAlign16}, | |||
1104 | ||||
1105 | {S32, LocalPtr, 32, 32}, | |||
1106 | {S64, LocalPtr, 64, 32}, | |||
1107 | {V2S32, LocalPtr, 64, 32}, | |||
1108 | {S32, LocalPtr, 8, 8}, | |||
1109 | {S32, LocalPtr, 16, 16}, | |||
1110 | {V2S16, LocalPtr, 32, 32}, | |||
1111 | ||||
1112 | {S32, PrivatePtr, 32, 32}, | |||
1113 | {S32, PrivatePtr, 8, 8}, | |||
1114 | {S32, PrivatePtr, 16, 16}, | |||
1115 | {V2S16, PrivatePtr, 32, 32}, | |||
1116 | ||||
1117 | {S32, ConstantPtr, 32, GlobalAlign32}, | |||
1118 | {V2S32, ConstantPtr, 64, GlobalAlign32}, | |||
1119 | {V4S32, ConstantPtr, 128, GlobalAlign32}, | |||
1120 | {S64, ConstantPtr, 64, GlobalAlign32}, | |||
1121 | {V2S32, ConstantPtr, 32, GlobalAlign32}}); | |||
1122 | Actions.legalIf( | |||
1123 | [=](const LegalityQuery &Query) -> bool { | |||
1124 | return isLoadStoreLegal(ST, Query); | |||
1125 | }); | |||
1126 | ||||
1127 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | |||
1128 | // 64-bits. | |||
1129 | // | |||
1130 | // TODO: Should generalize bitcast action into coerce, which will also cover | |||
1131 | // inserting addrspacecasts. | |||
1132 | Actions.customIf(typeIs(1, Constant32Ptr)); | |||
1133 | ||||
1134 | // Turn any illegal element vectors into something easier to deal | |||
1135 | // with. These will ultimately produce 32-bit scalar shifts to extract the | |||
1136 | // parts anyway. | |||
1137 | // | |||
1138 | // For odd 16-bit element vectors, prefer to split those into pieces with | |||
1139 | // 16-bit vector parts. | |||
1140 | Actions.bitcastIf( | |||
1141 | [=](const LegalityQuery &Query) -> bool { | |||
1142 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | |||
1143 | Query.MMODescrs[0].SizeInBits); | |||
1144 | }, bitcastToRegisterType(0)); | |||
1145 | ||||
1146 | if (!IsStore) { | |||
1147 | // Widen suitably aligned loads by loading extra bytes. The standard | |||
1148 | // legalization actions can't properly express widening memory operands. | |||
1149 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | |||
1150 | return shouldWidenLoad(ST, Query, G_LOAD); | |||
1151 | }); | |||
1152 | } | |||
1153 | ||||
1154 | // FIXME: load/store narrowing should be moved to lower action | |||
1155 | Actions | |||
1156 | .narrowScalarIf( | |||
1157 | [=](const LegalityQuery &Query) -> bool { | |||
1158 | return !Query.Types[0].isVector() && | |||
1159 | needToSplitMemOp(Query, Op == G_LOAD); | |||
1160 | }, | |||
1161 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | |||
1162 | const LLT DstTy = Query.Types[0]; | |||
1163 | const LLT PtrTy = Query.Types[1]; | |||
1164 | ||||
1165 | const unsigned DstSize = DstTy.getSizeInBits(); | |||
1166 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | |||
1167 | ||||
1168 | // Split extloads. | |||
1169 | if (DstSize > MemSize) | |||
1170 | return std::make_pair(0, LLT::scalar(MemSize)); | |||
1171 | ||||
1172 | if (!isPowerOf2_32(DstSize)) { | |||
1173 | // We're probably decomposing an odd sized store. Try to split | |||
1174 | // to the widest type. TODO: Account for alignment. As-is it | |||
1175 | // should be OK, since the new parts will be further legalized. | |||
1176 | unsigned FloorSize = PowerOf2Floor(DstSize); | |||
1177 | return std::make_pair(0, LLT::scalar(FloorSize)); | |||
1178 | } | |||
1179 | ||||
1180 | if (DstSize > 32 && (DstSize % 32 != 0)) { | |||
1181 | // FIXME: Need a way to specify non-extload of larger size if | |||
1182 | // suitably aligned. | |||
1183 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); | |||
1184 | } | |||
1185 | ||||
1186 | unsigned MaxSize = maxSizeForAddrSpace(ST, | |||
1187 | PtrTy.getAddressSpace(), | |||
1188 | Op == G_LOAD); | |||
1189 | if (MemSize > MaxSize) | |||
1190 | return std::make_pair(0, LLT::scalar(MaxSize)); | |||
1191 | ||||
1192 | unsigned Align = Query.MMODescrs[0].AlignInBits; | |||
1193 | return std::make_pair(0, LLT::scalar(Align)); | |||
1194 | }) | |||
1195 | .fewerElementsIf( | |||
1196 | [=](const LegalityQuery &Query) -> bool { | |||
1197 | return Query.Types[0].isVector() && | |||
1198 | needToSplitMemOp(Query, Op == G_LOAD); | |||
1199 | }, | |||
1200 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | |||
1201 | const LLT DstTy = Query.Types[0]; | |||
1202 | const LLT PtrTy = Query.Types[1]; | |||
1203 | ||||
1204 | LLT EltTy = DstTy.getElementType(); | |||
1205 | unsigned MaxSize = maxSizeForAddrSpace(ST, | |||
1206 | PtrTy.getAddressSpace(), | |||
1207 | Op == G_LOAD); | |||
1208 | ||||
1209 | // FIXME: Handle widened to power of 2 results better. This ends | |||
1210 | // up scalarizing. | |||
1211 | // FIXME: 3 element stores scalarized on SI | |||
1212 | ||||
1213 | // Split if it's too large for the address space. | |||
1214 | if (Query.MMODescrs[0].SizeInBits > MaxSize) { | |||
1215 | unsigned NumElts = DstTy.getNumElements(); | |||
1216 | unsigned EltSize = EltTy.getSizeInBits(); | |||
1217 | ||||
1218 | if (MaxSize % EltSize == 0) { | |||
1219 | return std::make_pair( | |||
1220 | 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); | |||
1221 | } | |||
1222 | ||||
1223 | unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; | |||
1224 | ||||
1225 | // FIXME: Refine when odd breakdowns handled | |||
1226 | // The scalars will need to be re-legalized. | |||
1227 | if (NumPieces == 1 || NumPieces >= NumElts || | |||
1228 | NumElts % NumPieces != 0) | |||
1229 | return std::make_pair(0, EltTy); | |||
1230 | ||||
1231 | return std::make_pair(0, | |||
1232 | LLT::vector(NumElts / NumPieces, EltTy)); | |||
1233 | } | |||
1234 | ||||
1235 | // FIXME: We could probably handle weird extending loads better. | |||
1236 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | |||
1237 | if (DstTy.getSizeInBits() > MemSize) | |||
1238 | return std::make_pair(0, EltTy); | |||
1239 | ||||
1240 | unsigned EltSize = EltTy.getSizeInBits(); | |||
1241 | unsigned DstSize = DstTy.getSizeInBits(); | |||
1242 | if (!isPowerOf2_32(DstSize)) { | |||
1243 | // We're probably decomposing an odd sized store. Try to split | |||
1244 | // to the widest type. TODO: Account for alignment. As-is it | |||
1245 | // should be OK, since the new parts will be further legalized. | |||
1246 | unsigned FloorSize = PowerOf2Floor(DstSize); | |||
1247 | return std::make_pair( | |||
1248 | 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); | |||
1249 | } | |||
1250 | ||||
1251 | // Need to split because of alignment. | |||
1252 | unsigned Align = Query.MMODescrs[0].AlignInBits; | |||
1253 | if (EltSize > Align && | |||
1254 | (EltSize / Align < DstTy.getNumElements())) { | |||
1255 | return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); | |||
1256 | } | |||
1257 | ||||
1258 | // May need relegalization for the scalars. | |||
1259 | return std::make_pair(0, EltTy); | |||
1260 | }) | |||
1261 | .lowerIfMemSizeNotPow2() | |||
1262 | .minScalar(0, S32) | |||
1263 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) | |||
1264 | .widenScalarToNextPow2(0) | |||
1265 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | |||
1266 | .lower(); | |||
1267 | } | |||
1268 | ||||
1269 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | |||
1270 | .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, | |||
1271 | {S32, GlobalPtr, 16, 2 * 8}, | |||
1272 | {S32, LocalPtr, 8, 8}, | |||
1273 | {S32, LocalPtr, 16, 16}, | |||
1274 | {S32, PrivatePtr, 8, 8}, | |||
1275 | {S32, PrivatePtr, 16, 16}, | |||
1276 | {S32, ConstantPtr, 8, 8}, | |||
1277 | {S32, ConstantPtr, 16, 2 * 8}}) | |||
1278 | .legalIf( | |||
1279 | [=](const LegalityQuery &Query) -> bool { | |||
1280 | return isLoadStoreLegal(ST, Query); | |||
1281 | }); | |||
1282 | ||||
1283 | if (ST.hasFlatAddressSpace()) { | |||
1284 | ExtLoads.legalForTypesWithMemDesc( | |||
1285 | {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); | |||
1286 | } | |||
1287 | ||||
1288 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | |||
1289 | // 64-bits. | |||
1290 | // | |||
1291 | // TODO: Should generalize bitcast action into coerce, which will also cover | |||
1292 | // inserting addrspacecasts. | |||
1293 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); | |||
1294 | ||||
1295 | ExtLoads.clampScalar(0, S32, S32) | |||
1296 | .widenScalarToNextPow2(0) | |||
1297 | .unsupportedIfMemSizeNotPow2() | |||
1298 | .lower(); | |||
1299 | ||||
1300 | auto &Atomics = getActionDefinitionsBuilder( | |||
1301 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | |||
1302 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | |||
1303 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | |||
1304 | G_ATOMICRMW_UMIN}) | |||
1305 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | |||
1306 | {S64, GlobalPtr}, {S64, LocalPtr}, | |||
1307 | {S32, RegionPtr}, {S64, RegionPtr}}); | |||
1308 | if (ST.hasFlatAddressSpace()) { | |||
1309 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | |||
1310 | } | |||
1311 | ||||
1312 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | |||
1313 | if (ST.hasLDSFPAtomics()) { | |||
1314 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | |||
1315 | if (ST.hasGFX90AInsts()) | |||
1316 | Atomic.legalFor({{S64, LocalPtr}}); | |||
1317 | } | |||
1318 | if (ST.hasAtomicFaddInsts()) | |||
1319 | Atomic.legalFor({{S32, GlobalPtr}}); | |||
1320 | ||||
1321 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | |||
1322 | // demarshalling | |||
1323 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | |||
1324 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | |||
1325 | {S32, FlatPtr}, {S64, FlatPtr}}) | |||
1326 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | |||
1327 | {S32, RegionPtr}, {S64, RegionPtr}}); | |||
1328 | // TODO: Pointer types, any 32-bit or 64-bit vector | |||
1329 | ||||
1330 | // Condition should be s32 for scalar, s1 for vector. | |||
1331 | getActionDefinitionsBuilder(G_SELECT) | |||
1332 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, | |||
1333 | GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, | |||
1334 | LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) | |||
1335 | .clampScalar(0, S16, S64) | |||
1336 | .scalarize(1) | |||
1337 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | |||
1338 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | |||
1339 | .clampMaxNumElements(0, S32, 2) | |||
1340 | .clampMaxNumElements(0, LocalPtr, 2) | |||
1341 | .clampMaxNumElements(0, PrivatePtr, 2) | |||
1342 | .scalarize(0) | |||
1343 | .widenScalarToNextPow2(0) | |||
1344 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | |||
1345 | ||||
1346 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | |||
1347 | // be more flexible with the shift amount type. | |||
1348 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | |||
1349 | .legalFor({{S32, S32}, {S64, S32}}); | |||
1350 | if (ST.has16BitInsts()) { | |||
1351 | if (ST.hasVOP3PInsts()) { | |||
1352 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | |||
1353 | .clampMaxNumElements(0, S16, 2); | |||
1354 | } else | |||
1355 | Shifts.legalFor({{S16, S16}}); | |||
1356 | ||||
1357 | // TODO: Support 16-bit shift amounts for all types | |||
1358 | Shifts.widenScalarIf( | |||
1359 | [=](const LegalityQuery &Query) { | |||
1360 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | |||
1361 | // 32-bit amount. | |||
1362 | const LLT ValTy = Query.Types[0]; | |||
1363 | const LLT AmountTy = Query.Types[1]; | |||
1364 | return ValTy.getSizeInBits() <= 16 && | |||
1365 | AmountTy.getSizeInBits() < 16; | |||
1366 | }, changeTo(1, S16)); | |||
1367 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | |||
1368 | Shifts.clampScalar(1, S32, S32); | |||
1369 | Shifts.clampScalar(0, S16, S64); | |||
1370 | Shifts.widenScalarToNextPow2(0, 16); | |||
1371 | ||||
1372 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | |||
1373 | .minScalar(0, S16) | |||
1374 | .scalarize(0) | |||
1375 | .lower(); | |||
1376 | } else { | |||
1377 | // Make sure we legalize the shift amount type first, as the general | |||
1378 | // expansion for the shifted type will produce much worse code if it hasn't | |||
1379 | // been truncated already. | |||
1380 | Shifts.clampScalar(1, S32, S32); | |||
1381 | Shifts.clampScalar(0, S32, S64); | |||
1382 | Shifts.widenScalarToNextPow2(0, 32); | |||
1383 | ||||
1384 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | |||
1385 | .minScalar(0, S32) | |||
1386 | .scalarize(0) | |||
1387 | .lower(); | |||
1388 | } | |||
1389 | Shifts.scalarize(0); | |||
1390 | ||||
1391 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | |||
1392 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | |||
1393 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | |||
1394 | unsigned IdxTypeIdx = 2; | |||
1395 | ||||
1396 | getActionDefinitionsBuilder(Op) | |||
1397 | .customIf([=](const LegalityQuery &Query) { | |||
1398 | const LLT EltTy = Query.Types[EltTypeIdx]; | |||
1399 | const LLT VecTy = Query.Types[VecTypeIdx]; | |||
1400 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | |||
1401 | const unsigned EltSize = EltTy.getSizeInBits(); | |||
1402 | return (EltSize == 32 || EltSize == 64) && | |||
1403 | VecTy.getSizeInBits() % 32 == 0 && | |||
1404 | VecTy.getSizeInBits() <= MaxRegisterSize && | |||
1405 | IdxTy.getSizeInBits() == 32; | |||
1406 | }) | |||
1407 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | |||
1408 | bitcastToVectorElement32(VecTypeIdx)) | |||
1409 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | |||
1410 | .bitcastIf( | |||
1411 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | |||
1412 | [=](const LegalityQuery &Query) { | |||
1413 | // For > 64-bit element types, try to turn this into a 64-bit | |||
1414 | // element vector since we may be able to do better indexing | |||
1415 | // if this is scalar. If not, fall back to 32. | |||
1416 | const LLT EltTy = Query.Types[EltTypeIdx]; | |||
1417 | const LLT VecTy = Query.Types[VecTypeIdx]; | |||
1418 | const unsigned DstEltSize = EltTy.getSizeInBits(); | |||
1419 | const unsigned VecSize = VecTy.getSizeInBits(); | |||
1420 | ||||
1421 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | |||
1422 | return std::make_pair( | |||
1423 | VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); | |||
1424 | }) | |||
1425 | .clampScalar(EltTypeIdx, S32, S64) | |||
1426 | .clampScalar(VecTypeIdx, S32, S64) | |||
1427 | .clampScalar(IdxTypeIdx, S32, S32) | |||
1428 | .clampMaxNumElements(VecTypeIdx, S32, 32) | |||
1429 | // TODO: Clamp elements for 64-bit vectors? | |||
1430 | // It should only be necessary with variable indexes. | |||
1431 | // As a last resort, lower to the stack | |||
1432 | .lower(); | |||
1433 | } | |||
1434 | ||||
1435 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | |||
1436 | .unsupportedIf([=](const LegalityQuery &Query) { | |||
1437 | const LLT &EltTy = Query.Types[1].getElementType(); | |||
1438 | return Query.Types[0] != EltTy; | |||
1439 | }); | |||
1440 | ||||
1441 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | |||
1442 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | |||
1443 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | |||
1444 | ||||
1445 | // FIXME: Doesn't handle extract of illegal sizes. | |||
1446 | getActionDefinitionsBuilder(Op) | |||
1447 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | |||
1448 | // FIXME: Multiples of 16 should not be legal. | |||
1449 | .legalIf([=](const LegalityQuery &Query) { | |||
1450 | const LLT BigTy = Query.Types[BigTyIdx]; | |||
1451 | const LLT LitTy = Query.Types[LitTyIdx]; | |||
1452 | return (BigTy.getSizeInBits() % 32 == 0) && | |||
1453 | (LitTy.getSizeInBits() % 16 == 0); | |||
1454 | }) | |||
1455 | .widenScalarIf( | |||
1456 | [=](const LegalityQuery &Query) { | |||
1457 | const LLT BigTy = Query.Types[BigTyIdx]; | |||
1458 | return (BigTy.getScalarSizeInBits() < 16); | |||
1459 | }, | |||
1460 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | |||
1461 | .widenScalarIf( | |||
1462 | [=](const LegalityQuery &Query) { | |||
1463 | const LLT LitTy = Query.Types[LitTyIdx]; | |||
1464 | return (LitTy.getScalarSizeInBits() < 16); | |||
1465 | }, | |||
1466 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | |||
1467 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | |||
1468 | .widenScalarToNextPow2(BigTyIdx, 32); | |||
1469 | ||||
1470 | } | |||
1471 | ||||
1472 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | |||
1473 | .legalForCartesianProduct(AllS32Vectors, {S32}) | |||
1474 | .legalForCartesianProduct(AllS64Vectors, {S64}) | |||
1475 | .clampNumElements(0, V16S32, V32S32) | |||
1476 | .clampNumElements(0, V2S64, V16S64) | |||
1477 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | |||
1478 | ||||
1479 | if (ST.hasScalarPackInsts()) { | |||
1480 | BuildVector | |||
1481 | // FIXME: Should probably widen s1 vectors straight to s32 | |||
1482 | .minScalarOrElt(0, S16) | |||
1483 | // Widen source elements and produce a G_BUILD_VECTOR_TRUNC | |||
1484 | .minScalar(1, S32); | |||
1485 | ||||
1486 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | |||
1487 | .legalFor({V2S16, S32}) | |||
1488 | .lower(); | |||
1489 | BuildVector.minScalarOrElt(0, S32); | |||
1490 | } else { | |||
1491 | BuildVector.customFor({V2S16, S16}); | |||
1492 | BuildVector.minScalarOrElt(0, S32); | |||
1493 | ||||
1494 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | |||
1495 | .customFor({V2S16, S32}) | |||
1496 | .lower(); | |||
1497 | } | |||
1498 | ||||
1499 | BuildVector.legalIf(isRegisterType(0)); | |||
1500 | ||||
1501 | // FIXME: Clamp maximum size | |||
1502 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | |||
1503 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | |||
1504 | .clampMaxNumElements(0, S32, 32) | |||
1505 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | |||
1506 | .clampMaxNumElements(0, S16, 64); | |||
1507 | ||||
1508 | // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse | |||
1509 | // pre-legalize. | |||
1510 | if (ST.hasVOP3PInsts()) { | |||
1511 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) | |||
1512 | .customFor({V2S16, V2S16}) | |||
1513 | .lower(); | |||
1514 | } else | |||
1515 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | |||
1516 | ||||
1517 | // Merge/Unmerge | |||
1518 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | |||
1519 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | |||
1520 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | |||
1521 | ||||
1522 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | |||
1523 | const LLT Ty = Query.Types[TypeIdx]; | |||
1524 | if (Ty.isVector()) { | |||
1525 | const LLT &EltTy = Ty.getElementType(); | |||
1526 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | |||
1527 | return true; | |||
1528 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | |||
1529 | return true; | |||
1530 | } | |||
1531 | return false; | |||
1532 | }; | |||
1533 | ||||
1534 | auto &Builder = getActionDefinitionsBuilder(Op) | |||
1535 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | |||
1536 | .lowerFor({{S16, V2S16}}) | |||
1537 | .lowerIf([=](const LegalityQuery &Query) { | |||
1538 | const LLT BigTy = Query.Types[BigTyIdx]; | |||
1539 | return BigTy.getSizeInBits() == 32; | |||
1540 | }) | |||
1541 | // Try to widen to s16 first for small types. | |||
1542 | // TODO: Only do this on targets with legal s16 shifts | |||
1543 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | |||
1544 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | |||
1545 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | |||
1546 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | |||
1547 | elementTypeIs(1, S16)), | |||
1548 | changeTo(1, V2S16)) | |||
1549 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | |||
1550 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | |||
1551 | // valid. | |||
1552 | .clampScalar(LitTyIdx, S32, S512) | |||
1553 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | |||
1554 | // Break up vectors with weird elements into scalars | |||
1555 | .fewerElementsIf( | |||
1556 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | |||
1557 | scalarize(0)) | |||
1558 | .fewerElementsIf( | |||
1559 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | |||
1560 | scalarize(1)) | |||
1561 | .clampScalar(BigTyIdx, S32, MaxScalar); | |||
1562 | ||||
1563 | if (Op == G_MERGE_VALUES) { | |||
1564 | Builder.widenScalarIf( | |||
1565 | // TODO: Use 16-bit shifts if legal for 8-bit values? | |||
1566 | [=](const LegalityQuery &Query) { | |||
1567 | const LLT Ty = Query.Types[LitTyIdx]; | |||
1568 | return Ty.getSizeInBits() < 32; | |||
1569 | }, | |||
1570 | changeTo(LitTyIdx, S32)); | |||
1571 | } | |||
1572 | ||||
1573 | Builder.widenScalarIf( | |||
1574 | [=](const LegalityQuery &Query) { | |||
1575 | const LLT Ty = Query.Types[BigTyIdx]; | |||
1576 | return !isPowerOf2_32(Ty.getSizeInBits()) && | |||
1577 | Ty.getSizeInBits() % 16 != 0; | |||
1578 | }, | |||
1579 | [=](const LegalityQuery &Query) { | |||
1580 | // Pick the next power of 2, or a multiple of 64 over 128. | |||
1581 | // Whichever is smaller. | |||
1582 | const LLT &Ty = Query.Types[BigTyIdx]; | |||
1583 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | |||
1584 | if (NewSizeInBits >= 256) { | |||
1585 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | |||
1586 | if (RoundedTo < NewSizeInBits) | |||
1587 | NewSizeInBits = RoundedTo; | |||
1588 | } | |||
1589 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | |||
1590 | }) | |||
1591 | // Any vectors left are the wrong size. Scalarize them. | |||
1592 | .scalarize(0) | |||
1593 | .scalarize(1); | |||
1594 | } | |||
1595 | ||||
1596 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | |||
1597 | // RegBankSelect. | |||
1598 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | |||
1599 | .legalFor({{S32}, {S64}}); | |||
1600 | ||||
1601 | if (ST.hasVOP3PInsts()) { | |||
1602 | SextInReg.lowerFor({{V2S16}}) | |||
1603 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | |||
1604 | // get more vector shift opportunities, since we'll get those when | |||
1605 | // expanded. | |||
1606 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); | |||
1607 | } else if (ST.has16BitInsts()) { | |||
1608 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | |||
1609 | } else { | |||
1610 | // Prefer to promote to s32 before lowering if we don't have 16-bit | |||
1611 | // shifts. This avoid a lot of intermediate truncate and extend operations. | |||
1612 | SextInReg.lowerFor({{S32}, {S64}}); | |||
1613 | } | |||
1614 | ||||
1615 | SextInReg | |||
1616 | .scalarize(0) | |||
1617 | .clampScalar(0, S32, S64) | |||
1618 | .lower(); | |||
1619 | ||||
1620 | // TODO: Only Try to form v2s16 with legal packed instructions. | |||
1621 | getActionDefinitionsBuilder(G_FSHR) | |||
1622 | .legalFor({{S32, S32}}) | |||
1623 | .lowerFor({{V2S16, V2S16}}) | |||
1624 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | |||
1625 | .scalarize(0) | |||
1626 | .lower(); | |||
1627 | ||||
1628 | if (ST.hasVOP3PInsts()) { | |||
1629 | getActionDefinitionsBuilder(G_FSHL) | |||
1630 | .lowerFor({{V2S16, V2S16}}) | |||
1631 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | |||
1632 | .scalarize(0) | |||
1633 | .lower(); | |||
1634 | } else { | |||
1635 | getActionDefinitionsBuilder(G_FSHL) | |||
1636 | .scalarize(0) | |||
1637 | .lower(); | |||
1638 | } | |||
1639 | ||||
1640 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | |||
1641 | .legalFor({S64}); | |||
1642 | ||||
1643 | getActionDefinitionsBuilder(G_FENCE) | |||
1644 | .alwaysLegal(); | |||
1645 | ||||
1646 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | |||
1647 | .scalarize(0) | |||
1648 | .minScalar(0, S32) | |||
1649 | .lower(); | |||
1650 | ||||
1651 | getActionDefinitionsBuilder({ | |||
1652 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | |||
1653 | G_FCOPYSIGN, | |||
1654 | ||||
1655 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | |||
1656 | G_ATOMICRMW_NAND, | |||
1657 | G_ATOMICRMW_FSUB, | |||
1658 | G_READ_REGISTER, | |||
1659 | G_WRITE_REGISTER, | |||
1660 | ||||
1661 | G_SADDO, G_SSUBO, | |||
1662 | ||||
1663 | // TODO: Implement | |||
1664 | G_FMINIMUM, G_FMAXIMUM}).lower(); | |||
1665 | ||||
1666 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | |||
1667 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | |||
1668 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | |||
1669 | .unsupported(); | |||
1670 | ||||
1671 | getLegacyLegalizerInfo().computeTables(); | |||
1672 | verify(*ST.getInstrInfo()); | |||
1673 | } | |||
1674 | ||||
1675 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | |||
1676 | MachineInstr &MI) const { | |||
1677 | MachineIRBuilder &B = Helper.MIRBuilder; | |||
1678 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
1679 | ||||
1680 | switch (MI.getOpcode()) { | |||
1681 | case TargetOpcode::G_ADDRSPACE_CAST: | |||
1682 | return legalizeAddrSpaceCast(MI, MRI, B); | |||
1683 | case TargetOpcode::G_FRINT: | |||
1684 | return legalizeFrint(MI, MRI, B); | |||
1685 | case TargetOpcode::G_FCEIL: | |||
1686 | return legalizeFceil(MI, MRI, B); | |||
1687 | case TargetOpcode::G_FREM: | |||
1688 | return legalizeFrem(MI, MRI, B); | |||
1689 | case TargetOpcode::G_INTRINSIC_TRUNC: | |||
1690 | return legalizeIntrinsicTrunc(MI, MRI, B); | |||
1691 | case TargetOpcode::G_SITOFP: | |||
1692 | return legalizeITOFP(MI, MRI, B, true); | |||
1693 | case TargetOpcode::G_UITOFP: | |||
1694 | return legalizeITOFP(MI, MRI, B, false); | |||
1695 | case TargetOpcode::G_FPTOSI: | |||
1696 | return legalizeFPTOI(MI, MRI, B, true); | |||
1697 | case TargetOpcode::G_FPTOUI: | |||
1698 | return legalizeFPTOI(MI, MRI, B, false); | |||
1699 | case TargetOpcode::G_FMINNUM: | |||
1700 | case TargetOpcode::G_FMAXNUM: | |||
1701 | case TargetOpcode::G_FMINNUM_IEEE: | |||
1702 | case TargetOpcode::G_FMAXNUM_IEEE: | |||
1703 | return legalizeMinNumMaxNum(Helper, MI); | |||
1704 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | |||
1705 | return legalizeExtractVectorElt(MI, MRI, B); | |||
1706 | case TargetOpcode::G_INSERT_VECTOR_ELT: | |||
1707 | return legalizeInsertVectorElt(MI, MRI, B); | |||
1708 | case TargetOpcode::G_SHUFFLE_VECTOR: | |||
1709 | return legalizeShuffleVector(MI, MRI, B); | |||
1710 | case TargetOpcode::G_FSIN: | |||
1711 | case TargetOpcode::G_FCOS: | |||
1712 | return legalizeSinCos(MI, MRI, B); | |||
1713 | case TargetOpcode::G_GLOBAL_VALUE: | |||
1714 | return legalizeGlobalValue(MI, MRI, B); | |||
1715 | case TargetOpcode::G_LOAD: | |||
1716 | case TargetOpcode::G_SEXTLOAD: | |||
1717 | case TargetOpcode::G_ZEXTLOAD: | |||
1718 | return legalizeLoad(Helper, MI); | |||
1719 | case TargetOpcode::G_FMAD: | |||
1720 | return legalizeFMad(MI, MRI, B); | |||
1721 | case TargetOpcode::G_FDIV: | |||
1722 | return legalizeFDIV(MI, MRI, B); | |||
1723 | case TargetOpcode::G_UDIV: | |||
1724 | case TargetOpcode::G_UREM: | |||
1725 | case TargetOpcode::G_UDIVREM: | |||
1726 | return legalizeUnsignedDIV_REM(MI, MRI, B); | |||
1727 | case TargetOpcode::G_SDIV: | |||
1728 | case TargetOpcode::G_SREM: | |||
1729 | case TargetOpcode::G_SDIVREM: | |||
1730 | return legalizeSignedDIV_REM(MI, MRI, B); | |||
1731 | case TargetOpcode::G_ATOMIC_CMPXCHG: | |||
1732 | return legalizeAtomicCmpXChg(MI, MRI, B); | |||
1733 | case TargetOpcode::G_FLOG: | |||
1734 | return legalizeFlog(MI, B, numbers::ln2f); | |||
1735 | case TargetOpcode::G_FLOG10: | |||
1736 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | |||
1737 | case TargetOpcode::G_FEXP: | |||
1738 | return legalizeFExp(MI, B); | |||
1739 | case TargetOpcode::G_FPOW: | |||
1740 | return legalizeFPow(MI, B); | |||
1741 | case TargetOpcode::G_FFLOOR: | |||
1742 | return legalizeFFloor(MI, MRI, B); | |||
1743 | case TargetOpcode::G_BUILD_VECTOR: | |||
1744 | return legalizeBuildVector(MI, MRI, B); | |||
1745 | default: | |||
1746 | return false; | |||
1747 | } | |||
1748 | ||||
1749 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1749); | |||
1750 | } | |||
1751 | ||||
1752 | Register AMDGPULegalizerInfo::getSegmentAperture( | |||
1753 | unsigned AS, | |||
1754 | MachineRegisterInfo &MRI, | |||
1755 | MachineIRBuilder &B) const { | |||
1756 | MachineFunction &MF = B.getMF(); | |||
1757 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
1758 | const LLT S32 = LLT::scalar(32); | |||
1759 | ||||
1760 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1760, __extension__ __PRETTY_FUNCTION__)); | |||
1761 | ||||
1762 | if (ST.hasApertureRegs()) { | |||
1763 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | |||
1764 | // getreg. | |||
1765 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | |||
1766 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | |||
1767 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | |||
1768 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | |||
1769 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | |||
1770 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | |||
1771 | unsigned Encoding = | |||
1772 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | |||
1773 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | |||
1774 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | |||
1775 | ||||
1776 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
1777 | ||||
1778 | B.buildInstr(AMDGPU::S_GETREG_B32) | |||
1779 | .addDef(GetReg) | |||
1780 | .addImm(Encoding); | |||
1781 | MRI.setType(GetReg, S32); | |||
1782 | ||||
1783 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | |||
1784 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); | |||
1785 | } | |||
1786 | ||||
1787 | Register QueuePtr = MRI.createGenericVirtualRegister( | |||
1788 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | |||
1789 | ||||
1790 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | |||
1791 | return Register(); | |||
1792 | ||||
1793 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | |||
1794 | // private_segment_aperture_base_hi. | |||
1795 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | |||
1796 | ||||
1797 | // TODO: can we be smarter about machine pointer info? | |||
1798 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | |||
1799 | MachineMemOperand *MMO = MF.getMachineMemOperand( | |||
1800 | PtrInfo, | |||
1801 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | |||
1802 | MachineMemOperand::MOInvariant, | |||
1803 | 4, commonAlignment(Align(64), StructOffset)); | |||
1804 | ||||
1805 | Register LoadAddr; | |||
1806 | ||||
1807 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | |||
1808 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | |||
1809 | } | |||
1810 | ||||
1811 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | |||
1812 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1813 | MachineIRBuilder &B) const { | |||
1814 | MachineFunction &MF = B.getMF(); | |||
1815 | ||||
1816 | const LLT S32 = LLT::scalar(32); | |||
1817 | Register Dst = MI.getOperand(0).getReg(); | |||
1818 | Register Src = MI.getOperand(1).getReg(); | |||
1819 | ||||
1820 | LLT DstTy = MRI.getType(Dst); | |||
1821 | LLT SrcTy = MRI.getType(Src); | |||
1822 | unsigned DestAS = DstTy.getAddressSpace(); | |||
1823 | unsigned SrcAS = SrcTy.getAddressSpace(); | |||
1824 | ||||
1825 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | |||
1826 | // vector element. | |||
1827 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1827, __extension__ __PRETTY_FUNCTION__)); | |||
1828 | ||||
1829 | const AMDGPUTargetMachine &TM | |||
1830 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | |||
1831 | ||||
1832 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | |||
1833 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | |||
1834 | return true; | |||
1835 | } | |||
1836 | ||||
1837 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | |||
1838 | // Truncate. | |||
1839 | B.buildExtract(Dst, Src, 0); | |||
1840 | MI.eraseFromParent(); | |||
1841 | return true; | |||
1842 | } | |||
1843 | ||||
1844 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | |||
1845 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | |||
1846 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | |||
1847 | ||||
1848 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | |||
1849 | // another. Merge operands are required to be the same type, but creating an | |||
1850 | // extra ptrtoint would be kind of pointless. | |||
1851 | auto HighAddr = B.buildConstant( | |||
1852 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | |||
1853 | B.buildMerge(Dst, {Src, HighAddr}); | |||
1854 | MI.eraseFromParent(); | |||
1855 | return true; | |||
1856 | } | |||
1857 | ||||
1858 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | |||
1859 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1860, __extension__ __PRETTY_FUNCTION__)) | |||
1860 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1860, __extension__ __PRETTY_FUNCTION__)); | |||
1861 | unsigned NullVal = TM.getNullPointerValue(DestAS); | |||
1862 | ||||
1863 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | |||
1864 | auto FlatNull = B.buildConstant(SrcTy, 0); | |||
1865 | ||||
1866 | // Extract low 32-bits of the pointer. | |||
1867 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | |||
1868 | ||||
1869 | auto CmpRes = | |||
1870 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | |||
1871 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | |||
1872 | ||||
1873 | MI.eraseFromParent(); | |||
1874 | return true; | |||
1875 | } | |||
1876 | ||||
1877 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | |||
1878 | return false; | |||
1879 | ||||
1880 | if (!ST.hasFlatAddressSpace()) | |||
1881 | return false; | |||
1882 | ||||
1883 | auto SegmentNull = | |||
1884 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | |||
1885 | auto FlatNull = | |||
1886 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | |||
1887 | ||||
1888 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | |||
1889 | if (!ApertureReg.isValid()) | |||
1890 | return false; | |||
1891 | ||||
1892 | auto CmpRes = | |||
1893 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); | |||
1894 | ||||
1895 | // Coerce the type of the low half of the result so we can use merge_values. | |||
1896 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | |||
1897 | ||||
1898 | // TODO: Should we allow mismatched types but matching sizes in merges to | |||
1899 | // avoid the ptrtoint? | |||
1900 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); | |||
1901 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | |||
1902 | ||||
1903 | MI.eraseFromParent(); | |||
1904 | return true; | |||
1905 | } | |||
1906 | ||||
1907 | bool AMDGPULegalizerInfo::legalizeFrint( | |||
1908 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1909 | MachineIRBuilder &B) const { | |||
1910 | Register Src = MI.getOperand(1).getReg(); | |||
1911 | LLT Ty = MRI.getType(Src); | |||
1912 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1912, __extension__ __PRETTY_FUNCTION__)); | |||
1913 | ||||
1914 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | |||
1915 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | |||
1916 | ||||
1917 | auto C1 = B.buildFConstant(Ty, C1Val); | |||
1918 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | |||
1919 | ||||
1920 | // TODO: Should this propagate fast-math-flags? | |||
1921 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | |||
1922 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | |||
1923 | ||||
1924 | auto C2 = B.buildFConstant(Ty, C2Val); | |||
1925 | auto Fabs = B.buildFAbs(Ty, Src); | |||
1926 | ||||
1927 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | |||
1928 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | |||
1929 | MI.eraseFromParent(); | |||
1930 | return true; | |||
1931 | } | |||
1932 | ||||
1933 | bool AMDGPULegalizerInfo::legalizeFceil( | |||
1934 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1935 | MachineIRBuilder &B) const { | |||
1936 | ||||
1937 | const LLT S1 = LLT::scalar(1); | |||
1938 | const LLT S64 = LLT::scalar(64); | |||
1939 | ||||
1940 | Register Src = MI.getOperand(1).getReg(); | |||
1941 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1941, __extension__ __PRETTY_FUNCTION__)); | |||
1942 | ||||
1943 | // result = trunc(src) | |||
1944 | // if (src > 0.0 && src != result) | |||
1945 | // result += 1.0 | |||
1946 | ||||
1947 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | |||
1948 | ||||
1949 | const auto Zero = B.buildFConstant(S64, 0.0); | |||
1950 | const auto One = B.buildFConstant(S64, 1.0); | |||
1951 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | |||
1952 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | |||
1953 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | |||
1954 | auto Add = B.buildSelect(S64, And, One, Zero); | |||
1955 | ||||
1956 | // TODO: Should this propagate fast-math-flags? | |||
1957 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | |||
1958 | return true; | |||
1959 | } | |||
1960 | ||||
1961 | bool AMDGPULegalizerInfo::legalizeFrem( | |||
1962 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1963 | MachineIRBuilder &B) const { | |||
1964 | Register DstReg = MI.getOperand(0).getReg(); | |||
1965 | Register Src0Reg = MI.getOperand(1).getReg(); | |||
1966 | Register Src1Reg = MI.getOperand(2).getReg(); | |||
1967 | auto Flags = MI.getFlags(); | |||
1968 | LLT Ty = MRI.getType(DstReg); | |||
1969 | ||||
1970 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | |||
1971 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | |||
1972 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | |||
1973 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | |||
1974 | MI.eraseFromParent(); | |||
1975 | return true; | |||
1976 | } | |||
1977 | ||||
1978 | static MachineInstrBuilder extractF64Exponent(Register Hi, | |||
1979 | MachineIRBuilder &B) { | |||
1980 | const unsigned FractBits = 52; | |||
1981 | const unsigned ExpBits = 11; | |||
1982 | LLT S32 = LLT::scalar(32); | |||
1983 | ||||
1984 | auto Const0 = B.buildConstant(S32, FractBits - 32); | |||
1985 | auto Const1 = B.buildConstant(S32, ExpBits); | |||
1986 | ||||
1987 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | |||
1988 | .addUse(Hi) | |||
1989 | .addUse(Const0.getReg(0)) | |||
1990 | .addUse(Const1.getReg(0)); | |||
1991 | ||||
1992 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | |||
1993 | } | |||
1994 | ||||
1995 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | |||
1996 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1997 | MachineIRBuilder &B) const { | |||
1998 | const LLT S1 = LLT::scalar(1); | |||
1999 | const LLT S32 = LLT::scalar(32); | |||
2000 | const LLT S64 = LLT::scalar(64); | |||
2001 | ||||
2002 | Register Src = MI.getOperand(1).getReg(); | |||
2003 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2003, __extension__ __PRETTY_FUNCTION__)); | |||
2004 | ||||
2005 | // TODO: Should this use extract since the low half is unused? | |||
2006 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | |||
2007 | Register Hi = Unmerge.getReg(1); | |||
2008 | ||||
2009 | // Extract the upper half, since this is where we will find the sign and | |||
2010 | // exponent. | |||
2011 | auto Exp = extractF64Exponent(Hi, B); | |||
2012 | ||||
2013 | const unsigned FractBits = 52; | |||
2014 | ||||
2015 | // Extract the sign bit. | |||
2016 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | |||
2017 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | |||
2018 | ||||
2019 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | |||
2020 | ||||
2021 | const auto Zero32 = B.buildConstant(S32, 0); | |||
2022 | ||||
2023 | // Extend back to 64-bits. | |||
2024 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); | |||
2025 | ||||
2026 | auto Shr = B.buildAShr(S64, FractMask, Exp); | |||
2027 | auto Not = B.buildNot(S64, Shr); | |||
2028 | auto Tmp0 = B.buildAnd(S64, Src, Not); | |||
2029 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | |||
2030 | ||||
2031 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | |||
2032 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | |||
2033 | ||||
2034 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | |||
2035 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | |||
2036 | MI.eraseFromParent(); | |||
2037 | return true; | |||
2038 | } | |||
2039 | ||||
2040 | bool AMDGPULegalizerInfo::legalizeITOFP( | |||
2041 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
2042 | MachineIRBuilder &B, bool Signed) const { | |||
2043 | ||||
2044 | Register Dst = MI.getOperand(0).getReg(); | |||
2045 | Register Src = MI.getOperand(1).getReg(); | |||
2046 | ||||
2047 | const LLT S64 = LLT::scalar(64); | |||
2048 | const LLT S32 = LLT::scalar(32); | |||
2049 | ||||
2050 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2050, __extension__ __PRETTY_FUNCTION__)); | |||
2051 | ||||
2052 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | |||
2053 | ||||
2054 | auto CvtHi = Signed ? | |||
2055 | B.buildSITOFP(S64, Unmerge.getReg(1)) : | |||
2056 | B.buildUITOFP(S64, Unmerge.getReg(1)); | |||
2057 | ||||
2058 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | |||
2059 | ||||
2060 | auto ThirtyTwo = B.buildConstant(S32, 32); | |||
2061 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | |||
2062 | .addUse(CvtHi.getReg(0)) | |||
2063 | .addUse(ThirtyTwo.getReg(0)); | |||
2064 | ||||
2065 | // TODO: Should this propagate fast-math-flags? | |||
2066 | B.buildFAdd(Dst, LdExp, CvtLo); | |||
2067 | MI.eraseFromParent(); | |||
2068 | return true; | |||
2069 | } | |||
2070 | ||||
2071 | // TODO: Copied from DAG implementation. Verify logic and document how this | |||
2072 | // actually works. | |||
2073 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, | |||
2074 | MachineRegisterInfo &MRI, | |||
2075 | MachineIRBuilder &B, | |||
2076 | bool Signed) const { | |||
2077 | ||||
2078 | Register Dst = MI.getOperand(0).getReg(); | |||
2079 | Register Src = MI.getOperand(1).getReg(); | |||
2080 | ||||
2081 | const LLT S64 = LLT::scalar(64); | |||
2082 | const LLT S32 = LLT::scalar(32); | |||
2083 | ||||
2084 | const LLT SrcLT = MRI.getType(Src); | |||
2085 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2085, __extension__ __PRETTY_FUNCTION__)); | |||
2086 | ||||
2087 | unsigned Flags = MI.getFlags(); | |||
2088 | ||||
2089 | // The basic idea of converting a floating point number into a pair of 32-bit | |||
2090 | // integers is illustrated as follows: | |||
2091 | // | |||
2092 | // tf := trunc(val); | |||
2093 | // hif := floor(tf * 2^-32); | |||
2094 | // lof := tf - hif * 2^32; // lof is always positive due to floor. | |||
2095 | // hi := fptoi(hif); | |||
2096 | // lo := fptoi(lof); | |||
2097 | // | |||
2098 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); | |||
2099 | MachineInstrBuilder Sign; | |||
2100 | if (Signed && SrcLT == S32) { | |||
2101 | // However, a 32-bit floating point number has only 23 bits mantissa and | |||
2102 | // it's not enough to hold all the significant bits of `lof` if val is | |||
2103 | // negative. To avoid the loss of precision, We need to take the absolute | |||
2104 | // value after truncating and flip the result back based on the original | |||
2105 | // signedness. | |||
2106 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); | |||
2107 | Trunc = B.buildFAbs(S32, Trunc, Flags); | |||
2108 | } | |||
2109 | MachineInstrBuilder K0, K1; | |||
2110 | if (SrcLT == S64) { | |||
2111 | K0 = B.buildFConstant(S64, | |||
2112 | BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL)); | |||
2113 | K1 = B.buildFConstant(S64, | |||
2114 | BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL)); | |||
2115 | } else { | |||
2116 | K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U)); | |||
2117 | K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U)); | |||
2118 | } | |||
2119 | ||||
2120 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); | |||
2121 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); | |||
2122 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); | |||
2123 | ||||
2124 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) | |||
2125 | : B.buildFPTOUI(S32, FloorMul); | |||
2126 | auto Lo = B.buildFPTOUI(S32, Fma); | |||
2127 | ||||
2128 | if (Signed && SrcLT == S32) { | |||
2129 | // Flip the result based on the signedness, which is either all 0s or 1s. | |||
2130 | Sign = B.buildMerge(S64, {Sign, Sign}); | |||
2131 | // r := xor({lo, hi}, sign) - sign; | |||
2132 | B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); | |||
2133 | } else | |||
2134 | B.buildMerge(Dst, {Lo, Hi}); | |||
2135 | MI.eraseFromParent(); | |||
2136 | ||||
2137 | return true; | |||
2138 | } | |||
2139 | ||||
2140 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | |||
2141 | MachineInstr &MI) const { | |||
2142 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | |||
2143 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | |||
2144 | ||||
2145 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | |||
2146 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | |||
2147 | ||||
2148 | // With ieee_mode disabled, the instructions have the correct behavior | |||
2149 | // already for G_FMINNUM/G_FMAXNUM | |||
2150 | if (!MFI->getMode().IEEE) | |||
2151 | return !IsIEEEOp; | |||
2152 | ||||
2153 | if (IsIEEEOp) | |||
2154 | return true; | |||
2155 | ||||
2156 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | |||
2157 | } | |||
2158 | ||||
2159 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | |||
2160 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
2161 | MachineIRBuilder &B) const { | |||
2162 | // TODO: Should move some of this into LegalizerHelper. | |||
2163 | ||||
2164 | // TODO: Promote dynamic indexing of s16 to s32 | |||
2165 | ||||
2166 | // FIXME: Artifact combiner probably should have replaced the truncated | |||
2167 | // constant before this, so we shouldn't need | |||
2168 | // getConstantVRegValWithLookThrough. | |||
2169 | Optional<ValueAndVReg> MaybeIdxVal = | |||
2170 | getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | |||
2171 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | |||
2172 | return true; | |||
2173 | const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | |||
2174 | ||||
2175 | Register Dst = MI.getOperand(0).getReg(); | |||
2176 | Register Vec = MI.getOperand(1).getReg(); | |||
2177 | ||||
2178 | LLT VecTy = MRI.getType(Vec); | |||
2179 | LLT EltTy = VecTy.getElementType(); | |||
2180 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2180, __extension__ __PRETTY_FUNCTION__)); | |||
2181 | ||||
2182 | if (IdxVal < VecTy.getNumElements()) | |||
2183 | B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); | |||
2184 | else | |||
2185 | B.buildUndef(Dst); | |||
2186 | ||||
2187 | MI.eraseFromParent(); | |||
2188 | return true; | |||
2189 | } | |||
2190 | ||||
2191 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | |||
2192 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
2193 | MachineIRBuilder &B) const { | |||
2194 | // TODO: Should move some of this into LegalizerHelper. | |||
2195 | ||||
2196 | // TODO: Promote dynamic indexing of s16 to s32 | |||
2197 | ||||
2198 | // FIXME: Artifact combiner probably should have replaced the truncated | |||
2199 | // constant before this, so we shouldn't need | |||
2200 | // getConstantVRegValWithLookThrough. | |||
2201 | Optional<ValueAndVReg> MaybeIdxVal = | |||
2202 | getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | |||
2203 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | |||
2204 | return true; | |||
2205 | ||||
2206 | int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | |||
2207 | Register Dst = MI.getOperand(0).getReg(); | |||
2208 | Register Vec = MI.getOperand(1).getReg(); | |||
2209 | Register Ins = MI.getOperand(2).getReg(); | |||
2210 | ||||
2211 | LLT VecTy = MRI.getType(Vec); | |||
2212 | LLT EltTy = VecTy.getElementType(); | |||
2213 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2213, __extension__ __PRETTY_FUNCTION__)); | |||
2214 | ||||
2215 | if (IdxVal < VecTy.getNumElements()) | |||
2216 | B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); | |||
2217 | else | |||
2218 | B.buildUndef(Dst); | |||
2219 | ||||
2220 | MI.eraseFromParent(); | |||
2221 | return true; | |||
2222 | } | |||
2223 | ||||
2224 | bool AMDGPULegalizerInfo::legalizeShuffleVector( | |||
2225 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
2226 | MachineIRBuilder &B) const { | |||
2227 | const LLT V2S16 = LLT::vector(2, 16); | |||
2228 | ||||
2229 | Register Dst = MI.getOperand(0).getReg(); | |||
2230 | Register Src0 = MI.getOperand(1).getReg(); | |||
2231 | LLT DstTy = MRI.getType(Dst); | |||
2232 | LLT SrcTy = MRI.getType(Src0); | |||
2233 | ||||
2234 | if (SrcTy == V2S16 && DstTy == V2S16 && | |||
2235 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) | |||
2236 | return true; | |||
2237 | ||||
2238 | MachineIRBuilder HelperBuilder(MI); | |||
2239 | GISelObserverWrapper DummyObserver; | |||
2240 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); | |||
2241 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; | |||
2242 | } | |||
2243 | ||||
2244 | bool AMDGPULegalizerInfo::legalizeSinCos( | |||
2245 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
2246 | MachineIRBuilder &B) const { | |||
2247 | ||||
2248 | Register DstReg = MI.getOperand(0).getReg(); | |||
2249 | Register SrcReg = MI.getOperand(1).getReg(); | |||
2250 | LLT Ty = MRI.getType(DstReg); | |||
2251 | unsigned Flags = MI.getFlags(); | |||
2252 | ||||
2253 | Register TrigVal; | |||
2254 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | |||
2255 | if (ST.hasTrigReducedRange()) { | |||
2256 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | |||
2257 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | |||
2258 | .addUse(MulVal.getReg(0)) | |||
2259 | .setMIFlags(Flags).getReg(0); | |||
2260 | } else | |||
2261 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | |||
2262 | ||||
2263 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | |||
2264 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | |||
2265 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | |||
2266 | .addUse(TrigVal) | |||
2267 | .setMIFlags(Flags); | |||
2268 | MI.eraseFromParent(); | |||
2269 | return true; | |||
2270 | } | |||
2271 | ||||
2272 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | |||
2273 | MachineIRBuilder &B, | |||
2274 | const GlobalValue *GV, | |||
2275 | int64_t Offset, | |||
2276 | unsigned GAFlags) const { | |||
2277 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2277, __extension__ __PRETTY_FUNCTION__)); | |||
2278 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | |||
2279 | // to the following code sequence: | |||
2280 | // | |||
2281 | // For constant address space: | |||
2282 | // s_getpc_b64 s[0:1] | |||
2283 | // s_add_u32 s0, s0, $symbol | |||
2284 | // s_addc_u32 s1, s1, 0 | |||
2285 | // | |||
2286 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | |||
2287 | // a fixup or relocation is emitted to replace $symbol with a literal | |||
2288 | // constant, which is a pc-relative offset from the encoding of the $symbol | |||
2289 | // operand to the global variable. | |||
2290 | // | |||
2291 | // For global address space: | |||
2292 | // s_getpc_b64 s[0:1] | |||
2293 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | |||
2294 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | |||
2295 | // | |||
2296 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | |||
2297 | // fixups or relocations are emitted to replace $symbol@*@lo and | |||
2298 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | |||
2299 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | |||
2300 | // operand to the global variable. | |||
2301 | // | |||
2302 | // What we want here is an offset from the value returned by s_getpc | |||
2303 | // (which is the address of the s_add_u32 instruction) to the global | |||
2304 | // variable, but since the encoding of $symbol starts 4 bytes after the start | |||
2305 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | |||
2306 | // small. This requires us to add 4 to the global variable offset in order to | |||
2307 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | |||
2308 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | |||
2309 | // instruction. | |||
2310 | ||||
2311 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | |||
2312 | ||||
2313 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | |||
2314 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | |||
2315 | ||||
2316 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | |||
2317 | .addDef(PCReg); | |||
2318 | ||||
2319 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | |||
2320 | if (GAFlags == SIInstrInfo::MO_NONE) | |||
2321 | MIB.addImm(0); | |||
2322 | else | |||
2323 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | |||
2324 | ||||
2325 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | |||
2326 | ||||
2327 | if (PtrTy.getSizeInBits() == 32) | |||
2328 | B.buildExtract(DstReg, PCReg, 0); | |||
2329 | return true; | |||
2330 | } | |||
2331 | ||||
2332 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | |||
2333 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
2334 | MachineIRBuilder &B) const { | |||
2335 | Register DstReg = MI.getOperand(0).getReg(); | |||
2336 | LLT Ty = MRI.getType(DstReg); | |||
2337 | unsigned AS = Ty.getAddressSpace(); | |||
2338 | ||||
2339 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | |||
2340 | MachineFunction &MF = B.getMF(); | |||
2341 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | |||
2342 | ||||
2343 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | |||
2344 | if (!MFI->isModuleEntryFunction() && | |||
2345 | !GV->getName().equals("llvm.amdgcn.module.lds")) { | |||
2346 | const Function &Fn = MF.getFunction(); | |||
2347 | DiagnosticInfoUnsupported BadLDSDecl( | |||
2348 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | |||
2349 | DS_Warning); | |||
2350 | Fn.getContext().diagnose(BadLDSDecl); | |||
2351 | ||||
2352 | // We currently don't have a way to correctly allocate LDS objects that | |||
2353 | // aren't directly associated with a kernel. We do force inlining of | |||
2354 | // functions that use local objects. However, if these dead functions are | |||
2355 | // not eliminated, we don't want a compile time error. Just emit a warning | |||
2356 | // and a trap, since there should be no callable path here. | |||
2357 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | |||
2358 | B.buildUndef(DstReg); | |||
2359 | MI.eraseFromParent(); | |||
2360 | return true; | |||
2361 | } | |||
2362 | ||||
2363 | // TODO: We could emit code to handle the initialization somewhere. | |||
2364 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { | |||
2365 | const SITargetLowering *TLI = ST.getTargetLowering(); | |||
2366 | if (!TLI->shouldUseLDSConstAddress(GV)) { | |||
2367 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | |||
2368 | return true; // Leave in place; | |||
2369 | } | |||
2370 | ||||
2371 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | |||
2372 | Type *Ty = GV->getValueType(); | |||
2373 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | |||
2374 | // zero-sized type in other languages to declare the dynamic shared | |||
2375 | // memory which size is not known at the compile time. They will be | |||
2376 | // allocated by the runtime and placed directly after the static | |||
2377 | // allocated ones. They all share the same offset. | |||
2378 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | |||
2379 | // Adjust alignment for that dynamic shared memory array. | |||
2380 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); | |||
2381 | LLT S32 = LLT::scalar(32); | |||
2382 | auto Sz = | |||
2383 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | |||
2384 | B.buildIntToPtr(DstReg, Sz); | |||
2385 | MI.eraseFromParent(); | |||
2386 | return true; | |||
2387 | } | |||
2388 | } | |||
2389 | ||||
2390 | B.buildConstant( | |||
2391 | DstReg, | |||
2392 | MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); | |||
2393 | MI.eraseFromParent(); | |||
2394 | return true; | |||
2395 | } | |||
2396 | ||||
2397 | const Function &Fn = MF.getFunction(); | |||
2398 | DiagnosticInfoUnsupported BadInit( | |||
2399 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); | |||
2400 | Fn.getContext().diagnose(BadInit); | |||
2401 | return true; | |||
2402 | } | |||
2403 | ||||
2404 | const SITargetLowering *TLI = ST.getTargetLowering(); | |||
2405 | ||||
2406 | if (TLI->shouldEmitFixup(GV)) { | |||
2407 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | |||
2408 | MI.eraseFromParent(); | |||
2409 | return true; | |||
2410 | } | |||
2411 | ||||
2412 | if (TLI->shouldEmitPCReloc(GV)) { | |||
2413 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | |||
2414 | MI.eraseFromParent(); | |||
2415 | return true; | |||
2416 | } | |||
2417 | ||||
2418 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | |||
2419 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | |||
2420 | ||||
2421 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | |||
2422 | MachinePointerInfo::getGOT(MF), | |||
2423 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | |||
2424 | MachineMemOperand::MOInvariant, | |||
2425 | 8 /*Size*/, Align(8)); | |||
2426 | ||||
2427 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | |||
2428 | ||||
2429 | if (Ty.getSizeInBits() == 32) { | |||
2430 | // Truncate if this is a 32-bit constant adrdess. | |||
2431 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | |||
2432 | B.buildExtract(DstReg, Load, 0); | |||
2433 | } else | |||
2434 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | |||
2435 | ||||
2436 | MI.eraseFromParent(); | |||
2437 | return true; | |||
2438 | } | |||
2439 | ||||
2440 | static LLT widenToNextPowerOf2(LLT Ty) { | |||
2441 | if (Ty.isVector()) | |||
2442 | return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); | |||
2443 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | |||
2444 | } | |||
2445 | ||||
2446 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | |||
2447 | MachineInstr &MI) const { | |||
2448 | MachineIRBuilder &B = Helper.MIRBuilder; | |||
2449 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
2450 | GISelChangeObserver &Observer = Helper.Observer; | |||
2451 | ||||
2452 | Register PtrReg = MI.getOperand(1).getReg(); | |||
2453 | LLT PtrTy = MRI.getType(PtrReg); | |||
2454 | unsigned AddrSpace = PtrTy.getAddressSpace(); | |||
2455 | ||||
2456 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | |||
2457 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | |||
2458 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | |||
2459 | Observer.changingInstr(MI); | |||
2460 | MI.getOperand(1).setReg(Cast.getReg(0)); | |||
2461 | Observer.changedInstr(MI); | |||
2462 | return true; | |||
2463 | } | |||
2464 | ||||
2465 | if (MI.getOpcode() != AMDGPU::G_LOAD) | |||
2466 | return false; | |||
2467 | ||||
2468 | Register ValReg = MI.getOperand(0).getReg(); | |||
2469 | LLT ValTy = MRI.getType(ValReg); | |||
2470 | ||||
2471 | MachineMemOperand *MMO = *MI.memoperands_begin(); | |||
2472 | const unsigned ValSize = ValTy.getSizeInBits(); | |||
2473 | const unsigned MemSize = 8 * MMO->getSize(); | |||
2474 | const Align MemAlign = MMO->getAlign(); | |||
2475 | const unsigned AlignInBits = 8 * MemAlign.value(); | |||
2476 | ||||
2477 | // Widen non-power-of-2 loads to the alignment if needed | |||
2478 | if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { | |||
2479 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | |||
2480 | ||||
2481 | // This was already the correct extending load result type, so just adjust | |||
2482 | // the memory type. | |||
2483 | if (WideMemSize == ValSize) { | |||
2484 | MachineFunction &MF = B.getMF(); | |||
2485 | ||||
2486 | MachineMemOperand *WideMMO = | |||
2487 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | |||
2488 | Observer.changingInstr(MI); | |||
2489 | MI.setMemRefs(MF, {WideMMO}); | |||
2490 | Observer.changedInstr(MI); | |||
2491 | return true; | |||
2492 | } | |||
2493 | ||||
2494 | // Don't bother handling edge case that should probably never be produced. | |||
2495 | if (ValSize > WideMemSize) | |||
2496 | return false; | |||
2497 | ||||
2498 | LLT WideTy = widenToNextPowerOf2(ValTy); | |||
2499 | ||||
2500 | Register WideLoad; | |||
2501 | if (!WideTy.isVector()) { | |||
2502 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | |||
2503 | B.buildTrunc(ValReg, WideLoad).getReg(0); | |||
2504 | } else { | |||
2505 | // Extract the subvector. | |||
2506 | ||||
2507 | if (isRegisterType(ValTy)) { | |||
2508 | // If this a case where G_EXTRACT is legal, use it. | |||
2509 | // (e.g. <3 x s32> -> <4 x s32>) | |||
2510 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | |||
2511 | B.buildExtract(ValReg, WideLoad, 0); | |||
2512 | } else { | |||
2513 | // For cases where the widened type isn't a nice register value, unmerge | |||
2514 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | |||
2515 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | |||
2516 | WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); | |||
2517 | B.setInsertPt(B.getMBB(), MI.getIterator()); | |||
2518 | B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); | |||
2519 | } | |||
2520 | } | |||
2521 | ||||
2522 | MI.eraseFromParent(); | |||
2523 | return true; | |||
2524 | } | |||
2525 | ||||
2526 | return false; | |||
2527 | } | |||
2528 | ||||
2529 | bool AMDGPULegalizerInfo::legalizeFMad( | |||
2530 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
2531 | MachineIRBuilder &B) const { | |||
2532 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | |||
2533 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2533, __extension__ __PRETTY_FUNCTION__)); | |||
2534 | ||||
2535 | MachineFunction &MF = B.getMF(); | |||
2536 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | |||
2537 | ||||
2538 | // TODO: Always legal with future ftz flag. | |||
2539 | // FIXME: Do we need just output? | |||
2540 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | |||
2541 | return true; | |||
2542 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | |||
2543 | return true; | |||
2544 | ||||
2545 | MachineIRBuilder HelperBuilder(MI); | |||
2546 | GISelObserverWrapper DummyObserver; | |||
2547 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | |||
2548 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | |||
2549 | } | |||
2550 | ||||
2551 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | |||
2552 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | |||
2553 | Register DstReg = MI.getOperand(0).getReg(); | |||
2554 | Register PtrReg = MI.getOperand(1).getReg(); | |||
2555 | Register CmpVal = MI.getOperand(2).getReg(); | |||
2556 | Register NewVal = MI.getOperand(3).getReg(); | |||
2557 | ||||
2558 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2559, __extension__ __PRETTY_FUNCTION__)) | |||
2559 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2559, __extension__ __PRETTY_FUNCTION__)); | |||
2560 | ||||
2561 | LLT ValTy = MRI.getType(CmpVal); | |||
2562 | LLT VecTy = LLT::vector(2, ValTy); | |||
2563 | ||||
2564 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | |||
2565 | ||||
2566 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | |||
2567 | .addDef(DstReg) | |||
2568 | .addUse(PtrReg) | |||
2569 | .addUse(PackedVal) | |||
2570 | .setMemRefs(MI.memoperands()); | |||
2571 | ||||
2572 | MI.eraseFromParent(); | |||
2573 | return true; | |||
2574 | } | |||
2575 | ||||
2576 | bool AMDGPULegalizerInfo::legalizeFlog( | |||
2577 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | |||
2578 | Register Dst = MI.getOperand(0).getReg(); | |||
2579 | Register Src = MI.getOperand(1).getReg(); | |||
2580 | LLT Ty = B.getMRI()->getType(Dst); | |||
2581 | unsigned Flags = MI.getFlags(); | |||
2582 | ||||
2583 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | |||
2584 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | |||
2585 | ||||
2586 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | |||
2587 | MI.eraseFromParent(); | |||
2588 | return true; | |||
2589 | } | |||
2590 | ||||
2591 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | |||
2592 | MachineIRBuilder &B) const { | |||
2593 | Register Dst = MI.getOperand(0).getReg(); | |||
2594 | Register Src = MI.getOperand(1).getReg(); | |||
2595 | unsigned Flags = MI.getFlags(); | |||
2596 | LLT Ty = B.getMRI()->getType(Dst); | |||
2597 | ||||
2598 | auto K = B.buildFConstant(Ty, numbers::log2e); | |||
2599 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | |||
2600 | B.buildFExp2(Dst, Mul, Flags); | |||
2601 | MI.eraseFromParent(); | |||
2602 | return true; | |||
2603 | } | |||
2604 | ||||
2605 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | |||
2606 | MachineIRBuilder &B) const { | |||
2607 | Register Dst = MI.getOperand(0).getReg(); | |||
2608 | Register Src0 = MI.getOperand(1).getReg(); | |||
2609 | Register Src1 = MI.getOperand(2).getReg(); | |||
2610 | unsigned Flags = MI.getFlags(); | |||
2611 | LLT Ty = B.getMRI()->getType(Dst); | |||
2612 | const LLT S16 = LLT::scalar(16); | |||
2613 | const LLT S32 = LLT::scalar(32); | |||
2614 | ||||
2615 | if (Ty == S32) { | |||
2616 | auto Log = B.buildFLog2(S32, Src0, Flags); | |||
2617 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | |||
2618 | .addUse(Log.getReg(0)) | |||
2619 | .addUse(Src1) | |||
2620 | .setMIFlags(Flags); | |||
2621 | B.buildFExp2(Dst, Mul, Flags); | |||
2622 | } else if (Ty == S16) { | |||
2623 | // There's no f16 fmul_legacy, so we need to convert for it. | |||
2624 | auto Log = B.buildFLog2(S16, Src0, Flags); | |||
2625 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | |||
2626 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | |||
2627 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | |||
2628 | .addUse(Ext0.getReg(0)) | |||
2629 | .addUse(Ext1.getReg(0)) | |||
2630 | .setMIFlags(Flags); | |||
2631 | ||||
2632 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | |||
2633 | } else | |||
2634 | return false; | |||
2635 | ||||
2636 | MI.eraseFromParent(); | |||
2637 | return true; | |||
2638 | } | |||
2639 | ||||
2640 | // Find a source register, ignoring any possible source modifiers. | |||
2641 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | |||
2642 | Register ModSrc = OrigSrc; | |||
2643 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | |||
2644 | ModSrc = SrcFNeg->getOperand(1).getReg(); | |||
2645 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | |||
2646 | ModSrc = SrcFAbs->getOperand(1).getReg(); | |||
2647 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | |||
2648 | ModSrc = SrcFAbs->getOperand(1).getReg(); | |||
2649 | return ModSrc; | |||
2650 | } | |||
2651 | ||||
2652 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | |||
2653 | MachineRegisterInfo &MRI, | |||
2654 | MachineIRBuilder &B) const { | |||
2655 | ||||
2656 | const LLT S1 = LLT::scalar(1); | |||
2657 | const LLT S64 = LLT::scalar(64); | |||
2658 | Register Dst = MI.getOperand(0).getReg(); | |||
2659 | Register OrigSrc = MI.getOperand(1).getReg(); | |||
2660 | unsigned Flags = MI.getFlags(); | |||
2661 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2662, __extension__ __PRETTY_FUNCTION__)) | |||
2662 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2662, __extension__ __PRETTY_FUNCTION__)); | |||
2663 | ||||
2664 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | |||
2665 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | |||
2666 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | |||
2667 | // V_FRACT bug is: | |||
2668 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | |||
2669 | // | |||
2670 | // Convert floor(x) to (x - fract(x)) | |||
2671 | ||||
2672 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | |||
2673 | .addUse(OrigSrc) | |||
2674 | .setMIFlags(Flags); | |||
2675 | ||||
2676 | // Give source modifier matching some assistance before obscuring a foldable | |||
2677 | // pattern. | |||
2678 | ||||
2679 | // TODO: We can avoid the neg on the fract? The input sign to fract | |||
2680 | // shouldn't matter? | |||
2681 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | |||
2682 | ||||
2683 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); | |||
2684 | ||||
2685 | Register Min = MRI.createGenericVirtualRegister(S64); | |||
2686 | ||||
2687 | // We don't need to concern ourselves with the snan handling difference, so | |||
2688 | // use the one which will directly select. | |||
2689 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | |||
2690 | if (MFI->getMode().IEEE) | |||
2691 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | |||
2692 | else | |||
2693 | B.buildFMinNum(Min, Fract, Const, Flags); | |||
2694 | ||||
2695 | Register CorrectedFract = Min; | |||
2696 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | |||
2697 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | |||
2698 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | |||
2699 | } | |||
2700 | ||||
2701 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | |||
2702 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | |||
2703 | ||||
2704 | MI.eraseFromParent(); | |||
2705 | return true; | |||
2706 | } | |||
2707 | ||||
2708 | // Turn an illegal packed v2s16 build vector into bit operations. | |||
2709 | // TODO: This should probably be a bitcast action in LegalizerHelper. | |||
2710 | bool AMDGPULegalizerInfo::legalizeBuildVector( | |||
2711 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | |||
2712 | Register Dst = MI.getOperand(0).getReg(); | |||
2713 | const LLT S32 = LLT::scalar(32); | |||
2714 | assert(MRI.getType(Dst) == LLT::vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::vector(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2714, __extension__ __PRETTY_FUNCTION__)); | |||
2715 | ||||
2716 | Register Src0 = MI.getOperand(1).getReg(); | |||
2717 | Register Src1 = MI.getOperand(2).getReg(); | |||
2718 | assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16 )) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2718, __extension__ __PRETTY_FUNCTION__)); | |||
2719 | ||||
2720 | auto Merge = B.buildMerge(S32, {Src0, Src1}); | |||
2721 | B.buildBitcast(Dst, Merge); | |||
2722 | ||||
2723 | MI.eraseFromParent(); | |||
2724 | return true; | |||
2725 | } | |||
2726 | ||||
2727 | // Check that this is a G_XOR x, -1 | |||
2728 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | |||
2729 | if (MI.getOpcode() != TargetOpcode::G_XOR) | |||
2730 | return false; | |||
2731 | auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | |||
2732 | return ConstVal && *ConstVal == -1; | |||
2733 | } | |||
2734 | ||||
2735 | // Return the use branch instruction, otherwise null if the usage is invalid. | |||
2736 | static MachineInstr * | |||
2737 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | |||
2738 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | |||
2739 | Register CondDef = MI.getOperand(0).getReg(); | |||
2740 | if (!MRI.hasOneNonDBGUse(CondDef)) | |||
2741 | return nullptr; | |||
2742 | ||||
2743 | MachineBasicBlock *Parent = MI.getParent(); | |||
2744 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | |||
2745 | ||||
2746 | if (isNot(MRI, *UseMI)) { | |||
2747 | Register NegatedCond = UseMI->getOperand(0).getReg(); | |||
2748 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | |||
2749 | return nullptr; | |||
2750 | ||||
2751 | // We're deleting the def of this value, so we need to remove it. | |||
2752 | UseMI->eraseFromParent(); | |||
2753 | ||||
2754 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | |||
2755 | Negated = true; | |||
2756 | } | |||
2757 | ||||
2758 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | |||
2759 | return nullptr; | |||
2760 | ||||
2761 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | |||
2762 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | |||
2763 | if (Next == Parent->end()) { | |||
2764 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | |||
2765 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | |||
2766 | return nullptr; | |||
2767 | UncondBrTarget = &*NextMBB; | |||
2768 | } else { | |||
2769 | if (Next->getOpcode() != AMDGPU::G_BR) | |||
2770 | return nullptr; | |||
2771 | Br = &*Next; | |||
2772 | UncondBrTarget = Br->getOperand(0).getMBB(); | |||
2773 | } | |||
2774 | ||||
2775 | return UseMI; | |||
2776 | } | |||
2777 | ||||
2778 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | |||
2779 | const ArgDescriptor *Arg, | |||
2780 | const TargetRegisterClass *ArgRC, | |||
2781 | LLT ArgTy) const { | |||
2782 | MCRegister SrcReg = Arg->getRegister(); | |||
2783 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2783, __extension__ __PRETTY_FUNCTION__)); | |||
2784 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2784, __extension__ __PRETTY_FUNCTION__)); | |||
2785 | ||||
2786 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, | |||
2787 | ArgTy); | |||
2788 | if (Arg->isMasked()) { | |||
2789 | // TODO: Should we try to emit this once in the entry block? | |||
2790 | const LLT S32 = LLT::scalar(32); | |||
2791 | const unsigned Mask = Arg->getMask(); | |||
2792 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | |||
2793 | ||||
2794 | Register AndMaskSrc = LiveIn; | |||
2795 | ||||
2796 | if (Shift != 0) { | |||
2797 | auto ShiftAmt = B.buildConstant(S32, Shift); | |||
2798 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | |||
2799 | } | |||
2800 | ||||
2801 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | |||
2802 | } else { | |||
2803 | B.buildCopy(DstReg, LiveIn); | |||
2804 | } | |||
2805 | ||||
2806 | return true; | |||
2807 | } | |||
2808 | ||||
2809 | bool AMDGPULegalizerInfo::loadInputValue( | |||
2810 | Register DstReg, MachineIRBuilder &B, | |||
2811 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | |||
2812 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | |||
2813 | const ArgDescriptor *Arg; | |||
2814 | const TargetRegisterClass *ArgRC; | |||
2815 | LLT ArgTy; | |||
2816 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | |||
2817 | ||||
2818 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | |||
2819 | return false; // TODO: Handle these | |||
2820 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | |||
2821 | } | |||
2822 | ||||
2823 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | |||
2824 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | |||
2825 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | |||
2826 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | |||
2827 | return false; | |||
2828 | ||||
2829 | MI.eraseFromParent(); | |||
2830 | return true; | |||
2831 | } | |||
2832 | ||||
2833 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | |||
2834 | MachineRegisterInfo &MRI, | |||
2835 | MachineIRBuilder &B) const { | |||
2836 | Register Dst = MI.getOperand(0).getReg(); | |||
2837 | LLT DstTy = MRI.getType(Dst); | |||
2838 | LLT S16 = LLT::scalar(16); | |||
2839 | LLT S32 = LLT::scalar(32); | |||
2840 | LLT S64 = LLT::scalar(64); | |||
2841 | ||||
2842 | if (DstTy == S16) | |||
2843 | return legalizeFDIV16(MI, MRI, B); | |||
2844 | if (DstTy == S32) | |||
2845 | return legalizeFDIV32(MI, MRI, B); | |||
2846 | if (DstTy == S64) | |||
2847 | return legalizeFDIV64(MI, MRI, B); | |||
2848 | ||||
2849 | return false; | |||
2850 | } | |||
2851 | ||||
2852 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, | |||
2853 | Register DstDivReg, | |||
2854 | Register DstRemReg, | |||
2855 | Register X, | |||
2856 | Register Y) const { | |||
2857 | const LLT S1 = LLT::scalar(1); | |||
2858 | const LLT S32 = LLT::scalar(32); | |||
2859 | ||||
2860 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | |||
2861 | // algorithm used here. | |||
2862 | ||||
2863 | // Initial estimate of inv(y). | |||
2864 | auto FloatY = B.buildUITOFP(S32, Y); | |||
2865 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | |||
2866 | auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); | |||
2867 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | |||
2868 | auto Z = B.buildFPTOUI(S32, ScaledY); | |||
2869 | ||||
2870 | // One round of UNR. | |||
2871 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | |||
2872 | auto NegYZ = B.buildMul(S32, NegY, Z); | |||
2873 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | |||
2874 | ||||
2875 | // Quotient/remainder estimate. | |||
2876 | auto Q = B.buildUMulH(S32, X, Z); | |||
2877 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | |||
2878 | ||||
2879 | // First quotient/remainder refinement. | |||
2880 | auto One = B.buildConstant(S32, 1); | |||
2881 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | |||
2882 | if (DstDivReg) | |||
2883 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | |||
2884 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | |||
2885 | ||||
2886 | // Second quotient/remainder refinement. | |||
2887 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | |||
2888 | if (DstDivReg) | |||
2889 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); | |||
2890 | ||||
2891 | if (DstRemReg) | |||
2892 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); | |||
2893 | } | |||
2894 | ||||
2895 | // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 | |||
2896 | // | |||
2897 | // Return lo, hi of result | |||
2898 | // | |||
2899 | // %cvt.lo = G_UITOFP Val.lo | |||
2900 | // %cvt.hi = G_UITOFP Val.hi | |||
2901 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | |||
2902 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | |||
2903 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | |||
2904 | // %mul2 = G_FMUL %mul1, 2**(-32) | |||
2905 | // %trunc = G_INTRINSIC_TRUNC %mul2 | |||
2906 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | |||
2907 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | |||
2908 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | |||
2909 | Register Val) { | |||
2910 | const LLT S32 = LLT::scalar(32); | |||
2911 | auto Unmerge = B.buildUnmerge(S32, Val); | |||
2912 | ||||
2913 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | |||
2914 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | |||
2915 | ||||
2916 | auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 | |||
2917 | B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); | |||
2918 | ||||
2919 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | |||
2920 | auto Mul1 = | |||
2921 | B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); | |||
2922 | ||||
2923 | // 2**(-32) | |||
2924 | auto Mul2 = | |||
2925 | B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); | |||
2926 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | |||
2927 | ||||
2928 | // -(2**32) | |||
2929 | auto Mad2 = B.buildFMAD(S32, Trunc, | |||
2930 | B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); | |||
2931 | ||||
2932 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | |||
2933 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | |||
2934 | ||||
2935 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | |||
2936 | } | |||
2937 | ||||
2938 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, | |||
2939 | Register DstDivReg, | |||
2940 | Register DstRemReg, | |||
2941 | Register Numer, | |||
2942 | Register Denom) const { | |||
2943 | const LLT S32 = LLT::scalar(32); | |||
2944 | const LLT S64 = LLT::scalar(64); | |||
2945 | const LLT S1 = LLT::scalar(1); | |||
2946 | Register RcpLo, RcpHi; | |||
2947 | ||||
2948 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | |||
2949 | ||||
2950 | auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); | |||
2951 | ||||
2952 | auto Zero64 = B.buildConstant(S64, 0); | |||
2953 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | |||
2954 | ||||
2955 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | |||
2956 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | |||
2957 | ||||
2958 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | |||
2959 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | |||
2960 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | |||
2961 | ||||
2962 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | |||
2963 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | |||
2964 | auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); | |||
2965 | auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); | |||
2966 | ||||
2967 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | |||
2968 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | |||
2969 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | |||
2970 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | |||
2971 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | |||
2972 | ||||
2973 | auto Zero32 = B.buildConstant(S32, 0); | |||
2974 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | |||
2975 | auto Add2_HiC = | |||
2976 | B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); | |||
2977 | auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); | |||
2978 | auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); | |||
2979 | ||||
2980 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | |||
2981 | Register NumerLo = UnmergeNumer.getReg(0); | |||
2982 | Register NumerHi = UnmergeNumer.getReg(1); | |||
2983 | ||||
2984 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | |||
2985 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | |||
2986 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | |||
2987 | Register Mul3_Lo = UnmergeMul3.getReg(0); | |||
2988 | Register Mul3_Hi = UnmergeMul3.getReg(1); | |||
2989 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | |||
2990 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | |||
2991 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | |||
2992 | auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); | |||
2993 | ||||
2994 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | |||
2995 | Register DenomLo = UnmergeDenom.getReg(0); | |||
2996 | Register DenomHi = UnmergeDenom.getReg(1); | |||
2997 | ||||
2998 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | |||
2999 | auto C1 = B.buildSExt(S32, CmpHi); | |||
3000 | ||||
3001 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | |||
3002 | auto C2 = B.buildSExt(S32, CmpLo); | |||
3003 | ||||
3004 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | |||
3005 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | |||
3006 | ||||
3007 | // TODO: Here and below portions of the code can be enclosed into if/endif. | |||
3008 | // Currently control flow is unconditional and we have 4 selects after | |||
3009 | // potential endif to substitute PHIs. | |||
3010 | ||||
3011 | // if C3 != 0 ... | |||
3012 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | |||
3013 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | |||
3014 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | |||
3015 | auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); | |||
3016 | ||||
3017 | auto One64 = B.buildConstant(S64, 1); | |||
3018 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | |||
3019 | ||||
3020 | auto C4 = | |||
3021 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | |||
3022 | auto C5 = | |||
3023 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | |||
3024 | auto C6 = B.buildSelect( | |||
3025 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | |||
3026 | ||||
3027 | // if (C6 != 0) | |||
3028 | auto Add4 = B.buildAdd(S64, Add3, One64); | |||
3029 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | |||
3030 | ||||
3031 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | |||
3032 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | |||
3033 | auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); | |||
3034 | ||||
3035 | // endif C6 | |||
3036 | // endif C3 | |||
3037 | ||||
3038 | if (DstDivReg) { | |||
3039 | auto Sel1 = B.buildSelect( | |||
3040 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | |||
3041 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | |||
3042 | Sel1, MulHi3); | |||
3043 | } | |||
3044 | ||||
3045 | if (DstRemReg) { | |||
3046 | auto Sel2 = B.buildSelect( | |||
3047 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | |||
3048 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | |||
3049 | Sel2, Sub1); | |||
3050 | } | |||
3051 | } | |||
3052 | ||||
3053 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, | |||
3054 | MachineRegisterInfo &MRI, | |||
3055 | MachineIRBuilder &B) const { | |||
3056 | Register DstDivReg, DstRemReg; | |||
3057 | switch (MI.getOpcode()) { | |||
3058 | default: | |||
3059 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3059); | |||
3060 | case AMDGPU::G_UDIV: { | |||
3061 | DstDivReg = MI.getOperand(0).getReg(); | |||
3062 | break; | |||
3063 | } | |||
3064 | case AMDGPU::G_UREM: { | |||
3065 | DstRemReg = MI.getOperand(0).getReg(); | |||
3066 | break; | |||
3067 | } | |||
3068 | case AMDGPU::G_UDIVREM: { | |||
3069 | DstDivReg = MI.getOperand(0).getReg(); | |||
3070 | DstRemReg = MI.getOperand(1).getReg(); | |||
3071 | break; | |||
3072 | } | |||
3073 | } | |||
3074 | ||||
3075 | const LLT S64 = LLT::scalar(64); | |||
3076 | const LLT S32 = LLT::scalar(32); | |||
3077 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | |||
3078 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); | |||
3079 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | |||
3080 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | |||
3081 | ||||
3082 | if (Ty == S32) | |||
3083 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); | |||
3084 | else if (Ty == S64) | |||
3085 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); | |||
3086 | else | |||
3087 | return false; | |||
3088 | ||||
3089 | MI.eraseFromParent(); | |||
3090 | return true; | |||
3091 | } | |||
3092 | ||||
3093 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, | |||
3094 | MachineRegisterInfo &MRI, | |||
3095 | MachineIRBuilder &B) const { | |||
3096 | const LLT S64 = LLT::scalar(64); | |||
3097 | const LLT S32 = LLT::scalar(32); | |||
3098 | ||||
3099 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | |||
3100 | if (Ty != S32 && Ty != S64) | |||
3101 | return false; | |||
3102 | ||||
3103 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | |||
3104 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); | |||
3105 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | |||
3106 | ||||
3107 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | |||
3108 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | |||
3109 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | |||
3110 | ||||
3111 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | |||
3112 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | |||
3113 | ||||
3114 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | |||
3115 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | |||
3116 | ||||
3117 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; | |||
3118 | switch (MI.getOpcode()) { | |||
3119 | default: | |||
3120 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3120); | |||
3121 | case AMDGPU::G_SDIV: { | |||
3122 | DstDivReg = MI.getOperand(0).getReg(); | |||
3123 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | |||
3124 | break; | |||
3125 | } | |||
3126 | case AMDGPU::G_SREM: { | |||
3127 | DstRemReg = MI.getOperand(0).getReg(); | |||
3128 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | |||
3129 | break; | |||
3130 | } | |||
3131 | case AMDGPU::G_SDIVREM: { | |||
3132 | DstDivReg = MI.getOperand(0).getReg(); | |||
3133 | DstRemReg = MI.getOperand(1).getReg(); | |||
3134 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | |||
3135 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | |||
3136 | break; | |||
3137 | } | |||
3138 | } | |||
3139 | ||||
3140 | if (Ty == S32) | |||
3141 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | |||
3142 | else | |||
3143 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | |||
3144 | ||||
3145 | if (DstDivReg) { | |||
3146 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | |||
3147 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); | |||
3148 | B.buildSub(DstDivReg, SignXor, Sign); | |||
3149 | } | |||
3150 | ||||
3151 | if (DstRemReg) { | |||
3152 | auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | |||
3153 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); | |||
3154 | B.buildSub(DstRemReg, SignXor, Sign); | |||
3155 | } | |||
3156 | ||||
3157 | MI.eraseFromParent(); | |||
3158 | return true; | |||
3159 | } | |||
3160 | ||||
3161 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | |||
3162 | MachineRegisterInfo &MRI, | |||
3163 | MachineIRBuilder &B) const { | |||
3164 | Register Res = MI.getOperand(0).getReg(); | |||
3165 | Register LHS = MI.getOperand(1).getReg(); | |||
3166 | Register RHS = MI.getOperand(2).getReg(); | |||
3167 | uint16_t Flags = MI.getFlags(); | |||
3168 | LLT ResTy = MRI.getType(Res); | |||
3169 | ||||
3170 | const MachineFunction &MF = B.getMF(); | |||
3171 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | |||
3172 | MI.getFlag(MachineInstr::FmAfn); | |||
3173 | ||||
3174 | if (!AllowInaccurateRcp) | |||
3175 | return false; | |||
3176 | ||||
3177 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | |||
3178 | // 1 / x -> RCP(x) | |||
3179 | if (CLHS->isExactlyValue(1.0)) { | |||
3180 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | |||
3181 | .addUse(RHS) | |||
3182 | .setMIFlags(Flags); | |||
3183 | ||||
3184 | MI.eraseFromParent(); | |||
3185 | return true; | |||
3186 | } | |||
3187 | ||||
3188 | // -1 / x -> RCP( FNEG(x) ) | |||
3189 | if (CLHS->isExactlyValue(-1.0)) { | |||
3190 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | |||
3191 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | |||
3192 | .addUse(FNeg.getReg(0)) | |||
3193 | .setMIFlags(Flags); | |||
3194 | ||||
3195 | MI.eraseFromParent(); | |||
3196 | return true; | |||
3197 | } | |||
3198 | } | |||
3199 | ||||
3200 | // x / y -> x * (1.0 / y) | |||
3201 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | |||
3202 | .addUse(RHS) | |||
3203 | .setMIFlags(Flags); | |||
3204 | B.buildFMul(Res, LHS, RCP, Flags); | |||
3205 | ||||
3206 | MI.eraseFromParent(); | |||
3207 | return true; | |||
3208 | } | |||
3209 | ||||
3210 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | |||
3211 | MachineRegisterInfo &MRI, | |||
3212 | MachineIRBuilder &B) const { | |||
3213 | Register Res = MI.getOperand(0).getReg(); | |||
3214 | Register X = MI.getOperand(1).getReg(); | |||
3215 | Register Y = MI.getOperand(2).getReg(); | |||
3216 | uint16_t Flags = MI.getFlags(); | |||
3217 | LLT ResTy = MRI.getType(Res); | |||
3218 | ||||
3219 | const MachineFunction &MF = B.getMF(); | |||
3220 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | |||
3221 | MI.getFlag(MachineInstr::FmAfn); | |||
3222 | ||||
3223 | if (!AllowInaccurateRcp) | |||
3224 | return false; | |||
3225 | ||||
3226 | auto NegY = B.buildFNeg(ResTy, Y); | |||
3227 | auto One = B.buildFConstant(ResTy, 1.0); | |||
3228 | ||||
3229 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | |||
3230 | .addUse(Y) | |||
3231 | .setMIFlags(Flags); | |||
3232 | ||||
3233 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | |||
3234 | R = B.buildFMA(ResTy, Tmp0, R, R); | |||
3235 | ||||
3236 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | |||
3237 | R = B.buildFMA(ResTy, Tmp1, R, R); | |||
3238 | ||||
3239 | auto Ret = B.buildFMul(ResTy, X, R); | |||
3240 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | |||
3241 | ||||
3242 | B.buildFMA(Res, Tmp2, R, Ret); | |||
3243 | MI.eraseFromParent(); | |||
3244 | return true; | |||
3245 | } | |||
3246 | ||||
3247 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | |||
3248 | MachineRegisterInfo &MRI, | |||
3249 | MachineIRBuilder &B) const { | |||
3250 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | |||
3251 | return true; | |||
3252 | ||||
3253 | Register Res = MI.getOperand(0).getReg(); | |||
3254 | Register LHS = MI.getOperand(1).getReg(); | |||
3255 | Register RHS = MI.getOperand(2).getReg(); | |||
3256 | ||||
3257 | uint16_t Flags = MI.getFlags(); | |||
3258 | ||||
3259 | LLT S16 = LLT::scalar(16); | |||
3260 | LLT S32 = LLT::scalar(32); | |||
3261 | ||||
3262 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | |||
3263 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | |||
3264 | ||||
3265 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | |||
3266 | .addUse(RHSExt.getReg(0)) | |||
3267 | .setMIFlags(Flags); | |||
3268 | ||||
3269 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | |||
3270 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | |||
3271 | ||||
3272 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | |||
3273 | .addUse(RDst.getReg(0)) | |||
3274 | .addUse(RHS) | |||
3275 | .addUse(LHS) | |||
3276 | .setMIFlags(Flags); | |||
3277 | ||||
3278 | MI.eraseFromParent(); | |||
3279 | return true; | |||
3280 | } | |||
3281 | ||||
3282 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | |||
3283 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | |||
3284 | static void toggleSPDenormMode(bool Enable, | |||
3285 | MachineIRBuilder &B, | |||
3286 | const GCNSubtarget &ST, | |||
3287 | AMDGPU::SIModeRegisterDefaults Mode) { | |||
3288 | // Set SP denorm mode to this value. | |||
3289 | unsigned SPDenormMode = | |||
3290 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | |||
3291 | ||||
3292 | if (ST.hasDenormModeInst()) { | |||
3293 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | |||
3294 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | |||
3295 | ||||
3296 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | |||
3297 | B.buildInstr(AMDGPU::S_DENORM_MODE) | |||
3298 | .addImm(NewDenormModeValue); | |||
3299 | ||||
3300 | } else { | |||
3301 | // Select FP32 bit field in mode register. | |||
3302 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | |||
3303 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | |||
3304 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | |||
3305 | ||||
3306 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | |||
3307 | .addImm(SPDenormMode) | |||
3308 | .addImm(SPDenormModeBitField); | |||
3309 | } | |||
3310 | } | |||
3311 | ||||
3312 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | |||
3313 | MachineRegisterInfo &MRI, | |||
3314 | MachineIRBuilder &B) const { | |||
3315 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | |||
3316 | return true; | |||
3317 | ||||
3318 | Register Res = MI.getOperand(0).getReg(); | |||
3319 | Register LHS = MI.getOperand(1).getReg(); | |||
3320 | Register RHS = MI.getOperand(2).getReg(); | |||
3321 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | |||
3322 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); | |||
3323 | ||||
3324 | uint16_t Flags = MI.getFlags(); | |||
3325 | ||||
3326 | LLT S32 = LLT::scalar(32); | |||
3327 | LLT S1 = LLT::scalar(1); | |||
3328 | ||||
3329 | auto One = B.buildFConstant(S32, 1.0f); | |||
3330 | ||||
3331 | auto DenominatorScaled = | |||
3332 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | |||
3333 | .addUse(LHS) | |||
3334 | .addUse(RHS) | |||
3335 | .addImm(0) | |||
3336 | .setMIFlags(Flags); | |||
3337 | auto NumeratorScaled = | |||
3338 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | |||
3339 | .addUse(LHS) | |||
3340 | .addUse(RHS) | |||
3341 | .addImm(1) | |||
3342 | .setMIFlags(Flags); | |||
3343 | ||||
3344 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | |||
3345 | .addUse(DenominatorScaled.getReg(0)) | |||
3346 | .setMIFlags(Flags); | |||
3347 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | |||
3348 | ||||
3349 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | |||
3350 | // aren't modeled as reading it. | |||
3351 | if (!Mode.allFP32Denormals()) | |||
3352 | toggleSPDenormMode(true, B, ST, Mode); | |||
3353 | ||||
3354 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | |||
3355 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | |||
3356 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | |||
3357 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | |||
3358 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | |||
3359 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | |||
3360 | ||||
3361 | if (!Mode.allFP32Denormals()) | |||
3362 | toggleSPDenormMode(false, B, ST, Mode); | |||
3363 | ||||
3364 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | |||
3365 | .addUse(Fma4.getReg(0)) | |||
3366 | .addUse(Fma1.getReg(0)) | |||
3367 | .addUse(Fma3.getReg(0)) | |||
3368 | .addUse(NumeratorScaled.getReg(1)) | |||
3369 | .setMIFlags(Flags); | |||
3370 | ||||
3371 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | |||
3372 | .addUse(Fmas.getReg(0)) | |||
3373 | .addUse(RHS) | |||
3374 | .addUse(LHS) | |||
3375 | .setMIFlags(Flags); | |||
3376 | ||||
3377 | MI.eraseFromParent(); | |||
3378 | return true; | |||
3379 | } | |||
3380 | ||||
3381 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | |||
3382 | MachineRegisterInfo &MRI, | |||
3383 | MachineIRBuilder &B) const { | |||
3384 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | |||
3385 | return true; | |||
3386 | ||||
3387 | Register Res = MI.getOperand(0).getReg(); | |||
3388 | Register LHS = MI.getOperand(1).getReg(); | |||
3389 | Register RHS = MI.getOperand(2).getReg(); | |||
3390 | ||||
3391 | uint16_t Flags = MI.getFlags(); | |||
3392 | ||||
3393 | LLT S64 = LLT::scalar(64); | |||
3394 | LLT S1 = LLT::scalar(1); | |||
3395 | ||||
3396 | auto One = B.buildFConstant(S64, 1.0); | |||
3397 | ||||
3398 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | |||
3399 | .addUse(LHS) | |||
3400 | .addUse(RHS) | |||
3401 | .addImm(0) | |||
3402 | .setMIFlags(Flags); | |||
3403 | ||||
3404 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | |||
3405 | ||||
3406 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | |||
3407 | .addUse(DivScale0.getReg(0)) | |||
3408 | .setMIFlags(Flags); | |||
3409 | ||||
3410 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | |||
3411 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | |||
3412 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | |||
3413 | ||||
3414 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | |||
3415 | .addUse(LHS) | |||
3416 | .addUse(RHS) | |||
3417 | .addImm(1) | |||
3418 | .setMIFlags(Flags); | |||
3419 | ||||
3420 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | |||
3421 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | |||
3422 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | |||
3423 | ||||
3424 | Register Scale; | |||
3425 | if (!ST.hasUsableDivScaleConditionOutput()) { | |||
3426 | // Workaround a hardware bug on SI where the condition output from div_scale | |||
3427 | // is not usable. | |||
3428 | ||||
3429 | LLT S32 = LLT::scalar(32); | |||
3430 | ||||
3431 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | |||
3432 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | |||
3433 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | |||
3434 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | |||
3435 | ||||
3436 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | |||
3437 | Scale1Unmerge.getReg(1)); | |||
3438 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | |||
3439 | Scale0Unmerge.getReg(1)); | |||
3440 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | |||
3441 | } else { | |||
3442 | Scale = DivScale1.getReg(1); | |||
3443 | } | |||
3444 | ||||
3445 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | |||
3446 | .addUse(Fma4.getReg(0)) | |||
3447 | .addUse(Fma3.getReg(0)) | |||
3448 | .addUse(Mul.getReg(0)) | |||
3449 | .addUse(Scale) | |||
3450 | .setMIFlags(Flags); | |||
3451 | ||||
3452 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) | |||
3453 | .addUse(Fmas.getReg(0)) | |||
3454 | .addUse(RHS) | |||
3455 | .addUse(LHS) | |||
3456 | .setMIFlags(Flags); | |||
3457 | ||||
3458 | MI.eraseFromParent(); | |||
3459 | return true; | |||
3460 | } | |||
3461 | ||||
3462 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | |||
3463 | MachineRegisterInfo &MRI, | |||
3464 | MachineIRBuilder &B) const { | |||
3465 | Register Res = MI.getOperand(0).getReg(); | |||
3466 | Register LHS = MI.getOperand(2).getReg(); | |||
3467 | Register RHS = MI.getOperand(3).getReg(); | |||
3468 | uint16_t Flags = MI.getFlags(); | |||
3469 | ||||
3470 | LLT S32 = LLT::scalar(32); | |||
3471 | LLT S1 = LLT::scalar(1); | |||
3472 | ||||
3473 | auto Abs = B.buildFAbs(S32, RHS, Flags); | |||
3474 | const APFloat C0Val(1.0f); | |||
3475 | ||||
3476 | auto C0 = B.buildConstant(S32, 0x6f800000); | |||
3477 | auto C1 = B.buildConstant(S32, 0x2f800000); | |||
3478 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | |||
3479 | ||||
3480 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | |||
3481 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | |||
3482 | ||||
3483 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | |||
3484 | ||||
3485 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | |||
3486 | .addUse(Mul0.getReg(0)) | |||
3487 | .setMIFlags(Flags); | |||
3488 | ||||
3489 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | |||
3490 | ||||
3491 | B.buildFMul(Res, Sel, Mul1, Flags); | |||
3492 | ||||
3493 | MI.eraseFromParent(); | |||
3494 | return true; | |||
3495 | } | |||
3496 | ||||
3497 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | |||
3498 | // FIXME: Why do we handle this one but not other removed instructions? | |||
3499 | // | |||
3500 | // Reciprocal square root. The clamp prevents infinite results, clamping | |||
3501 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | |||
3502 | // +-max_float. | |||
3503 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | |||
3504 | MachineRegisterInfo &MRI, | |||
3505 | MachineIRBuilder &B) const { | |||
3506 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | |||
3507 | return true; | |||
3508 | ||||
3509 | Register Dst = MI.getOperand(0).getReg(); | |||
3510 | Register Src = MI.getOperand(2).getReg(); | |||
3511 | auto Flags = MI.getFlags(); | |||
3512 | ||||
3513 | LLT Ty = MRI.getType(Dst); | |||
3514 | ||||
3515 | const fltSemantics *FltSemantics; | |||
3516 | if (Ty == LLT::scalar(32)) | |||
3517 | FltSemantics = &APFloat::IEEEsingle(); | |||
3518 | else if (Ty == LLT::scalar(64)) | |||
3519 | FltSemantics = &APFloat::IEEEdouble(); | |||
3520 | else | |||
3521 | return false; | |||
3522 | ||||
3523 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | |||
3524 | .addUse(Src) | |||
3525 | .setMIFlags(Flags); | |||
3526 | ||||
3527 | // We don't need to concern ourselves with the snan handling difference, since | |||
3528 | // the rsq quieted (or not) so use the one which will directly select. | |||
3529 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | |||
3530 | const bool UseIEEE = MFI->getMode().IEEE; | |||
3531 | ||||
3532 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | |||
3533 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | |||
3534 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | |||
3535 | ||||
3536 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | |||
3537 | ||||
3538 | if (UseIEEE) | |||
3539 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | |||
3540 | else | |||
3541 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | |||
3542 | MI.eraseFromParent(); | |||
3543 | return true; | |||
3544 | } | |||
3545 | ||||
3546 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | |||
3547 | switch (IID) { | |||
3548 | case Intrinsic::amdgcn_ds_fadd: | |||
3549 | return AMDGPU::G_ATOMICRMW_FADD; | |||
3550 | case Intrinsic::amdgcn_ds_fmin: | |||
3551 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | |||
3552 | case Intrinsic::amdgcn_ds_fmax: | |||
3553 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | |||
3554 | default: | |||
3555 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3555); | |||
3556 | } | |||
3557 | } | |||
3558 | ||||
3559 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | |||
3560 | MachineInstr &MI, | |||
3561 | Intrinsic::ID IID) const { | |||
3562 | GISelChangeObserver &Observer = Helper.Observer; | |||
3563 | Observer.changingInstr(MI); | |||
3564 | ||||
3565 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | |||
3566 | ||||
3567 | // The remaining operands were used to set fields in the MemOperand on | |||
3568 | // construction. | |||
3569 | for (int I = 6; I > 3; --I) | |||
3570 | MI.RemoveOperand(I); | |||
3571 | ||||
3572 | MI.RemoveOperand(1); // Remove the intrinsic ID. | |||
3573 | Observer.changedInstr(MI); | |||
3574 | return true; | |||
3575 | } | |||
3576 | ||||
3577 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | |||
3578 | MachineRegisterInfo &MRI, | |||
3579 | MachineIRBuilder &B) const { | |||
3580 | uint64_t Offset = | |||
3581 | ST.getTargetLowering()->getImplicitParameterOffset( | |||
3582 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | |||
3583 | LLT DstTy = MRI.getType(DstReg); | |||
3584 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | |||
3585 | ||||
3586 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | |||
3587 | if (!loadInputValue(KernargPtrReg, B, | |||
3588 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | |||
3589 | return false; | |||
3590 | ||||
3591 | // FIXME: This should be nuw | |||
3592 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | |||
3593 | return true; | |||
3594 | } | |||
3595 | ||||
3596 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | |||
3597 | MachineRegisterInfo &MRI, | |||
3598 | MachineIRBuilder &B) const { | |||
3599 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | |||
3600 | if (!MFI->isEntryFunction()) { | |||
3601 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
3602 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | |||
3603 | } | |||
3604 | ||||
3605 | Register DstReg = MI.getOperand(0).getReg(); | |||
3606 | if (!getImplicitArgPtr(DstReg, MRI, B)) | |||
3607 | return false; | |||
3608 | ||||
3609 | MI.eraseFromParent(); | |||
3610 | return true; | |||
3611 | } | |||
3612 | ||||
3613 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | |||
3614 | MachineRegisterInfo &MRI, | |||
3615 | MachineIRBuilder &B, | |||
3616 | unsigned AddrSpace) const { | |||
3617 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | |||
3618 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | |||
3619 | Register Hi32 = Unmerge.getReg(1); | |||
3620 | ||||
3621 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | |||
3622 | MI.eraseFromParent(); | |||
3623 | return true; | |||
3624 | } | |||
3625 | ||||
3626 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | |||
3627 | // offset (the offset that is included in bounds checking and swizzling, to be | |||
3628 | // split between the instruction's voffset and immoffset fields) and soffset | |||
3629 | // (the offset that is excluded from bounds checking and swizzling, to go in | |||
3630 | // the instruction's soffset field). This function takes the first kind of | |||
3631 | // offset and figures out how to split it between voffset and immoffset. | |||
3632 | std::tuple<Register, unsigned, unsigned> | |||
3633 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | |||
3634 | Register OrigOffset) const { | |||
3635 | const unsigned MaxImm = 4095; | |||
3636 | Register BaseReg; | |||
3637 | unsigned TotalConstOffset; | |||
3638 | const LLT S32 = LLT::scalar(32); | |||
3639 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
3640 | ||||
3641 | std::tie(BaseReg, TotalConstOffset) = | |||
3642 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | |||
3643 | ||||
3644 | unsigned ImmOffset = TotalConstOffset; | |||
3645 | ||||
3646 | // If BaseReg is a pointer, convert it to int. | |||
3647 | if (MRI.getType(BaseReg).isPointer()) | |||
3648 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | |||
3649 | ||||
3650 | // If the immediate value is too big for the immoffset field, put the value | |||
3651 | // and -4096 into the immoffset field so that the value that is copied/added | |||
3652 | // for the voffset field is a multiple of 4096, and it stands more chance | |||
3653 | // of being CSEd with the copy/add for another similar load/store. | |||
3654 | // However, do not do that rounding down to a multiple of 4096 if that is a | |||
3655 | // negative number, as it appears to be illegal to have a negative offset | |||
3656 | // in the vgpr, even if adding the immediate offset makes it positive. | |||
3657 | unsigned Overflow = ImmOffset & ~MaxImm; | |||
3658 | ImmOffset -= Overflow; | |||
3659 | if ((int32_t)Overflow < 0) { | |||
3660 | Overflow += ImmOffset; | |||
3661 | ImmOffset = 0; | |||
3662 | } | |||
3663 | ||||
3664 | if (Overflow != 0) { | |||
3665 | if (!BaseReg) { | |||
3666 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | |||
3667 | } else { | |||
3668 | auto OverflowVal = B.buildConstant(S32, Overflow); | |||
3669 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | |||
3670 | } | |||
3671 | } | |||
3672 | ||||
3673 | if (!BaseReg) | |||
3674 | BaseReg = B.buildConstant(S32, 0).getReg(0); | |||
3675 | ||||
3676 | return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); | |||
3677 | } | |||
3678 | ||||
3679 | /// Handle register layout difference for f16 images for some subtargets. | |||
3680 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | |||
3681 | MachineRegisterInfo &MRI, | |||
3682 | Register Reg, | |||
3683 | bool ImageStore) const { | |||
3684 | const LLT S16 = LLT::scalar(16); | |||
3685 | const LLT S32 = LLT::scalar(32); | |||
3686 | LLT StoreVT = MRI.getType(Reg); | |||
3687 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3687, __extension__ __PRETTY_FUNCTION__)); | |||
3688 | ||||
3689 | if (ST.hasUnpackedD16VMem()) { | |||
3690 | auto Unmerge = B.buildUnmerge(S16, Reg); | |||
3691 | ||||
3692 | SmallVector<Register, 4> WideRegs; | |||
3693 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | |||
3694 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | |||
3695 | ||||
3696 | int NumElts = StoreVT.getNumElements(); | |||
3697 | ||||
3698 | return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); | |||
3699 | } | |||
3700 | ||||
3701 | if (ImageStore && ST.hasImageStoreD16Bug()) { | |||
3702 | if (StoreVT.getNumElements() == 2) { | |||
3703 | SmallVector<Register, 4> PackedRegs; | |||
3704 | Reg = B.buildBitcast(S32, Reg).getReg(0); | |||
3705 | PackedRegs.push_back(Reg); | |||
3706 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | |||
3707 | return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); | |||
3708 | } | |||
3709 | ||||
3710 | if (StoreVT.getNumElements() == 3) { | |||
3711 | SmallVector<Register, 4> PackedRegs; | |||
3712 | auto Unmerge = B.buildUnmerge(S16, Reg); | |||
3713 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | |||
3714 | PackedRegs.push_back(Unmerge.getReg(I)); | |||
3715 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | |||
3716 | Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); | |||
3717 | return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); | |||
3718 | } | |||
3719 | ||||
3720 | if (StoreVT.getNumElements() == 4) { | |||
3721 | SmallVector<Register, 4> PackedRegs; | |||
3722 | Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); | |||
3723 | auto Unmerge = B.buildUnmerge(S32, Reg); | |||
3724 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | |||
3725 | PackedRegs.push_back(Unmerge.getReg(I)); | |||
3726 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | |||
3727 | return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); | |||
3728 | } | |||
3729 | ||||
3730 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3730); | |||
3731 | } | |||
3732 | ||||
3733 | return Reg; | |||
3734 | } | |||
3735 | ||||
3736 | Register AMDGPULegalizerInfo::fixStoreSourceType( | |||
3737 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | |||
3738 | MachineRegisterInfo *MRI = B.getMRI(); | |||
3739 | LLT Ty = MRI->getType(VData); | |||
3740 | ||||
3741 | const LLT S16 = LLT::scalar(16); | |||
3742 | ||||
3743 | // Fixup illegal register types for i8 stores. | |||
3744 | if (Ty == LLT::scalar(8) || Ty == S16) { | |||
3745 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | |||
3746 | return AnyExt; | |||
3747 | } | |||
3748 | ||||
3749 | if (Ty.isVector()) { | |||
3750 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | |||
3751 | if (IsFormat) | |||
3752 | return handleD16VData(B, *MRI, VData); | |||
3753 | } | |||
3754 | } | |||
3755 | ||||
3756 | return VData; | |||
3757 | } | |||
3758 | ||||
3759 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | |||
3760 | MachineRegisterInfo &MRI, | |||
3761 | MachineIRBuilder &B, | |||
3762 | bool IsTyped, | |||
3763 | bool IsFormat) const { | |||
3764 | Register VData = MI.getOperand(1).getReg(); | |||
3765 | LLT Ty = MRI.getType(VData); | |||
3766 | LLT EltTy = Ty.getScalarType(); | |||
3767 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | |||
3768 | const LLT S32 = LLT::scalar(32); | |||
3769 | ||||
3770 | VData = fixStoreSourceType(B, VData, IsFormat); | |||
3771 | Register RSrc = MI.getOperand(2).getReg(); | |||
3772 | ||||
3773 | MachineMemOperand *MMO = *MI.memoperands_begin(); | |||
3774 | const int MemSize = MMO->getSize(); | |||
3775 | ||||
3776 | unsigned ImmOffset; | |||
3777 | unsigned TotalOffset; | |||
3778 | ||||
3779 | // The typed intrinsics add an immediate after the registers. | |||
3780 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | |||
3781 | ||||
3782 | // The struct intrinsic variants add one additional operand over raw. | |||
3783 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | |||
3784 | Register VIndex; | |||
3785 | int OpOffset = 0; | |||
3786 | if (HasVIndex) { | |||
3787 | VIndex = MI.getOperand(3).getReg(); | |||
3788 | OpOffset = 1; | |||
3789 | } | |||
3790 | ||||
3791 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | |||
3792 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | |||
3793 | ||||
3794 | unsigned Format = 0; | |||
3795 | if (IsTyped) { | |||
3796 | Format = MI.getOperand(5 + OpOffset).getImm(); | |||
3797 | ++OpOffset; | |||
3798 | } | |||
3799 | ||||
3800 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | |||
3801 | ||||
3802 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | |||
3803 | if (TotalOffset != 0) | |||
3804 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | |||
3805 | ||||
3806 | unsigned Opc; | |||
3807 | if (IsTyped) { | |||
3808 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | |||
3809 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | |||
3810 | } else if (IsFormat) { | |||
3811 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | |||
3812 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | |||
3813 | } else { | |||
3814 | switch (MemSize) { | |||
3815 | case 1: | |||
3816 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | |||
3817 | break; | |||
3818 | case 2: | |||
3819 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | |||
3820 | break; | |||
3821 | default: | |||
3822 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | |||
3823 | break; | |||
3824 | } | |||
3825 | } | |||
3826 | ||||
3827 | if (!VIndex) | |||
3828 | VIndex = B.buildConstant(S32, 0).getReg(0); | |||
3829 | ||||
3830 | auto MIB = B.buildInstr(Opc) | |||
3831 | .addUse(VData) // vdata | |||
3832 | .addUse(RSrc) // rsrc | |||
3833 | .addUse(VIndex) // vindex | |||
3834 | .addUse(VOffset) // voffset | |||
3835 | .addUse(SOffset) // soffset | |||
3836 | .addImm(ImmOffset); // offset(imm) | |||
3837 | ||||
3838 | if (IsTyped) | |||
3839 | MIB.addImm(Format); | |||
3840 | ||||
3841 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | |||
3842 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | |||
3843 | .addMemOperand(MMO); | |||
3844 | ||||
3845 | MI.eraseFromParent(); | |||
3846 | return true; | |||
3847 | } | |||
3848 | ||||
3849 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | |||
3850 | MachineRegisterInfo &MRI, | |||
3851 | MachineIRBuilder &B, | |||
3852 | bool IsFormat, | |||
3853 | bool IsTyped) const { | |||
3854 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | |||
3855 | MachineMemOperand *MMO = *MI.memoperands_begin(); | |||
3856 | const int MemSize = MMO->getSize(); | |||
3857 | const LLT S32 = LLT::scalar(32); | |||
3858 | ||||
3859 | Register Dst = MI.getOperand(0).getReg(); | |||
3860 | Register RSrc = MI.getOperand(2).getReg(); | |||
3861 | ||||
3862 | // The typed intrinsics add an immediate after the registers. | |||
3863 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | |||
3864 | ||||
3865 | // The struct intrinsic variants add one additional operand over raw. | |||
3866 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | |||
3867 | Register VIndex; | |||
3868 | int OpOffset = 0; | |||
3869 | if (HasVIndex) { | |||
3870 | VIndex = MI.getOperand(3).getReg(); | |||
3871 | OpOffset = 1; | |||
3872 | } | |||
3873 | ||||
3874 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | |||
3875 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | |||
3876 | ||||
3877 | unsigned Format = 0; | |||
3878 | if (IsTyped) { | |||
3879 | Format = MI.getOperand(5 + OpOffset).getImm(); | |||
3880 | ++OpOffset; | |||
3881 | } | |||
3882 | ||||
3883 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | |||
3884 | unsigned ImmOffset; | |||
3885 | unsigned TotalOffset; | |||
3886 | ||||
3887 | LLT Ty = MRI.getType(Dst); | |||
3888 | LLT EltTy = Ty.getScalarType(); | |||
3889 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | |||
3890 | const bool Unpacked = ST.hasUnpackedD16VMem(); | |||
3891 | ||||
3892 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | |||
3893 | if (TotalOffset != 0) | |||
3894 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | |||
3895 | ||||
3896 | unsigned Opc; | |||
3897 | ||||
3898 | if (IsTyped) { | |||
3899 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | |||
3900 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | |||
3901 | } else if (IsFormat) { | |||
3902 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : | |||
3903 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | |||
3904 | } else { | |||
3905 | switch (MemSize) { | |||
3906 | case 1: | |||
3907 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | |||
3908 | break; | |||
3909 | case 2: | |||
3910 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | |||
3911 | break; | |||
3912 | default: | |||
3913 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | |||
3914 | break; | |||
3915 | } | |||
3916 | } | |||
3917 | ||||
3918 | Register LoadDstReg; | |||
3919 | ||||
3920 | bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); | |||
3921 | LLT UnpackedTy = Ty.changeElementSize(32); | |||
3922 | ||||
3923 | if (IsExtLoad) | |||
3924 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | |||
3925 | else if (Unpacked && IsD16 && Ty.isVector()) | |||
3926 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | |||
3927 | else | |||
3928 | LoadDstReg = Dst; | |||
3929 | ||||
3930 | if (!VIndex) | |||
3931 | VIndex = B.buildConstant(S32, 0).getReg(0); | |||
3932 | ||||
3933 | auto MIB = B.buildInstr(Opc) | |||
3934 | .addDef(LoadDstReg) // vdata | |||
3935 | .addUse(RSrc) // rsrc | |||
3936 | .addUse(VIndex) // vindex | |||
3937 | .addUse(VOffset) // voffset | |||
3938 | .addUse(SOffset) // soffset | |||
3939 | .addImm(ImmOffset); // offset(imm) | |||
3940 | ||||
3941 | if (IsTyped) | |||
3942 | MIB.addImm(Format); | |||
3943 | ||||
3944 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | |||
3945 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | |||
3946 | .addMemOperand(MMO); | |||
3947 | ||||
3948 | if (LoadDstReg != Dst) { | |||
3949 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | |||
3950 | ||||
3951 | // Widen result for extending loads was widened. | |||
3952 | if (IsExtLoad) | |||
3953 | B.buildTrunc(Dst, LoadDstReg); | |||
3954 | else { | |||
3955 | // Repack to original 16-bit vector result | |||
3956 | // FIXME: G_TRUNC should work, but legalization currently fails | |||
3957 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | |||
3958 | SmallVector<Register, 4> Repack; | |||
3959 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | |||
3960 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | |||
3961 | B.buildMerge(Dst, Repack); | |||
3962 | } | |||
3963 | } | |||
3964 | ||||
3965 | MI.eraseFromParent(); | |||
3966 | return true; | |||
3967 | } | |||
3968 | ||||
3969 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | |||
3970 | MachineIRBuilder &B, | |||
3971 | bool IsInc) const { | |||
3972 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : | |||
3973 | AMDGPU::G_AMDGPU_ATOMIC_DEC; | |||
3974 | B.buildInstr(Opc) | |||
3975 | .addDef(MI.getOperand(0).getReg()) | |||
3976 | .addUse(MI.getOperand(2).getReg()) | |||
3977 | .addUse(MI.getOperand(3).getReg()) | |||
3978 | .cloneMemRefs(MI); | |||
3979 | MI.eraseFromParent(); | |||
3980 | return true; | |||
3981 | } | |||
3982 | ||||
3983 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | |||
3984 | switch (IntrID) { | |||
3985 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | |||
3986 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | |||
3987 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | |||
3988 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | |||
3989 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | |||
3990 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | |||
3991 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | |||
3992 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | |||
3993 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | |||
3994 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | |||
3995 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | |||
3996 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | |||
3997 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | |||
3998 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | |||
3999 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | |||
4000 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | |||
4001 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | |||
4002 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | |||
4003 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | |||
4004 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | |||
4005 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | |||
4006 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | |||
4007 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | |||
4008 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | |||
4009 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | |||
4010 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | |||
4011 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | |||
4012 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | |||
4013 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | |||
4014 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | |||
4015 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | |||
4016 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | |||
4017 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | |||
4018 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | |||
4019 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | |||
4020 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | |||
4021 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | |||
4022 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | |||
4023 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | |||
4024 | case Intrinsic::amdgcn_buffer_atomic_fadd: | |||
4025 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | |||
4026 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | |||
4027 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | |||
4028 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | |||
4029 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | |||
4030 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | |||
4031 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | |||
4032 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | |||
4033 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | |||
4034 | default: | |||
4035 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4035); | |||
4036 | } | |||
4037 | } | |||
4038 | ||||
4039 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | |||
4040 | MachineIRBuilder &B, | |||
4041 | Intrinsic::ID IID) const { | |||
4042 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | |||
4043 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | |||
4044 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | |||
4045 | ||||
4046 | Register Dst; | |||
4047 | ||||
4048 | int OpOffset = 0; | |||
4049 | if (HasReturn) { | |||
4050 | // A few FP atomics do not support return values. | |||
4051 | Dst = MI.getOperand(0).getReg(); | |||
4052 | } else { | |||
4053 | OpOffset = -1; | |||
4054 | } | |||
4055 | ||||
4056 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | |||
4057 | Register CmpVal; | |||
4058 | ||||
4059 | if (IsCmpSwap) { | |||
4060 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | |||
4061 | ++OpOffset; | |||
4062 | } | |||
4063 | ||||
4064 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | |||
4065 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | |||
4066 | ||||
4067 | // The struct intrinsic variants add one additional operand over raw. | |||
4068 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | |||
4069 | Register VIndex; | |||
4070 | if (HasVIndex) { | |||
4071 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | |||
4072 | ++OpOffset; | |||
4073 | } | |||
4074 | ||||
4075 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | |||
4076 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | |||
4077 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | |||
4078 | ||||
4079 | MachineMemOperand *MMO = *MI.memoperands_begin(); | |||
4080 | ||||
4081 | unsigned ImmOffset; | |||
4082 | unsigned TotalOffset; | |||
4083 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | |||
4084 | if (TotalOffset != 0) | |||
4085 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); | |||
4086 | ||||
4087 | if (!VIndex) | |||
4088 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | |||
4089 | ||||
4090 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | |||
4091 | ||||
4092 | if (HasReturn) | |||
4093 | MIB.addDef(Dst); | |||
4094 | ||||
4095 | MIB.addUse(VData); // vdata | |||
4096 | ||||
4097 | if (IsCmpSwap) | |||
4098 | MIB.addReg(CmpVal); | |||
4099 | ||||
4100 | MIB.addUse(RSrc) // rsrc | |||
4101 | .addUse(VIndex) // vindex | |||
4102 | .addUse(VOffset) // voffset | |||
4103 | .addUse(SOffset) // soffset | |||
4104 | .addImm(ImmOffset) // offset(imm) | |||
4105 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | |||
4106 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | |||
4107 | .addMemOperand(MMO); | |||
4108 | ||||
4109 | MI.eraseFromParent(); | |||
4110 | return true; | |||
4111 | } | |||
4112 | ||||
4113 | /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized | |||
4114 | /// vector with s16 typed elements. | |||
4115 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, | |||
4116 | SmallVectorImpl<Register> &PackedAddrs, | |||
4117 | unsigned ArgOffset, | |||
4118 | const AMDGPU::ImageDimIntrinsicInfo *Intr, | |||
4119 | bool IsA16, bool IsG16) { | |||
4120 | const LLT S16 = LLT::scalar(16); | |||
4121 | const LLT V2S16 = LLT::vector(2, 16); | |||
4122 | auto EndIdx = Intr->VAddrEnd; | |||
4123 | ||||
4124 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | |||
4125 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | |||
4126 | if (!SrcOp.isReg()) | |||
4127 | continue; // _L to _LZ may have eliminated this. | |||
4128 | ||||
4129 | Register AddrReg = SrcOp.getReg(); | |||
4130 | ||||
4131 | if (I < Intr->GradientStart) { | |||
4132 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | |||
4133 | PackedAddrs.push_back(AddrReg); | |||
4134 | } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || | |||
4135 | (I >= Intr->CoordStart && !IsA16)) { | |||
4136 | // Handle any gradient or coordinate operands that should not be packed | |||
4137 | PackedAddrs.push_back(AddrReg); | |||
4138 | } else { | |||
4139 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | |||
4140 | // derivatives dx/dh and dx/dv are packed with undef. | |||
4141 | if (((I + 1) >= EndIdx) || | |||
4142 | ((Intr->NumGradients / 2) % 2 == 1 && | |||
4143 | (I == static_cast<unsigned>(Intr->GradientStart + | |||
4144 | (Intr->NumGradients / 2) - 1) || | |||
4145 | I == static_cast<unsigned>(Intr->GradientStart + | |||
4146 | Intr->NumGradients - 1))) || | |||
4147 | // Check for _L to _LZ optimization | |||
4148 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | |||
4149 | PackedAddrs.push_back( | |||
4150 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | |||
4151 | .getReg(0)); | |||
4152 | } else { | |||
4153 | PackedAddrs.push_back( | |||
4154 | B.buildBuildVector( | |||
4155 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | |||
4156 | .getReg(0)); | |||
4157 | ++I; | |||
4158 | } | |||
4159 | } | |||
4160 | } | |||
4161 | } | |||
4162 | ||||
4163 | /// Convert from separate vaddr components to a single vector address register, | |||
4164 | /// and replace the remaining operands with $noreg. | |||
4165 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | |||
4166 | int DimIdx, int NumVAddrs) { | |||
4167 | const LLT S32 = LLT::scalar(32); | |||
4168 | ||||
4169 | SmallVector<Register, 8> AddrRegs; | |||
4170 | for (int I = 0; I != NumVAddrs; ++I) { | |||
4171 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | |||
4172 | if (SrcOp.isReg()) { | |||
4173 | AddrRegs.push_back(SrcOp.getReg()); | |||
4174 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4174, __extension__ __PRETTY_FUNCTION__)); | |||
4175 | } | |||
4176 | } | |||
4177 | ||||
4178 | int NumAddrRegs = AddrRegs.size(); | |||
4179 | if (NumAddrRegs != 1) { | |||
4180 | // Round up to 8 elements for v5-v7 | |||
4181 | // FIXME: Missing intermediate sized register classes and instructions. | |||
4182 | if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { | |||
4183 | const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); | |||
4184 | auto Undef = B.buildUndef(S32); | |||
4185 | AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); | |||
4186 | NumAddrRegs = RoundedNumRegs; | |||
4187 | } | |||
4188 | ||||
4189 | auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); | |||
4190 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | |||
4191 | } | |||
4192 | ||||
4193 | for (int I = 1; I != NumVAddrs; ++I) { | |||
4194 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | |||
4195 | if (SrcOp.isReg()) | |||
4196 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | |||
4197 | } | |||
4198 | } | |||
4199 | ||||
4200 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | |||
4201 | /// | |||
4202 | /// Depending on the subtarget, load/store with 16-bit element data need to be | |||
4203 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | |||
4204 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | |||
4205 | /// registers. | |||
4206 | /// | |||
4207 | /// We don't want to directly select image instructions just yet, but also want | |||
4208 | /// to exposes all register repacking to the legalizer/combiners. We also don't | |||
4209 | /// want a selected instrution entering RegBankSelect. In order to avoid | |||
4210 | /// defining a multitude of intermediate image instructions, directly hack on | |||
4211 | /// the intrinsic's arguments. In cases like a16 addreses, this requires padding | |||
4212 | /// now unnecessary arguments with $noreg. | |||
4213 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | |||
4214 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | |||
4215 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | |||
4216 | ||||
4217 | const unsigned NumDefs = MI.getNumExplicitDefs(); | |||
4218 | const unsigned ArgOffset = NumDefs + 1; | |||
4219 | bool IsTFE = NumDefs == 2; | |||
4220 | // We are only processing the operands of d16 image operations on subtargets | |||
4221 | // that use the unpacked register layout, or need to repack the TFE result. | |||
4222 | ||||
4223 | // TODO: Do we need to guard against already legalized intrinsics? | |||
4224 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | |||
4225 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | |||
4226 | ||||
4227 | MachineRegisterInfo *MRI = B.getMRI(); | |||
4228 | const LLT S32 = LLT::scalar(32); | |||
4229 | const LLT S16 = LLT::scalar(16); | |||
4230 | const LLT V2S16 = LLT::vector(2, 16); | |||
4231 | ||||
4232 | unsigned DMask = 0; | |||
4233 | ||||
4234 | // Check for 16 bit addresses and pack if true. | |||
4235 | LLT GradTy = | |||
4236 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | |||
4237 | LLT AddrTy = | |||
4238 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | |||
4239 | const bool IsG16 = GradTy == S16; | |||
4240 | const bool IsA16 = AddrTy == S16; | |||
4241 | ||||
4242 | int DMaskLanes = 0; | |||
4243 | if (!BaseOpcode->Atomic) { | |||
4244 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | |||
4245 | if (BaseOpcode->Gather4) { | |||
4246 | DMaskLanes = 4; | |||
4247 | } else if (DMask != 0) { | |||
4248 | DMaskLanes = countPopulation(DMask); | |||
4249 | } else if (!IsTFE && !BaseOpcode->Store) { | |||
4250 | // If dmask is 0, this is a no-op load. This can be eliminated. | |||
4251 | B.buildUndef(MI.getOperand(0)); | |||
4252 | MI.eraseFromParent(); | |||
4253 | return true; | |||
4254 | } | |||
4255 | } | |||
4256 | ||||
4257 | Observer.changingInstr(MI); | |||
4258 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | |||
4259 | ||||
4260 | unsigned NewOpcode = NumDefs == 0 ? | |||
4261 | AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | |||
4262 | ||||
4263 | // Track that we legalized this | |||
4264 | MI.setDesc(B.getTII().get(NewOpcode)); | |||
4265 | ||||
4266 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | |||
4267 | // dmask to be at least 1 otherwise the instruction will fail | |||
4268 | if (IsTFE && DMask == 0) { | |||
4269 | DMask = 0x1; | |||
4270 | DMaskLanes = 1; | |||
4271 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | |||
4272 | } | |||
4273 | ||||
4274 | if (BaseOpcode->Atomic) { | |||
4275 | Register VData0 = MI.getOperand(2).getReg(); | |||
4276 | LLT Ty = MRI->getType(VData0); | |||
4277 | ||||
4278 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | |||
4279 | if (Ty.isVector()) | |||
4280 | return false; | |||
4281 | ||||
4282 | if (BaseOpcode->AtomicX2) { | |||
4283 | Register VData1 = MI.getOperand(3).getReg(); | |||
4284 | // The two values are packed in one register. | |||
4285 | LLT PackedTy = LLT::vector(2, Ty); | |||
4286 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | |||
4287 | MI.getOperand(2).setReg(Concat.getReg(0)); | |||
4288 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | |||
4289 | } | |||
4290 | } | |||
4291 | ||||
4292 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | |||
4293 | ||||
4294 | // Optimize _L to _LZ when _L is zero | |||
4295 | if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = | |||
4296 | AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { | |||
4297 | const ConstantFP *ConstantLod; | |||
4298 | ||||
4299 | if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, | |||
4300 | m_GFCst(ConstantLod))) { | |||
4301 | if (ConstantLod->isZero() || ConstantLod->isNegative()) { | |||
4302 | // Set new opcode to _lz variant of _l, and change the intrinsic ID. | |||
4303 | const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = | |||
4304 | AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, | |||
4305 | Intr->Dim); | |||
4306 | ||||
4307 | // The starting indexes should remain in the same place. | |||
4308 | --CorrectedNumVAddrs; | |||
4309 | ||||
4310 | MI.getOperand(MI.getNumExplicitDefs()) | |||
4311 | .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); | |||
4312 | MI.RemoveOperand(ArgOffset + Intr->LodIndex); | |||
4313 | Intr = NewImageDimIntr; | |||
4314 | } | |||
4315 | } | |||
4316 | } | |||
4317 | ||||
4318 | // Optimize _mip away, when 'lod' is zero | |||
4319 | if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { | |||
4320 | int64_t ConstantLod; | |||
4321 | if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, | |||
4322 | m_ICst(ConstantLod))) { | |||
4323 | if (ConstantLod == 0) { | |||
4324 | // TODO: Change intrinsic opcode and remove operand instead or replacing | |||
4325 | // it with 0, as the _L to _LZ handling is done above. | |||
4326 | MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); | |||
4327 | --CorrectedNumVAddrs; | |||
4328 | } | |||
4329 | } | |||
4330 | } | |||
4331 | ||||
4332 | // Rewrite the addressing register layout before doing anything else. | |||
4333 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { | |||
4334 | // 16 bit gradients are supported, but are tied to the A16 control | |||
4335 | // so both gradients and addresses must be 16 bit | |||
4336 | return false; | |||
4337 | } | |||
4338 | ||||
4339 | if (IsA16 && !ST.hasA16()) { | |||
4340 | // A16 not supported | |||
4341 | return false; | |||
4342 | } | |||
4343 | ||||
4344 | if (IsA16 || IsG16) { | |||
4345 | if (Intr->NumVAddrs > 1) { | |||
4346 | SmallVector<Register, 4> PackedRegs; | |||
4347 | ||||
4348 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, | |||
4349 | IsG16); | |||
4350 | ||||
4351 | // See also below in the non-a16 branch | |||
4352 | const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); | |||
4353 | ||||
4354 | if (!UseNSA && PackedRegs.size() > 1) { | |||
4355 | LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); | |||
4356 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | |||
4357 | PackedRegs[0] = Concat.getReg(0); | |||
4358 | PackedRegs.resize(1); | |||
4359 | } | |||
4360 | ||||
4361 | const unsigned NumPacked = PackedRegs.size(); | |||
4362 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | |||
4363 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | |||
4364 | if (!SrcOp.isReg()) { | |||
4365 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4365, __extension__ __PRETTY_FUNCTION__)); | |||
4366 | continue; | |||
4367 | } | |||
4368 | ||||
4369 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4369, __extension__ __PRETTY_FUNCTION__)); | |||
4370 | ||||
4371 | if (I - Intr->VAddrStart < NumPacked) | |||
4372 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | |||
4373 | else | |||
4374 | SrcOp.setReg(AMDGPU::NoRegister); | |||
4375 | } | |||
4376 | } | |||
4377 | } else { | |||
4378 | // If the register allocator cannot place the address registers contiguously | |||
4379 | // without introducing moves, then using the non-sequential address encoding | |||
4380 | // is always preferable, since it saves VALU instructions and is usually a | |||
4381 | // wash in terms of code size or even better. | |||
4382 | // | |||
4383 | // However, we currently have no way of hinting to the register allocator | |||
4384 | // that MIMG addresses should be placed contiguously when it is possible to | |||
4385 | // do so, so force non-NSA for the common 2-address case as a heuristic. | |||
4386 | // | |||
4387 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | |||
4388 | // allocation when possible. | |||
4389 | const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); | |||
4390 | ||||
4391 | if (!UseNSA && Intr->NumVAddrs > 1) | |||
4392 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | |||
4393 | Intr->NumVAddrs); | |||
4394 | } | |||
4395 | ||||
4396 | int Flags = 0; | |||
4397 | if (IsA16) | |||
4398 | Flags |= 1; | |||
4399 | if (IsG16) | |||
4400 | Flags |= 2; | |||
4401 | MI.addOperand(MachineOperand::CreateImm(Flags)); | |||
4402 | ||||
4403 | if (BaseOpcode->Store) { // No TFE for stores? | |||
4404 | // TODO: Handle dmask trim | |||
4405 | Register VData = MI.getOperand(1).getReg(); | |||
4406 | LLT Ty = MRI->getType(VData); | |||
4407 | if (!Ty.isVector() || Ty.getElementType() != S16) | |||
4408 | return true; | |||
4409 | ||||
4410 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | |||
4411 | if (RepackedReg != VData) { | |||
4412 | MI.getOperand(1).setReg(RepackedReg); | |||
4413 | } | |||
4414 | ||||
4415 | return true; | |||
4416 | } | |||
4417 | ||||
4418 | Register DstReg = MI.getOperand(0).getReg(); | |||
4419 | LLT Ty = MRI->getType(DstReg); | |||
4420 | const LLT EltTy = Ty.getScalarType(); | |||
4421 | const bool IsD16 = Ty.getScalarType() == S16; | |||
4422 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | |||
4423 | ||||
4424 | // Confirm that the return type is large enough for the dmask specified | |||
4425 | if (NumElts < DMaskLanes) | |||
4426 | return false; | |||
4427 | ||||
4428 | if (NumElts > 4 || DMaskLanes > 4) | |||
4429 | return false; | |||
4430 | ||||
4431 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; | |||
4432 | const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); | |||
4433 | ||||
4434 | // The raw dword aligned data component of the load. The only legal cases | |||
4435 | // where this matters should be when using the packed D16 format, for | |||
4436 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | |||
4437 | LLT RoundedTy; | |||
4438 | ||||
4439 | // S32 vector to to cover all data, plus TFE result element. | |||
4440 | LLT TFETy; | |||
4441 | ||||
4442 | // Register type to use for each loaded component. Will be S32 or V2S16. | |||
4443 | LLT RegTy; | |||
4444 | ||||
4445 | if (IsD16 && ST.hasUnpackedD16VMem()) { | |||
4446 | RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); | |||
4447 | TFETy = LLT::vector(AdjustedNumElts + 1, 32); | |||
4448 | RegTy = S32; | |||
4449 | } else { | |||
4450 | unsigned EltSize = EltTy.getSizeInBits(); | |||
4451 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | |||
4452 | unsigned RoundedSize = 32 * RoundedElts; | |||
4453 | RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); | |||
4454 | TFETy = LLT::vector(RoundedSize / 32 + 1, S32); | |||
4455 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | |||
4456 | } | |||
4457 | ||||
4458 | // The return type does not need adjustment. | |||
4459 | // TODO: Should we change s16 case to s32 or <2 x s16>? | |||
4460 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | |||
4461 | return true; | |||
4462 | ||||
4463 | Register Dst1Reg; | |||
4464 | ||||
4465 | // Insert after the instruction. | |||
4466 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | |||
4467 | ||||
4468 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | |||
4469 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | |||
4470 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | |||
4471 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | |||
4472 | ||||
4473 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | |||
4474 | ||||
4475 | MI.getOperand(0).setReg(NewResultReg); | |||
4476 | ||||
4477 | // In the IR, TFE is supposed to be used with a 2 element struct return | |||
4478 | // type. The intruction really returns these two values in one contiguous | |||
4479 | // register, with one additional dword beyond the loaded data. Rewrite the | |||
4480 | // return type to use a single register result. | |||
4481 | ||||
4482 | if (IsTFE) { | |||
4483 | Dst1Reg = MI.getOperand(1).getReg(); | |||
4484 | if (MRI->getType(Dst1Reg) != S32) | |||
4485 | return false; | |||
4486 | ||||
4487 | // TODO: Make sure the TFE operand bit is set. | |||
4488 | MI.RemoveOperand(1); | |||
4489 | ||||
4490 | // Handle the easy case that requires no repack instructions. | |||
4491 | if (Ty == S32) { | |||
4492 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | |||
4493 | return true; | |||
4494 | } | |||
4495 | } | |||
4496 | ||||
4497 | // Now figure out how to copy the new result register back into the old | |||
4498 | // result. | |||
4499 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | |||
4500 | ||||
4501 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | |||
4502 | ||||
4503 | if (ResultNumRegs == 1) { | |||
4504 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4504, __extension__ __PRETTY_FUNCTION__)); | |||
4505 | ResultRegs[0] = NewResultReg; | |||
4506 | } else { | |||
4507 | // We have to repack into a new vector of some kind. | |||
4508 | for (int I = 0; I != NumDataRegs; ++I) | |||
4509 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | |||
4510 | B.buildUnmerge(ResultRegs, NewResultReg); | |||
4511 | ||||
4512 | // Drop the final TFE element to get the data part. The TFE result is | |||
4513 | // directly written to the right place already. | |||
4514 | if (IsTFE) | |||
4515 | ResultRegs.resize(NumDataRegs); | |||
4516 | } | |||
4517 | ||||
4518 | // For an s16 scalar result, we form an s32 result with a truncate regardless | |||
4519 | // of packed vs. unpacked. | |||
4520 | if (IsD16 && !Ty.isVector()) { | |||
4521 | B.buildTrunc(DstReg, ResultRegs[0]); | |||
4522 | return true; | |||
4523 | } | |||
4524 | ||||
4525 | // Avoid a build/concat_vector of 1 entry. | |||
4526 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | |||
4527 | B.buildBitcast(DstReg, ResultRegs[0]); | |||
4528 | return true; | |||
4529 | } | |||
4530 | ||||
4531 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4531, __extension__ __PRETTY_FUNCTION__)); | |||
4532 | ||||
4533 | if (IsD16) { | |||
4534 | // For packed D16 results with TFE enabled, all the data components are | |||
4535 | // S32. Cast back to the expected type. | |||
4536 | // | |||
4537 | // TODO: We don't really need to use load s32 elements. We would only need one | |||
4538 | // cast for the TFE result if a multiple of v2s16 was used. | |||
4539 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | |||
4540 | for (Register &Reg : ResultRegs) | |||
4541 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | |||
4542 | } else if (ST.hasUnpackedD16VMem()) { | |||
4543 | for (Register &Reg : ResultRegs) | |||
4544 | Reg = B.buildTrunc(S16, Reg).getReg(0); | |||
4545 | } | |||
4546 | } | |||
4547 | ||||
4548 | auto padWithUndef = [&](LLT Ty, int NumElts) { | |||
4549 | if (NumElts == 0) | |||
4550 | return; | |||
4551 | Register Undef = B.buildUndef(Ty).getReg(0); | |||
4552 | for (int I = 0; I != NumElts; ++I) | |||
4553 | ResultRegs.push_back(Undef); | |||
4554 | }; | |||
4555 | ||||
4556 | // Pad out any elements eliminated due to the dmask. | |||
4557 | LLT ResTy = MRI->getType(ResultRegs[0]); | |||
4558 | if (!ResTy.isVector()) { | |||
4559 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | |||
4560 | B.buildBuildVector(DstReg, ResultRegs); | |||
4561 | return true; | |||
4562 | } | |||
4563 | ||||
4564 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4564, __extension__ __PRETTY_FUNCTION__)); | |||
4565 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | |||
4566 | ||||
4567 | // Deal with the one annoying legal case. | |||
4568 | const LLT V3S16 = LLT::vector(3, 16); | |||
4569 | if (Ty == V3S16) { | |||
4570 | padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); | |||
4571 | auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); | |||
4572 | B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); | |||
4573 | return true; | |||
4574 | } | |||
4575 | ||||
4576 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | |||
4577 | B.buildConcatVectors(DstReg, ResultRegs); | |||
4578 | return true; | |||
4579 | } | |||
4580 | ||||
4581 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | |||
4582 | LegalizerHelper &Helper, MachineInstr &MI) const { | |||
4583 | MachineIRBuilder &B = Helper.MIRBuilder; | |||
4584 | GISelChangeObserver &Observer = Helper.Observer; | |||
4585 | ||||
4586 | Register Dst = MI.getOperand(0).getReg(); | |||
4587 | LLT Ty = B.getMRI()->getType(Dst); | |||
4588 | unsigned Size = Ty.getSizeInBits(); | |||
4589 | MachineFunction &MF = B.getMF(); | |||
4590 | ||||
4591 | Observer.changingInstr(MI); | |||
4592 | ||||
4593 | if (shouldBitcastLoadStoreType(ST, Ty, Size)) { | |||
4594 | Ty = getBitcastRegisterType(Ty); | |||
4595 | Helper.bitcastDst(MI, Ty, 0); | |||
4596 | Dst = MI.getOperand(0).getReg(); | |||
4597 | B.setInsertPt(B.getMBB(), MI); | |||
4598 | } | |||
4599 | ||||
4600 | // FIXME: We don't really need this intermediate instruction. The intrinsic | |||
4601 | // should be fixed to have a memory operand. Since it's readnone, we're not | |||
4602 | // allowed to add one. | |||
4603 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | |||
4604 | MI.RemoveOperand(1); // Remove intrinsic ID | |||
4605 | ||||
4606 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | |||
4607 | // TODO: Should this use datalayout alignment? | |||
4608 | const unsigned MemSize = (Size + 7) / 8; | |||
4609 | const Align MemAlign(4); | |||
4610 | MachineMemOperand *MMO = MF.getMachineMemOperand( | |||
4611 | MachinePointerInfo(), | |||
4612 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | |||
4613 | MachineMemOperand::MOInvariant, | |||
4614 | MemSize, MemAlign); | |||
4615 | MI.addMemOperand(MF, MMO); | |||
4616 | ||||
4617 | // There are no 96-bit result scalar loads, but widening to 128-bit should | |||
4618 | // always be legal. We may need to restore this to a 96-bit result if it turns | |||
4619 | // out this needs to be converted to a vector load during RegBankSelect. | |||
4620 | if (!isPowerOf2_32(Size)) { | |||
4621 | if (Ty.isVector()) | |||
4622 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | |||
4623 | else | |||
4624 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | |||
4625 | } | |||
4626 | ||||
4627 | Observer.changedInstr(MI); | |||
4628 | return true; | |||
4629 | } | |||
4630 | ||||
4631 | // TODO: Move to selection | |||
4632 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | |||
4633 | MachineRegisterInfo &MRI, | |||
4634 | MachineIRBuilder &B) const { | |||
4635 | if (!ST.isTrapHandlerEnabled() || | |||
4636 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | |||
4637 | return legalizeTrapEndpgm(MI, MRI, B); | |||
4638 | ||||
4639 | if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { | |||
4640 | switch (*HsaAbiVer) { | |||
4641 | case ELF::ELFABIVERSION_AMDGPU_HSA_V2: | |||
4642 | case ELF::ELFABIVERSION_AMDGPU_HSA_V3: | |||
4643 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | |||
4644 | case ELF::ELFABIVERSION_AMDGPU_HSA_V4: | |||
4645 | return ST.supportsGetDoorbellID() ? | |||
4646 | legalizeTrapHsa(MI, MRI, B) : | |||
4647 | legalizeTrapHsaQueuePtr(MI, MRI, B); | |||
4648 | } | |||
4649 | } | |||
4650 | ||||
4651 | llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4651); | |||
4652 | } | |||
4653 | ||||
4654 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | |||
4655 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | |||
4656 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | |||
4657 | MI.eraseFromParent(); | |||
4658 | return true; | |||
4659 | } | |||
4660 | ||||
4661 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | |||
4662 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | |||
4663 | // Pass queue pointer to trap handler as input, and insert trap instruction | |||
4664 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | |||
4665 | Register LiveIn = | |||
4666 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | |||
4667 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | |||
4668 | return false; | |||
4669 | ||||
4670 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | |||
4671 | B.buildCopy(SGPR01, LiveIn); | |||
4672 | B.buildInstr(AMDGPU::S_TRAP) | |||
4673 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | |||
4674 | .addReg(SGPR01, RegState::Implicit); | |||
4675 | ||||
4676 | MI.eraseFromParent(); | |||
4677 | return true; | |||
4678 | } | |||
4679 | ||||
4680 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | |||
4681 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | |||
4682 | B.buildInstr(AMDGPU::S_TRAP) | |||
4683 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | |||
4684 | MI.eraseFromParent(); | |||
4685 | return true; | |||
4686 | } | |||
4687 | ||||
4688 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | |||
4689 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | |||
4690 | // Is non-HSA path or trap-handler disabled? then, report a warning | |||
4691 | // accordingly | |||
4692 | if (!ST.isTrapHandlerEnabled() || | |||
4693 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | |||
4694 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | |||
4695 | "debugtrap handler not supported", | |||
4696 | MI.getDebugLoc(), DS_Warning); | |||
4697 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | |||
4698 | Ctx.diagnose(NoTrap); | |||
4699 | } else { | |||
4700 | // Insert debug-trap instruction | |||
4701 | B.buildInstr(AMDGPU::S_TRAP) | |||
4702 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | |||
4703 | } | |||
4704 | ||||
4705 | MI.eraseFromParent(); | |||
4706 | return true; | |||
4707 | } | |||
4708 | ||||
4709 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | |||
4710 | MachineIRBuilder &B) const { | |||
4711 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
4712 | const LLT S16 = LLT::scalar(16); | |||
4713 | const LLT S32 = LLT::scalar(32); | |||
4714 | ||||
4715 | Register DstReg = MI.getOperand(0).getReg(); | |||
4716 | Register NodePtr = MI.getOperand(2).getReg(); | |||
4717 | Register RayExtent = MI.getOperand(3).getReg(); | |||
4718 | Register RayOrigin = MI.getOperand(4).getReg(); | |||
4719 | Register RayDir = MI.getOperand(5).getReg(); | |||
4720 | Register RayInvDir = MI.getOperand(6).getReg(); | |||
4721 | Register TDescr = MI.getOperand(7).getReg(); | |||
4722 | ||||
4723 | if (!ST.hasGFX10_AEncoding()) { | |||
4724 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), | |||
4725 | "intrinsic not supported on subtarget", | |||
4726 | MI.getDebugLoc()); | |||
4727 | B.getMF().getFunction().getContext().diagnose(BadIntrin); | |||
4728 | return false; | |||
4729 | } | |||
4730 | ||||
4731 | bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | |||
4732 | bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | |||
4733 | unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa | |||
4734 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa | |||
4735 | : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa | |||
4736 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; | |||
4737 | ||||
4738 | SmallVector<Register, 12> Ops; | |||
4739 | if (Is64) { | |||
4740 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | |||
4741 | Ops.push_back(Unmerge.getReg(0)); | |||
4742 | Ops.push_back(Unmerge.getReg(1)); | |||
4743 | } else { | |||
4744 | Ops.push_back(NodePtr); | |||
4745 | } | |||
4746 | Ops.push_back(RayExtent); | |||
4747 | ||||
4748 | auto packLanes = [&Ops, &S32, &B] (Register Src) { | |||
4749 | auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); | |||
4750 | Ops.push_back(Unmerge.getReg(0)); | |||
4751 | Ops.push_back(Unmerge.getReg(1)); | |||
4752 | Ops.push_back(Unmerge.getReg(2)); | |||
4753 | }; | |||
4754 | ||||
4755 | packLanes(RayOrigin); | |||
4756 | if (IsA16) { | |||
4757 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); | |||
4758 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); | |||
4759 | Register R1 = MRI.createGenericVirtualRegister(S32); | |||
4760 | Register R2 = MRI.createGenericVirtualRegister(S32); | |||
4761 | Register R3 = MRI.createGenericVirtualRegister(S32); | |||
4762 | B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | |||
4763 | B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | |||
4764 | B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | |||
4765 | Ops.push_back(R1); | |||
4766 | Ops.push_back(R2); | |||
4767 | Ops.push_back(R3); | |||
4768 | } else { | |||
4769 | packLanes(RayDir); | |||
4770 | packLanes(RayInvDir); | |||
4771 | } | |||
4772 | ||||
4773 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | |||
4774 | .addDef(DstReg) | |||
4775 | .addImm(Opcode); | |||
4776 | ||||
4777 | for (Register R : Ops) { | |||
4778 | MIB.addUse(R); | |||
4779 | } | |||
4780 | ||||
4781 | MIB.addUse(TDescr) | |||
4782 | .addImm(IsA16 ? 1 : 0) | |||
4783 | .cloneMemRefs(MI); | |||
4784 | ||||
4785 | MI.eraseFromParent(); | |||
4786 | return true; | |||
4787 | } | |||
4788 | ||||
4789 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | |||
4790 | MachineInstr &MI) const { | |||
4791 | MachineIRBuilder &B = Helper.MIRBuilder; | |||
4792 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
4793 | ||||
4794 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | |||
4795 | auto IntrID = MI.getIntrinsicID(); | |||
4796 | switch (IntrID) { | |||
4797 | case Intrinsic::amdgcn_if: | |||
4798 | case Intrinsic::amdgcn_else: { | |||
4799 | MachineInstr *Br = nullptr; | |||
4800 | MachineBasicBlock *UncondBrTarget = nullptr; | |||
4801 | bool Negated = false; | |||
4802 | if (MachineInstr *BrCond = | |||
4803 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | |||
4804 | const SIRegisterInfo *TRI | |||
4805 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | |||
4806 | ||||
4807 | Register Def = MI.getOperand(1).getReg(); | |||
4808 | Register Use = MI.getOperand(3).getReg(); | |||
4809 | ||||
4810 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | |||
4811 | ||||
4812 | if (Negated) | |||
4813 | std::swap(CondBrTarget, UncondBrTarget); | |||
4814 | ||||
4815 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | |||
4816 | if (IntrID == Intrinsic::amdgcn_if) { | |||
4817 | B.buildInstr(AMDGPU::SI_IF) | |||
4818 | .addDef(Def) | |||
4819 | .addUse(Use) | |||
4820 | .addMBB(UncondBrTarget); | |||
4821 | } else { | |||
4822 | B.buildInstr(AMDGPU::SI_ELSE) | |||
4823 | .addDef(Def) | |||
4824 | .addUse(Use) | |||
4825 | .addMBB(UncondBrTarget); | |||
4826 | } | |||
4827 | ||||
4828 | if (Br) { | |||
4829 | Br->getOperand(0).setMBB(CondBrTarget); | |||
4830 | } else { | |||
4831 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | |||
4832 | // since we're swapping branch targets it needs to be reinserted. | |||
4833 | // FIXME: IRTranslator should probably not do this | |||
4834 | B.buildBr(*CondBrTarget); | |||
4835 | } | |||
4836 | ||||
4837 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | |||
4838 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | |||
4839 | MI.eraseFromParent(); | |||
4840 | BrCond->eraseFromParent(); | |||
4841 | return true; | |||
4842 | } | |||
4843 | ||||
4844 | return false; | |||
4845 | } | |||
4846 | case Intrinsic::amdgcn_loop: { | |||
4847 | MachineInstr *Br = nullptr; | |||
4848 | MachineBasicBlock *UncondBrTarget = nullptr; | |||
4849 | bool Negated = false; | |||
4850 | if (MachineInstr *BrCond = | |||
4851 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | |||
4852 | const SIRegisterInfo *TRI | |||
4853 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | |||
4854 | ||||
4855 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | |||
4856 | Register Reg = MI.getOperand(2).getReg(); | |||
4857 | ||||
4858 | if (Negated) | |||
4859 | std::swap(CondBrTarget, UncondBrTarget); | |||
4860 | ||||
4861 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | |||
4862 | B.buildInstr(AMDGPU::SI_LOOP) | |||
4863 | .addUse(Reg) | |||
4864 | .addMBB(UncondBrTarget); | |||
4865 | ||||
4866 | if (Br) | |||
4867 | Br->getOperand(0).setMBB(CondBrTarget); | |||
4868 | else | |||
4869 | B.buildBr(*CondBrTarget); | |||
4870 | ||||
4871 | MI.eraseFromParent(); | |||
4872 | BrCond->eraseFromParent(); | |||
4873 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | |||
4874 | return true; | |||
4875 | } | |||
4876 | ||||
4877 | return false; | |||
4878 | } | |||
4879 | case Intrinsic::amdgcn_kernarg_segment_ptr: | |||
4880 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | |||
4881 | // This only makes sense to call in a kernel, so just lower to null. | |||
4882 | B.buildConstant(MI.getOperand(0).getReg(), 0); | |||
4883 | MI.eraseFromParent(); | |||
4884 | return true; | |||
4885 | } | |||
4886 | ||||
4887 | return legalizePreloadedArgIntrin( | |||
4888 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | |||
4889 | case Intrinsic::amdgcn_implicitarg_ptr: | |||
4890 | return legalizeImplicitArgPtr(MI, MRI, B); | |||
4891 | case Intrinsic::amdgcn_workitem_id_x: | |||
4892 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4893 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | |||
4894 | case Intrinsic::amdgcn_workitem_id_y: | |||
4895 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4896 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | |||
4897 | case Intrinsic::amdgcn_workitem_id_z: | |||
4898 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4899 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | |||
4900 | case Intrinsic::amdgcn_workgroup_id_x: | |||
4901 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4902 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | |||
4903 | case Intrinsic::amdgcn_workgroup_id_y: | |||
4904 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4905 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | |||
4906 | case Intrinsic::amdgcn_workgroup_id_z: | |||
4907 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4908 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | |||
4909 | case Intrinsic::amdgcn_dispatch_ptr: | |||
4910 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4911 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | |||
4912 | case Intrinsic::amdgcn_queue_ptr: | |||
4913 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4914 | AMDGPUFunctionArgInfo::QUEUE_PTR); | |||
4915 | case Intrinsic::amdgcn_implicit_buffer_ptr: | |||
4916 | return legalizePreloadedArgIntrin( | |||
4917 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | |||
4918 | case Intrinsic::amdgcn_dispatch_id: | |||
4919 | return legalizePreloadedArgIntrin(MI, MRI, B, | |||
4920 | AMDGPUFunctionArgInfo::DISPATCH_ID); | |||
4921 | case Intrinsic::amdgcn_fdiv_fast: | |||
4922 | return legalizeFDIVFastIntrin(MI, MRI, B); | |||
4923 | case Intrinsic::amdgcn_is_shared: | |||
4924 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | |||
4925 | case Intrinsic::amdgcn_is_private: | |||
4926 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | |||
4927 | case Intrinsic::amdgcn_wavefrontsize: { | |||
4928 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | |||
4929 | MI.eraseFromParent(); | |||
4930 | return true; | |||
4931 | } | |||
4932 | case Intrinsic::amdgcn_s_buffer_load: | |||
4933 | return legalizeSBufferLoad(Helper, MI); | |||
4934 | case Intrinsic::amdgcn_raw_buffer_store: | |||
4935 | case Intrinsic::amdgcn_struct_buffer_store: | |||
4936 | return legalizeBufferStore(MI, MRI, B, false, false); | |||
4937 | case Intrinsic::amdgcn_raw_buffer_store_format: | |||
4938 | case Intrinsic::amdgcn_struct_buffer_store_format: | |||
4939 | return legalizeBufferStore(MI, MRI, B, false, true); | |||
4940 | case Intrinsic::amdgcn_raw_tbuffer_store: | |||
4941 | case Intrinsic::amdgcn_struct_tbuffer_store: | |||
4942 | return legalizeBufferStore(MI, MRI, B, true, true); | |||
4943 | case Intrinsic::amdgcn_raw_buffer_load: | |||
4944 | case Intrinsic::amdgcn_struct_buffer_load: | |||
4945 | return legalizeBufferLoad(MI, MRI, B, false, false); | |||
4946 | case Intrinsic::amdgcn_raw_buffer_load_format: | |||
4947 | case Intrinsic::amdgcn_struct_buffer_load_format: | |||
4948 | return legalizeBufferLoad(MI, MRI, B, true, false); | |||
4949 | case Intrinsic::amdgcn_raw_tbuffer_load: | |||
4950 | case Intrinsic::amdgcn_struct_tbuffer_load: | |||
4951 | return legalizeBufferLoad(MI, MRI, B, true, true); | |||
4952 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | |||
4953 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | |||
4954 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | |||
4955 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | |||
4956 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | |||
4957 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | |||
4958 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | |||
4959 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | |||
4960 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | |||
4961 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | |||
4962 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | |||
4963 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | |||
4964 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | |||
4965 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | |||
4966 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | |||
4967 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | |||
4968 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | |||
4969 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | |||
4970 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | |||
4971 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | |||
4972 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | |||
4973 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | |||
4974 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | |||
4975 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | |||
4976 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | |||
4977 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | |||
4978 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | |||
4979 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | |||
4980 | case Intrinsic::amdgcn_buffer_atomic_fadd: | |||
4981 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | |||
4982 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | |||
4983 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | |||
4984 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | |||
4985 | return legalizeBufferAtomic(MI, B, IntrID); | |||
4986 | case Intrinsic::amdgcn_atomic_inc: | |||
4987 | return legalizeAtomicIncDec(MI, B, true); | |||
4988 | case Intrinsic::amdgcn_atomic_dec: | |||
4989 | return legalizeAtomicIncDec(MI, B, false); | |||
4990 | case Intrinsic::trap: | |||
4991 | return legalizeTrapIntrinsic(MI, MRI, B); | |||
4992 | case Intrinsic::debugtrap: | |||
4993 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | |||
4994 | case Intrinsic::amdgcn_rsq_clamp: | |||
4995 | return legalizeRsqClampIntrinsic(MI, MRI, B); | |||
4996 | case Intrinsic::amdgcn_ds_fadd: | |||
4997 | case Intrinsic::amdgcn_ds_fmin: | |||
4998 | case Intrinsic::amdgcn_ds_fmax: | |||
4999 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | |||
5000 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | |||
5001 | return legalizeBVHIntrinsic(MI, B); | |||
5002 | default: { | |||
5003 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | |||
5004 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | |||
5005 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | |||
5006 | return true; | |||
5007 | } | |||
5008 | } | |||
5009 | ||||
5010 | return true; | |||
5011 | } |
1 | //== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// Implement a low-level type suitable for MachineInstr level instruction |
10 | /// selection. |
11 | /// |
12 | /// For a type attached to a MachineInstr, we only care about 2 details: total |
13 | /// size and the number of vector lanes (if any). Accordingly, there are 4 |
14 | /// possible valid type-kinds: |
15 | /// |
16 | /// * `sN` for scalars and aggregates |
17 | /// * `<N x sM>` for vectors, which must have at least 2 elements. |
18 | /// * `pN` for pointers |
19 | /// |
20 | /// Other information required for correct selection is expected to be carried |
21 | /// by the opcode, or non-type flags. For example the distinction between G_ADD |
22 | /// and G_FADD for int/float or fast-math flags. |
23 | /// |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |
27 | #define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |
28 | |
29 | #include "llvm/ADT/DenseMapInfo.h" |
30 | #include "llvm/Support/Debug.h" |
31 | #include "llvm/Support/MachineValueType.h" |
32 | #include <cassert> |
33 | |
34 | namespace llvm { |
35 | |
36 | class DataLayout; |
37 | class Type; |
38 | class raw_ostream; |
39 | |
40 | class LLT { |
41 | public: |
42 | /// Get a low-level scalar or aggregate "bag of bits". |
43 | static LLT scalar(unsigned SizeInBits) { |
44 | assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 44, __extension__ __PRETTY_FUNCTION__)); |
45 | return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0, |
46 | SizeInBits, /*AddressSpace=*/0}; |
47 | } |
48 | |
49 | /// Get a low-level pointer in the given address space. |
50 | static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) { |
51 | assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 51, __extension__ __PRETTY_FUNCTION__)); |
52 | return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0, |
53 | SizeInBits, AddressSpace}; |
54 | } |
55 | |
56 | /// Get a low-level vector of some number of elements and element width. |
57 | /// \p NumElements must be at least 2. |
58 | static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) { |
59 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 59, __extension__ __PRETTY_FUNCTION__)); |
60 | assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 && "invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 60, __extension__ __PRETTY_FUNCTION__)); |
61 | return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements, |
62 | ScalarSizeInBits, /*AddressSpace=*/0}; |
63 | } |
64 | |
65 | /// Get a low-level vector of some number of elements and element type. |
66 | static LLT vector(uint16_t NumElements, LLT ScalarTy) { |
67 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 67, __extension__ __PRETTY_FUNCTION__)); |
68 | assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type" ) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 68, __extension__ __PRETTY_FUNCTION__)); |
69 | return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements, |
70 | ScalarTy.getSizeInBits(), |
71 | ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0}; |
72 | } |
73 | |
74 | static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) { |
75 | return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy); |
76 | } |
77 | |
78 | static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) { |
79 | return scalarOrVector(NumElements, LLT::scalar(ScalarSize)); |
80 | } |
81 | |
82 | explicit LLT(bool isPointer, bool isVector, uint16_t NumElements, |
83 | unsigned SizeInBits, unsigned AddressSpace) { |
84 | init(isPointer, isVector, NumElements, SizeInBits, AddressSpace); |
85 | } |
86 | explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {} |
87 | |
88 | explicit LLT(MVT VT); |
89 | |
90 | bool isValid() const { return RawData != 0; } |
91 | |
92 | bool isScalar() const { return isValid() && !IsPointer && !IsVector; } |
93 | |
94 | bool isPointer() const { return isValid() && IsPointer && !IsVector; } |
95 | |
96 | bool isVector() const { return isValid() && IsVector; } |
97 | |
98 | /// Returns the number of elements in a vector LLT. Must only be called on |
99 | /// vector types. |
100 | uint16_t getNumElements() const { |
101 | assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate" ) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 101, __extension__ __PRETTY_FUNCTION__)); |
102 | if (!IsPointer) |
103 | return getFieldValue(VectorElementsFieldInfo); |
104 | else |
105 | return getFieldValue(PointerVectorElementsFieldInfo); |
106 | } |
107 | |
108 | /// Returns the total size of the type. Must only be called on sized types. |
109 | unsigned getSizeInBits() const { |
110 | if (isPointer() || isScalar()) |
111 | return getScalarSizeInBits(); |
112 | return getScalarSizeInBits() * getNumElements(); |
113 | } |
114 | |
115 | /// Returns the total size of the type in bytes, i.e. number of whole bytes |
116 | /// needed to represent the size in bits. Must only be called on sized types. |
117 | unsigned getSizeInBytes() const { |
118 | return (getSizeInBits() + 7) / 8; |
119 | } |
120 | |
121 | LLT getScalarType() const { |
122 | return isVector() ? getElementType() : *this; |
123 | } |
124 | |
125 | /// If this type is a vector, return a vector with the same number of elements |
126 | /// but the new element type. Otherwise, return the new element type. |
127 | LLT changeElementType(LLT NewEltTy) const { |
128 | return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy; |
129 | } |
130 | |
131 | /// If this type is a vector, return a vector with the same number of elements |
132 | /// but the new element size. Otherwise, return the new element type. Invalid |
133 | /// for pointer types. For pointer types, use changeElementType. |
134 | LLT changeElementSize(unsigned NewEltSize) const { |
135 | assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)) |
136 | "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)); |
137 | return isVector() ? LLT::vector(getNumElements(), NewEltSize) |
138 | : LLT::scalar(NewEltSize); |
139 | } |
140 | |
141 | /// Return a vector or scalar with the same element type and the new number of |
142 | /// elements. |
143 | LLT changeNumElements(unsigned NewNumElts) const { |
144 | return LLT::scalarOrVector(NewNumElts, getScalarType()); |
145 | } |
146 | |
147 | /// Return a type that is \p Factor times smaller. Reduces the number of |
148 | /// elements if this is a vector, or the bitwidth for scalar/pointers. Does |
149 | /// not attempt to handle cases that aren't evenly divisible. |
150 | LLT divide(int Factor) const { |
151 | assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail ("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 151, __extension__ __PRETTY_FUNCTION__)); |
152 | if (isVector()) { |
153 | assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void (0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 153, __extension__ __PRETTY_FUNCTION__)); |
154 | return scalarOrVector(getNumElements() / Factor, getElementType()); |
155 | } |
156 | |
157 | assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void (0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 157, __extension__ __PRETTY_FUNCTION__)); |
158 | return scalar(getSizeInBits() / Factor); |
159 | } |
160 | |
161 | bool isByteSized() const { return (getSizeInBits() & 7) == 0; } |
162 | |
163 | unsigned getScalarSizeInBits() const { |
164 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 164, __extension__ __PRETTY_FUNCTION__)); |
165 | if (!IsVector) { |
166 | if (!IsPointer) |
167 | return getFieldValue(ScalarSizeFieldInfo); |
168 | else |
169 | return getFieldValue(PointerSizeFieldInfo); |
170 | } else { |
171 | if (!IsPointer) |
172 | return getFieldValue(VectorSizeFieldInfo); |
173 | else |
174 | return getFieldValue(PointerVectorSizeFieldInfo); |
175 | } |
176 | } |
177 | |
178 | unsigned getAddressSpace() const { |
179 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 179, __extension__ __PRETTY_FUNCTION__)); |
180 | assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type" ) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 180, __extension__ __PRETTY_FUNCTION__)); |
181 | if (!IsVector) |
182 | return getFieldValue(PointerAddressSpaceFieldInfo); |
183 | else |
184 | return getFieldValue(PointerVectorAddressSpaceFieldInfo); |
185 | } |
186 | |
187 | /// Returns the vector's element type. Only valid for vector types. |
188 | LLT getElementType() const { |
189 | assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate" ) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 189, __extension__ __PRETTY_FUNCTION__)); |
190 | if (IsPointer) |
191 | return pointer(getAddressSpace(), getScalarSizeInBits()); |
192 | else |
193 | return scalar(getScalarSizeInBits()); |
194 | } |
195 | |
196 | void print(raw_ostream &OS) const; |
197 | |
198 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
199 | LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { |
200 | print(dbgs()); |
201 | dbgs() << '\n'; |
202 | } |
203 | #endif |
204 | |
205 | bool operator==(const LLT &RHS) const { |
206 | return IsPointer == RHS.IsPointer && IsVector == RHS.IsVector && |
207 | RHS.RawData == RawData; |
208 | } |
209 | |
210 | bool operator!=(const LLT &RHS) const { return !(*this == RHS); } |
211 | |
212 | friend struct DenseMapInfo<LLT>; |
213 | friend class GISelInstProfileBuilder; |
214 | |
215 | private: |
216 | /// LLT is packed into 64 bits as follows: |
217 | /// isPointer : 1 |
218 | /// isVector : 1 |
219 | /// with 62 bits remaining for Kind-specific data, packed in bitfields |
220 | /// as described below. As there isn't a simple portable way to pack bits |
221 | /// into bitfields, here the different fields in the packed structure is |
222 | /// described in static const *Field variables. Each of these variables |
223 | /// is a 2-element array, with the first element describing the bitfield size |
224 | /// and the second element describing the bitfield offset. |
225 | typedef int BitFieldInfo[2]; |
226 | /// |
227 | /// This is how the bitfields are packed per Kind: |
228 | /// * Invalid: |
229 | /// gets encoded as RawData == 0, as that is an invalid encoding, since for |
230 | /// valid encodings, SizeInBits/SizeOfElement must be larger than 0. |
231 | /// * Non-pointer scalar (isPointer == 0 && isVector == 0): |
232 | /// SizeInBits: 32; |
233 | static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0}; |
234 | /// * Pointer (isPointer == 1 && isVector == 0): |
235 | /// SizeInBits: 16; |
236 | /// AddressSpace: 24; |
237 | static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0}; |
238 | static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{ |
239 | 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]}; |
240 | /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1): |
241 | /// NumElements: 16; |
242 | /// SizeOfElement: 32; |
243 | static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0}; |
244 | static const constexpr BitFieldInfo VectorSizeFieldInfo{ |
245 | 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]}; |
246 | /// * Vector-of-pointer (isPointer == 1 && isVector == 1): |
247 | /// NumElements: 16; |
248 | /// SizeOfElement: 16; |
249 | /// AddressSpace: 24; |
250 | static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0}; |
251 | static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{ |
252 | 16, |
253 | PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]}; |
254 | static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{ |
255 | 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]}; |
256 | |
257 | uint64_t IsPointer : 1; |
258 | uint64_t IsVector : 1; |
259 | uint64_t RawData : 62; |
260 | |
261 | static uint64_t getMask(const BitFieldInfo FieldInfo) { |
262 | const int FieldSizeInBits = FieldInfo[0]; |
263 | return (((uint64_t)1) << FieldSizeInBits) - 1; |
264 | } |
265 | static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) { |
266 | assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field" ) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 266, __extension__ __PRETTY_FUNCTION__)); |
267 | return (Val & Mask) << Shift; |
268 | } |
269 | static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) { |
270 | return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]); |
271 | } |
272 | uint64_t getFieldValue(const BitFieldInfo FieldInfo) const { |
273 | return getMask(FieldInfo) & (RawData >> FieldInfo[1]); |
274 | } |
275 | |
276 | void init(bool IsPointer, bool IsVector, uint16_t NumElements, |
277 | unsigned SizeInBits, unsigned AddressSpace) { |
278 | this->IsPointer = IsPointer; |
279 | this->IsVector = IsVector; |
280 | if (!IsVector) { |
281 | if (!IsPointer) |
282 | RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo); |
283 | else |
284 | RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) | |
285 | maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo); |
286 | } else { |
287 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 287, __extension__ __PRETTY_FUNCTION__)); |
288 | if (!IsPointer) |
289 | RawData = maskAndShift(NumElements, VectorElementsFieldInfo) | |
290 | maskAndShift(SizeInBits, VectorSizeFieldInfo); |
291 | else |
292 | RawData = |
293 | maskAndShift(NumElements, PointerVectorElementsFieldInfo) | |
294 | maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) | |
295 | maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo); |
296 | } |
297 | } |
298 | |
299 | uint64_t getUniqueRAWLLTData() const { |
300 | return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 | |
301 | ((uint64_t)IsVector); |
302 | } |
303 | }; |
304 | |
305 | inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) { |
306 | Ty.print(OS); |
307 | return OS; |
308 | } |
309 | |
310 | template<> struct DenseMapInfo<LLT> { |
311 | static inline LLT getEmptyKey() { |
312 | LLT Invalid; |
313 | Invalid.IsPointer = true; |
314 | return Invalid; |
315 | } |
316 | static inline LLT getTombstoneKey() { |
317 | LLT Invalid; |
318 | Invalid.IsVector = true; |
319 | return Invalid; |
320 | } |
321 | static inline unsigned getHashValue(const LLT &Ty) { |
322 | uint64_t Val = Ty.getUniqueRAWLLTData(); |
323 | return DenseMapInfo<uint64_t>::getHashValue(Val); |
324 | } |
325 | static bool isEqual(const LLT &LHS, const LLT &RHS) { |
326 | return LHS == RHS; |
327 | } |
328 | }; |
329 | |
330 | } |
331 | |
332 | #endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |