File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 4351, column 49 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||||||
2 | // | ||||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||||
6 | // | ||||||||
7 | //===----------------------------------------------------------------------===// | ||||||||
8 | /// \file | ||||||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||||||
10 | /// AMDGPU. | ||||||||
11 | /// \todo This should be generated by TableGen. | ||||||||
12 | //===----------------------------------------------------------------------===// | ||||||||
13 | |||||||||
14 | #include "AMDGPULegalizerInfo.h" | ||||||||
15 | |||||||||
16 | #include "AMDGPU.h" | ||||||||
17 | #include "AMDGPUGlobalISelUtils.h" | ||||||||
18 | #include "AMDGPUInstrInfo.h" | ||||||||
19 | #include "AMDGPUTargetMachine.h" | ||||||||
20 | #include "SIMachineFunctionInfo.h" | ||||||||
21 | #include "Utils/AMDGPUBaseInfo.h" | ||||||||
22 | #include "llvm/ADT/ScopeExit.h" | ||||||||
23 | #include "llvm/BinaryFormat/ELF.h" | ||||||||
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||||||
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||||||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||||||
27 | #include "llvm/IR/DiagnosticInfo.h" | ||||||||
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||||||
29 | |||||||||
30 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||||||
31 | |||||||||
32 | using namespace llvm; | ||||||||
33 | using namespace LegalizeActions; | ||||||||
34 | using namespace LegalizeMutations; | ||||||||
35 | using namespace LegalityPredicates; | ||||||||
36 | using namespace MIPatternMatch; | ||||||||
37 | |||||||||
38 | // Hack until load/store selection patterns support any tuple of legal types. | ||||||||
39 | static cl::opt<bool> EnableNewLegality( | ||||||||
40 | "amdgpu-global-isel-new-legality", | ||||||||
41 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||||||
42 | "rules compatible with selection patterns"), | ||||||||
43 | cl::init(false), | ||||||||
44 | cl::ReallyHidden); | ||||||||
45 | |||||||||
46 | static constexpr unsigned MaxRegisterSize = 1024; | ||||||||
47 | |||||||||
48 | // Round the number of elements to the next power of two elements | ||||||||
49 | static LLT getPow2VectorType(LLT Ty) { | ||||||||
50 | unsigned NElts = Ty.getNumElements(); | ||||||||
51 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||||||
52 | return Ty.changeNumElements(Pow2NElts); | ||||||||
53 | } | ||||||||
54 | |||||||||
55 | // Round the number of bits to the next power of two bits | ||||||||
56 | static LLT getPow2ScalarType(LLT Ty) { | ||||||||
57 | unsigned Bits = Ty.getSizeInBits(); | ||||||||
58 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||||||
59 | return LLT::scalar(Pow2Bits); | ||||||||
60 | } | ||||||||
61 | |||||||||
62 | /// \returs true if this is an odd sized vector which should widen by adding an | ||||||||
63 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||||||
64 | /// excludes s1 vectors, which should always be scalarized. | ||||||||
65 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||||||
66 | return [=](const LegalityQuery &Query) { | ||||||||
67 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
68 | if (!Ty.isVector()) | ||||||||
69 | return false; | ||||||||
70 | |||||||||
71 | const LLT EltTy = Ty.getElementType(); | ||||||||
72 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
73 | return Ty.getNumElements() % 2 != 0 && | ||||||||
74 | EltSize > 1 && EltSize < 32 && | ||||||||
75 | Ty.getSizeInBits() % 32 != 0; | ||||||||
76 | }; | ||||||||
77 | } | ||||||||
78 | |||||||||
79 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||||||
80 | return [=](const LegalityQuery &Query) { | ||||||||
81 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
82 | return Ty.getSizeInBits() % 32 == 0; | ||||||||
83 | }; | ||||||||
84 | } | ||||||||
85 | |||||||||
86 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||||||
87 | return [=](const LegalityQuery &Query) { | ||||||||
88 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
89 | const LLT EltTy = Ty.getScalarType(); | ||||||||
90 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||||||
91 | }; | ||||||||
92 | } | ||||||||
93 | |||||||||
94 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||||||
95 | return [=](const LegalityQuery &Query) { | ||||||||
96 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
97 | const LLT EltTy = Ty.getElementType(); | ||||||||
98 | return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); | ||||||||
99 | }; | ||||||||
100 | } | ||||||||
101 | |||||||||
102 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||||||
103 | return [=](const LegalityQuery &Query) { | ||||||||
104 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
105 | const LLT EltTy = Ty.getElementType(); | ||||||||
106 | unsigned Size = Ty.getSizeInBits(); | ||||||||
107 | unsigned Pieces = (Size + 63) / 64; | ||||||||
108 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||||||
109 | return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); | ||||||||
110 | }; | ||||||||
111 | } | ||||||||
112 | |||||||||
113 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||||||
114 | // type. | ||||||||
115 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||||||
116 | return [=](const LegalityQuery &Query) { | ||||||||
117 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
118 | |||||||||
119 | const LLT EltTy = Ty.getElementType(); | ||||||||
120 | const int Size = Ty.getSizeInBits(); | ||||||||
121 | const int EltSize = EltTy.getSizeInBits(); | ||||||||
122 | const int NextMul32 = (Size + 31) / 32; | ||||||||
123 | |||||||||
124 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 124, __extension__ __PRETTY_FUNCTION__)); | ||||||||
125 | |||||||||
126 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||||||
127 | return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); | ||||||||
128 | }; | ||||||||
129 | } | ||||||||
130 | |||||||||
131 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||||||
132 | const unsigned Size = Ty.getSizeInBits(); | ||||||||
133 | |||||||||
134 | LLT CoercedTy; | ||||||||
135 | if (Size <= 32) { | ||||||||
136 | // <2 x s8> -> s16 | ||||||||
137 | // <4 x s8> -> s32 | ||||||||
138 | return LLT::scalar(Size); | ||||||||
139 | } | ||||||||
140 | |||||||||
141 | return LLT::scalarOrVector(Size / 32, 32); | ||||||||
142 | } | ||||||||
143 | |||||||||
144 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||||||
145 | return [=](const LegalityQuery &Query) { | ||||||||
146 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
147 | return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||||||
148 | }; | ||||||||
149 | } | ||||||||
150 | |||||||||
151 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||||||
152 | return [=](const LegalityQuery &Query) { | ||||||||
153 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
154 | unsigned Size = Ty.getSizeInBits(); | ||||||||
155 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 155, __extension__ __PRETTY_FUNCTION__)); | ||||||||
156 | return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); | ||||||||
157 | }; | ||||||||
158 | } | ||||||||
159 | |||||||||
160 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||||||
161 | return [=](const LegalityQuery &Query) { | ||||||||
162 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
163 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||||||
164 | }; | ||||||||
165 | } | ||||||||
166 | |||||||||
167 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||||||
168 | return [=](const LegalityQuery &Query) { | ||||||||
169 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
170 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||||||
171 | }; | ||||||||
172 | } | ||||||||
173 | |||||||||
174 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||||||
175 | return [=](const LegalityQuery &Query) { | ||||||||
176 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
177 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||||||
178 | }; | ||||||||
179 | } | ||||||||
180 | |||||||||
181 | static bool isRegisterSize(unsigned Size) { | ||||||||
182 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||||||
183 | } | ||||||||
184 | |||||||||
185 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||||||
186 | const int EltSize = EltTy.getSizeInBits(); | ||||||||
187 | return EltSize == 16 || EltSize % 32 == 0; | ||||||||
188 | } | ||||||||
189 | |||||||||
190 | static bool isRegisterVectorType(LLT Ty) { | ||||||||
191 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||||||
192 | return EltSize == 32 || EltSize == 64 || | ||||||||
193 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||||||
194 | EltSize == 128 || EltSize == 256; | ||||||||
195 | } | ||||||||
196 | |||||||||
197 | static bool isRegisterType(LLT Ty) { | ||||||||
198 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||||||
199 | return false; | ||||||||
200 | |||||||||
201 | if (Ty.isVector()) | ||||||||
202 | return isRegisterVectorType(Ty); | ||||||||
203 | |||||||||
204 | return true; | ||||||||
205 | } | ||||||||
206 | |||||||||
207 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||||||
208 | // multiples of v2s16. | ||||||||
209 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||||||
210 | return [=](const LegalityQuery &Query) { | ||||||||
211 | return isRegisterType(Query.Types[TypeIdx]); | ||||||||
212 | }; | ||||||||
213 | } | ||||||||
214 | |||||||||
215 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||||||
216 | return [=](const LegalityQuery &Query) { | ||||||||
217 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
218 | if (!QueryTy.isVector()) | ||||||||
219 | return false; | ||||||||
220 | const LLT EltTy = QueryTy.getElementType(); | ||||||||
221 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||||||
222 | }; | ||||||||
223 | } | ||||||||
224 | |||||||||
225 | static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { | ||||||||
226 | return [=](const LegalityQuery &Query) { | ||||||||
227 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
228 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||||||
229 | Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); | ||||||||
230 | }; | ||||||||
231 | } | ||||||||
232 | |||||||||
233 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||||||
234 | // handle some operations by just promoting the register during | ||||||||
235 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||||||
236 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||||||
237 | bool IsLoad) { | ||||||||
238 | switch (AS) { | ||||||||
239 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||||||
240 | // FIXME: Private element size. | ||||||||
241 | return ST.enableFlatScratch() ? 128 : 32; | ||||||||
242 | case AMDGPUAS::LOCAL_ADDRESS: | ||||||||
243 | return ST.useDS128() ? 128 : 64; | ||||||||
244 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||||||
245 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||||||
246 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||||||
247 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||||||
248 | // global loads (ideally constant address space should be eliminated) | ||||||||
249 | // depending on the context. Legality cannot be context dependent, but | ||||||||
250 | // RegBankSelect can split the load as necessary depending on the pointer | ||||||||
251 | // register bank/uniformity and if the memory is invariant or not written in a | ||||||||
252 | // kernel. | ||||||||
253 | return IsLoad ? 512 : 128; | ||||||||
254 | default: | ||||||||
255 | // Flat addresses may contextually need to be split to 32-bit parts if they | ||||||||
256 | // may alias scratch depending on the subtarget. | ||||||||
257 | return 128; | ||||||||
258 | } | ||||||||
259 | } | ||||||||
260 | |||||||||
261 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||||||
262 | const LegalityQuery &Query, | ||||||||
263 | unsigned Opcode) { | ||||||||
264 | const LLT Ty = Query.Types[0]; | ||||||||
265 | |||||||||
266 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||||||
267 | const bool IsLoad = Opcode != AMDGPU::G_STORE; | ||||||||
268 | |||||||||
269 | unsigned RegSize = Ty.getSizeInBits(); | ||||||||
270 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
271 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||||
272 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||||||
273 | |||||||||
274 | // All of these need to be custom lowered to cast the pointer operand. | ||||||||
275 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||||||
276 | return false; | ||||||||
277 | |||||||||
278 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||||||
279 | // we also need to modify the memory access size. | ||||||||
280 | #if 0 | ||||||||
281 | // Accept widening loads based on alignment. | ||||||||
282 | if (IsLoad && MemSize < Size) | ||||||||
283 | MemSize = std::max(MemSize, Align); | ||||||||
284 | #endif | ||||||||
285 | |||||||||
286 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||||||
287 | if (MemSize != RegSize && RegSize != 32) | ||||||||
288 | return false; | ||||||||
289 | |||||||||
290 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||||
291 | return false; | ||||||||
292 | |||||||||
293 | switch (MemSize) { | ||||||||
294 | case 8: | ||||||||
295 | case 16: | ||||||||
296 | case 32: | ||||||||
297 | case 64: | ||||||||
298 | case 128: | ||||||||
299 | break; | ||||||||
300 | case 96: | ||||||||
301 | if (!ST.hasDwordx3LoadStores()) | ||||||||
302 | return false; | ||||||||
303 | break; | ||||||||
304 | case 256: | ||||||||
305 | case 512: | ||||||||
306 | // These may contextually need to be broken down. | ||||||||
307 | break; | ||||||||
308 | default: | ||||||||
309 | return false; | ||||||||
310 | } | ||||||||
311 | |||||||||
312 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 312, __extension__ __PRETTY_FUNCTION__)); | ||||||||
313 | |||||||||
314 | if (AlignBits < MemSize) { | ||||||||
315 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
316 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||||
317 | Align(AlignBits / 8))) | ||||||||
318 | return false; | ||||||||
319 | } | ||||||||
320 | |||||||||
321 | return true; | ||||||||
322 | } | ||||||||
323 | |||||||||
324 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||||||
325 | // workaround this. Eventually it should ignore the type for loads and only care | ||||||||
326 | // about the size. Return true in cases where we will workaround this for now by | ||||||||
327 | // bitcasting. | ||||||||
328 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||||||
329 | if (EnableNewLegality) | ||||||||
330 | return false; | ||||||||
331 | |||||||||
332 | const unsigned Size = Ty.getSizeInBits(); | ||||||||
333 | if (Size <= 64) | ||||||||
334 | return false; | ||||||||
335 | if (!Ty.isVector()) | ||||||||
336 | return true; | ||||||||
337 | |||||||||
338 | LLT EltTy = Ty.getElementType(); | ||||||||
339 | if (EltTy.isPointer()) | ||||||||
340 | return true; | ||||||||
341 | |||||||||
342 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
343 | return EltSize != 32 && EltSize != 64; | ||||||||
344 | } | ||||||||
345 | |||||||||
346 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||||||
347 | unsigned Opcode) { | ||||||||
348 | const LLT Ty = Query.Types[0]; | ||||||||
349 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && | ||||||||
350 | !loadStoreBitcastWorkaround(Ty); | ||||||||
351 | } | ||||||||
352 | |||||||||
353 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||||||
354 | /// to a different type. | ||||||||
355 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||||||
356 | const unsigned MemSizeInBits) { | ||||||||
357 | const unsigned Size = Ty.getSizeInBits(); | ||||||||
358 | if (Size != MemSizeInBits) | ||||||||
359 | return Size <= 32 && Ty.isVector(); | ||||||||
360 | |||||||||
361 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||||||
362 | return true; | ||||||||
363 | return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && | ||||||||
364 | !isRegisterVectorElementType(Ty.getElementType()); | ||||||||
365 | } | ||||||||
366 | |||||||||
367 | /// Return true if we should legalize a load by widening an odd sized memory | ||||||||
368 | /// access up to the alignment. Note this case when the memory access itself | ||||||||
369 | /// changes, not the size of the result register. | ||||||||
370 | static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, | ||||||||
371 | unsigned AlignInBits, unsigned AddrSpace, | ||||||||
372 | unsigned Opcode) { | ||||||||
373 | // We don't want to widen cases that are naturally legal. | ||||||||
374 | if (isPowerOf2_32(SizeInBits)) | ||||||||
375 | return false; | ||||||||
376 | |||||||||
377 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||||||
378 | // end up widening these for a scalar load during RegBankSelect, since there | ||||||||
379 | // aren't 96-bit scalar loads. | ||||||||
380 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||||||
381 | return false; | ||||||||
382 | |||||||||
383 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) | ||||||||
384 | return false; | ||||||||
385 | |||||||||
386 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||||||
387 | // to it. | ||||||||
388 | // | ||||||||
389 | // TODO: Could check dereferenceable for less aligned cases. | ||||||||
390 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||||||
391 | if (AlignInBits < RoundedSize) | ||||||||
392 | return false; | ||||||||
393 | |||||||||
394 | // Do not widen if it would introduce a slow unaligned load. | ||||||||
395 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
396 | bool Fast = false; | ||||||||
397 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||||||
398 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||||||
399 | MachineMemOperand::MOLoad, &Fast) && | ||||||||
400 | Fast; | ||||||||
401 | } | ||||||||
402 | |||||||||
403 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||||||
404 | unsigned Opcode) { | ||||||||
405 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||||||
406 | return false; | ||||||||
407 | |||||||||
408 | return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, | ||||||||
409 | Query.MMODescrs[0].AlignInBits, | ||||||||
410 | Query.Types[1].getAddressSpace(), Opcode); | ||||||||
411 | } | ||||||||
412 | |||||||||
413 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||||||
414 | const GCNTargetMachine &TM) | ||||||||
415 | : ST(ST_) { | ||||||||
416 | using namespace TargetOpcode; | ||||||||
417 | |||||||||
418 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||||||
419 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||||||
420 | }; | ||||||||
421 | |||||||||
422 | const LLT S1 = LLT::scalar(1); | ||||||||
423 | const LLT S8 = LLT::scalar(8); | ||||||||
424 | const LLT S16 = LLT::scalar(16); | ||||||||
425 | const LLT S32 = LLT::scalar(32); | ||||||||
426 | const LLT S64 = LLT::scalar(64); | ||||||||
427 | const LLT S128 = LLT::scalar(128); | ||||||||
428 | const LLT S256 = LLT::scalar(256); | ||||||||
429 | const LLT S512 = LLT::scalar(512); | ||||||||
430 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||||||
431 | |||||||||
432 | const LLT V2S8 = LLT::vector(2, 8); | ||||||||
433 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
434 | const LLT V4S16 = LLT::vector(4, 16); | ||||||||
435 | |||||||||
436 | const LLT V2S32 = LLT::vector(2, 32); | ||||||||
437 | const LLT V3S32 = LLT::vector(3, 32); | ||||||||
438 | const LLT V4S32 = LLT::vector(4, 32); | ||||||||
439 | const LLT V5S32 = LLT::vector(5, 32); | ||||||||
440 | const LLT V6S32 = LLT::vector(6, 32); | ||||||||
441 | const LLT V7S32 = LLT::vector(7, 32); | ||||||||
442 | const LLT V8S32 = LLT::vector(8, 32); | ||||||||
443 | const LLT V9S32 = LLT::vector(9, 32); | ||||||||
444 | const LLT V10S32 = LLT::vector(10, 32); | ||||||||
445 | const LLT V11S32 = LLT::vector(11, 32); | ||||||||
446 | const LLT V12S32 = LLT::vector(12, 32); | ||||||||
447 | const LLT V13S32 = LLT::vector(13, 32); | ||||||||
448 | const LLT V14S32 = LLT::vector(14, 32); | ||||||||
449 | const LLT V15S32 = LLT::vector(15, 32); | ||||||||
450 | const LLT V16S32 = LLT::vector(16, 32); | ||||||||
451 | const LLT V32S32 = LLT::vector(32, 32); | ||||||||
452 | |||||||||
453 | const LLT V2S64 = LLT::vector(2, 64); | ||||||||
454 | const LLT V3S64 = LLT::vector(3, 64); | ||||||||
455 | const LLT V4S64 = LLT::vector(4, 64); | ||||||||
456 | const LLT V5S64 = LLT::vector(5, 64); | ||||||||
457 | const LLT V6S64 = LLT::vector(6, 64); | ||||||||
458 | const LLT V7S64 = LLT::vector(7, 64); | ||||||||
459 | const LLT V8S64 = LLT::vector(8, 64); | ||||||||
460 | const LLT V16S64 = LLT::vector(16, 64); | ||||||||
461 | |||||||||
462 | std::initializer_list<LLT> AllS32Vectors = | ||||||||
463 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||||||
464 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||||||
465 | std::initializer_list<LLT> AllS64Vectors = | ||||||||
466 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||||||
467 | |||||||||
468 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||||||
469 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||||||
470 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||||||
471 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||||||
472 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||||||
473 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||||||
474 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||||||
475 | |||||||||
476 | const LLT CodePtr = FlatPtr; | ||||||||
477 | |||||||||
478 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||||||
479 | GlobalPtr, ConstantPtr, FlatPtr | ||||||||
480 | }; | ||||||||
481 | |||||||||
482 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||||||
483 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||||||
484 | }; | ||||||||
485 | |||||||||
486 | const std::initializer_list<LLT> FPTypesBase = { | ||||||||
487 | S32, S64 | ||||||||
488 | }; | ||||||||
489 | |||||||||
490 | const std::initializer_list<LLT> FPTypes16 = { | ||||||||
491 | S32, S64, S16 | ||||||||
492 | }; | ||||||||
493 | |||||||||
494 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||||||
495 | S32, S64, S16, V2S16 | ||||||||
496 | }; | ||||||||
497 | |||||||||
498 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||||||
499 | |||||||||
500 | setAction({G_BRCOND, S1}, Legal); // VCC branches | ||||||||
501 | setAction({G_BRCOND, S32}, Legal); // SCC branches | ||||||||
502 | |||||||||
503 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||||||
504 | // elements for v3s16 | ||||||||
505 | getActionDefinitionsBuilder(G_PHI) | ||||||||
506 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||||||
507 | .legalFor(AllS32Vectors) | ||||||||
508 | .legalFor(AllS64Vectors) | ||||||||
509 | .legalFor(AddrSpaces64) | ||||||||
510 | .legalFor(AddrSpaces32) | ||||||||
511 | .legalIf(isPointer(0)) | ||||||||
512 | .clampScalar(0, S16, S256) | ||||||||
513 | .widenScalarToNextPow2(0, 32) | ||||||||
514 | .clampMaxNumElements(0, S32, 16) | ||||||||
515 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
516 | .scalarize(0); | ||||||||
517 | |||||||||
518 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||||||
519 | // Full set of gfx9 features. | ||||||||
520 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||||
521 | .legalFor({S32, S16, V2S16}) | ||||||||
522 | .clampScalar(0, S16, S32) | ||||||||
523 | .clampMaxNumElements(0, S16, 2) | ||||||||
524 | .scalarize(0) | ||||||||
525 | .widenScalarToNextPow2(0, 32); | ||||||||
526 | |||||||||
527 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||||||
528 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||||||
529 | .minScalarOrElt(0, S16) | ||||||||
530 | .clampMaxNumElements(0, S16, 2) | ||||||||
531 | .scalarize(0) | ||||||||
532 | .widenScalarToNextPow2(0, 32) | ||||||||
533 | .lower(); | ||||||||
534 | } else if (ST.has16BitInsts()) { | ||||||||
535 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||||
536 | .legalFor({S32, S16}) | ||||||||
537 | .clampScalar(0, S16, S32) | ||||||||
538 | .scalarize(0) | ||||||||
539 | .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 | ||||||||
540 | |||||||||
541 | // Technically the saturating operations require clamp bit support, but this | ||||||||
542 | // was introduced at the same time as 16-bit operations. | ||||||||
543 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||||
544 | .legalFor({S32, S16}) // Clamp modifier | ||||||||
545 | .minScalar(0, S16) | ||||||||
546 | .scalarize(0) | ||||||||
547 | .widenScalarToNextPow2(0, 16) | ||||||||
548 | .lower(); | ||||||||
549 | |||||||||
550 | // We're just lowering this, but it helps get a better result to try to | ||||||||
551 | // coerce to the desired type first. | ||||||||
552 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||||
553 | .minScalar(0, S16) | ||||||||
554 | .scalarize(0) | ||||||||
555 | .lower(); | ||||||||
556 | } else { | ||||||||
557 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||||
558 | .legalFor({S32}) | ||||||||
559 | .clampScalar(0, S32, S32) | ||||||||
560 | .scalarize(0); | ||||||||
561 | |||||||||
562 | if (ST.hasIntClamp()) { | ||||||||
563 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||||
564 | .legalFor({S32}) // Clamp modifier. | ||||||||
565 | .scalarize(0) | ||||||||
566 | .minScalarOrElt(0, S32) | ||||||||
567 | .lower(); | ||||||||
568 | } else { | ||||||||
569 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||||||
570 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||||
571 | .minScalar(0, S32) | ||||||||
572 | .scalarize(0) | ||||||||
573 | .lower(); | ||||||||
574 | } | ||||||||
575 | |||||||||
576 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||||||
577 | // range values and goes for the min/max lowering directly. | ||||||||
578 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||||
579 | .minScalar(0, S32) | ||||||||
580 | .scalarize(0) | ||||||||
581 | .lower(); | ||||||||
582 | } | ||||||||
583 | |||||||||
584 | getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) | ||||||||
585 | .customFor({S32, S64}) | ||||||||
586 | .clampScalar(0, S32, S64) | ||||||||
587 | .widenScalarToNextPow2(0, 32) | ||||||||
588 | .scalarize(0); | ||||||||
589 | |||||||||
590 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||||||
591 | .legalFor({S32}) | ||||||||
592 | .maxScalarOrElt(0, S32); | ||||||||
593 | |||||||||
594 | if (ST.hasVOP3PInsts()) { | ||||||||
595 | Mulh | ||||||||
596 | .clampMaxNumElements(0, S8, 2) | ||||||||
597 | .lowerFor({V2S8}); | ||||||||
598 | } | ||||||||
599 | |||||||||
600 | Mulh | ||||||||
601 | .scalarize(0) | ||||||||
602 | .lower(); | ||||||||
603 | |||||||||
604 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||||||
605 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||||||
606 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||||||
607 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||||||
608 | .clampScalar(0, S32, S64) | ||||||||
609 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
610 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||||||
611 | .widenScalarToNextPow2(0) | ||||||||
612 | .scalarize(0); | ||||||||
613 | |||||||||
614 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | ||||||||
615 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||||||
616 | .legalFor({{S32, S1}, {S32, S32}}) | ||||||||
617 | .minScalar(0, S32) | ||||||||
618 | // TODO: .scalarize(0) | ||||||||
619 | .lower(); | ||||||||
620 | |||||||||
621 | getActionDefinitionsBuilder(G_BITCAST) | ||||||||
622 | // Don't worry about the size constraint. | ||||||||
623 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||||
624 | .lower(); | ||||||||
625 | |||||||||
626 | |||||||||
627 | getActionDefinitionsBuilder(G_CONSTANT) | ||||||||
628 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||||||
629 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||||||
630 | .legalIf(isPointer(0)) | ||||||||
631 | .clampScalar(0, S32, S64) | ||||||||
632 | .widenScalarToNextPow2(0); | ||||||||
633 | |||||||||
634 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||||||
635 | .legalFor({S32, S64, S16}) | ||||||||
636 | .clampScalar(0, S16, S64); | ||||||||
637 | |||||||||
638 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||||||
639 | .legalIf(isRegisterType(0)) | ||||||||
640 | // s1 and s16 are special cases because they have legal operations on | ||||||||
641 | // them, but don't really occupy registers in the normal way. | ||||||||
642 | .legalFor({S1, S16}) | ||||||||
643 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
644 | .clampScalarOrElt(0, S32, MaxScalar) | ||||||||
645 | .widenScalarToNextPow2(0, 32) | ||||||||
646 | .clampMaxNumElements(0, S32, 16); | ||||||||
647 | |||||||||
648 | setAction({G_FRAME_INDEX, PrivatePtr}, Legal); | ||||||||
649 | |||||||||
650 | // If the amount is divergent, we have to do a wave reduction to get the | ||||||||
651 | // maximum value, so this is expanded during RegBankSelect. | ||||||||
652 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||||||
653 | .legalFor({{PrivatePtr, S32}}); | ||||||||
654 | |||||||||
655 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||||||
656 | .customIf(typeIsNot(0, PrivatePtr)); | ||||||||
657 | |||||||||
658 | setAction({G_BLOCK_ADDR, CodePtr}, Legal); | ||||||||
659 | |||||||||
660 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||||||
661 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | ||||||||
662 | .legalFor({S32, S64}); | ||||||||
663 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||||||
664 | .customFor({S32, S64}); | ||||||||
665 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||||||
666 | .customFor({S32, S64}); | ||||||||
667 | |||||||||
668 | if (ST.has16BitInsts()) { | ||||||||
669 | if (ST.hasVOP3PInsts()) | ||||||||
670 | FPOpActions.legalFor({S16, V2S16}); | ||||||||
671 | else | ||||||||
672 | FPOpActions.legalFor({S16}); | ||||||||
673 | |||||||||
674 | TrigActions.customFor({S16}); | ||||||||
675 | FDIVActions.customFor({S16}); | ||||||||
676 | } | ||||||||
677 | |||||||||
678 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||||||
679 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||||||
680 | |||||||||
681 | if (ST.hasVOP3PInsts()) { | ||||||||
682 | MinNumMaxNum.customFor(FPTypesPK16) | ||||||||
683 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
684 | .clampMaxNumElements(0, S16, 2) | ||||||||
685 | .clampScalar(0, S16, S64) | ||||||||
686 | .scalarize(0); | ||||||||
687 | } else if (ST.has16BitInsts()) { | ||||||||
688 | MinNumMaxNum.customFor(FPTypes16) | ||||||||
689 | .clampScalar(0, S16, S64) | ||||||||
690 | .scalarize(0); | ||||||||
691 | } else { | ||||||||
692 | MinNumMaxNum.customFor(FPTypesBase) | ||||||||
693 | .clampScalar(0, S32, S64) | ||||||||
694 | .scalarize(0); | ||||||||
695 | } | ||||||||
696 | |||||||||
697 | if (ST.hasVOP3PInsts()) | ||||||||
698 | FPOpActions.clampMaxNumElements(0, S16, 2); | ||||||||
699 | |||||||||
700 | FPOpActions | ||||||||
701 | .scalarize(0) | ||||||||
702 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||||
703 | |||||||||
704 | TrigActions | ||||||||
705 | .scalarize(0) | ||||||||
706 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||||
707 | |||||||||
708 | FDIVActions | ||||||||
709 | .scalarize(0) | ||||||||
710 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||||
711 | |||||||||
712 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||||||
713 | .legalFor(FPTypesPK16) | ||||||||
714 | .clampMaxNumElements(0, S16, 2) | ||||||||
715 | .scalarize(0) | ||||||||
716 | .clampScalar(0, S16, S64); | ||||||||
717 | |||||||||
718 | if (ST.has16BitInsts()) { | ||||||||
719 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||||
720 | .legalFor({S32, S64, S16}) | ||||||||
721 | .scalarize(0) | ||||||||
722 | .clampScalar(0, S16, S64); | ||||||||
723 | } else { | ||||||||
724 | getActionDefinitionsBuilder(G_FSQRT) | ||||||||
725 | .legalFor({S32, S64}) | ||||||||
726 | .scalarize(0) | ||||||||
727 | .clampScalar(0, S32, S64); | ||||||||
728 | |||||||||
729 | if (ST.hasFractBug()) { | ||||||||
730 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||||
731 | .customFor({S64}) | ||||||||
732 | .legalFor({S32, S64}) | ||||||||
733 | .scalarize(0) | ||||||||
734 | .clampScalar(0, S32, S64); | ||||||||
735 | } else { | ||||||||
736 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||||
737 | .legalFor({S32, S64}) | ||||||||
738 | .scalarize(0) | ||||||||
739 | .clampScalar(0, S32, S64); | ||||||||
740 | } | ||||||||
741 | } | ||||||||
742 | |||||||||
743 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||||||
744 | .legalFor({{S32, S64}, {S16, S32}}) | ||||||||
745 | .scalarize(0) | ||||||||
746 | .lower(); | ||||||||
747 | |||||||||
748 | getActionDefinitionsBuilder(G_FPEXT) | ||||||||
749 | .legalFor({{S64, S32}, {S32, S16}}) | ||||||||
750 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||||||
751 | .scalarize(0); | ||||||||
752 | |||||||||
753 | getActionDefinitionsBuilder(G_FSUB) | ||||||||
754 | // Use actual fsub instruction | ||||||||
755 | .legalFor({S32}) | ||||||||
756 | // Must use fadd + fneg | ||||||||
757 | .lowerFor({S64, S16, V2S16}) | ||||||||
758 | .scalarize(0) | ||||||||
759 | .clampScalar(0, S32, S64); | ||||||||
760 | |||||||||
761 | // Whether this is legal depends on the floating point mode for the function. | ||||||||
762 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||||||
763 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||||||
764 | FMad.customFor({S32, S16}); | ||||||||
765 | else if (ST.hasMadMacF32Insts()) | ||||||||
766 | FMad.customFor({S32}); | ||||||||
767 | else if (ST.hasMadF16()) | ||||||||
768 | FMad.customFor({S16}); | ||||||||
769 | FMad.scalarize(0) | ||||||||
770 | .lower(); | ||||||||
771 | |||||||||
772 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||||||
773 | if (ST.has16BitInsts()) { | ||||||||
774 | FRem.customFor({S16, S32, S64}); | ||||||||
775 | } else { | ||||||||
776 | FRem.minScalar(0, S32) | ||||||||
777 | .customFor({S32, S64}); | ||||||||
778 | } | ||||||||
779 | FRem.scalarize(0); | ||||||||
780 | |||||||||
781 | // TODO: Do we need to clamp maximum bitwidth? | ||||||||
782 | getActionDefinitionsBuilder(G_TRUNC) | ||||||||
783 | .legalIf(isScalar(0)) | ||||||||
784 | .legalFor({{V2S16, V2S32}}) | ||||||||
785 | .clampMaxNumElements(0, S16, 2) | ||||||||
786 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||||||
787 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||||||
788 | // in the legalizer. | ||||||||
789 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||||||
790 | .alwaysLegal(); | ||||||||
791 | |||||||||
792 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||||||
793 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||||||
794 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||||||
795 | .scalarize(0) | ||||||||
796 | .clampScalar(0, S32, S64) | ||||||||
797 | .widenScalarToNextPow2(1, 32); | ||||||||
798 | |||||||||
799 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||||||
800 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||||||
801 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||||||
802 | .lowerFor({{S32, S64}}) | ||||||||
803 | .lowerIf(typeIs(1, S1)) | ||||||||
804 | .customFor({{S64, S64}}); | ||||||||
805 | if (ST.has16BitInsts()) | ||||||||
806 | IToFP.legalFor({{S16, S16}}); | ||||||||
807 | IToFP.clampScalar(1, S32, S64) | ||||||||
808 | .minScalar(0, S32) | ||||||||
809 | .scalarize(0) | ||||||||
810 | .widenScalarToNextPow2(1); | ||||||||
811 | |||||||||
812 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||||||
813 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||||||
814 | .customFor({{S64, S64}}) | ||||||||
815 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||||||
816 | if (ST.has16BitInsts()) | ||||||||
817 | FPToI.legalFor({{S16, S16}}); | ||||||||
818 | else | ||||||||
819 | FPToI.minScalar(1, S32); | ||||||||
820 | |||||||||
821 | FPToI.minScalar(0, S32) | ||||||||
822 | .widenScalarToNextPow2(0, 32) | ||||||||
823 | .scalarize(0) | ||||||||
824 | .lower(); | ||||||||
825 | |||||||||
826 | // Lower roundeven into G_FRINT | ||||||||
827 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||||||
828 | .scalarize(0) | ||||||||
829 | .lower(); | ||||||||
830 | |||||||||
831 | if (ST.has16BitInsts()) { | ||||||||
832 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||||
833 | .legalFor({S16, S32, S64}) | ||||||||
834 | .clampScalar(0, S16, S64) | ||||||||
835 | .scalarize(0); | ||||||||
836 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||||||
837 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||||
838 | .legalFor({S32, S64}) | ||||||||
839 | .clampScalar(0, S32, S64) | ||||||||
840 | .scalarize(0); | ||||||||
841 | } else { | ||||||||
842 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||||
843 | .legalFor({S32}) | ||||||||
844 | .customFor({S64}) | ||||||||
845 | .clampScalar(0, S32, S64) | ||||||||
846 | .scalarize(0); | ||||||||
847 | } | ||||||||
848 | |||||||||
849 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||||||
850 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||||||
851 | .scalarize(0) | ||||||||
852 | .scalarSameSizeAs(1, 0); | ||||||||
853 | |||||||||
854 | getActionDefinitionsBuilder(G_PTRMASK) | ||||||||
855 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||||||
856 | .scalarSameSizeAs(1, 0) | ||||||||
857 | .scalarize(0); | ||||||||
858 | |||||||||
859 | auto &CmpBuilder = | ||||||||
860 | getActionDefinitionsBuilder(G_ICMP) | ||||||||
861 | // The compare output type differs based on the register bank of the output, | ||||||||
862 | // so make both s1 and s32 legal. | ||||||||
863 | // | ||||||||
864 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||||||
865 | // is the allocatable register type that will be needed for the copy from | ||||||||
866 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||||||
867 | // before that won't try to use s32 result types. | ||||||||
868 | // | ||||||||
869 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||||||
870 | // bank. | ||||||||
871 | .legalForCartesianProduct( | ||||||||
872 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||||||
873 | .legalForCartesianProduct( | ||||||||
874 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||||||
875 | if (ST.has16BitInsts()) { | ||||||||
876 | CmpBuilder.legalFor({{S1, S16}}); | ||||||||
877 | } | ||||||||
878 | |||||||||
879 | CmpBuilder | ||||||||
880 | .widenScalarToNextPow2(1) | ||||||||
881 | .clampScalar(1, S32, S64) | ||||||||
882 | .scalarize(0) | ||||||||
883 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||||||
884 | |||||||||
885 | getActionDefinitionsBuilder(G_FCMP) | ||||||||
886 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||||||
887 | .widenScalarToNextPow2(1) | ||||||||
888 | .clampScalar(1, S32, S64) | ||||||||
889 | .scalarize(0); | ||||||||
890 | |||||||||
891 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||||||
892 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||||||
893 | if (ST.has16BitInsts()) | ||||||||
894 | Exp2Ops.legalFor({S32, S16}); | ||||||||
895 | else | ||||||||
896 | Exp2Ops.legalFor({S32}); | ||||||||
897 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||||||
898 | Exp2Ops.scalarize(0); | ||||||||
899 | |||||||||
900 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||||||
901 | if (ST.has16BitInsts()) | ||||||||
902 | ExpOps.customFor({{S32}, {S16}}); | ||||||||
903 | else | ||||||||
904 | ExpOps.customFor({S32}); | ||||||||
905 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||||||
906 | .scalarize(0); | ||||||||
907 | |||||||||
908 | getActionDefinitionsBuilder(G_FPOWI) | ||||||||
909 | .clampScalar(0, MinScalarFPTy, S32) | ||||||||
910 | .lower(); | ||||||||
911 | |||||||||
912 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||||
913 | getActionDefinitionsBuilder(G_CTPOP) | ||||||||
914 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||||
915 | .clampScalar(0, S32, S32) | ||||||||
916 | .clampScalar(1, S32, S64) | ||||||||
917 | .scalarize(0) | ||||||||
918 | .widenScalarToNextPow2(0, 32) | ||||||||
919 | .widenScalarToNextPow2(1, 32); | ||||||||
920 | |||||||||
921 | // The hardware instructions return a different result on 0 than the generic | ||||||||
922 | // instructions expect. The hardware produces -1, but these produce the | ||||||||
923 | // bitwidth. | ||||||||
924 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||||||
925 | .scalarize(0) | ||||||||
926 | .clampScalar(0, S32, S32) | ||||||||
927 | .clampScalar(1, S32, S64) | ||||||||
928 | .widenScalarToNextPow2(0, 32) | ||||||||
929 | .widenScalarToNextPow2(1, 32) | ||||||||
930 | .lower(); | ||||||||
931 | |||||||||
932 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||||
933 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||||||
934 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||||
935 | .clampScalar(0, S32, S32) | ||||||||
936 | .clampScalar(1, S32, S64) | ||||||||
937 | .scalarize(0) | ||||||||
938 | .widenScalarToNextPow2(0, 32) | ||||||||
939 | .widenScalarToNextPow2(1, 32); | ||||||||
940 | |||||||||
941 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||||
942 | // RegBankSelect. | ||||||||
943 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||||||
944 | .legalFor({S32, S64}) | ||||||||
945 | .clampScalar(0, S32, S64) | ||||||||
946 | .scalarize(0) | ||||||||
947 | .widenScalarToNextPow2(0); | ||||||||
948 | |||||||||
949 | if (ST.has16BitInsts()) { | ||||||||
950 | getActionDefinitionsBuilder(G_BSWAP) | ||||||||
951 | .legalFor({S16, S32, V2S16}) | ||||||||
952 | .clampMaxNumElements(0, S16, 2) | ||||||||
953 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||||
954 | // narrowScalar limitation. | ||||||||
955 | .widenScalarToNextPow2(0) | ||||||||
956 | .clampScalar(0, S16, S32) | ||||||||
957 | .scalarize(0); | ||||||||
958 | |||||||||
959 | if (ST.hasVOP3PInsts()) { | ||||||||
960 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||||
961 | .legalFor({S32, S16, V2S16}) | ||||||||
962 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
963 | .clampMaxNumElements(0, S16, 2) | ||||||||
964 | .minScalar(0, S16) | ||||||||
965 | .widenScalarToNextPow2(0) | ||||||||
966 | .scalarize(0) | ||||||||
967 | .lower(); | ||||||||
968 | } else { | ||||||||
969 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||||
970 | .legalFor({S32, S16}) | ||||||||
971 | .widenScalarToNextPow2(0) | ||||||||
972 | .minScalar(0, S16) | ||||||||
973 | .scalarize(0) | ||||||||
974 | .lower(); | ||||||||
975 | } | ||||||||
976 | } else { | ||||||||
977 | // TODO: Should have same legality without v_perm_b32 | ||||||||
978 | getActionDefinitionsBuilder(G_BSWAP) | ||||||||
979 | .legalFor({S32}) | ||||||||
980 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||||||
981 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||||
982 | // narrowScalar limitation. | ||||||||
983 | .widenScalarToNextPow2(0) | ||||||||
984 | .maxScalar(0, S32) | ||||||||
985 | .scalarize(0) | ||||||||
986 | .lower(); | ||||||||
987 | |||||||||
988 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||||
989 | .legalFor({S32}) | ||||||||
990 | .minScalar(0, S32) | ||||||||
991 | .widenScalarToNextPow2(0) | ||||||||
992 | .scalarize(0) | ||||||||
993 | .lower(); | ||||||||
994 | } | ||||||||
995 | |||||||||
996 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||||||
997 | // List the common cases | ||||||||
998 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||||
999 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||||
1000 | .scalarize(0) | ||||||||
1001 | // Accept any address space as long as the size matches | ||||||||
1002 | .legalIf(sameSize(0, 1)) | ||||||||
1003 | .widenScalarIf(smallerThan(1, 0), | ||||||||
1004 | [](const LegalityQuery &Query) { | ||||||||
1005 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||||
1006 | }) | ||||||||
1007 | .narrowScalarIf(largerThan(1, 0), | ||||||||
1008 | [](const LegalityQuery &Query) { | ||||||||
1009 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||||
1010 | }); | ||||||||
1011 | |||||||||
1012 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||||||
1013 | // List the common cases | ||||||||
1014 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||||
1015 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||||
1016 | .scalarize(0) | ||||||||
1017 | // Accept any address space as long as the size matches | ||||||||
1018 | .legalIf(sameSize(0, 1)) | ||||||||
1019 | .widenScalarIf(smallerThan(0, 1), | ||||||||
1020 | [](const LegalityQuery &Query) { | ||||||||
1021 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||||
1022 | }) | ||||||||
1023 | .narrowScalarIf( | ||||||||
1024 | largerThan(0, 1), | ||||||||
1025 | [](const LegalityQuery &Query) { | ||||||||
1026 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||||
1027 | }); | ||||||||
1028 | |||||||||
1029 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||||||
1030 | .scalarize(0) | ||||||||
1031 | .custom(); | ||||||||
1032 | |||||||||
1033 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||||||
1034 | bool IsLoad) -> bool { | ||||||||
1035 | const LLT DstTy = Query.Types[0]; | ||||||||
1036 | |||||||||
1037 | // Split vector extloads. | ||||||||
1038 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
1039 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||||
1040 | |||||||||
1041 | if (MemSize < DstTy.getSizeInBits()) | ||||||||
1042 | MemSize = std::max(MemSize, AlignBits); | ||||||||
1043 | |||||||||
1044 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||||||
1045 | return true; | ||||||||
1046 | |||||||||
1047 | const LLT PtrTy = Query.Types[1]; | ||||||||
1048 | unsigned AS = PtrTy.getAddressSpace(); | ||||||||
1049 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||||
1050 | return true; | ||||||||
1051 | |||||||||
1052 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||||||
1053 | // TODO: May be able to widen depending on alignment etc. | ||||||||
1054 | unsigned NumRegs = (MemSize + 31) / 32; | ||||||||
1055 | if (NumRegs == 3) { | ||||||||
1056 | if (!ST.hasDwordx3LoadStores()) | ||||||||
1057 | return true; | ||||||||
1058 | } else { | ||||||||
1059 | // If the alignment allows, these should have been widened. | ||||||||
1060 | if (!isPowerOf2_32(NumRegs)) | ||||||||
1061 | return true; | ||||||||
1062 | } | ||||||||
1063 | |||||||||
1064 | if (AlignBits < MemSize) { | ||||||||
1065 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
1066 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||||
1067 | Align(AlignBits / 8)); | ||||||||
1068 | } | ||||||||
1069 | |||||||||
1070 | return false; | ||||||||
1071 | }; | ||||||||
1072 | |||||||||
1073 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||||||
1074 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||||||
1075 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||||||
1076 | |||||||||
1077 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||||||
1078 | // LDS | ||||||||
1079 | // TODO: Unsupported flat for SI. | ||||||||
1080 | |||||||||
1081 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||||||
1082 | const bool IsStore = Op == G_STORE; | ||||||||
1083 | |||||||||
1084 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||||||
1085 | // Explicitly list some common cases. | ||||||||
1086 | // TODO: Does this help compile time at all? | ||||||||
1087 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, | ||||||||
1088 | {V2S32, GlobalPtr, 64, GlobalAlign32}, | ||||||||
1089 | {V4S32, GlobalPtr, 128, GlobalAlign32}, | ||||||||
1090 | {S64, GlobalPtr, 64, GlobalAlign32}, | ||||||||
1091 | {V2S64, GlobalPtr, 128, GlobalAlign32}, | ||||||||
1092 | {V2S16, GlobalPtr, 32, GlobalAlign32}, | ||||||||
1093 | {S32, GlobalPtr, 8, GlobalAlign8}, | ||||||||
1094 | {S32, GlobalPtr, 16, GlobalAlign16}, | ||||||||
1095 | |||||||||
1096 | {S32, LocalPtr, 32, 32}, | ||||||||
1097 | {S64, LocalPtr, 64, 32}, | ||||||||
1098 | {V2S32, LocalPtr, 64, 32}, | ||||||||
1099 | {S32, LocalPtr, 8, 8}, | ||||||||
1100 | {S32, LocalPtr, 16, 16}, | ||||||||
1101 | {V2S16, LocalPtr, 32, 32}, | ||||||||
1102 | |||||||||
1103 | {S32, PrivatePtr, 32, 32}, | ||||||||
1104 | {S32, PrivatePtr, 8, 8}, | ||||||||
1105 | {S32, PrivatePtr, 16, 16}, | ||||||||
1106 | {V2S16, PrivatePtr, 32, 32}, | ||||||||
1107 | |||||||||
1108 | {S32, ConstantPtr, 32, GlobalAlign32}, | ||||||||
1109 | {V2S32, ConstantPtr, 64, GlobalAlign32}, | ||||||||
1110 | {V4S32, ConstantPtr, 128, GlobalAlign32}, | ||||||||
1111 | {S64, ConstantPtr, 64, GlobalAlign32}, | ||||||||
1112 | {V2S32, ConstantPtr, 32, GlobalAlign32}}); | ||||||||
1113 | Actions.legalIf( | ||||||||
1114 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1115 | return isLoadStoreLegal(ST, Query, Op); | ||||||||
1116 | }); | ||||||||
1117 | |||||||||
1118 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||||
1119 | // 64-bits. | ||||||||
1120 | // | ||||||||
1121 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||||
1122 | // inserting addrspacecasts. | ||||||||
1123 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||||||
1124 | |||||||||
1125 | // Turn any illegal element vectors into something easier to deal | ||||||||
1126 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||||||
1127 | // parts anyway. | ||||||||
1128 | // | ||||||||
1129 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||||||
1130 | // 16-bit vector parts. | ||||||||
1131 | Actions.bitcastIf( | ||||||||
1132 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1133 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||||||
1134 | Query.MMODescrs[0].SizeInBits); | ||||||||
1135 | }, bitcastToRegisterType(0)); | ||||||||
1136 | |||||||||
1137 | if (!IsStore) { | ||||||||
1138 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||||||
1139 | // legalization actions can't properly express widening memory operands. | ||||||||
1140 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||||||
1141 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||||||
1142 | }); | ||||||||
1143 | } | ||||||||
1144 | |||||||||
1145 | // FIXME: load/store narrowing should be moved to lower action | ||||||||
1146 | Actions | ||||||||
1147 | .narrowScalarIf( | ||||||||
1148 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1149 | return !Query.Types[0].isVector() && | ||||||||
1150 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||||
1151 | }, | ||||||||
1152 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||||
1153 | const LLT DstTy = Query.Types[0]; | ||||||||
1154 | const LLT PtrTy = Query.Types[1]; | ||||||||
1155 | |||||||||
1156 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||||||
1157 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
1158 | |||||||||
1159 | // Split extloads. | ||||||||
1160 | if (DstSize > MemSize) | ||||||||
1161 | return std::make_pair(0, LLT::scalar(MemSize)); | ||||||||
1162 | |||||||||
1163 | if (!isPowerOf2_32(DstSize)) { | ||||||||
1164 | // We're probably decomposing an odd sized store. Try to split | ||||||||
1165 | // to the widest type. TODO: Account for alignment. As-is it | ||||||||
1166 | // should be OK, since the new parts will be further legalized. | ||||||||
1167 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||||
1168 | return std::make_pair(0, LLT::scalar(FloorSize)); | ||||||||
1169 | } | ||||||||
1170 | |||||||||
1171 | if (DstSize > 32 && (DstSize % 32 != 0)) { | ||||||||
1172 | // FIXME: Need a way to specify non-extload of larger size if | ||||||||
1173 | // suitably aligned. | ||||||||
1174 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); | ||||||||
1175 | } | ||||||||
1176 | |||||||||
1177 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||||
1178 | PtrTy.getAddressSpace(), | ||||||||
1179 | Op == G_LOAD); | ||||||||
1180 | if (MemSize > MaxSize) | ||||||||
1181 | return std::make_pair(0, LLT::scalar(MaxSize)); | ||||||||
1182 | |||||||||
1183 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||||
1184 | return std::make_pair(0, LLT::scalar(Align)); | ||||||||
1185 | }) | ||||||||
1186 | .fewerElementsIf( | ||||||||
1187 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1188 | return Query.Types[0].isVector() && | ||||||||
1189 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||||
1190 | }, | ||||||||
1191 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||||
1192 | const LLT DstTy = Query.Types[0]; | ||||||||
1193 | const LLT PtrTy = Query.Types[1]; | ||||||||
1194 | |||||||||
1195 | LLT EltTy = DstTy.getElementType(); | ||||||||
1196 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||||
1197 | PtrTy.getAddressSpace(), | ||||||||
1198 | Op == G_LOAD); | ||||||||
1199 | |||||||||
1200 | // FIXME: Handle widened to power of 2 results better. This ends | ||||||||
1201 | // up scalarizing. | ||||||||
1202 | // FIXME: 3 element stores scalarized on SI | ||||||||
1203 | |||||||||
1204 | // Split if it's too large for the address space. | ||||||||
1205 | if (Query.MMODescrs[0].SizeInBits > MaxSize) { | ||||||||
1206 | unsigned NumElts = DstTy.getNumElements(); | ||||||||
1207 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
1208 | |||||||||
1209 | if (MaxSize % EltSize == 0) { | ||||||||
1210 | return std::make_pair( | ||||||||
1211 | 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); | ||||||||
1212 | } | ||||||||
1213 | |||||||||
1214 | unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; | ||||||||
1215 | |||||||||
1216 | // FIXME: Refine when odd breakdowns handled | ||||||||
1217 | // The scalars will need to be re-legalized. | ||||||||
1218 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||||||
1219 | NumElts % NumPieces != 0) | ||||||||
1220 | return std::make_pair(0, EltTy); | ||||||||
1221 | |||||||||
1222 | return std::make_pair(0, | ||||||||
1223 | LLT::vector(NumElts / NumPieces, EltTy)); | ||||||||
1224 | } | ||||||||
1225 | |||||||||
1226 | // FIXME: We could probably handle weird extending loads better. | ||||||||
1227 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
1228 | if (DstTy.getSizeInBits() > MemSize) | ||||||||
1229 | return std::make_pair(0, EltTy); | ||||||||
1230 | |||||||||
1231 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
1232 | unsigned DstSize = DstTy.getSizeInBits(); | ||||||||
1233 | if (!isPowerOf2_32(DstSize)) { | ||||||||
1234 | // We're probably decomposing an odd sized store. Try to split | ||||||||
1235 | // to the widest type. TODO: Account for alignment. As-is it | ||||||||
1236 | // should be OK, since the new parts will be further legalized. | ||||||||
1237 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||||
1238 | return std::make_pair( | ||||||||
1239 | 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); | ||||||||
1240 | } | ||||||||
1241 | |||||||||
1242 | // Need to split because of alignment. | ||||||||
1243 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||||
1244 | if (EltSize > Align && | ||||||||
1245 | (EltSize / Align < DstTy.getNumElements())) { | ||||||||
1246 | return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); | ||||||||
1247 | } | ||||||||
1248 | |||||||||
1249 | // May need relegalization for the scalars. | ||||||||
1250 | return std::make_pair(0, EltTy); | ||||||||
1251 | }) | ||||||||
1252 | .lowerIfMemSizeNotPow2() | ||||||||
1253 | .minScalar(0, S32); | ||||||||
1254 | |||||||||
1255 | if (IsStore) | ||||||||
1256 | Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); | ||||||||
1257 | |||||||||
1258 | Actions | ||||||||
1259 | .widenScalarToNextPow2(0) | ||||||||
1260 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||||||
1261 | .lower(); | ||||||||
1262 | } | ||||||||
1263 | |||||||||
1264 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||||||
1265 | .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, | ||||||||
1266 | {S32, GlobalPtr, 16, 2 * 8}, | ||||||||
1267 | {S32, LocalPtr, 8, 8}, | ||||||||
1268 | {S32, LocalPtr, 16, 16}, | ||||||||
1269 | {S32, PrivatePtr, 8, 8}, | ||||||||
1270 | {S32, PrivatePtr, 16, 16}, | ||||||||
1271 | {S32, ConstantPtr, 8, 8}, | ||||||||
1272 | {S32, ConstantPtr, 16, 2 * 8}}); | ||||||||
1273 | if (ST.hasFlatAddressSpace()) { | ||||||||
1274 | ExtLoads.legalForTypesWithMemDesc( | ||||||||
1275 | {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); | ||||||||
1276 | } | ||||||||
1277 | |||||||||
1278 | ExtLoads.clampScalar(0, S32, S32) | ||||||||
1279 | .widenScalarToNextPow2(0) | ||||||||
1280 | .unsupportedIfMemSizeNotPow2() | ||||||||
1281 | .lower(); | ||||||||
1282 | |||||||||
1283 | auto &Atomics = getActionDefinitionsBuilder( | ||||||||
1284 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||||||
1285 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||||||
1286 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||||||
1287 | G_ATOMICRMW_UMIN}) | ||||||||
1288 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||||||
1289 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||||||
1290 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||||
1291 | if (ST.hasFlatAddressSpace()) { | ||||||||
1292 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||||||
1293 | } | ||||||||
1294 | |||||||||
1295 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | ||||||||
1296 | if (ST.hasLDSFPAtomics()) { | ||||||||
1297 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||||||
1298 | if (ST.hasGFX90AInsts()) | ||||||||
1299 | Atomic.legalFor({{S64, LocalPtr}}); | ||||||||
1300 | } | ||||||||
1301 | if (ST.hasAtomicFaddInsts()) | ||||||||
1302 | Atomic.legalFor({{S32, GlobalPtr}}); | ||||||||
1303 | |||||||||
1304 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||||||
1305 | // demarshalling | ||||||||
1306 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||||||
1307 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||||||
1308 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||||||
1309 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||||||
1310 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||||
1311 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||||||
1312 | |||||||||
1313 | // Condition should be s32 for scalar, s1 for vector. | ||||||||
1314 | getActionDefinitionsBuilder(G_SELECT) | ||||||||
1315 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, | ||||||||
1316 | GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, | ||||||||
1317 | LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) | ||||||||
1318 | .clampScalar(0, S16, S64) | ||||||||
1319 | .scalarize(1) | ||||||||
1320 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
1321 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||||||
1322 | .clampMaxNumElements(0, S32, 2) | ||||||||
1323 | .clampMaxNumElements(0, LocalPtr, 2) | ||||||||
1324 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||||||
1325 | .scalarize(0) | ||||||||
1326 | .widenScalarToNextPow2(0) | ||||||||
1327 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||||||
1328 | |||||||||
1329 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||||||
1330 | // be more flexible with the shift amount type. | ||||||||
1331 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||||||
1332 | .legalFor({{S32, S32}, {S64, S32}}); | ||||||||
1333 | if (ST.has16BitInsts()) { | ||||||||
1334 | if (ST.hasVOP3PInsts()) { | ||||||||
1335 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||||||
1336 | .clampMaxNumElements(0, S16, 2); | ||||||||
1337 | } else | ||||||||
1338 | Shifts.legalFor({{S16, S16}}); | ||||||||
1339 | |||||||||
1340 | // TODO: Support 16-bit shift amounts for all types | ||||||||
1341 | Shifts.widenScalarIf( | ||||||||
1342 | [=](const LegalityQuery &Query) { | ||||||||
1343 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||||||
1344 | // 32-bit amount. | ||||||||
1345 | const LLT ValTy = Query.Types[0]; | ||||||||
1346 | const LLT AmountTy = Query.Types[1]; | ||||||||
1347 | return ValTy.getSizeInBits() <= 16 && | ||||||||
1348 | AmountTy.getSizeInBits() < 16; | ||||||||
1349 | }, changeTo(1, S16)); | ||||||||
1350 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||||||
1351 | Shifts.clampScalar(1, S32, S32); | ||||||||
1352 | Shifts.clampScalar(0, S16, S64); | ||||||||
1353 | Shifts.widenScalarToNextPow2(0, 16); | ||||||||
1354 | |||||||||
1355 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||||
1356 | .minScalar(0, S16) | ||||||||
1357 | .scalarize(0) | ||||||||
1358 | .lower(); | ||||||||
1359 | } else { | ||||||||
1360 | // Make sure we legalize the shift amount type first, as the general | ||||||||
1361 | // expansion for the shifted type will produce much worse code if it hasn't | ||||||||
1362 | // been truncated already. | ||||||||
1363 | Shifts.clampScalar(1, S32, S32); | ||||||||
1364 | Shifts.clampScalar(0, S32, S64); | ||||||||
1365 | Shifts.widenScalarToNextPow2(0, 32); | ||||||||
1366 | |||||||||
1367 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||||
1368 | .minScalar(0, S32) | ||||||||
1369 | .scalarize(0) | ||||||||
1370 | .lower(); | ||||||||
1371 | } | ||||||||
1372 | Shifts.scalarize(0); | ||||||||
1373 | |||||||||
1374 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||||||
1375 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||||||
1376 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||||||
1377 | unsigned IdxTypeIdx = 2; | ||||||||
1378 | |||||||||
1379 | getActionDefinitionsBuilder(Op) | ||||||||
1380 | .customIf([=](const LegalityQuery &Query) { | ||||||||
1381 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||||
1382 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||||
1383 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||||||
1384 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
1385 | return (EltSize == 32 || EltSize == 64) && | ||||||||
1386 | VecTy.getSizeInBits() % 32 == 0 && | ||||||||
1387 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||||||
1388 | IdxTy.getSizeInBits() == 32; | ||||||||
1389 | }) | ||||||||
1390 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||||||
1391 | bitcastToVectorElement32(VecTypeIdx)) | ||||||||
1392 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||||||
1393 | .bitcastIf( | ||||||||
1394 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||||||
1395 | [=](const LegalityQuery &Query) { | ||||||||
1396 | // For > 64-bit element types, try to turn this into a 64-bit | ||||||||
1397 | // element vector since we may be able to do better indexing | ||||||||
1398 | // if this is scalar. If not, fall back to 32. | ||||||||
1399 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||||
1400 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||||
1401 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||||||
1402 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||||||
1403 | |||||||||
1404 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||||||
1405 | return std::make_pair( | ||||||||
1406 | VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); | ||||||||
1407 | }) | ||||||||
1408 | .clampScalar(EltTypeIdx, S32, S64) | ||||||||
1409 | .clampScalar(VecTypeIdx, S32, S64) | ||||||||
1410 | .clampScalar(IdxTypeIdx, S32, S32) | ||||||||
1411 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||||||
1412 | // TODO: Clamp elements for 64-bit vectors? | ||||||||
1413 | // It should only be necessary with variable indexes. | ||||||||
1414 | // As a last resort, lower to the stack | ||||||||
1415 | .lower(); | ||||||||
1416 | } | ||||||||
1417 | |||||||||
1418 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||||||
1419 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||||||
1420 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||||||
1421 | return Query.Types[0] != EltTy; | ||||||||
1422 | }); | ||||||||
1423 | |||||||||
1424 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||||||
1425 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||||||
1426 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||||||
1427 | |||||||||
1428 | // FIXME: Doesn't handle extract of illegal sizes. | ||||||||
1429 | getActionDefinitionsBuilder(Op) | ||||||||
1430 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||||||
1431 | // FIXME: Multiples of 16 should not be legal. | ||||||||
1432 | .legalIf([=](const LegalityQuery &Query) { | ||||||||
1433 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||||
1434 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||||
1435 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||||||
1436 | (LitTy.getSizeInBits() % 16 == 0); | ||||||||
1437 | }) | ||||||||
1438 | .widenScalarIf( | ||||||||
1439 | [=](const LegalityQuery &Query) { | ||||||||
1440 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||||
1441 | return (BigTy.getScalarSizeInBits() < 16); | ||||||||
1442 | }, | ||||||||
1443 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||||||
1444 | .widenScalarIf( | ||||||||
1445 | [=](const LegalityQuery &Query) { | ||||||||
1446 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||||
1447 | return (LitTy.getScalarSizeInBits() < 16); | ||||||||
1448 | }, | ||||||||
1449 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||||||
1450 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||||
1451 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||||||
1452 | |||||||||
1453 | } | ||||||||
1454 | |||||||||
1455 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||||||
1456 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||||||
1457 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||||||
1458 | .clampNumElements(0, V16S32, V32S32) | ||||||||
1459 | .clampNumElements(0, V2S64, V16S64) | ||||||||
1460 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||||||
1461 | |||||||||
1462 | if (ST.hasScalarPackInsts()) { | ||||||||
1463 | BuildVector | ||||||||
1464 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||||||
1465 | .minScalarOrElt(0, S16) | ||||||||
1466 | // Widen source elements and produce a G_BUILD_VECTOR_TRUNC | ||||||||
1467 | .minScalar(1, S32); | ||||||||
1468 | |||||||||
1469 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||||
1470 | .legalFor({V2S16, S32}) | ||||||||
1471 | .lower(); | ||||||||
1472 | BuildVector.minScalarOrElt(0, S32); | ||||||||
1473 | } else { | ||||||||
1474 | BuildVector.customFor({V2S16, S16}); | ||||||||
1475 | BuildVector.minScalarOrElt(0, S32); | ||||||||
1476 | |||||||||
1477 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||||
1478 | .customFor({V2S16, S32}) | ||||||||
1479 | .lower(); | ||||||||
1480 | } | ||||||||
1481 | |||||||||
1482 | BuildVector.legalIf(isRegisterType(0)); | ||||||||
1483 | |||||||||
1484 | // FIXME: Clamp maximum size | ||||||||
1485 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||||||
1486 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||||
1487 | .clampMaxNumElements(0, S32, 32) | ||||||||
1488 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||||||
1489 | .clampMaxNumElements(0, S16, 64); | ||||||||
1490 | |||||||||
1491 | // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse | ||||||||
1492 | // pre-legalize. | ||||||||
1493 | if (ST.hasVOP3PInsts()) { | ||||||||
1494 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) | ||||||||
1495 | .customFor({V2S16, V2S16}) | ||||||||
1496 | .lower(); | ||||||||
1497 | } else | ||||||||
1498 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||||||
1499 | |||||||||
1500 | // Merge/Unmerge | ||||||||
1501 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||||||
1502 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||||||
1503 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||||||
1504 | |||||||||
1505 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||||||
1506 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
1507 | if (Ty.isVector()) { | ||||||||
1508 | const LLT &EltTy = Ty.getElementType(); | ||||||||
1509 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||||||
1510 | return true; | ||||||||
1511 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | ||||||||
1512 | return true; | ||||||||
1513 | } | ||||||||
1514 | return false; | ||||||||
1515 | }; | ||||||||
1516 | |||||||||
1517 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||||||
1518 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||||
1519 | .lowerFor({{S16, V2S16}}) | ||||||||
1520 | .lowerIf([=](const LegalityQuery &Query) { | ||||||||
1521 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||||
1522 | return BigTy.getSizeInBits() == 32; | ||||||||
1523 | }) | ||||||||
1524 | // Try to widen to s16 first for small types. | ||||||||
1525 | // TODO: Only do this on targets with legal s16 shifts | ||||||||
1526 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||||||
1527 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||||||
1528 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||||
1529 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||||||
1530 | elementTypeIs(1, S16)), | ||||||||
1531 | changeTo(1, V2S16)) | ||||||||
1532 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||||||
1533 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||||||
1534 | // valid. | ||||||||
1535 | .clampScalar(LitTyIdx, S32, S512) | ||||||||
1536 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||||||
1537 | // Break up vectors with weird elements into scalars | ||||||||
1538 | .fewerElementsIf( | ||||||||
1539 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||||||
1540 | scalarize(0)) | ||||||||
1541 | .fewerElementsIf( | ||||||||
1542 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||||||
1543 | scalarize(1)) | ||||||||
1544 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||||||
1545 | |||||||||
1546 | if (Op == G_MERGE_VALUES) { | ||||||||
1547 | Builder.widenScalarIf( | ||||||||
1548 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||||||
1549 | [=](const LegalityQuery &Query) { | ||||||||
1550 | const LLT Ty = Query.Types[LitTyIdx]; | ||||||||
1551 | return Ty.getSizeInBits() < 32; | ||||||||
1552 | }, | ||||||||
1553 | changeTo(LitTyIdx, S32)); | ||||||||
1554 | } | ||||||||
1555 | |||||||||
1556 | Builder.widenScalarIf( | ||||||||
1557 | [=](const LegalityQuery &Query) { | ||||||||
1558 | const LLT Ty = Query.Types[BigTyIdx]; | ||||||||
1559 | return !isPowerOf2_32(Ty.getSizeInBits()) && | ||||||||
1560 | Ty.getSizeInBits() % 16 != 0; | ||||||||
1561 | }, | ||||||||
1562 | [=](const LegalityQuery &Query) { | ||||||||
1563 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||||||
1564 | // Whichever is smaller. | ||||||||
1565 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||||||
1566 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||||||
1567 | if (NewSizeInBits >= 256) { | ||||||||
1568 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||||||
1569 | if (RoundedTo < NewSizeInBits) | ||||||||
1570 | NewSizeInBits = RoundedTo; | ||||||||
1571 | } | ||||||||
1572 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||||||
1573 | }) | ||||||||
1574 | // Any vectors left are the wrong size. Scalarize them. | ||||||||
1575 | .scalarize(0) | ||||||||
1576 | .scalarize(1); | ||||||||
1577 | } | ||||||||
1578 | |||||||||
1579 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||||
1580 | // RegBankSelect. | ||||||||
1581 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||||||
1582 | .legalFor({{S32}, {S64}}); | ||||||||
1583 | |||||||||
1584 | if (ST.hasVOP3PInsts()) { | ||||||||
1585 | SextInReg.lowerFor({{V2S16}}) | ||||||||
1586 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||||||
1587 | // get more vector shift opportunities, since we'll get those when | ||||||||
1588 | // expanded. | ||||||||
1589 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); | ||||||||
1590 | } else if (ST.has16BitInsts()) { | ||||||||
1591 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||||||
1592 | } else { | ||||||||
1593 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||||||
1594 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||||||
1595 | SextInReg.lowerFor({{S32}, {S64}}); | ||||||||
1596 | } | ||||||||
1597 | |||||||||
1598 | SextInReg | ||||||||
1599 | .scalarize(0) | ||||||||
1600 | .clampScalar(0, S32, S64) | ||||||||
1601 | .lower(); | ||||||||
1602 | |||||||||
1603 | // TODO: Only Try to form v2s16 with legal packed instructions. | ||||||||
1604 | getActionDefinitionsBuilder(G_FSHR) | ||||||||
1605 | .legalFor({{S32, S32}}) | ||||||||
1606 | .lowerFor({{V2S16, V2S16}}) | ||||||||
1607 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | ||||||||
1608 | .scalarize(0) | ||||||||
1609 | .lower(); | ||||||||
1610 | |||||||||
1611 | if (ST.hasVOP3PInsts()) { | ||||||||
1612 | getActionDefinitionsBuilder(G_FSHL) | ||||||||
1613 | .lowerFor({{V2S16, V2S16}}) | ||||||||
1614 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | ||||||||
1615 | .scalarize(0) | ||||||||
1616 | .lower(); | ||||||||
1617 | } else { | ||||||||
1618 | getActionDefinitionsBuilder(G_FSHL) | ||||||||
1619 | .scalarize(0) | ||||||||
1620 | .lower(); | ||||||||
1621 | } | ||||||||
1622 | |||||||||
1623 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||||||
1624 | .legalFor({S64}); | ||||||||
1625 | |||||||||
1626 | getActionDefinitionsBuilder(G_FENCE) | ||||||||
1627 | .alwaysLegal(); | ||||||||
1628 | |||||||||
1629 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | ||||||||
1630 | .scalarize(0) | ||||||||
1631 | .minScalar(0, S32) | ||||||||
1632 | .lower(); | ||||||||
1633 | |||||||||
1634 | getActionDefinitionsBuilder({ | ||||||||
1635 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||||||
1636 | G_FCOPYSIGN, | ||||||||
1637 | |||||||||
1638 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||||||
1639 | G_ATOMICRMW_NAND, | ||||||||
1640 | G_ATOMICRMW_FSUB, | ||||||||
1641 | G_READ_REGISTER, | ||||||||
1642 | G_WRITE_REGISTER, | ||||||||
1643 | |||||||||
1644 | G_SADDO, G_SSUBO, | ||||||||
1645 | |||||||||
1646 | // TODO: Implement | ||||||||
1647 | G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||||||
1648 | |||||||||
1649 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||||||
1650 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||||||
1651 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||||||
1652 | .unsupported(); | ||||||||
1653 | |||||||||
1654 | computeTables(); | ||||||||
1655 | verify(*ST.getInstrInfo()); | ||||||||
1656 | } | ||||||||
1657 | |||||||||
1658 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||||||
1659 | MachineInstr &MI) const { | ||||||||
1660 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
1661 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
1662 | |||||||||
1663 | switch (MI.getOpcode()) { | ||||||||
1664 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||||||
1665 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||||||
1666 | case TargetOpcode::G_FRINT: | ||||||||
1667 | return legalizeFrint(MI, MRI, B); | ||||||||
1668 | case TargetOpcode::G_FCEIL: | ||||||||
1669 | return legalizeFceil(MI, MRI, B); | ||||||||
1670 | case TargetOpcode::G_FREM: | ||||||||
1671 | return legalizeFrem(MI, MRI, B); | ||||||||
1672 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||||||
1673 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||||||
1674 | case TargetOpcode::G_SITOFP: | ||||||||
1675 | return legalizeITOFP(MI, MRI, B, true); | ||||||||
1676 | case TargetOpcode::G_UITOFP: | ||||||||
1677 | return legalizeITOFP(MI, MRI, B, false); | ||||||||
1678 | case TargetOpcode::G_FPTOSI: | ||||||||
1679 | return legalizeFPTOI(MI, MRI, B, true); | ||||||||
1680 | case TargetOpcode::G_FPTOUI: | ||||||||
1681 | return legalizeFPTOI(MI, MRI, B, false); | ||||||||
1682 | case TargetOpcode::G_FMINNUM: | ||||||||
1683 | case TargetOpcode::G_FMAXNUM: | ||||||||
1684 | case TargetOpcode::G_FMINNUM_IEEE: | ||||||||
1685 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||||||
1686 | return legalizeMinNumMaxNum(Helper, MI); | ||||||||
1687 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||||||
1688 | return legalizeExtractVectorElt(MI, MRI, B); | ||||||||
1689 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||||||
1690 | return legalizeInsertVectorElt(MI, MRI, B); | ||||||||
1691 | case TargetOpcode::G_SHUFFLE_VECTOR: | ||||||||
1692 | return legalizeShuffleVector(MI, MRI, B); | ||||||||
1693 | case TargetOpcode::G_FSIN: | ||||||||
1694 | case TargetOpcode::G_FCOS: | ||||||||
1695 | return legalizeSinCos(MI, MRI, B); | ||||||||
1696 | case TargetOpcode::G_GLOBAL_VALUE: | ||||||||
1697 | return legalizeGlobalValue(MI, MRI, B); | ||||||||
1698 | case TargetOpcode::G_LOAD: | ||||||||
1699 | return legalizeLoad(Helper, MI); | ||||||||
1700 | case TargetOpcode::G_FMAD: | ||||||||
1701 | return legalizeFMad(MI, MRI, B); | ||||||||
1702 | case TargetOpcode::G_FDIV: | ||||||||
1703 | return legalizeFDIV(MI, MRI, B); | ||||||||
1704 | case TargetOpcode::G_UDIV: | ||||||||
1705 | case TargetOpcode::G_UREM: | ||||||||
1706 | return legalizeUDIV_UREM(MI, MRI, B); | ||||||||
1707 | case TargetOpcode::G_SDIV: | ||||||||
1708 | case TargetOpcode::G_SREM: | ||||||||
1709 | return legalizeSDIV_SREM(MI, MRI, B); | ||||||||
1710 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||||||
1711 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||||||
1712 | case TargetOpcode::G_FLOG: | ||||||||
1713 | return legalizeFlog(MI, B, numbers::ln2f); | ||||||||
1714 | case TargetOpcode::G_FLOG10: | ||||||||
1715 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||||||
1716 | case TargetOpcode::G_FEXP: | ||||||||
1717 | return legalizeFExp(MI, B); | ||||||||
1718 | case TargetOpcode::G_FPOW: | ||||||||
1719 | return legalizeFPow(MI, B); | ||||||||
1720 | case TargetOpcode::G_FFLOOR: | ||||||||
1721 | return legalizeFFloor(MI, MRI, B); | ||||||||
1722 | case TargetOpcode::G_BUILD_VECTOR: | ||||||||
1723 | return legalizeBuildVector(MI, MRI, B); | ||||||||
1724 | default: | ||||||||
1725 | return false; | ||||||||
1726 | } | ||||||||
1727 | |||||||||
1728 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1728); | ||||||||
1729 | } | ||||||||
1730 | |||||||||
1731 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||||||
1732 | unsigned AS, | ||||||||
1733 | MachineRegisterInfo &MRI, | ||||||||
1734 | MachineIRBuilder &B) const { | ||||||||
1735 | MachineFunction &MF = B.getMF(); | ||||||||
1736 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||||
1737 | const LLT S32 = LLT::scalar(32); | ||||||||
1738 | |||||||||
1739 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1739, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1740 | |||||||||
1741 | if (ST.hasApertureRegs()) { | ||||||||
1742 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | ||||||||
1743 | // getreg. | ||||||||
1744 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||||
1745 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | ||||||||
1746 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | ||||||||
1747 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||||
1748 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | ||||||||
1749 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | ||||||||
1750 | unsigned Encoding = | ||||||||
1751 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | ||||||||
1752 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | ||||||||
1753 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | ||||||||
1754 | |||||||||
1755 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||||||||
1756 | |||||||||
1757 | B.buildInstr(AMDGPU::S_GETREG_B32) | ||||||||
1758 | .addDef(GetReg) | ||||||||
1759 | .addImm(Encoding); | ||||||||
1760 | MRI.setType(GetReg, S32); | ||||||||
1761 | |||||||||
1762 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | ||||||||
1763 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); | ||||||||
1764 | } | ||||||||
1765 | |||||||||
1766 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||||||
1767 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||||
1768 | |||||||||
1769 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||||
1770 | return Register(); | ||||||||
1771 | |||||||||
1772 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||||||
1773 | // private_segment_aperture_base_hi. | ||||||||
1774 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||||||
1775 | |||||||||
1776 | // TODO: can we be smarter about machine pointer info? | ||||||||
1777 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||||||
1778 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||||
1779 | PtrInfo, | ||||||||
1780 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||||
1781 | MachineMemOperand::MOInvariant, | ||||||||
1782 | 4, commonAlignment(Align(64), StructOffset)); | ||||||||
1783 | |||||||||
1784 | Register LoadAddr; | ||||||||
1785 | |||||||||
1786 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | ||||||||
1787 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||||||
1788 | } | ||||||||
1789 | |||||||||
1790 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||||||
1791 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1792 | MachineIRBuilder &B) const { | ||||||||
1793 | MachineFunction &MF = B.getMF(); | ||||||||
1794 | |||||||||
1795 | const LLT S32 = LLT::scalar(32); | ||||||||
1796 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
1797 | Register Src = MI.getOperand(1).getReg(); | ||||||||
1798 | |||||||||
1799 | LLT DstTy = MRI.getType(Dst); | ||||||||
1800 | LLT SrcTy = MRI.getType(Src); | ||||||||
1801 | unsigned DestAS = DstTy.getAddressSpace(); | ||||||||
1802 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||||||
1803 | |||||||||
1804 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||||||
1805 | // vector element. | ||||||||
1806 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1806, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1807 | |||||||||
1808 | const AMDGPUTargetMachine &TM | ||||||||
1809 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||||||
1810 | |||||||||
1811 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||||||
1812 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||||||
1813 | return true; | ||||||||
1814 | } | ||||||||
1815 | |||||||||
1816 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||||
1817 | // Truncate. | ||||||||
1818 | B.buildExtract(Dst, Src, 0); | ||||||||
1819 | MI.eraseFromParent(); | ||||||||
1820 | return true; | ||||||||
1821 | } | ||||||||
1822 | |||||||||
1823 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||||
1824 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
1825 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||||||
1826 | |||||||||
1827 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | ||||||||
1828 | // another. Merge operands are required to be the same type, but creating an | ||||||||
1829 | // extra ptrtoint would be kind of pointless. | ||||||||
1830 | auto HighAddr = B.buildConstant( | ||||||||
1831 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | ||||||||
1832 | B.buildMerge(Dst, {Src, HighAddr}); | ||||||||
1833 | MI.eraseFromParent(); | ||||||||
1834 | return true; | ||||||||
1835 | } | ||||||||
1836 | |||||||||
1837 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | ||||||||
1838 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1839, __extension__ __PRETTY_FUNCTION__)) | ||||||||
1839 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1839, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1840 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||||||
1841 | |||||||||
1842 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||||||
1843 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||||||
1844 | |||||||||
1845 | // Extract low 32-bits of the pointer. | ||||||||
1846 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||||||
1847 | |||||||||
1848 | auto CmpRes = | ||||||||
1849 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||||||
1850 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||||||
1851 | |||||||||
1852 | MI.eraseFromParent(); | ||||||||
1853 | return true; | ||||||||
1854 | } | ||||||||
1855 | |||||||||
1856 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | ||||||||
1857 | return false; | ||||||||
1858 | |||||||||
1859 | if (!ST.hasFlatAddressSpace()) | ||||||||
1860 | return false; | ||||||||
1861 | |||||||||
1862 | auto SegmentNull = | ||||||||
1863 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||||||
1864 | auto FlatNull = | ||||||||
1865 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||||||
1866 | |||||||||
1867 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||||||
1868 | if (!ApertureReg.isValid()) | ||||||||
1869 | return false; | ||||||||
1870 | |||||||||
1871 | auto CmpRes = | ||||||||
1872 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); | ||||||||
1873 | |||||||||
1874 | // Coerce the type of the low half of the result so we can use merge_values. | ||||||||
1875 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||||||
1876 | |||||||||
1877 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||||||
1878 | // avoid the ptrtoint? | ||||||||
1879 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); | ||||||||
1880 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||||||
1881 | |||||||||
1882 | MI.eraseFromParent(); | ||||||||
1883 | return true; | ||||||||
1884 | } | ||||||||
1885 | |||||||||
1886 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||||||
1887 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1888 | MachineIRBuilder &B) const { | ||||||||
1889 | Register Src = MI.getOperand(1).getReg(); | ||||||||
1890 | LLT Ty = MRI.getType(Src); | ||||||||
1891 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1891, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1892 | |||||||||
1893 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||||||
1894 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||||||
1895 | |||||||||
1896 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||||||
1897 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||||||
1898 | |||||||||
1899 | // TODO: Should this propagate fast-math-flags? | ||||||||
1900 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||||||
1901 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||||||
1902 | |||||||||
1903 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||||||
1904 | auto Fabs = B.buildFAbs(Ty, Src); | ||||||||
1905 | |||||||||
1906 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||||||
1907 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||||||
1908 | MI.eraseFromParent(); | ||||||||
1909 | return true; | ||||||||
1910 | } | ||||||||
1911 | |||||||||
1912 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||||||
1913 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1914 | MachineIRBuilder &B) const { | ||||||||
1915 | |||||||||
1916 | const LLT S1 = LLT::scalar(1); | ||||||||
1917 | const LLT S64 = LLT::scalar(64); | ||||||||
1918 | |||||||||
1919 | Register Src = MI.getOperand(1).getReg(); | ||||||||
1920 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1920, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1921 | |||||||||
1922 | // result = trunc(src) | ||||||||
1923 | // if (src > 0.0 && src != result) | ||||||||
1924 | // result += 1.0 | ||||||||
1925 | |||||||||
1926 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||||||
1927 | |||||||||
1928 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||||||
1929 | const auto One = B.buildFConstant(S64, 1.0); | ||||||||
1930 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||||||
1931 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||||||
1932 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||||||
1933 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||||||
1934 | |||||||||
1935 | // TODO: Should this propagate fast-math-flags? | ||||||||
1936 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||||||
1937 | return true; | ||||||||
1938 | } | ||||||||
1939 | |||||||||
1940 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||||||
1941 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1942 | MachineIRBuilder &B) const { | ||||||||
1943 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
1944 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||||||
1945 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||||||
1946 | auto Flags = MI.getFlags(); | ||||||||
1947 | LLT Ty = MRI.getType(DstReg); | ||||||||
1948 | |||||||||
1949 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||||||
1950 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||||||
1951 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||||||
1952 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||||||
1953 | MI.eraseFromParent(); | ||||||||
1954 | return true; | ||||||||
1955 | } | ||||||||
1956 | |||||||||
1957 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||||||
1958 | MachineIRBuilder &B) { | ||||||||
1959 | const unsigned FractBits = 52; | ||||||||
1960 | const unsigned ExpBits = 11; | ||||||||
1961 | LLT S32 = LLT::scalar(32); | ||||||||
1962 | |||||||||
1963 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||||||
1964 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||||||
1965 | |||||||||
1966 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||||||
1967 | .addUse(Hi) | ||||||||
1968 | .addUse(Const0.getReg(0)) | ||||||||
1969 | .addUse(Const1.getReg(0)); | ||||||||
1970 | |||||||||
1971 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||||||
1972 | } | ||||||||
1973 | |||||||||
1974 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||||||
1975 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1976 | MachineIRBuilder &B) const { | ||||||||
1977 | const LLT S1 = LLT::scalar(1); | ||||||||
1978 | const LLT S32 = LLT::scalar(32); | ||||||||
1979 | const LLT S64 = LLT::scalar(64); | ||||||||
1980 | |||||||||
1981 | Register Src = MI.getOperand(1).getReg(); | ||||||||
1982 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1982, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1983 | |||||||||
1984 | // TODO: Should this use extract since the low half is unused? | ||||||||
1985 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||||
1986 | Register Hi = Unmerge.getReg(1); | ||||||||
1987 | |||||||||
1988 | // Extract the upper half, since this is where we will find the sign and | ||||||||
1989 | // exponent. | ||||||||
1990 | auto Exp = extractF64Exponent(Hi, B); | ||||||||
1991 | |||||||||
1992 | const unsigned FractBits = 52; | ||||||||
1993 | |||||||||
1994 | // Extract the sign bit. | ||||||||
1995 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||||||
1996 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||||||
1997 | |||||||||
1998 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||||||
1999 | |||||||||
2000 | const auto Zero32 = B.buildConstant(S32, 0); | ||||||||
2001 | |||||||||
2002 | // Extend back to 64-bits. | ||||||||
2003 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); | ||||||||
2004 | |||||||||
2005 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||||||
2006 | auto Not = B.buildNot(S64, Shr); | ||||||||
2007 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||||||
2008 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||||||
2009 | |||||||||
2010 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||||||
2011 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||||||
2012 | |||||||||
2013 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||||||
2014 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||||||
2015 | MI.eraseFromParent(); | ||||||||
2016 | return true; | ||||||||
2017 | } | ||||||||
2018 | |||||||||
2019 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||||||
2020 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2021 | MachineIRBuilder &B, bool Signed) const { | ||||||||
2022 | |||||||||
2023 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2024 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2025 | |||||||||
2026 | const LLT S64 = LLT::scalar(64); | ||||||||
2027 | const LLT S32 = LLT::scalar(32); | ||||||||
2028 | |||||||||
2029 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2029, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2030 | |||||||||
2031 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||||
2032 | |||||||||
2033 | auto CvtHi = Signed ? | ||||||||
2034 | B.buildSITOFP(S64, Unmerge.getReg(1)) : | ||||||||
2035 | B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||||||
2036 | |||||||||
2037 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||||||
2038 | |||||||||
2039 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||||||
2040 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||||||
2041 | .addUse(CvtHi.getReg(0)) | ||||||||
2042 | .addUse(ThirtyTwo.getReg(0)); | ||||||||
2043 | |||||||||
2044 | // TODO: Should this propagate fast-math-flags? | ||||||||
2045 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||||||
2046 | MI.eraseFromParent(); | ||||||||
2047 | return true; | ||||||||
2048 | } | ||||||||
2049 | |||||||||
2050 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||||||
2051 | // actually works. | ||||||||
2052 | bool AMDGPULegalizerInfo::legalizeFPTOI( | ||||||||
2053 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2054 | MachineIRBuilder &B, bool Signed) const { | ||||||||
2055 | |||||||||
2056 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2057 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2058 | |||||||||
2059 | const LLT S64 = LLT::scalar(64); | ||||||||
2060 | const LLT S32 = LLT::scalar(32); | ||||||||
2061 | |||||||||
2062 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2062, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2063 | |||||||||
2064 | unsigned Flags = MI.getFlags(); | ||||||||
2065 | |||||||||
2066 | auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); | ||||||||
2067 | auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL)); | ||||||||
2068 | auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL)); | ||||||||
2069 | |||||||||
2070 | auto Mul = B.buildFMul(S64, Trunc, K0, Flags); | ||||||||
2071 | auto FloorMul = B.buildFFloor(S64, Mul, Flags); | ||||||||
2072 | auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); | ||||||||
2073 | |||||||||
2074 | auto Hi = Signed ? | ||||||||
2075 | B.buildFPTOSI(S32, FloorMul) : | ||||||||
2076 | B.buildFPTOUI(S32, FloorMul); | ||||||||
2077 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||||||
2078 | |||||||||
2079 | B.buildMerge(Dst, { Lo, Hi }); | ||||||||
2080 | MI.eraseFromParent(); | ||||||||
2081 | |||||||||
2082 | return true; | ||||||||
2083 | } | ||||||||
2084 | |||||||||
2085 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||||||
2086 | MachineInstr &MI) const { | ||||||||
2087 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||||||
2088 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
2089 | |||||||||
2090 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||||||
2091 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||||||
2092 | |||||||||
2093 | // With ieee_mode disabled, the instructions have the correct behavior | ||||||||
2094 | // already for G_FMINNUM/G_FMAXNUM | ||||||||
2095 | if (!MFI->getMode().IEEE) | ||||||||
2096 | return !IsIEEEOp; | ||||||||
2097 | |||||||||
2098 | if (IsIEEEOp) | ||||||||
2099 | return true; | ||||||||
2100 | |||||||||
2101 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||||||
2102 | } | ||||||||
2103 | |||||||||
2104 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||||||
2105 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2106 | MachineIRBuilder &B) const { | ||||||||
2107 | // TODO: Should move some of this into LegalizerHelper. | ||||||||
2108 | |||||||||
2109 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||||
2110 | |||||||||
2111 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||||
2112 | // constant before this, so we shouldn't need | ||||||||
2113 | // getConstantVRegValWithLookThrough. | ||||||||
2114 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||||
2115 | getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||||||
2116 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||||
2117 | return true; | ||||||||
2118 | const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||||
2119 | |||||||||
2120 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2121 | Register Vec = MI.getOperand(1).getReg(); | ||||||||
2122 | |||||||||
2123 | LLT VecTy = MRI.getType(Vec); | ||||||||
2124 | LLT EltTy = VecTy.getElementType(); | ||||||||
2125 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2125, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2126 | |||||||||
2127 | if (IdxVal < VecTy.getNumElements()) | ||||||||
2128 | B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); | ||||||||
2129 | else | ||||||||
2130 | B.buildUndef(Dst); | ||||||||
2131 | |||||||||
2132 | MI.eraseFromParent(); | ||||||||
2133 | return true; | ||||||||
2134 | } | ||||||||
2135 | |||||||||
2136 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||||||
2137 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2138 | MachineIRBuilder &B) const { | ||||||||
2139 | // TODO: Should move some of this into LegalizerHelper. | ||||||||
2140 | |||||||||
2141 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||||
2142 | |||||||||
2143 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||||
2144 | // constant before this, so we shouldn't need | ||||||||
2145 | // getConstantVRegValWithLookThrough. | ||||||||
2146 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||||
2147 | getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||||||
2148 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||||
2149 | return true; | ||||||||
2150 | |||||||||
2151 | int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||||
2152 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2153 | Register Vec = MI.getOperand(1).getReg(); | ||||||||
2154 | Register Ins = MI.getOperand(2).getReg(); | ||||||||
2155 | |||||||||
2156 | LLT VecTy = MRI.getType(Vec); | ||||||||
2157 | LLT EltTy = VecTy.getElementType(); | ||||||||
2158 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2158, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2159 | |||||||||
2160 | if (IdxVal < VecTy.getNumElements()) | ||||||||
2161 | B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); | ||||||||
2162 | else | ||||||||
2163 | B.buildUndef(Dst); | ||||||||
2164 | |||||||||
2165 | MI.eraseFromParent(); | ||||||||
2166 | return true; | ||||||||
2167 | } | ||||||||
2168 | |||||||||
2169 | bool AMDGPULegalizerInfo::legalizeShuffleVector( | ||||||||
2170 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2171 | MachineIRBuilder &B) const { | ||||||||
2172 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
2173 | |||||||||
2174 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2175 | Register Src0 = MI.getOperand(1).getReg(); | ||||||||
2176 | LLT DstTy = MRI.getType(Dst); | ||||||||
2177 | LLT SrcTy = MRI.getType(Src0); | ||||||||
2178 | |||||||||
2179 | if (SrcTy == V2S16 && DstTy == V2S16 && | ||||||||
2180 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) | ||||||||
2181 | return true; | ||||||||
2182 | |||||||||
2183 | MachineIRBuilder HelperBuilder(MI); | ||||||||
2184 | GISelObserverWrapper DummyObserver; | ||||||||
2185 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); | ||||||||
2186 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; | ||||||||
2187 | } | ||||||||
2188 | |||||||||
2189 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||||||
2190 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2191 | MachineIRBuilder &B) const { | ||||||||
2192 | |||||||||
2193 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
2194 | Register SrcReg = MI.getOperand(1).getReg(); | ||||||||
2195 | LLT Ty = MRI.getType(DstReg); | ||||||||
2196 | unsigned Flags = MI.getFlags(); | ||||||||
2197 | |||||||||
2198 | Register TrigVal; | ||||||||
2199 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||||||
2200 | if (ST.hasTrigReducedRange()) { | ||||||||
2201 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||||||
2202 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||||||
2203 | .addUse(MulVal.getReg(0)) | ||||||||
2204 | .setMIFlags(Flags).getReg(0); | ||||||||
2205 | } else | ||||||||
2206 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||||||
2207 | |||||||||
2208 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||||||
2209 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||||||
2210 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | ||||||||
2211 | .addUse(TrigVal) | ||||||||
2212 | .setMIFlags(Flags); | ||||||||
2213 | MI.eraseFromParent(); | ||||||||
2214 | return true; | ||||||||
2215 | } | ||||||||
2216 | |||||||||
2217 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||||||
2218 | MachineIRBuilder &B, | ||||||||
2219 | const GlobalValue *GV, | ||||||||
2220 | int64_t Offset, | ||||||||
2221 | unsigned GAFlags) const { | ||||||||
2222 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2222, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2223 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||||||
2224 | // to the following code sequence: | ||||||||
2225 | // | ||||||||
2226 | // For constant address space: | ||||||||
2227 | // s_getpc_b64 s[0:1] | ||||||||
2228 | // s_add_u32 s0, s0, $symbol | ||||||||
2229 | // s_addc_u32 s1, s1, 0 | ||||||||
2230 | // | ||||||||
2231 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||||
2232 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||||||
2233 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||||||
2234 | // operand to the global variable. | ||||||||
2235 | // | ||||||||
2236 | // For global address space: | ||||||||
2237 | // s_getpc_b64 s[0:1] | ||||||||
2238 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||||||
2239 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||||||
2240 | // | ||||||||
2241 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||||
2242 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||||||
2243 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||||||
2244 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||||||
2245 | // operand to the global variable. | ||||||||
2246 | // | ||||||||
2247 | // What we want here is an offset from the value returned by s_getpc | ||||||||
2248 | // (which is the address of the s_add_u32 instruction) to the global | ||||||||
2249 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||||||
2250 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||||||
2251 | // small. This requires us to add 4 to the global variable offset in order to | ||||||||
2252 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||||||
2253 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||||||
2254 | // instruction. | ||||||||
2255 | |||||||||
2256 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||||
2257 | |||||||||
2258 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||||||
2259 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||||||
2260 | |||||||||
2261 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||||||
2262 | .addDef(PCReg); | ||||||||
2263 | |||||||||
2264 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||||||
2265 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||||||
2266 | MIB.addImm(0); | ||||||||
2267 | else | ||||||||
2268 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||||||
2269 | |||||||||
2270 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||||||
2271 | |||||||||
2272 | if (PtrTy.getSizeInBits() == 32) | ||||||||
2273 | B.buildExtract(DstReg, PCReg, 0); | ||||||||
2274 | return true; | ||||||||
2275 | } | ||||||||
2276 | |||||||||
2277 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||||||
2278 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2279 | MachineIRBuilder &B) const { | ||||||||
2280 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
2281 | LLT Ty = MRI.getType(DstReg); | ||||||||
2282 | unsigned AS = Ty.getAddressSpace(); | ||||||||
2283 | |||||||||
2284 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||||||
2285 | MachineFunction &MF = B.getMF(); | ||||||||
2286 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
2287 | |||||||||
2288 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||||||
2289 | if (!MFI->isModuleEntryFunction()) { | ||||||||
2290 | const Function &Fn = MF.getFunction(); | ||||||||
2291 | DiagnosticInfoUnsupported BadLDSDecl( | ||||||||
2292 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||||||
2293 | DS_Warning); | ||||||||
2294 | Fn.getContext().diagnose(BadLDSDecl); | ||||||||
2295 | |||||||||
2296 | // We currently don't have a way to correctly allocate LDS objects that | ||||||||
2297 | // aren't directly associated with a kernel. We do force inlining of | ||||||||
2298 | // functions that use local objects. However, if these dead functions are | ||||||||
2299 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||||||
2300 | // and a trap, since there should be no callable path here. | ||||||||
2301 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||||||
2302 | B.buildUndef(DstReg); | ||||||||
2303 | MI.eraseFromParent(); | ||||||||
2304 | return true; | ||||||||
2305 | } | ||||||||
2306 | |||||||||
2307 | // TODO: We could emit code to handle the initialization somewhere. | ||||||||
2308 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { | ||||||||
2309 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
2310 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||||||
2311 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||||||
2312 | return true; // Leave in place; | ||||||||
2313 | } | ||||||||
2314 | |||||||||
2315 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||||||
2316 | Type *Ty = GV->getValueType(); | ||||||||
2317 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||||||
2318 | // zero-sized type in other languages to declare the dynamic shared | ||||||||
2319 | // memory which size is not known at the compile time. They will be | ||||||||
2320 | // allocated by the runtime and placed directly after the static | ||||||||
2321 | // allocated ones. They all share the same offset. | ||||||||
2322 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||||||
2323 | // Adjust alignment for that dynamic shared memory array. | ||||||||
2324 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); | ||||||||
2325 | LLT S32 = LLT::scalar(32); | ||||||||
2326 | auto Sz = | ||||||||
2327 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||||||
2328 | B.buildIntToPtr(DstReg, Sz); | ||||||||
2329 | MI.eraseFromParent(); | ||||||||
2330 | return true; | ||||||||
2331 | } | ||||||||
2332 | } | ||||||||
2333 | |||||||||
2334 | B.buildConstant( | ||||||||
2335 | DstReg, | ||||||||
2336 | MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); | ||||||||
2337 | MI.eraseFromParent(); | ||||||||
2338 | return true; | ||||||||
2339 | } | ||||||||
2340 | |||||||||
2341 | const Function &Fn = MF.getFunction(); | ||||||||
2342 | DiagnosticInfoUnsupported BadInit( | ||||||||
2343 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); | ||||||||
2344 | Fn.getContext().diagnose(BadInit); | ||||||||
2345 | return true; | ||||||||
2346 | } | ||||||||
2347 | |||||||||
2348 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
2349 | |||||||||
2350 | if (TLI->shouldEmitFixup(GV)) { | ||||||||
2351 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||||||
2352 | MI.eraseFromParent(); | ||||||||
2353 | return true; | ||||||||
2354 | } | ||||||||
2355 | |||||||||
2356 | if (TLI->shouldEmitPCReloc(GV)) { | ||||||||
2357 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||||||
2358 | MI.eraseFromParent(); | ||||||||
2359 | return true; | ||||||||
2360 | } | ||||||||
2361 | |||||||||
2362 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||||
2363 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||||||
2364 | |||||||||
2365 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||||||
2366 | MachinePointerInfo::getGOT(MF), | ||||||||
2367 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||||
2368 | MachineMemOperand::MOInvariant, | ||||||||
2369 | 8 /*Size*/, Align(8)); | ||||||||
2370 | |||||||||
2371 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||||||
2372 | |||||||||
2373 | if (Ty.getSizeInBits() == 32) { | ||||||||
2374 | // Truncate if this is a 32-bit constant adrdess. | ||||||||
2375 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||||||
2376 | B.buildExtract(DstReg, Load, 0); | ||||||||
2377 | } else | ||||||||
2378 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||||||
2379 | |||||||||
2380 | MI.eraseFromParent(); | ||||||||
2381 | return true; | ||||||||
2382 | } | ||||||||
2383 | |||||||||
2384 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||||||
2385 | if (Ty.isVector()) | ||||||||
2386 | return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); | ||||||||
2387 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||||||
2388 | } | ||||||||
2389 | |||||||||
2390 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||||||
2391 | MachineInstr &MI) const { | ||||||||
2392 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
2393 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
2394 | GISelChangeObserver &Observer = Helper.Observer; | ||||||||
2395 | |||||||||
2396 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||||
2397 | LLT PtrTy = MRI.getType(PtrReg); | ||||||||
2398 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||||||
2399 | |||||||||
2400 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||||
2401 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||||
2402 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||||||
2403 | Observer.changingInstr(MI); | ||||||||
2404 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||||||
2405 | Observer.changedInstr(MI); | ||||||||
2406 | return true; | ||||||||
2407 | } | ||||||||
2408 | |||||||||
2409 | Register ValReg = MI.getOperand(0).getReg(); | ||||||||
2410 | LLT ValTy = MRI.getType(ValReg); | ||||||||
2411 | |||||||||
2412 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
2413 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||||||
2414 | const unsigned MemSize = 8 * MMO->getSize(); | ||||||||
2415 | const Align MemAlign = MMO->getAlign(); | ||||||||
2416 | const unsigned AlignInBits = 8 * MemAlign.value(); | ||||||||
2417 | |||||||||
2418 | // Widen non-power-of-2 loads to the alignment if needed | ||||||||
2419 | if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||||||
2420 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||||||
2421 | |||||||||
2422 | // This was already the correct extending load result type, so just adjust | ||||||||
2423 | // the memory type. | ||||||||
2424 | if (WideMemSize == ValSize) { | ||||||||
2425 | MachineFunction &MF = B.getMF(); | ||||||||
2426 | |||||||||
2427 | MachineMemOperand *WideMMO = | ||||||||
2428 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||||||
2429 | Observer.changingInstr(MI); | ||||||||
2430 | MI.setMemRefs(MF, {WideMMO}); | ||||||||
2431 | Observer.changedInstr(MI); | ||||||||
2432 | return true; | ||||||||
2433 | } | ||||||||
2434 | |||||||||
2435 | // Don't bother handling edge case that should probably never be produced. | ||||||||
2436 | if (ValSize > WideMemSize) | ||||||||
2437 | return false; | ||||||||
2438 | |||||||||
2439 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||||||
2440 | |||||||||
2441 | Register WideLoad; | ||||||||
2442 | if (!WideTy.isVector()) { | ||||||||
2443 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||||
2444 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||||||
2445 | } else { | ||||||||
2446 | // Extract the subvector. | ||||||||
2447 | |||||||||
2448 | if (isRegisterType(ValTy)) { | ||||||||
2449 | // If this a case where G_EXTRACT is legal, use it. | ||||||||
2450 | // (e.g. <3 x s32> -> <4 x s32>) | ||||||||
2451 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||||
2452 | B.buildExtract(ValReg, WideLoad, 0); | ||||||||
2453 | } else { | ||||||||
2454 | // For cases where the widened type isn't a nice register value, unmerge | ||||||||
2455 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||||||
2456 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||||
2457 | WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); | ||||||||
2458 | B.setInsertPt(B.getMBB(), MI.getIterator()); | ||||||||
2459 | B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); | ||||||||
2460 | } | ||||||||
2461 | } | ||||||||
2462 | |||||||||
2463 | MI.eraseFromParent(); | ||||||||
2464 | return true; | ||||||||
2465 | } | ||||||||
2466 | |||||||||
2467 | return false; | ||||||||
2468 | } | ||||||||
2469 | |||||||||
2470 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||||||
2471 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2472 | MachineIRBuilder &B) const { | ||||||||
2473 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||||
2474 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2474, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2475 | |||||||||
2476 | MachineFunction &MF = B.getMF(); | ||||||||
2477 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
2478 | |||||||||
2479 | // TODO: Always legal with future ftz flag. | ||||||||
2480 | // FIXME: Do we need just output? | ||||||||
2481 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||||||
2482 | return true; | ||||||||
2483 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||||||
2484 | return true; | ||||||||
2485 | |||||||||
2486 | MachineIRBuilder HelperBuilder(MI); | ||||||||
2487 | GISelObserverWrapper DummyObserver; | ||||||||
2488 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||||
2489 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||||||
2490 | } | ||||||||
2491 | |||||||||
2492 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||||||
2493 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
2494 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
2495 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||||
2496 | Register CmpVal = MI.getOperand(2).getReg(); | ||||||||
2497 | Register NewVal = MI.getOperand(3).getReg(); | ||||||||
2498 | |||||||||
2499 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2500, __extension__ __PRETTY_FUNCTION__)) | ||||||||
2500 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2500, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2501 | |||||||||
2502 | LLT ValTy = MRI.getType(CmpVal); | ||||||||
2503 | LLT VecTy = LLT::vector(2, ValTy); | ||||||||
2504 | |||||||||
2505 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||||||
2506 | |||||||||
2507 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||||||
2508 | .addDef(DstReg) | ||||||||
2509 | .addUse(PtrReg) | ||||||||
2510 | .addUse(PackedVal) | ||||||||
2511 | .setMemRefs(MI.memoperands()); | ||||||||
2512 | |||||||||
2513 | MI.eraseFromParent(); | ||||||||
2514 | return true; | ||||||||
2515 | } | ||||||||
2516 | |||||||||
2517 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||||||
2518 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||||||
2519 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2520 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2521 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
2522 | unsigned Flags = MI.getFlags(); | ||||||||
2523 | |||||||||
2524 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||||||
2525 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||||||
2526 | |||||||||
2527 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||||||
2528 | MI.eraseFromParent(); | ||||||||
2529 | return true; | ||||||||
2530 | } | ||||||||
2531 | |||||||||
2532 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||||||
2533 | MachineIRBuilder &B) const { | ||||||||
2534 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2535 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2536 | unsigned Flags = MI.getFlags(); | ||||||||
2537 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
2538 | |||||||||
2539 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||||||
2540 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||||||
2541 | B.buildFExp2(Dst, Mul, Flags); | ||||||||
2542 | MI.eraseFromParent(); | ||||||||
2543 | return true; | ||||||||
2544 | } | ||||||||
2545 | |||||||||
2546 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||||||
2547 | MachineIRBuilder &B) const { | ||||||||
2548 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2549 | Register Src0 = MI.getOperand(1).getReg(); | ||||||||
2550 | Register Src1 = MI.getOperand(2).getReg(); | ||||||||
2551 | unsigned Flags = MI.getFlags(); | ||||||||
2552 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
2553 | const LLT S16 = LLT::scalar(16); | ||||||||
2554 | const LLT S32 = LLT::scalar(32); | ||||||||
2555 | |||||||||
2556 | if (Ty == S32) { | ||||||||
2557 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||||||
2558 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||||
2559 | .addUse(Log.getReg(0)) | ||||||||
2560 | .addUse(Src1) | ||||||||
2561 | .setMIFlags(Flags); | ||||||||
2562 | B.buildFExp2(Dst, Mul, Flags); | ||||||||
2563 | } else if (Ty == S16) { | ||||||||
2564 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||||||
2565 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||||||
2566 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||||||
2567 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||||||
2568 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||||
2569 | .addUse(Ext0.getReg(0)) | ||||||||
2570 | .addUse(Ext1.getReg(0)) | ||||||||
2571 | .setMIFlags(Flags); | ||||||||
2572 | |||||||||
2573 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||||||
2574 | } else | ||||||||
2575 | return false; | ||||||||
2576 | |||||||||
2577 | MI.eraseFromParent(); | ||||||||
2578 | return true; | ||||||||
2579 | } | ||||||||
2580 | |||||||||
2581 | // Find a source register, ignoring any possible source modifiers. | ||||||||
2582 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||||||
2583 | Register ModSrc = OrigSrc; | ||||||||
2584 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||||||
2585 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||||||
2586 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||||
2587 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||||
2588 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||||
2589 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||||
2590 | return ModSrc; | ||||||||
2591 | } | ||||||||
2592 | |||||||||
2593 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||||||
2594 | MachineRegisterInfo &MRI, | ||||||||
2595 | MachineIRBuilder &B) const { | ||||||||
2596 | |||||||||
2597 | const LLT S1 = LLT::scalar(1); | ||||||||
2598 | const LLT S64 = LLT::scalar(64); | ||||||||
2599 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2600 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||||||
2601 | unsigned Flags = MI.getFlags(); | ||||||||
2602 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2603, __extension__ __PRETTY_FUNCTION__)) | ||||||||
2603 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2603, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2604 | |||||||||
2605 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||||||
2606 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||||||
2607 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||||||
2608 | // V_FRACT bug is: | ||||||||
2609 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||||||
2610 | // | ||||||||
2611 | // Convert floor(x) to (x - fract(x)) | ||||||||
2612 | |||||||||
2613 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||||||
2614 | .addUse(OrigSrc) | ||||||||
2615 | .setMIFlags(Flags); | ||||||||
2616 | |||||||||
2617 | // Give source modifier matching some assistance before obscuring a foldable | ||||||||
2618 | // pattern. | ||||||||
2619 | |||||||||
2620 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||||||
2621 | // shouldn't matter? | ||||||||
2622 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||||||
2623 | |||||||||
2624 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); | ||||||||
2625 | |||||||||
2626 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||||||
2627 | |||||||||
2628 | // We don't need to concern ourselves with the snan handling difference, so | ||||||||
2629 | // use the one which will directly select. | ||||||||
2630 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
2631 | if (MFI->getMode().IEEE) | ||||||||
2632 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||||||
2633 | else | ||||||||
2634 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||||||
2635 | |||||||||
2636 | Register CorrectedFract = Min; | ||||||||
2637 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||||||
2638 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||||||
2639 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||||||
2640 | } | ||||||||
2641 | |||||||||
2642 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||||||
2643 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||||||
2644 | |||||||||
2645 | MI.eraseFromParent(); | ||||||||
2646 | return true; | ||||||||
2647 | } | ||||||||
2648 | |||||||||
2649 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||||||
2650 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||||||
2651 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||||||
2652 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
2653 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2654 | const LLT S32 = LLT::scalar(32); | ||||||||
2655 | assert(MRI.getType(Dst) == LLT::vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::vector(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2655, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2656 | |||||||||
2657 | Register Src0 = MI.getOperand(1).getReg(); | ||||||||
2658 | Register Src1 = MI.getOperand(2).getReg(); | ||||||||
2659 | assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16 )) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2659, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2660 | |||||||||
2661 | auto Merge = B.buildMerge(S32, {Src0, Src1}); | ||||||||
2662 | B.buildBitcast(Dst, Merge); | ||||||||
2663 | |||||||||
2664 | MI.eraseFromParent(); | ||||||||
2665 | return true; | ||||||||
2666 | } | ||||||||
2667 | |||||||||
2668 | // Check that this is a G_XOR x, -1 | ||||||||
2669 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||||||
2670 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||||||
2671 | return false; | ||||||||
2672 | auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||||||
2673 | return ConstVal && *ConstVal == -1; | ||||||||
2674 | } | ||||||||
2675 | |||||||||
2676 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||||||
2677 | static MachineInstr * | ||||||||
2678 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||||||
2679 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||||||
2680 | Register CondDef = MI.getOperand(0).getReg(); | ||||||||
2681 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||||||
2682 | return nullptr; | ||||||||
2683 | |||||||||
2684 | MachineBasicBlock *Parent = MI.getParent(); | ||||||||
2685 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||||||
2686 | |||||||||
2687 | if (isNot(MRI, *UseMI)) { | ||||||||
2688 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||||||
2689 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||||||
2690 | return nullptr; | ||||||||
2691 | |||||||||
2692 | // We're deleting the def of this value, so we need to remove it. | ||||||||
2693 | UseMI->eraseFromParent(); | ||||||||
2694 | |||||||||
2695 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||||||
2696 | Negated = true; | ||||||||
2697 | } | ||||||||
2698 | |||||||||
2699 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||||||
2700 | return nullptr; | ||||||||
2701 | |||||||||
2702 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||||||
2703 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||||||
2704 | if (Next == Parent->end()) { | ||||||||
2705 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||||||
2706 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||||||
2707 | return nullptr; | ||||||||
2708 | UncondBrTarget = &*NextMBB; | ||||||||
2709 | } else { | ||||||||
2710 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||||||
2711 | return nullptr; | ||||||||
2712 | Br = &*Next; | ||||||||
2713 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||||||
2714 | } | ||||||||
2715 | |||||||||
2716 | return UseMI; | ||||||||
2717 | } | ||||||||
2718 | |||||||||
2719 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||||||
2720 | const ArgDescriptor *Arg, | ||||||||
2721 | const TargetRegisterClass *ArgRC, | ||||||||
2722 | LLT ArgTy) const { | ||||||||
2723 | MCRegister SrcReg = Arg->getRegister(); | ||||||||
2724 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2724, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2725 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2725, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2726 | |||||||||
2727 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, | ||||||||
2728 | ArgTy); | ||||||||
2729 | if (Arg->isMasked()) { | ||||||||
2730 | // TODO: Should we try to emit this once in the entry block? | ||||||||
2731 | const LLT S32 = LLT::scalar(32); | ||||||||
2732 | const unsigned Mask = Arg->getMask(); | ||||||||
2733 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | ||||||||
2734 | |||||||||
2735 | Register AndMaskSrc = LiveIn; | ||||||||
2736 | |||||||||
2737 | if (Shift != 0) { | ||||||||
2738 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||||||
2739 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||||||
2740 | } | ||||||||
2741 | |||||||||
2742 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||||||
2743 | } else { | ||||||||
2744 | B.buildCopy(DstReg, LiveIn); | ||||||||
2745 | } | ||||||||
2746 | |||||||||
2747 | return true; | ||||||||
2748 | } | ||||||||
2749 | |||||||||
2750 | bool AMDGPULegalizerInfo::loadInputValue( | ||||||||
2751 | Register DstReg, MachineIRBuilder &B, | ||||||||
2752 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||||
2753 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
2754 | const ArgDescriptor *Arg; | ||||||||
2755 | const TargetRegisterClass *ArgRC; | ||||||||
2756 | LLT ArgTy; | ||||||||
2757 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||||||
2758 | |||||||||
2759 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||||||
2760 | return false; // TODO: Handle these | ||||||||
2761 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||||||
2762 | } | ||||||||
2763 | |||||||||
2764 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||||||
2765 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||||||
2766 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||||
2767 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||||||
2768 | return false; | ||||||||
2769 | |||||||||
2770 | MI.eraseFromParent(); | ||||||||
2771 | return true; | ||||||||
2772 | } | ||||||||
2773 | |||||||||
2774 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||||||
2775 | MachineRegisterInfo &MRI, | ||||||||
2776 | MachineIRBuilder &B) const { | ||||||||
2777 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2778 | LLT DstTy = MRI.getType(Dst); | ||||||||
2779 | LLT S16 = LLT::scalar(16); | ||||||||
2780 | LLT S32 = LLT::scalar(32); | ||||||||
2781 | LLT S64 = LLT::scalar(64); | ||||||||
2782 | |||||||||
2783 | if (DstTy == S16) | ||||||||
2784 | return legalizeFDIV16(MI, MRI, B); | ||||||||
2785 | if (DstTy == S32) | ||||||||
2786 | return legalizeFDIV32(MI, MRI, B); | ||||||||
2787 | if (DstTy == S64) | ||||||||
2788 | return legalizeFDIV64(MI, MRI, B); | ||||||||
2789 | |||||||||
2790 | return false; | ||||||||
2791 | } | ||||||||
2792 | |||||||||
2793 | void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, | ||||||||
2794 | Register DstReg, | ||||||||
2795 | Register X, | ||||||||
2796 | Register Y, | ||||||||
2797 | bool IsDiv) const { | ||||||||
2798 | const LLT S1 = LLT::scalar(1); | ||||||||
2799 | const LLT S32 = LLT::scalar(32); | ||||||||
2800 | |||||||||
2801 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||||||
2802 | // algorithm used here. | ||||||||
2803 | |||||||||
2804 | // Initial estimate of inv(y). | ||||||||
2805 | auto FloatY = B.buildUITOFP(S32, Y); | ||||||||
2806 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||||||
2807 | auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); | ||||||||
2808 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||||||
2809 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||||||
2810 | |||||||||
2811 | // One round of UNR. | ||||||||
2812 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||||||
2813 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||||||
2814 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||||||
2815 | |||||||||
2816 | // Quotient/remainder estimate. | ||||||||
2817 | auto Q = B.buildUMulH(S32, X, Z); | ||||||||
2818 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||||||
2819 | |||||||||
2820 | // First quotient/remainder refinement. | ||||||||
2821 | auto One = B.buildConstant(S32, 1); | ||||||||
2822 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||||
2823 | if (IsDiv) | ||||||||
2824 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||||||
2825 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||||||
2826 | |||||||||
2827 | // Second quotient/remainder refinement. | ||||||||
2828 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||||
2829 | if (IsDiv) | ||||||||
2830 | B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||||||
2831 | else | ||||||||
2832 | B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); | ||||||||
2833 | } | ||||||||
2834 | |||||||||
2835 | // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 | ||||||||
2836 | // | ||||||||
2837 | // Return lo, hi of result | ||||||||
2838 | // | ||||||||
2839 | // %cvt.lo = G_UITOFP Val.lo | ||||||||
2840 | // %cvt.hi = G_UITOFP Val.hi | ||||||||
2841 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||||||
2842 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||||||
2843 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||||||
2844 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||||||
2845 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||||||
2846 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||||||
2847 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||||||
2848 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||||||
2849 | Register Val) { | ||||||||
2850 | const LLT S32 = LLT::scalar(32); | ||||||||
2851 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||||||
2852 | |||||||||
2853 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||||||
2854 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||||||
2855 | |||||||||
2856 | auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 | ||||||||
2857 | B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); | ||||||||
2858 | |||||||||
2859 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||||||
2860 | auto Mul1 = | ||||||||
2861 | B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); | ||||||||
2862 | |||||||||
2863 | // 2**(-32) | ||||||||
2864 | auto Mul2 = | ||||||||
2865 | B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); | ||||||||
2866 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||||||
2867 | |||||||||
2868 | // -(2**32) | ||||||||
2869 | auto Mad2 = B.buildFMAD(S32, Trunc, | ||||||||
2870 | B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); | ||||||||
2871 | |||||||||
2872 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||||||
2873 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||||||
2874 | |||||||||
2875 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||||||
2876 | } | ||||||||
2877 | |||||||||
2878 | void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, | ||||||||
2879 | Register DstReg, | ||||||||
2880 | Register Numer, | ||||||||
2881 | Register Denom, | ||||||||
2882 | bool IsDiv) const { | ||||||||
2883 | const LLT S32 = LLT::scalar(32); | ||||||||
2884 | const LLT S64 = LLT::scalar(64); | ||||||||
2885 | const LLT S1 = LLT::scalar(1); | ||||||||
2886 | Register RcpLo, RcpHi; | ||||||||
2887 | |||||||||
2888 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||||||
2889 | |||||||||
2890 | auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); | ||||||||
2891 | |||||||||
2892 | auto Zero64 = B.buildConstant(S64, 0); | ||||||||
2893 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||||||
2894 | |||||||||
2895 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||||||
2896 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||||||
2897 | |||||||||
2898 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||||||
2899 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||||||
2900 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||||||
2901 | |||||||||
2902 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||||||
2903 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||||||
2904 | auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); | ||||||||
2905 | auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); | ||||||||
2906 | |||||||||
2907 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||||||
2908 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||||||
2909 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||||||
2910 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||||||
2911 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||||||
2912 | |||||||||
2913 | auto Zero32 = B.buildConstant(S32, 0); | ||||||||
2914 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||||||
2915 | auto Add2_HiC = | ||||||||
2916 | B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); | ||||||||
2917 | auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); | ||||||||
2918 | auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); | ||||||||
2919 | |||||||||
2920 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||||||
2921 | Register NumerLo = UnmergeNumer.getReg(0); | ||||||||
2922 | Register NumerHi = UnmergeNumer.getReg(1); | ||||||||
2923 | |||||||||
2924 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||||||
2925 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||||||
2926 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||||||
2927 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||||||
2928 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||||||
2929 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||||||
2930 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||||||
2931 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||||||
2932 | auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); | ||||||||
2933 | |||||||||
2934 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||||||
2935 | Register DenomLo = UnmergeDenom.getReg(0); | ||||||||
2936 | Register DenomHi = UnmergeDenom.getReg(1); | ||||||||
2937 | |||||||||
2938 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||||||
2939 | auto C1 = B.buildSExt(S32, CmpHi); | ||||||||
2940 | |||||||||
2941 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||||||
2942 | auto C2 = B.buildSExt(S32, CmpLo); | ||||||||
2943 | |||||||||
2944 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||||||
2945 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||||||
2946 | |||||||||
2947 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||||||
2948 | // Currently control flow is unconditional and we have 4 selects after | ||||||||
2949 | // potential endif to substitute PHIs. | ||||||||
2950 | |||||||||
2951 | // if C3 != 0 ... | ||||||||
2952 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||||||
2953 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||||||
2954 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||||||
2955 | auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); | ||||||||
2956 | |||||||||
2957 | auto One64 = B.buildConstant(S64, 1); | ||||||||
2958 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||||||
2959 | |||||||||
2960 | auto C4 = | ||||||||
2961 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||||||
2962 | auto C5 = | ||||||||
2963 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||||||
2964 | auto C6 = B.buildSelect( | ||||||||
2965 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||||||
2966 | |||||||||
2967 | // if (C6 != 0) | ||||||||
2968 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||||||
2969 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||||||
2970 | |||||||||
2971 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||||||
2972 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||||||
2973 | auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); | ||||||||
2974 | |||||||||
2975 | // endif C6 | ||||||||
2976 | // endif C3 | ||||||||
2977 | |||||||||
2978 | if (IsDiv) { | ||||||||
2979 | auto Sel1 = B.buildSelect( | ||||||||
2980 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||||||
2981 | B.buildSelect(DstReg, | ||||||||
2982 | B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); | ||||||||
2983 | } else { | ||||||||
2984 | auto Sel2 = B.buildSelect( | ||||||||
2985 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||||||
2986 | B.buildSelect(DstReg, | ||||||||
2987 | B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); | ||||||||
2988 | } | ||||||||
2989 | } | ||||||||
2990 | |||||||||
2991 | bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, | ||||||||
2992 | MachineRegisterInfo &MRI, | ||||||||
2993 | MachineIRBuilder &B) const { | ||||||||
2994 | const LLT S64 = LLT::scalar(64); | ||||||||
2995 | const LLT S32 = LLT::scalar(32); | ||||||||
2996 | const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; | ||||||||
2997 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
2998 | Register Num = MI.getOperand(1).getReg(); | ||||||||
2999 | Register Den = MI.getOperand(2).getReg(); | ||||||||
3000 | LLT Ty = MRI.getType(DstReg); | ||||||||
3001 | |||||||||
3002 | if (Ty == S32) | ||||||||
3003 | legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); | ||||||||
3004 | else if (Ty == S64) | ||||||||
3005 | legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); | ||||||||
3006 | else | ||||||||
3007 | return false; | ||||||||
3008 | |||||||||
3009 | MI.eraseFromParent(); | ||||||||
3010 | return true; | ||||||||
3011 | |||||||||
3012 | } | ||||||||
3013 | |||||||||
3014 | bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, | ||||||||
3015 | MachineRegisterInfo &MRI, | ||||||||
3016 | MachineIRBuilder &B) const { | ||||||||
3017 | const LLT S64 = LLT::scalar(64); | ||||||||
3018 | const LLT S32 = LLT::scalar(32); | ||||||||
3019 | |||||||||
3020 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
3021 | const LLT Ty = MRI.getType(DstReg); | ||||||||
3022 | if (Ty != S32 && Ty != S64) | ||||||||
3023 | return false; | ||||||||
3024 | |||||||||
3025 | const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; | ||||||||
3026 | |||||||||
3027 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3028 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3029 | |||||||||
3030 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||||||
3031 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||||||
3032 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||||||
3033 | |||||||||
3034 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||||||
3035 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||||||
3036 | |||||||||
3037 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||||||
3038 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||||||
3039 | |||||||||
3040 | Register UDivRem = MRI.createGenericVirtualRegister(Ty); | ||||||||
3041 | if (Ty == S32) | ||||||||
3042 | legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); | ||||||||
3043 | else | ||||||||
3044 | legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); | ||||||||
3045 | |||||||||
3046 | Register Sign; | ||||||||
3047 | if (IsDiv) | ||||||||
3048 | Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||||||
3049 | else | ||||||||
3050 | Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||||||
3051 | |||||||||
3052 | UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); | ||||||||
3053 | B.buildSub(DstReg, UDivRem, Sign); | ||||||||
3054 | |||||||||
3055 | MI.eraseFromParent(); | ||||||||
3056 | return true; | ||||||||
3057 | } | ||||||||
3058 | |||||||||
3059 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||||||
3060 | MachineRegisterInfo &MRI, | ||||||||
3061 | MachineIRBuilder &B) const { | ||||||||
3062 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3063 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3064 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3065 | uint16_t Flags = MI.getFlags(); | ||||||||
3066 | LLT ResTy = MRI.getType(Res); | ||||||||
3067 | |||||||||
3068 | const MachineFunction &MF = B.getMF(); | ||||||||
3069 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||||
3070 | MI.getFlag(MachineInstr::FmAfn); | ||||||||
3071 | |||||||||
3072 | if (!AllowInaccurateRcp) | ||||||||
3073 | return false; | ||||||||
3074 | |||||||||
3075 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||||||
3076 | // 1 / x -> RCP(x) | ||||||||
3077 | if (CLHS->isExactlyValue(1.0)) { | ||||||||
3078 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||||
3079 | .addUse(RHS) | ||||||||
3080 | .setMIFlags(Flags); | ||||||||
3081 | |||||||||
3082 | MI.eraseFromParent(); | ||||||||
3083 | return true; | ||||||||
3084 | } | ||||||||
3085 | |||||||||
3086 | // -1 / x -> RCP( FNEG(x) ) | ||||||||
3087 | if (CLHS->isExactlyValue(-1.0)) { | ||||||||
3088 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||||||
3089 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||||
3090 | .addUse(FNeg.getReg(0)) | ||||||||
3091 | .setMIFlags(Flags); | ||||||||
3092 | |||||||||
3093 | MI.eraseFromParent(); | ||||||||
3094 | return true; | ||||||||
3095 | } | ||||||||
3096 | } | ||||||||
3097 | |||||||||
3098 | // x / y -> x * (1.0 / y) | ||||||||
3099 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||||
3100 | .addUse(RHS) | ||||||||
3101 | .setMIFlags(Flags); | ||||||||
3102 | B.buildFMul(Res, LHS, RCP, Flags); | ||||||||
3103 | |||||||||
3104 | MI.eraseFromParent(); | ||||||||
3105 | return true; | ||||||||
3106 | } | ||||||||
3107 | |||||||||
3108 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||||||
3109 | MachineRegisterInfo &MRI, | ||||||||
3110 | MachineIRBuilder &B) const { | ||||||||
3111 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3112 | Register X = MI.getOperand(1).getReg(); | ||||||||
3113 | Register Y = MI.getOperand(2).getReg(); | ||||||||
3114 | uint16_t Flags = MI.getFlags(); | ||||||||
3115 | LLT ResTy = MRI.getType(Res); | ||||||||
3116 | |||||||||
3117 | const MachineFunction &MF = B.getMF(); | ||||||||
3118 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||||
3119 | MI.getFlag(MachineInstr::FmAfn); | ||||||||
3120 | |||||||||
3121 | if (!AllowInaccurateRcp) | ||||||||
3122 | return false; | ||||||||
3123 | |||||||||
3124 | auto NegY = B.buildFNeg(ResTy, Y); | ||||||||
3125 | auto One = B.buildFConstant(ResTy, 1.0); | ||||||||
3126 | |||||||||
3127 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||||
3128 | .addUse(Y) | ||||||||
3129 | .setMIFlags(Flags); | ||||||||
3130 | |||||||||
3131 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||||||
3132 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||||||
3133 | |||||||||
3134 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||||||
3135 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||||||
3136 | |||||||||
3137 | auto Ret = B.buildFMul(ResTy, X, R); | ||||||||
3138 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||||||
3139 | |||||||||
3140 | B.buildFMA(Res, Tmp2, R, Ret); | ||||||||
3141 | MI.eraseFromParent(); | ||||||||
3142 | return true; | ||||||||
3143 | } | ||||||||
3144 | |||||||||
3145 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||||||
3146 | MachineRegisterInfo &MRI, | ||||||||
3147 | MachineIRBuilder &B) const { | ||||||||
3148 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||||
3149 | return true; | ||||||||
3150 | |||||||||
3151 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3152 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3153 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3154 | |||||||||
3155 | uint16_t Flags = MI.getFlags(); | ||||||||
3156 | |||||||||
3157 | LLT S16 = LLT::scalar(16); | ||||||||
3158 | LLT S32 = LLT::scalar(32); | ||||||||
3159 | |||||||||
3160 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||||||
3161 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||||||
3162 | |||||||||
3163 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||||
3164 | .addUse(RHSExt.getReg(0)) | ||||||||
3165 | .setMIFlags(Flags); | ||||||||
3166 | |||||||||
3167 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||||||
3168 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||||||
3169 | |||||||||
3170 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||||
3171 | .addUse(RDst.getReg(0)) | ||||||||
3172 | .addUse(RHS) | ||||||||
3173 | .addUse(LHS) | ||||||||
3174 | .setMIFlags(Flags); | ||||||||
3175 | |||||||||
3176 | MI.eraseFromParent(); | ||||||||
3177 | return true; | ||||||||
3178 | } | ||||||||
3179 | |||||||||
3180 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||||||
3181 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||||||
3182 | static void toggleSPDenormMode(bool Enable, | ||||||||
3183 | MachineIRBuilder &B, | ||||||||
3184 | const GCNSubtarget &ST, | ||||||||
3185 | AMDGPU::SIModeRegisterDefaults Mode) { | ||||||||
3186 | // Set SP denorm mode to this value. | ||||||||
3187 | unsigned SPDenormMode = | ||||||||
3188 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||||||
3189 | |||||||||
3190 | if (ST.hasDenormModeInst()) { | ||||||||
3191 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||||||
3192 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||||||
3193 | |||||||||
3194 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||||||
3195 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||||||
3196 | .addImm(NewDenormModeValue); | ||||||||
3197 | |||||||||
3198 | } else { | ||||||||
3199 | // Select FP32 bit field in mode register. | ||||||||
3200 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||||||
3201 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||||||
3202 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||||||
3203 | |||||||||
3204 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||||||
3205 | .addImm(SPDenormMode) | ||||||||
3206 | .addImm(SPDenormModeBitField); | ||||||||
3207 | } | ||||||||
3208 | } | ||||||||
3209 | |||||||||
3210 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||||||
3211 | MachineRegisterInfo &MRI, | ||||||||
3212 | MachineIRBuilder &B) const { | ||||||||
3213 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||||
3214 | return true; | ||||||||
3215 | |||||||||
3216 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3217 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3218 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3219 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
3220 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); | ||||||||
3221 | |||||||||
3222 | uint16_t Flags = MI.getFlags(); | ||||||||
3223 | |||||||||
3224 | LLT S32 = LLT::scalar(32); | ||||||||
3225 | LLT S1 = LLT::scalar(1); | ||||||||
3226 | |||||||||
3227 | auto One = B.buildFConstant(S32, 1.0f); | ||||||||
3228 | |||||||||
3229 | auto DenominatorScaled = | ||||||||
3230 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||||
3231 | .addUse(LHS) | ||||||||
3232 | .addUse(RHS) | ||||||||
3233 | .addImm(0) | ||||||||
3234 | .setMIFlags(Flags); | ||||||||
3235 | auto NumeratorScaled = | ||||||||
3236 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||||
3237 | .addUse(LHS) | ||||||||
3238 | .addUse(RHS) | ||||||||
3239 | .addImm(1) | ||||||||
3240 | .setMIFlags(Flags); | ||||||||
3241 | |||||||||
3242 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||||
3243 | .addUse(DenominatorScaled.getReg(0)) | ||||||||
3244 | .setMIFlags(Flags); | ||||||||
3245 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||||||
3246 | |||||||||
3247 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||||||
3248 | // aren't modeled as reading it. | ||||||||
3249 | if (!Mode.allFP32Denormals()) | ||||||||
3250 | toggleSPDenormMode(true, B, ST, Mode); | ||||||||
3251 | |||||||||
3252 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||||||
3253 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||||||
3254 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||||||
3255 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||||||
3256 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||||||
3257 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||||||
3258 | |||||||||
3259 | if (!Mode.allFP32Denormals()) | ||||||||
3260 | toggleSPDenormMode(false, B, ST, Mode); | ||||||||
3261 | |||||||||
3262 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||||||
3263 | .addUse(Fma4.getReg(0)) | ||||||||
3264 | .addUse(Fma1.getReg(0)) | ||||||||
3265 | .addUse(Fma3.getReg(0)) | ||||||||
3266 | .addUse(NumeratorScaled.getReg(1)) | ||||||||
3267 | .setMIFlags(Flags); | ||||||||
3268 | |||||||||
3269 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||||
3270 | .addUse(Fmas.getReg(0)) | ||||||||
3271 | .addUse(RHS) | ||||||||
3272 | .addUse(LHS) | ||||||||
3273 | .setMIFlags(Flags); | ||||||||
3274 | |||||||||
3275 | MI.eraseFromParent(); | ||||||||
3276 | return true; | ||||||||
3277 | } | ||||||||
3278 | |||||||||
3279 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||||||
3280 | MachineRegisterInfo &MRI, | ||||||||
3281 | MachineIRBuilder &B) const { | ||||||||
3282 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||||||
3283 | return true; | ||||||||
3284 | |||||||||
3285 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3286 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3287 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3288 | |||||||||
3289 | uint16_t Flags = MI.getFlags(); | ||||||||
3290 | |||||||||
3291 | LLT S64 = LLT::scalar(64); | ||||||||
3292 | LLT S1 = LLT::scalar(1); | ||||||||
3293 | |||||||||
3294 | auto One = B.buildFConstant(S64, 1.0); | ||||||||
3295 | |||||||||
3296 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||||
3297 | .addUse(LHS) | ||||||||
3298 | .addUse(RHS) | ||||||||
3299 | .addImm(0) | ||||||||
3300 | .setMIFlags(Flags); | ||||||||
3301 | |||||||||
3302 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||||||
3303 | |||||||||
3304 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||||||
3305 | .addUse(DivScale0.getReg(0)) | ||||||||
3306 | .setMIFlags(Flags); | ||||||||
3307 | |||||||||
3308 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||||||
3309 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||||||
3310 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||||||
3311 | |||||||||
3312 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||||
3313 | .addUse(LHS) | ||||||||
3314 | .addUse(RHS) | ||||||||
3315 | .addImm(1) | ||||||||
3316 | .setMIFlags(Flags); | ||||||||
3317 | |||||||||
3318 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||||||
3319 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||||||
3320 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||||||
3321 | |||||||||
3322 | Register Scale; | ||||||||
3323 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||||||
3324 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||||||
3325 | // is not usable. | ||||||||
3326 | |||||||||
3327 | LLT S32 = LLT::scalar(32); | ||||||||
3328 | |||||||||
3329 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||||||
3330 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||||||
3331 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||||||
3332 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||||||
3333 | |||||||||
3334 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||||||
3335 | Scale1Unmerge.getReg(1)); | ||||||||
3336 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||||||
3337 | Scale0Unmerge.getReg(1)); | ||||||||
3338 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||||||
3339 | } else { | ||||||||
3340 | Scale = DivScale1.getReg(1); | ||||||||
3341 | } | ||||||||
3342 | |||||||||
3343 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||||||
3344 | .addUse(Fma4.getReg(0)) | ||||||||
3345 | .addUse(Fma3.getReg(0)) | ||||||||
3346 | .addUse(Mul.getReg(0)) | ||||||||
3347 | .addUse(Scale) | ||||||||
3348 | .setMIFlags(Flags); | ||||||||
3349 | |||||||||
3350 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) | ||||||||
3351 | .addUse(Fmas.getReg(0)) | ||||||||
3352 | .addUse(RHS) | ||||||||
3353 | .addUse(LHS) | ||||||||
3354 | .setMIFlags(Flags); | ||||||||
3355 | |||||||||
3356 | MI.eraseFromParent(); | ||||||||
3357 | return true; | ||||||||
3358 | } | ||||||||
3359 | |||||||||
3360 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||||||
3361 | MachineRegisterInfo &MRI, | ||||||||
3362 | MachineIRBuilder &B) const { | ||||||||
3363 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3364 | Register LHS = MI.getOperand(2).getReg(); | ||||||||
3365 | Register RHS = MI.getOperand(3).getReg(); | ||||||||
3366 | uint16_t Flags = MI.getFlags(); | ||||||||
3367 | |||||||||
3368 | LLT S32 = LLT::scalar(32); | ||||||||
3369 | LLT S1 = LLT::scalar(1); | ||||||||
3370 | |||||||||
3371 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||||||
3372 | const APFloat C0Val(1.0f); | ||||||||
3373 | |||||||||
3374 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||||||
3375 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||||||
3376 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | ||||||||
3377 | |||||||||
3378 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||||||
3379 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||||||
3380 | |||||||||
3381 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||||||
3382 | |||||||||
3383 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||||
3384 | .addUse(Mul0.getReg(0)) | ||||||||
3385 | .setMIFlags(Flags); | ||||||||
3386 | |||||||||
3387 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||||||
3388 | |||||||||
3389 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||||||
3390 | |||||||||
3391 | MI.eraseFromParent(); | ||||||||
3392 | return true; | ||||||||
3393 | } | ||||||||
3394 | |||||||||
3395 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||||||
3396 | // FIXME: Why do we handle this one but not other removed instructions? | ||||||||
3397 | // | ||||||||
3398 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||||||
3399 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||||||
3400 | // +-max_float. | ||||||||
3401 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||||||
3402 | MachineRegisterInfo &MRI, | ||||||||
3403 | MachineIRBuilder &B) const { | ||||||||
3404 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||||||
3405 | return true; | ||||||||
3406 | |||||||||
3407 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
3408 | Register Src = MI.getOperand(2).getReg(); | ||||||||
3409 | auto Flags = MI.getFlags(); | ||||||||
3410 | |||||||||
3411 | LLT Ty = MRI.getType(Dst); | ||||||||
3412 | |||||||||
3413 | const fltSemantics *FltSemantics; | ||||||||
3414 | if (Ty == LLT::scalar(32)) | ||||||||
3415 | FltSemantics = &APFloat::IEEEsingle(); | ||||||||
3416 | else if (Ty == LLT::scalar(64)) | ||||||||
3417 | FltSemantics = &APFloat::IEEEdouble(); | ||||||||
3418 | else | ||||||||
3419 | return false; | ||||||||
3420 | |||||||||
3421 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||||||
3422 | .addUse(Src) | ||||||||
3423 | .setMIFlags(Flags); | ||||||||
3424 | |||||||||
3425 | // We don't need to concern ourselves with the snan handling difference, since | ||||||||
3426 | // the rsq quieted (or not) so use the one which will directly select. | ||||||||
3427 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
3428 | const bool UseIEEE = MFI->getMode().IEEE; | ||||||||
3429 | |||||||||
3430 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||||||
3431 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||||||
3432 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||||||
3433 | |||||||||
3434 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||||||
3435 | |||||||||
3436 | if (UseIEEE) | ||||||||
3437 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||||||
3438 | else | ||||||||
3439 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||||||
3440 | MI.eraseFromParent(); | ||||||||
3441 | return true; | ||||||||
3442 | } | ||||||||
3443 | |||||||||
3444 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||||||
3445 | switch (IID) { | ||||||||
3446 | case Intrinsic::amdgcn_ds_fadd: | ||||||||
3447 | return AMDGPU::G_ATOMICRMW_FADD; | ||||||||
3448 | case Intrinsic::amdgcn_ds_fmin: | ||||||||
3449 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||||||
3450 | case Intrinsic::amdgcn_ds_fmax: | ||||||||
3451 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||||||
3452 | default: | ||||||||
3453 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3453); | ||||||||
3454 | } | ||||||||
3455 | } | ||||||||
3456 | |||||||||
3457 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||||||
3458 | MachineInstr &MI, | ||||||||
3459 | Intrinsic::ID IID) const { | ||||||||
3460 | GISelChangeObserver &Observer = Helper.Observer; | ||||||||
3461 | Observer.changingInstr(MI); | ||||||||
3462 | |||||||||
3463 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||||||
3464 | |||||||||
3465 | // The remaining operands were used to set fields in the MemOperand on | ||||||||
3466 | // construction. | ||||||||
3467 | for (int I = 6; I > 3; --I) | ||||||||
3468 | MI.RemoveOperand(I); | ||||||||
3469 | |||||||||
3470 | MI.RemoveOperand(1); // Remove the intrinsic ID. | ||||||||
3471 | Observer.changedInstr(MI); | ||||||||
3472 | return true; | ||||||||
3473 | } | ||||||||
3474 | |||||||||
3475 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||||||
3476 | MachineRegisterInfo &MRI, | ||||||||
3477 | MachineIRBuilder &B) const { | ||||||||
3478 | uint64_t Offset = | ||||||||
3479 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||||||
3480 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||||||
3481 | LLT DstTy = MRI.getType(DstReg); | ||||||||
3482 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||||||
3483 | |||||||||
3484 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||||||
3485 | if (!loadInputValue(KernargPtrReg, B, | ||||||||
3486 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||||||
3487 | return false; | ||||||||
3488 | |||||||||
3489 | // FIXME: This should be nuw | ||||||||
3490 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||||||
3491 | return true; | ||||||||
3492 | } | ||||||||
3493 | |||||||||
3494 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||||||
3495 | MachineRegisterInfo &MRI, | ||||||||
3496 | MachineIRBuilder &B) const { | ||||||||
3497 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
3498 | if (!MFI->isEntryFunction()) { | ||||||||
3499 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
3500 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||||||
3501 | } | ||||||||
3502 | |||||||||
3503 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
3504 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||||||
3505 | return false; | ||||||||
3506 | |||||||||
3507 | MI.eraseFromParent(); | ||||||||
3508 | return true; | ||||||||
3509 | } | ||||||||
3510 | |||||||||
3511 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||||||
3512 | MachineRegisterInfo &MRI, | ||||||||
3513 | MachineIRBuilder &B, | ||||||||
3514 | unsigned AddrSpace) const { | ||||||||
3515 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||||||
3516 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||||||
3517 | Register Hi32 = Unmerge.getReg(1); | ||||||||
3518 | |||||||||
3519 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||||||
3520 | MI.eraseFromParent(); | ||||||||
3521 | return true; | ||||||||
3522 | } | ||||||||
3523 | |||||||||
3524 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||||||
3525 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||||||
3526 | // split between the instruction's voffset and immoffset fields) and soffset | ||||||||
3527 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||||||
3528 | // the instruction's soffset field). This function takes the first kind of | ||||||||
3529 | // offset and figures out how to split it between voffset and immoffset. | ||||||||
3530 | std::tuple<Register, unsigned, unsigned> | ||||||||
3531 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||||||
3532 | Register OrigOffset) const { | ||||||||
3533 | const unsigned MaxImm = 4095; | ||||||||
3534 | Register BaseReg; | ||||||||
3535 | unsigned TotalConstOffset; | ||||||||
3536 | const LLT S32 = LLT::scalar(32); | ||||||||
3537 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
3538 | |||||||||
3539 | std::tie(BaseReg, TotalConstOffset) = | ||||||||
3540 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | ||||||||
3541 | |||||||||
3542 | unsigned ImmOffset = TotalConstOffset; | ||||||||
3543 | |||||||||
3544 | // If BaseReg is a pointer, convert it to int. | ||||||||
3545 | if (MRI.getType(BaseReg).isPointer()) | ||||||||
3546 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | ||||||||
3547 | |||||||||
3548 | // If the immediate value is too big for the immoffset field, put the value | ||||||||
3549 | // and -4096 into the immoffset field so that the value that is copied/added | ||||||||
3550 | // for the voffset field is a multiple of 4096, and it stands more chance | ||||||||
3551 | // of being CSEd with the copy/add for another similar load/store. | ||||||||
3552 | // However, do not do that rounding down to a multiple of 4096 if that is a | ||||||||
3553 | // negative number, as it appears to be illegal to have a negative offset | ||||||||
3554 | // in the vgpr, even if adding the immediate offset makes it positive. | ||||||||
3555 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||||||
3556 | ImmOffset -= Overflow; | ||||||||
3557 | if ((int32_t)Overflow < 0) { | ||||||||
3558 | Overflow += ImmOffset; | ||||||||
3559 | ImmOffset = 0; | ||||||||
3560 | } | ||||||||
3561 | |||||||||
3562 | if (Overflow != 0) { | ||||||||
3563 | if (!BaseReg) { | ||||||||
3564 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||||||
3565 | } else { | ||||||||
3566 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||||||
3567 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||||||
3568 | } | ||||||||
3569 | } | ||||||||
3570 | |||||||||
3571 | if (!BaseReg) | ||||||||
3572 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||||||
3573 | |||||||||
3574 | return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); | ||||||||
3575 | } | ||||||||
3576 | |||||||||
3577 | /// Handle register layout difference for f16 images for some subtargets. | ||||||||
3578 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||||||
3579 | MachineRegisterInfo &MRI, | ||||||||
3580 | Register Reg, | ||||||||
3581 | bool ImageStore) const { | ||||||||
3582 | const LLT S16 = LLT::scalar(16); | ||||||||
3583 | const LLT S32 = LLT::scalar(32); | ||||||||
3584 | LLT StoreVT = MRI.getType(Reg); | ||||||||
3585 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3585, __extension__ __PRETTY_FUNCTION__)); | ||||||||
3586 | |||||||||
3587 | if (ST.hasUnpackedD16VMem()) { | ||||||||
3588 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||||
3589 | |||||||||
3590 | SmallVector<Register, 4> WideRegs; | ||||||||
3591 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||||
3592 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||||||
3593 | |||||||||
3594 | int NumElts = StoreVT.getNumElements(); | ||||||||
3595 | |||||||||
3596 | return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); | ||||||||
3597 | } | ||||||||
3598 | |||||||||
3599 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||||||
3600 | if (StoreVT.getNumElements() == 2) { | ||||||||
3601 | SmallVector<Register, 4> PackedRegs; | ||||||||
3602 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||||||
3603 | PackedRegs.push_back(Reg); | ||||||||
3604 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||||||
3605 | return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); | ||||||||
3606 | } | ||||||||
3607 | |||||||||
3608 | if (StoreVT.getNumElements() == 3) { | ||||||||
3609 | SmallVector<Register, 4> PackedRegs; | ||||||||
3610 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||||
3611 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||||
3612 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||||
3613 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||||||
3614 | Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); | ||||||||
3615 | return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); | ||||||||
3616 | } | ||||||||
3617 | |||||||||
3618 | if (StoreVT.getNumElements() == 4) { | ||||||||
3619 | SmallVector<Register, 4> PackedRegs; | ||||||||
3620 | Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); | ||||||||
3621 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||||||
3622 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||||
3623 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||||
3624 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||||||
3625 | return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); | ||||||||
3626 | } | ||||||||
3627 | |||||||||
3628 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3628); | ||||||||
3629 | } | ||||||||
3630 | |||||||||
3631 | return Reg; | ||||||||
3632 | } | ||||||||
3633 | |||||||||
3634 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||||||
3635 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||||||
3636 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||||
3637 | LLT Ty = MRI->getType(VData); | ||||||||
3638 | |||||||||
3639 | const LLT S16 = LLT::scalar(16); | ||||||||
3640 | |||||||||
3641 | // Fixup illegal register types for i8 stores. | ||||||||
3642 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||||||
3643 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||||||
3644 | return AnyExt; | ||||||||
3645 | } | ||||||||
3646 | |||||||||
3647 | if (Ty.isVector()) { | ||||||||
3648 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||||||
3649 | if (IsFormat) | ||||||||
3650 | return handleD16VData(B, *MRI, VData); | ||||||||
3651 | } | ||||||||
3652 | } | ||||||||
3653 | |||||||||
3654 | return VData; | ||||||||
3655 | } | ||||||||
3656 | |||||||||
3657 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||||||
3658 | MachineRegisterInfo &MRI, | ||||||||
3659 | MachineIRBuilder &B, | ||||||||
3660 | bool IsTyped, | ||||||||
3661 | bool IsFormat) const { | ||||||||
3662 | Register VData = MI.getOperand(1).getReg(); | ||||||||
3663 | LLT Ty = MRI.getType(VData); | ||||||||
3664 | LLT EltTy = Ty.getScalarType(); | ||||||||
3665 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||||
3666 | const LLT S32 = LLT::scalar(32); | ||||||||
3667 | |||||||||
3668 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||||||
3669 | Register RSrc = MI.getOperand(2).getReg(); | ||||||||
3670 | |||||||||
3671 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
3672 | const int MemSize = MMO->getSize(); | ||||||||
3673 | |||||||||
3674 | unsigned ImmOffset; | ||||||||
3675 | unsigned TotalOffset; | ||||||||
3676 | |||||||||
3677 | // The typed intrinsics add an immediate after the registers. | ||||||||
3678 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||||
3679 | |||||||||
3680 | // The struct intrinsic variants add one additional operand over raw. | ||||||||
3681 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||||
3682 | Register VIndex; | ||||||||
3683 | int OpOffset = 0; | ||||||||
3684 | if (HasVIndex) { | ||||||||
3685 | VIndex = MI.getOperand(3).getReg(); | ||||||||
3686 | OpOffset = 1; | ||||||||
3687 | } | ||||||||
3688 | |||||||||
3689 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
3690 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
3691 | |||||||||
3692 | unsigned Format = 0; | ||||||||
3693 | if (IsTyped) { | ||||||||
3694 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3695 | ++OpOffset; | ||||||||
3696 | } | ||||||||
3697 | |||||||||
3698 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3699 | |||||||||
3700 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||||
3701 | if (TotalOffset != 0) | ||||||||
3702 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||||
3703 | |||||||||
3704 | unsigned Opc; | ||||||||
3705 | if (IsTyped) { | ||||||||
3706 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||||||
3707 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||||||
3708 | } else if (IsFormat) { | ||||||||
3709 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||||||
3710 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||||||
3711 | } else { | ||||||||
3712 | switch (MemSize) { | ||||||||
3713 | case 1: | ||||||||
3714 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||||||
3715 | break; | ||||||||
3716 | case 2: | ||||||||
3717 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||||||
3718 | break; | ||||||||
3719 | default: | ||||||||
3720 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||||||
3721 | break; | ||||||||
3722 | } | ||||||||
3723 | } | ||||||||
3724 | |||||||||
3725 | if (!VIndex) | ||||||||
3726 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||||
3727 | |||||||||
3728 | auto MIB = B.buildInstr(Opc) | ||||||||
3729 | .addUse(VData) // vdata | ||||||||
3730 | .addUse(RSrc) // rsrc | ||||||||
3731 | .addUse(VIndex) // vindex | ||||||||
3732 | .addUse(VOffset) // voffset | ||||||||
3733 | .addUse(SOffset) // soffset | ||||||||
3734 | .addImm(ImmOffset); // offset(imm) | ||||||||
3735 | |||||||||
3736 | if (IsTyped) | ||||||||
3737 | MIB.addImm(Format); | ||||||||
3738 | |||||||||
3739 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||||
3740 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||||
3741 | .addMemOperand(MMO); | ||||||||
3742 | |||||||||
3743 | MI.eraseFromParent(); | ||||||||
3744 | return true; | ||||||||
3745 | } | ||||||||
3746 | |||||||||
3747 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||||||
3748 | MachineRegisterInfo &MRI, | ||||||||
3749 | MachineIRBuilder &B, | ||||||||
3750 | bool IsFormat, | ||||||||
3751 | bool IsTyped) const { | ||||||||
3752 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||||||
3753 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
3754 | const int MemSize = MMO->getSize(); | ||||||||
3755 | const LLT S32 = LLT::scalar(32); | ||||||||
3756 | |||||||||
3757 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
3758 | Register RSrc = MI.getOperand(2).getReg(); | ||||||||
3759 | |||||||||
3760 | // The typed intrinsics add an immediate after the registers. | ||||||||
3761 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||||
3762 | |||||||||
3763 | // The struct intrinsic variants add one additional operand over raw. | ||||||||
3764 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||||
3765 | Register VIndex; | ||||||||
3766 | int OpOffset = 0; | ||||||||
3767 | if (HasVIndex) { | ||||||||
3768 | VIndex = MI.getOperand(3).getReg(); | ||||||||
3769 | OpOffset = 1; | ||||||||
3770 | } | ||||||||
3771 | |||||||||
3772 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
3773 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
3774 | |||||||||
3775 | unsigned Format = 0; | ||||||||
3776 | if (IsTyped) { | ||||||||
3777 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3778 | ++OpOffset; | ||||||||
3779 | } | ||||||||
3780 | |||||||||
3781 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3782 | unsigned ImmOffset; | ||||||||
3783 | unsigned TotalOffset; | ||||||||
3784 | |||||||||
3785 | LLT Ty = MRI.getType(Dst); | ||||||||
3786 | LLT EltTy = Ty.getScalarType(); | ||||||||
3787 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||||
3788 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||||||
3789 | |||||||||
3790 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||||
3791 | if (TotalOffset != 0) | ||||||||
3792 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||||
3793 | |||||||||
3794 | unsigned Opc; | ||||||||
3795 | |||||||||
3796 | if (IsTyped) { | ||||||||
3797 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||||||
3798 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||||||
3799 | } else if (IsFormat) { | ||||||||
3800 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : | ||||||||
3801 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||||||
3802 | } else { | ||||||||
3803 | switch (MemSize) { | ||||||||
3804 | case 1: | ||||||||
3805 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||||||
3806 | break; | ||||||||
3807 | case 2: | ||||||||
3808 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||||||
3809 | break; | ||||||||
3810 | default: | ||||||||
3811 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||||||
3812 | break; | ||||||||
3813 | } | ||||||||
3814 | } | ||||||||
3815 | |||||||||
3816 | Register LoadDstReg; | ||||||||
3817 | |||||||||
3818 | bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); | ||||||||
3819 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||||||
3820 | |||||||||
3821 | if (IsExtLoad) | ||||||||
3822 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||||||
3823 | else if (Unpacked && IsD16 && Ty.isVector()) | ||||||||
3824 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||||||
3825 | else | ||||||||
3826 | LoadDstReg = Dst; | ||||||||
3827 | |||||||||
3828 | if (!VIndex) | ||||||||
3829 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||||
3830 | |||||||||
3831 | auto MIB = B.buildInstr(Opc) | ||||||||
3832 | .addDef(LoadDstReg) // vdata | ||||||||
3833 | .addUse(RSrc) // rsrc | ||||||||
3834 | .addUse(VIndex) // vindex | ||||||||
3835 | .addUse(VOffset) // voffset | ||||||||
3836 | .addUse(SOffset) // soffset | ||||||||
3837 | .addImm(ImmOffset); // offset(imm) | ||||||||
3838 | |||||||||
3839 | if (IsTyped) | ||||||||
3840 | MIB.addImm(Format); | ||||||||
3841 | |||||||||
3842 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||||
3843 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||||
3844 | .addMemOperand(MMO); | ||||||||
3845 | |||||||||
3846 | if (LoadDstReg != Dst) { | ||||||||
3847 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||||
3848 | |||||||||
3849 | // Widen result for extending loads was widened. | ||||||||
3850 | if (IsExtLoad) | ||||||||
3851 | B.buildTrunc(Dst, LoadDstReg); | ||||||||
3852 | else { | ||||||||
3853 | // Repack to original 16-bit vector result | ||||||||
3854 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||||||
3855 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||||||
3856 | SmallVector<Register, 4> Repack; | ||||||||
3857 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||||||
3858 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||||||
3859 | B.buildMerge(Dst, Repack); | ||||||||
3860 | } | ||||||||
3861 | } | ||||||||
3862 | |||||||||
3863 | MI.eraseFromParent(); | ||||||||
3864 | return true; | ||||||||
3865 | } | ||||||||
3866 | |||||||||
3867 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||||||
3868 | MachineIRBuilder &B, | ||||||||
3869 | bool IsInc) const { | ||||||||
3870 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : | ||||||||
3871 | AMDGPU::G_AMDGPU_ATOMIC_DEC; | ||||||||
3872 | B.buildInstr(Opc) | ||||||||
3873 | .addDef(MI.getOperand(0).getReg()) | ||||||||
3874 | .addUse(MI.getOperand(2).getReg()) | ||||||||
3875 | .addUse(MI.getOperand(3).getReg()) | ||||||||
3876 | .cloneMemRefs(MI); | ||||||||
3877 | MI.eraseFromParent(); | ||||||||
3878 | return true; | ||||||||
3879 | } | ||||||||
3880 | |||||||||
3881 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||||||
3882 | switch (IntrID) { | ||||||||
3883 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||||
3884 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||||
3885 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||||||
3886 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||||
3887 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||||
3888 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||||||
3889 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||||
3890 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||||
3891 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||||||
3892 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||||
3893 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||||
3894 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||||||
3895 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||||
3896 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||||
3897 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||||||
3898 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||||
3899 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||||
3900 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||||||
3901 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||||
3902 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||||
3903 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||||||
3904 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||||
3905 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||||
3906 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||||||
3907 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||||
3908 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||||
3909 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||||||
3910 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||||
3911 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||||
3912 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||||||
3913 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||||
3914 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||||
3915 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||||||
3916 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||||
3917 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||||
3918 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||||||
3919 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||||
3920 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||||
3921 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||||||
3922 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||||
3923 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||||
3924 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||||
3925 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||||||
3926 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||||
3927 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||||
3928 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | ||||||||
3929 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||||
3930 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||||
3931 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | ||||||||
3932 | default: | ||||||||
3933 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3933); | ||||||||
3934 | } | ||||||||
3935 | } | ||||||||
3936 | |||||||||
3937 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||||||
3938 | MachineIRBuilder &B, | ||||||||
3939 | Intrinsic::ID IID) const { | ||||||||
3940 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||||||
3941 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||||||
3942 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||||||
3943 | |||||||||
3944 | Register Dst; | ||||||||
3945 | |||||||||
3946 | int OpOffset = 0; | ||||||||
3947 | if (HasReturn) { | ||||||||
3948 | // A few FP atomics do not support return values. | ||||||||
3949 | Dst = MI.getOperand(0).getReg(); | ||||||||
3950 | } else { | ||||||||
3951 | OpOffset = -1; | ||||||||
3952 | } | ||||||||
3953 | |||||||||
3954 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||||||
3955 | Register CmpVal; | ||||||||
3956 | |||||||||
3957 | if (IsCmpSwap) { | ||||||||
3958 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
3959 | ++OpOffset; | ||||||||
3960 | } | ||||||||
3961 | |||||||||
3962 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
3963 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||||||
3964 | |||||||||
3965 | // The struct intrinsic variants add one additional operand over raw. | ||||||||
3966 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||||
3967 | Register VIndex; | ||||||||
3968 | if (HasVIndex) { | ||||||||
3969 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
3970 | ++OpOffset; | ||||||||
3971 | } | ||||||||
3972 | |||||||||
3973 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
3974 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||||||
3975 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||||||
3976 | |||||||||
3977 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
3978 | |||||||||
3979 | unsigned ImmOffset; | ||||||||
3980 | unsigned TotalOffset; | ||||||||
3981 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||||
3982 | if (TotalOffset != 0) | ||||||||
3983 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); | ||||||||
3984 | |||||||||
3985 | if (!VIndex) | ||||||||
3986 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||||||
3987 | |||||||||
3988 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||||||
3989 | |||||||||
3990 | if (HasReturn) | ||||||||
3991 | MIB.addDef(Dst); | ||||||||
3992 | |||||||||
3993 | MIB.addUse(VData); // vdata | ||||||||
3994 | |||||||||
3995 | if (IsCmpSwap) | ||||||||
3996 | MIB.addReg(CmpVal); | ||||||||
3997 | |||||||||
3998 | MIB.addUse(RSrc) // rsrc | ||||||||
3999 | .addUse(VIndex) // vindex | ||||||||
4000 | .addUse(VOffset) // voffset | ||||||||
4001 | .addUse(SOffset) // soffset | ||||||||
4002 | .addImm(ImmOffset) // offset(imm) | ||||||||
4003 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||||
4004 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||||
4005 | .addMemOperand(MMO); | ||||||||
4006 | |||||||||
4007 | MI.eraseFromParent(); | ||||||||
4008 | return true; | ||||||||
4009 | } | ||||||||
4010 | |||||||||
4011 | /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized | ||||||||
4012 | /// vector with s16 typed elements. | ||||||||
4013 | static void packImageA16AddressToDwords( | ||||||||
4014 | MachineIRBuilder &B, MachineInstr &MI, | ||||||||
4015 | SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, | ||||||||
4016 | const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { | ||||||||
4017 | const LLT S16 = LLT::scalar(16); | ||||||||
4018 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
4019 | |||||||||
4020 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||||||
4021 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||||
4022 | if (!SrcOp.isReg()) | ||||||||
4023 | continue; // _L to _LZ may have eliminated this. | ||||||||
4024 | |||||||||
4025 | Register AddrReg = SrcOp.getReg(); | ||||||||
4026 | |||||||||
4027 | if (I < Intr->GradientStart) { | ||||||||
4028 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||||||
4029 | PackedAddrs.push_back(AddrReg); | ||||||||
4030 | } else { | ||||||||
4031 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||||||
4032 | // derivatives dx/dh and dx/dv are packed with undef. | ||||||||
4033 | if (((I + 1) >= EndIdx) || | ||||||||
4034 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||||||
4035 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||||||
4036 | (Intr->NumGradients / 2) - 1) || | ||||||||
4037 | I == static_cast<unsigned>(Intr->GradientStart + | ||||||||
4038 | Intr->NumGradients - 1))) || | ||||||||
4039 | // Check for _L to _LZ optimization | ||||||||
4040 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||||||
4041 | PackedAddrs.push_back( | ||||||||
4042 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||||||
4043 | .getReg(0)); | ||||||||
4044 | } else { | ||||||||
4045 | PackedAddrs.push_back( | ||||||||
4046 | B.buildBuildVector( | ||||||||
4047 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||||||
4048 | .getReg(0)); | ||||||||
4049 | ++I; | ||||||||
4050 | } | ||||||||
4051 | } | ||||||||
4052 | } | ||||||||
4053 | } | ||||||||
4054 | |||||||||
4055 | /// Convert from separate vaddr components to a single vector address register, | ||||||||
4056 | /// and replace the remaining operands with $noreg. | ||||||||
4057 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||||||
4058 | int DimIdx, int NumVAddrs) { | ||||||||
4059 | const LLT S32 = LLT::scalar(32); | ||||||||
4060 | |||||||||
4061 | SmallVector<Register, 8> AddrRegs; | ||||||||
4062 | for (int I = 0; I != NumVAddrs; ++I) { | ||||||||
4063 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||||
4064 | if (SrcOp.isReg()) { | ||||||||
4065 | AddrRegs.push_back(SrcOp.getReg()); | ||||||||
4066 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4066, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4067 | } | ||||||||
4068 | } | ||||||||
4069 | |||||||||
4070 | int NumAddrRegs = AddrRegs.size(); | ||||||||
4071 | if (NumAddrRegs != 1) { | ||||||||
4072 | // Round up to 8 elements for v5-v7 | ||||||||
4073 | // FIXME: Missing intermediate sized register classes and instructions. | ||||||||
4074 | if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { | ||||||||
4075 | const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); | ||||||||
4076 | auto Undef = B.buildUndef(S32); | ||||||||
4077 | AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); | ||||||||
4078 | NumAddrRegs = RoundedNumRegs; | ||||||||
4079 | } | ||||||||
4080 | |||||||||
4081 | auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); | ||||||||
4082 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||||||
4083 | } | ||||||||
4084 | |||||||||
4085 | for (int I = 1; I != NumVAddrs; ++I) { | ||||||||
4086 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||||
4087 | if (SrcOp.isReg()) | ||||||||
4088 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||||||
4089 | } | ||||||||
4090 | } | ||||||||
4091 | |||||||||
4092 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||||||
4093 | /// | ||||||||
4094 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||||||
4095 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||||||
4096 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||||||
4097 | /// registers. | ||||||||
4098 | /// | ||||||||
4099 | /// We don't want to directly select image instructions just yet, but also want | ||||||||
4100 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||||||
4101 | /// want a selected instrution entering RegBankSelect. In order to avoid | ||||||||
4102 | /// defining a multitude of intermediate image instructions, directly hack on | ||||||||
4103 | /// the intrinsic's arguments. In cases like a16 addreses, this requires padding | ||||||||
4104 | /// now unnecessary arguments with $noreg. | ||||||||
4105 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||||||
4106 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||||||
4107 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||||||
4108 | |||||||||
4109 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||||||
4110 | const unsigned ArgOffset = NumDefs + 1; | ||||||||
4111 | bool IsTFE = NumDefs == 2; | ||||||||
| |||||||||
4112 | // We are only processing the operands of d16 image operations on subtargets | ||||||||
4113 | // that use the unpacked register layout, or need to repack the TFE result. | ||||||||
4114 | |||||||||
4115 | // TODO: Do we need to guard against already legalized intrinsics? | ||||||||
4116 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||||||
4117 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||||||
4118 | |||||||||
4119 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||||
4120 | const LLT S32 = LLT::scalar(32); | ||||||||
4121 | const LLT S16 = LLT::scalar(16); | ||||||||
4122 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
4123 | |||||||||
4124 | unsigned DMask = 0; | ||||||||
4125 | |||||||||
4126 | // Check for 16 bit addresses and pack if true. | ||||||||
4127 | LLT GradTy = | ||||||||
4128 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||||||
4129 | LLT AddrTy = | ||||||||
4130 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||||||
4131 | const bool IsG16 = GradTy == S16; | ||||||||
4132 | const bool IsA16 = AddrTy == S16; | ||||||||
4133 | |||||||||
4134 | int DMaskLanes = 0; | ||||||||
4135 | if (!BaseOpcode->Atomic) { | ||||||||
4136 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||||||
4137 | if (BaseOpcode->Gather4) { | ||||||||
4138 | DMaskLanes = 4; | ||||||||
4139 | } else if (DMask != 0) { | ||||||||
4140 | DMaskLanes = countPopulation(DMask); | ||||||||
4141 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||||||
4142 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||||||
4143 | B.buildUndef(MI.getOperand(0)); | ||||||||
4144 | MI.eraseFromParent(); | ||||||||
4145 | return true; | ||||||||
4146 | } | ||||||||
4147 | } | ||||||||
4148 | |||||||||
4149 | Observer.changingInstr(MI); | ||||||||
4150 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||||||
4151 | |||||||||
4152 | unsigned NewOpcode = NumDefs == 0 ? | ||||||||
4153 | AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||||||
4154 | |||||||||
4155 | // Track that we legalized this | ||||||||
4156 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||||||
4157 | |||||||||
4158 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||||||
4159 | // dmask to be at least 1 otherwise the instruction will fail | ||||||||
4160 | if (IsTFE
| ||||||||
4161 | DMask = 0x1; | ||||||||
4162 | DMaskLanes = 1; | ||||||||
4163 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||||||
4164 | } | ||||||||
4165 | |||||||||
4166 | if (BaseOpcode->Atomic
| ||||||||
4167 | Register VData0 = MI.getOperand(2).getReg(); | ||||||||
4168 | LLT Ty = MRI->getType(VData0); | ||||||||
4169 | |||||||||
4170 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||||||
4171 | if (Ty.isVector()) | ||||||||
4172 | return false; | ||||||||
4173 | |||||||||
4174 | if (BaseOpcode->AtomicX2) { | ||||||||
4175 | Register VData1 = MI.getOperand(3).getReg(); | ||||||||
4176 | // The two values are packed in one register. | ||||||||
4177 | LLT PackedTy = LLT::vector(2, Ty); | ||||||||
4178 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||||||
4179 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||||||
4180 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||||||
4181 | } | ||||||||
4182 | } | ||||||||
4183 | |||||||||
4184 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||||||
4185 | |||||||||
4186 | // Optimize _L to _LZ when _L is zero | ||||||||
4187 | if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = | ||||||||
4188 | AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { | ||||||||
4189 | const ConstantFP *ConstantLod; | ||||||||
4190 | |||||||||
4191 | if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, | ||||||||
4192 | m_GFCst(ConstantLod))) { | ||||||||
4193 | if (ConstantLod->isZero() || ConstantLod->isNegative()) { | ||||||||
4194 | // Set new opcode to _lz variant of _l, and change the intrinsic ID. | ||||||||
4195 | const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = | ||||||||
4196 | AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, | ||||||||
4197 | Intr->Dim); | ||||||||
4198 | |||||||||
4199 | // The starting indexes should remain in the same place. | ||||||||
4200 | --CorrectedNumVAddrs; | ||||||||
4201 | |||||||||
4202 | MI.getOperand(MI.getNumExplicitDefs()) | ||||||||
4203 | .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); | ||||||||
4204 | MI.RemoveOperand(ArgOffset + Intr->LodIndex); | ||||||||
4205 | Intr = NewImageDimIntr; | ||||||||
4206 | } | ||||||||
4207 | } | ||||||||
4208 | } | ||||||||
4209 | |||||||||
4210 | // Optimize _mip away, when 'lod' is zero | ||||||||
4211 | if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { | ||||||||
4212 | int64_t ConstantLod; | ||||||||
4213 | if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, | ||||||||
4214 | m_ICst(ConstantLod))) { | ||||||||
4215 | if (ConstantLod == 0) { | ||||||||
4216 | // TODO: Change intrinsic opcode and remove operand instead or replacing | ||||||||
4217 | // it with 0, as the _L to _LZ handling is done above. | ||||||||
4218 | MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); | ||||||||
4219 | --CorrectedNumVAddrs; | ||||||||
4220 | } | ||||||||
4221 | } | ||||||||
4222 | } | ||||||||
4223 | |||||||||
4224 | // Rewrite the addressing register layout before doing anything else. | ||||||||
4225 | if (IsA16
| ||||||||
4226 | if (IsA16) { | ||||||||
4227 | // Target must support the feature and gradients need to be 16 bit too | ||||||||
4228 | if (!ST.hasA16() || !IsG16) | ||||||||
4229 | return false; | ||||||||
4230 | } else if (!ST.hasG16()) | ||||||||
4231 | return false; | ||||||||
4232 | |||||||||
4233 | if (Intr->NumVAddrs > 1) { | ||||||||
4234 | SmallVector<Register, 4> PackedRegs; | ||||||||
4235 | // Don't compress addresses for G16 | ||||||||
4236 | const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; | ||||||||
4237 | packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, | ||||||||
4238 | PackEndIdx); | ||||||||
4239 | |||||||||
4240 | if (!IsA16) { | ||||||||
4241 | // Add uncompressed address | ||||||||
4242 | for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { | ||||||||
4243 | int AddrReg = MI.getOperand(ArgOffset + I).getReg(); | ||||||||
4244 | assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32))(static_cast <bool> (B.getMRI()->getType(AddrReg) == LLT::scalar(32)) ? void (0) : __assert_fail ("B.getMRI()->getType(AddrReg) == LLT::scalar(32)" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4244, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4245 | PackedRegs.push_back(AddrReg); | ||||||||
4246 | } | ||||||||
4247 | } | ||||||||
4248 | |||||||||
4249 | // See also below in the non-a16 branch | ||||||||
4250 | const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); | ||||||||
4251 | |||||||||
4252 | if (!UseNSA && PackedRegs.size() > 1) { | ||||||||
4253 | LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); | ||||||||
4254 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||||||
4255 | PackedRegs[0] = Concat.getReg(0); | ||||||||
4256 | PackedRegs.resize(1); | ||||||||
4257 | } | ||||||||
4258 | |||||||||
4259 | const unsigned NumPacked = PackedRegs.size(); | ||||||||
4260 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||||||
4261 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||||
4262 | if (!SrcOp.isReg()) { | ||||||||
4263 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4263, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4264 | continue; | ||||||||
4265 | } | ||||||||
4266 | |||||||||
4267 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4267, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4268 | |||||||||
4269 | if (I - Intr->VAddrStart < NumPacked) | ||||||||
4270 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||||||
4271 | else | ||||||||
4272 | SrcOp.setReg(AMDGPU::NoRegister); | ||||||||
4273 | } | ||||||||
4274 | } | ||||||||
4275 | } else { | ||||||||
4276 | // If the register allocator cannot place the address registers contiguously | ||||||||
4277 | // without introducing moves, then using the non-sequential address encoding | ||||||||
4278 | // is always preferable, since it saves VALU instructions and is usually a | ||||||||
4279 | // wash in terms of code size or even better. | ||||||||
4280 | // | ||||||||
4281 | // However, we currently have no way of hinting to the register allocator | ||||||||
4282 | // that MIMG addresses should be placed contiguously when it is possible to | ||||||||
4283 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||||||
4284 | // | ||||||||
4285 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||||||
4286 | // allocation when possible. | ||||||||
4287 | const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); | ||||||||
4288 | |||||||||
4289 | if (!UseNSA
| ||||||||
4290 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||||||
4291 | Intr->NumVAddrs); | ||||||||
4292 | } | ||||||||
4293 | |||||||||
4294 | int Flags = 0; | ||||||||
4295 | if (IsA16
| ||||||||
4296 | Flags |= 1; | ||||||||
4297 | if (IsG16
| ||||||||
4298 | Flags |= 2; | ||||||||
4299 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||||||
4300 | |||||||||
4301 | if (BaseOpcode->Store) { // No TFE for stores? | ||||||||
4302 | // TODO: Handle dmask trim | ||||||||
4303 | Register VData = MI.getOperand(1).getReg(); | ||||||||
4304 | LLT Ty = MRI->getType(VData); | ||||||||
4305 | if (!Ty.isVector() || Ty.getElementType() != S16) | ||||||||
4306 | return true; | ||||||||
4307 | |||||||||
4308 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||||||
4309 | if (RepackedReg != VData) { | ||||||||
4310 | MI.getOperand(1).setReg(RepackedReg); | ||||||||
4311 | } | ||||||||
4312 | |||||||||
4313 | return true; | ||||||||
4314 | } | ||||||||
4315 | |||||||||
4316 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
4317 | LLT Ty = MRI->getType(DstReg); | ||||||||
4318 | const LLT EltTy = Ty.getScalarType(); | ||||||||
4319 | const bool IsD16 = Ty.getScalarType() == S16; | ||||||||
4320 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||||||
4321 | |||||||||
4322 | // Confirm that the return type is large enough for the dmask specified | ||||||||
4323 | if (NumElts
| ||||||||
4324 | return false; | ||||||||
4325 | |||||||||
4326 | if (NumElts > 4 || DMaskLanes
| ||||||||
4327 | return false; | ||||||||
4328 | |||||||||
4329 | const unsigned AdjustedNumElts = DMaskLanes
| ||||||||
4330 | const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); | ||||||||
4331 | |||||||||
4332 | // The raw dword aligned data component of the load. The only legal cases | ||||||||
4333 | // where this matters should be when using the packed D16 format, for | ||||||||
4334 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||||||
4335 | LLT RoundedTy; | ||||||||
4336 | |||||||||
4337 | // S32 vector to to cover all data, plus TFE result element. | ||||||||
4338 | LLT TFETy; | ||||||||
4339 | |||||||||
4340 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||||||
4341 | LLT RegTy; | ||||||||
4342 | |||||||||
4343 | if (IsD16
| ||||||||
4344 | RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); | ||||||||
4345 | TFETy = LLT::vector(AdjustedNumElts + 1, 32); | ||||||||
4346 | RegTy = S32; | ||||||||
4347 | } else { | ||||||||
4348 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
4349 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||||||
4350 | unsigned RoundedSize = 32 * RoundedElts; | ||||||||
4351 | RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); | ||||||||
| |||||||||
4352 | TFETy = LLT::vector(RoundedSize / 32 + 1, S32); | ||||||||
4353 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||||||
4354 | } | ||||||||
4355 | |||||||||
4356 | // The return type does not need adjustment. | ||||||||
4357 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||||||
4358 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||||||
4359 | return true; | ||||||||
4360 | |||||||||
4361 | Register Dst1Reg; | ||||||||
4362 | |||||||||
4363 | // Insert after the instruction. | ||||||||
4364 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||||||
4365 | |||||||||
4366 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||||||
4367 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||||||
4368 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||||||
4369 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||||||
4370 | |||||||||
4371 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||||||
4372 | |||||||||
4373 | MI.getOperand(0).setReg(NewResultReg); | ||||||||
4374 | |||||||||
4375 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||||||
4376 | // type. The intruction really returns these two values in one contiguous | ||||||||
4377 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||||||
4378 | // return type to use a single register result. | ||||||||
4379 | |||||||||
4380 | if (IsTFE) { | ||||||||
4381 | Dst1Reg = MI.getOperand(1).getReg(); | ||||||||
4382 | if (MRI->getType(Dst1Reg) != S32) | ||||||||
4383 | return false; | ||||||||
4384 | |||||||||
4385 | // TODO: Make sure the TFE operand bit is set. | ||||||||
4386 | MI.RemoveOperand(1); | ||||||||
4387 | |||||||||
4388 | // Handle the easy case that requires no repack instructions. | ||||||||
4389 | if (Ty == S32) { | ||||||||
4390 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||||||
4391 | return true; | ||||||||
4392 | } | ||||||||
4393 | } | ||||||||
4394 | |||||||||
4395 | // Now figure out how to copy the new result register back into the old | ||||||||
4396 | // result. | ||||||||
4397 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||||||
4398 | |||||||||
4399 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||||||
4400 | |||||||||
4401 | if (ResultNumRegs == 1) { | ||||||||
4402 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4402, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4403 | ResultRegs[0] = NewResultReg; | ||||||||
4404 | } else { | ||||||||
4405 | // We have to repack into a new vector of some kind. | ||||||||
4406 | for (int I = 0; I != NumDataRegs; ++I) | ||||||||
4407 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||||||
4408 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||||||
4409 | |||||||||
4410 | // Drop the final TFE element to get the data part. The TFE result is | ||||||||
4411 | // directly written to the right place already. | ||||||||
4412 | if (IsTFE) | ||||||||
4413 | ResultRegs.resize(NumDataRegs); | ||||||||
4414 | } | ||||||||
4415 | |||||||||
4416 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||||||
4417 | // of packed vs. unpacked. | ||||||||
4418 | if (IsD16 && !Ty.isVector()) { | ||||||||
4419 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||||||
4420 | return true; | ||||||||
4421 | } | ||||||||
4422 | |||||||||
4423 | // Avoid a build/concat_vector of 1 entry. | ||||||||
4424 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||||||
4425 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||||||
4426 | return true; | ||||||||
4427 | } | ||||||||
4428 | |||||||||
4429 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4429, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4430 | |||||||||
4431 | if (IsD16) { | ||||||||
4432 | // For packed D16 results with TFE enabled, all the data components are | ||||||||
4433 | // S32. Cast back to the expected type. | ||||||||
4434 | // | ||||||||
4435 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||||||
4436 | // cast for the TFE result if a multiple of v2s16 was used. | ||||||||
4437 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||||||
4438 | for (Register &Reg : ResultRegs) | ||||||||
4439 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||||||
4440 | } else if (ST.hasUnpackedD16VMem()) { | ||||||||
4441 | for (Register &Reg : ResultRegs) | ||||||||
4442 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||||||
4443 | } | ||||||||
4444 | } | ||||||||
4445 | |||||||||
4446 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||||||
4447 | if (NumElts == 0) | ||||||||
4448 | return; | ||||||||
4449 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||||||
4450 | for (int I = 0; I != NumElts; ++I) | ||||||||
4451 | ResultRegs.push_back(Undef); | ||||||||
4452 | }; | ||||||||
4453 | |||||||||
4454 | // Pad out any elements eliminated due to the dmask. | ||||||||
4455 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||||||
4456 | if (!ResTy.isVector()) { | ||||||||
4457 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||||||
4458 | B.buildBuildVector(DstReg, ResultRegs); | ||||||||
4459 | return true; | ||||||||
4460 | } | ||||||||
4461 | |||||||||
4462 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4462, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4463 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||||||
4464 | |||||||||
4465 | // Deal with the one annoying legal case. | ||||||||
4466 | const LLT V3S16 = LLT::vector(3, 16); | ||||||||
4467 | if (Ty == V3S16) { | ||||||||
4468 | padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); | ||||||||
4469 | auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); | ||||||||
4470 | B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); | ||||||||
4471 | return true; | ||||||||
4472 | } | ||||||||
4473 | |||||||||
4474 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||||||
4475 | B.buildConcatVectors(DstReg, ResultRegs); | ||||||||
4476 | return true; | ||||||||
4477 | } | ||||||||
4478 | |||||||||
4479 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||||||
4480 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||||||
4481 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
4482 | GISelChangeObserver &Observer = Helper.Observer; | ||||||||
4483 | |||||||||
4484 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
4485 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
4486 | unsigned Size = Ty.getSizeInBits(); | ||||||||
4487 | MachineFunction &MF = B.getMF(); | ||||||||
4488 | |||||||||
4489 | Observer.changingInstr(MI); | ||||||||
4490 | |||||||||
4491 | if (shouldBitcastLoadStoreType(ST, Ty, Size)) { | ||||||||
4492 | Ty = getBitcastRegisterType(Ty); | ||||||||
4493 | Helper.bitcastDst(MI, Ty, 0); | ||||||||
4494 | Dst = MI.getOperand(0).getReg(); | ||||||||
4495 | B.setInsertPt(B.getMBB(), MI); | ||||||||
4496 | } | ||||||||
4497 | |||||||||
4498 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||||||
4499 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||||||
4500 | // allowed to add one. | ||||||||
4501 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||||||
4502 | MI.RemoveOperand(1); // Remove intrinsic ID | ||||||||
4503 | |||||||||
4504 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||||||
4505 | // TODO: Should this use datalayout alignment? | ||||||||
4506 | const unsigned MemSize = (Size + 7) / 8; | ||||||||
4507 | const Align MemAlign(4); | ||||||||
4508 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||||
4509 | MachinePointerInfo(), | ||||||||
4510 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||||
4511 | MachineMemOperand::MOInvariant, | ||||||||
4512 | MemSize, MemAlign); | ||||||||
4513 | MI.addMemOperand(MF, MMO); | ||||||||
4514 | |||||||||
4515 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||||||
4516 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||||||
4517 | // out this needs to be converted to a vector load during RegBankSelect. | ||||||||
4518 | if (!isPowerOf2_32(Size)) { | ||||||||
4519 | if (Ty.isVector()) | ||||||||
4520 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||||||
4521 | else | ||||||||
4522 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||||||
4523 | } | ||||||||
4524 | |||||||||
4525 | Observer.changedInstr(MI); | ||||||||
4526 | return true; | ||||||||
4527 | } | ||||||||
4528 | |||||||||
4529 | // TODO: Move to selection | ||||||||
4530 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||||||
4531 | MachineRegisterInfo &MRI, | ||||||||
4532 | MachineIRBuilder &B) const { | ||||||||
4533 | if (!ST.isTrapHandlerEnabled() || | ||||||||
4534 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | ||||||||
4535 | return legalizeTrapEndpgm(MI, MRI, B); | ||||||||
4536 | |||||||||
4537 | if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { | ||||||||
4538 | switch (*HsaAbiVer) { | ||||||||
4539 | case ELF::ELFABIVERSION_AMDGPU_HSA_V2: | ||||||||
4540 | case ELF::ELFABIVERSION_AMDGPU_HSA_V3: | ||||||||
4541 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||||
4542 | case ELF::ELFABIVERSION_AMDGPU_HSA_V4: | ||||||||
4543 | return ST.supportsGetDoorbellID() ? | ||||||||
4544 | legalizeTrapHsa(MI, MRI, B) : | ||||||||
4545 | legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||||
4546 | } | ||||||||
4547 | } | ||||||||
4548 | |||||||||
4549 | llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4549); | ||||||||
4550 | } | ||||||||
4551 | |||||||||
4552 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | ||||||||
4553 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4554 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||||||
4555 | MI.eraseFromParent(); | ||||||||
4556 | return true; | ||||||||
4557 | } | ||||||||
4558 | |||||||||
4559 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | ||||||||
4560 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4561 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||||||
4562 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||||||
4563 | Register LiveIn = | ||||||||
4564 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||||
4565 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||||
4566 | return false; | ||||||||
4567 | |||||||||
4568 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||||||
4569 | B.buildCopy(SGPR01, LiveIn); | ||||||||
4570 | B.buildInstr(AMDGPU::S_TRAP) | ||||||||
4571 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||||||
4572 | .addReg(SGPR01, RegState::Implicit); | ||||||||
4573 | |||||||||
4574 | MI.eraseFromParent(); | ||||||||
4575 | return true; | ||||||||
4576 | } | ||||||||
4577 | |||||||||
4578 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | ||||||||
4579 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4580 | B.buildInstr(AMDGPU::S_TRAP) | ||||||||
4581 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | ||||||||
4582 | MI.eraseFromParent(); | ||||||||
4583 | return true; | ||||||||
4584 | } | ||||||||
4585 | |||||||||
4586 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||||||
4587 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4588 | // Is non-HSA path or trap-handler disabled? then, report a warning | ||||||||
4589 | // accordingly | ||||||||
4590 | if (!ST.isTrapHandlerEnabled() || | ||||||||
4591 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | ||||||||
4592 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||||||
4593 | "debugtrap handler not supported", | ||||||||
4594 | MI.getDebugLoc(), DS_Warning); | ||||||||
4595 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||||||
4596 | Ctx.diagnose(NoTrap); | ||||||||
4597 | } else { | ||||||||
4598 | // Insert debug-trap instruction | ||||||||
4599 | B.buildInstr(AMDGPU::S_TRAP) | ||||||||
4600 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | ||||||||
4601 | } | ||||||||
4602 | |||||||||
4603 | MI.eraseFromParent(); | ||||||||
4604 | return true; | ||||||||
4605 | } | ||||||||
4606 | |||||||||
4607 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||||||
4608 | MachineIRBuilder &B) const { | ||||||||
4609 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
4610 | const LLT S16 = LLT::scalar(16); | ||||||||
4611 | const LLT S32 = LLT::scalar(32); | ||||||||
4612 | |||||||||
4613 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
4614 | Register NodePtr = MI.getOperand(2).getReg(); | ||||||||
4615 | Register RayExtent = MI.getOperand(3).getReg(); | ||||||||
4616 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||||||
4617 | Register RayDir = MI.getOperand(5).getReg(); | ||||||||
4618 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||||||
4619 | Register TDescr = MI.getOperand(7).getReg(); | ||||||||
4620 | |||||||||
4621 | bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||||||
4622 | bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||||||
4623 | unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa | ||||||||
4624 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa | ||||||||
4625 | : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa | ||||||||
4626 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; | ||||||||
4627 | |||||||||
4628 | SmallVector<Register, 12> Ops; | ||||||||
4629 | if (Is64) { | ||||||||
4630 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||||||
4631 | Ops.push_back(Unmerge.getReg(0)); | ||||||||
4632 | Ops.push_back(Unmerge.getReg(1)); | ||||||||
4633 | } else { | ||||||||
4634 | Ops.push_back(NodePtr); | ||||||||
4635 | } | ||||||||
4636 | Ops.push_back(RayExtent); | ||||||||
4637 | |||||||||
4638 | auto packLanes = [&Ops, &S32, &B] (Register Src) { | ||||||||
4639 | auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); | ||||||||
4640 | Ops.push_back(Unmerge.getReg(0)); | ||||||||
4641 | Ops.push_back(Unmerge.getReg(1)); | ||||||||
4642 | Ops.push_back(Unmerge.getReg(2)); | ||||||||
4643 | }; | ||||||||
4644 | |||||||||
4645 | packLanes(RayOrigin); | ||||||||
4646 | if (IsA16) { | ||||||||
4647 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); | ||||||||
4648 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); | ||||||||
4649 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||||||
4650 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||||||
4651 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||||||
4652 | B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||||||
4653 | B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||||||
4654 | B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||||||
4655 | Ops.push_back(R1); | ||||||||
4656 | Ops.push_back(R2); | ||||||||
4657 | Ops.push_back(R3); | ||||||||
4658 | } else { | ||||||||
4659 | packLanes(RayDir); | ||||||||
4660 | packLanes(RayInvDir); | ||||||||
4661 | } | ||||||||
4662 | |||||||||
4663 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||||||
4664 | .addDef(DstReg) | ||||||||
4665 | .addImm(Opcode); | ||||||||
4666 | |||||||||
4667 | for (Register R : Ops) { | ||||||||
4668 | MIB.addUse(R); | ||||||||
4669 | } | ||||||||
4670 | |||||||||
4671 | MIB.addUse(TDescr) | ||||||||
4672 | .addImm(IsA16 ? 1 : 0) | ||||||||
4673 | .cloneMemRefs(MI); | ||||||||
4674 | |||||||||
4675 | MI.eraseFromParent(); | ||||||||
4676 | return true; | ||||||||
4677 | } | ||||||||
4678 | |||||||||
4679 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||||||
4680 | MachineInstr &MI) const { | ||||||||
4681 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
4682 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
4683 | |||||||||
4684 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||||||
4685 | auto IntrID = MI.getIntrinsicID(); | ||||||||
4686 | switch (IntrID) { | ||||||||
4687 | case Intrinsic::amdgcn_if: | ||||||||
4688 | case Intrinsic::amdgcn_else: { | ||||||||
4689 | MachineInstr *Br = nullptr; | ||||||||
4690 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||||
4691 | bool Negated = false; | ||||||||
4692 | if (MachineInstr *BrCond = | ||||||||
4693 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||||
4694 | const SIRegisterInfo *TRI | ||||||||
4695 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||||
4696 | |||||||||
4697 | Register Def = MI.getOperand(1).getReg(); | ||||||||
4698 | Register Use = MI.getOperand(3).getReg(); | ||||||||
4699 | |||||||||
4700 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||||
4701 | |||||||||
4702 | if (Negated) | ||||||||
4703 | std::swap(CondBrTarget, UncondBrTarget); | ||||||||
4704 | |||||||||
4705 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||||
4706 | if (IntrID == Intrinsic::amdgcn_if) { | ||||||||
4707 | B.buildInstr(AMDGPU::SI_IF) | ||||||||
4708 | .addDef(Def) | ||||||||
4709 | .addUse(Use) | ||||||||
4710 | .addMBB(UncondBrTarget); | ||||||||
4711 | } else { | ||||||||
4712 | B.buildInstr(AMDGPU::SI_ELSE) | ||||||||
4713 | .addDef(Def) | ||||||||
4714 | .addUse(Use) | ||||||||
4715 | .addMBB(UncondBrTarget); | ||||||||
4716 | } | ||||||||
4717 | |||||||||
4718 | if (Br) { | ||||||||
4719 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||||
4720 | } else { | ||||||||
4721 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||||||
4722 | // since we're swapping branch targets it needs to be reinserted. | ||||||||
4723 | // FIXME: IRTranslator should probably not do this | ||||||||
4724 | B.buildBr(*CondBrTarget); | ||||||||
4725 | } | ||||||||
4726 | |||||||||
4727 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||||||
4728 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||||||
4729 | MI.eraseFromParent(); | ||||||||
4730 | BrCond->eraseFromParent(); | ||||||||
4731 | return true; | ||||||||
4732 | } | ||||||||
4733 | |||||||||
4734 | return false; | ||||||||
4735 | } | ||||||||
4736 | case Intrinsic::amdgcn_loop: { | ||||||||
4737 | MachineInstr *Br = nullptr; | ||||||||
4738 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||||
4739 | bool Negated = false; | ||||||||
4740 | if (MachineInstr *BrCond = | ||||||||
4741 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||||
4742 | const SIRegisterInfo *TRI | ||||||||
4743 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||||
4744 | |||||||||
4745 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||||
4746 | Register Reg = MI.getOperand(2).getReg(); | ||||||||
4747 | |||||||||
4748 | if (Negated) | ||||||||
4749 | std::swap(CondBrTarget, UncondBrTarget); | ||||||||
4750 | |||||||||
4751 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||||
4752 | B.buildInstr(AMDGPU::SI_LOOP) | ||||||||
4753 | .addUse(Reg) | ||||||||
4754 | .addMBB(UncondBrTarget); | ||||||||
4755 | |||||||||
4756 | if (Br) | ||||||||
4757 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||||
4758 | else | ||||||||
4759 | B.buildBr(*CondBrTarget); | ||||||||
4760 | |||||||||
4761 | MI.eraseFromParent(); | ||||||||
4762 | BrCond->eraseFromParent(); | ||||||||
4763 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||||||
4764 | return true; | ||||||||
4765 | } | ||||||||
4766 | |||||||||
4767 | return false; | ||||||||
4768 | } | ||||||||
4769 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||||||
4770 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||||||
4771 | // This only makes sense to call in a kernel, so just lower to null. | ||||||||
4772 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||||||
4773 | MI.eraseFromParent(); | ||||||||
4774 | return true; | ||||||||
4775 | } | ||||||||
4776 | |||||||||
4777 | return legalizePreloadedArgIntrin( | ||||||||
4778 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||||
4779 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||||||
4780 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||||||
4781 | case Intrinsic::amdgcn_workitem_id_x: | ||||||||
4782 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4783 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||||||
4784 | case Intrinsic::amdgcn_workitem_id_y: | ||||||||
4785 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4786 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||||||
4787 | case Intrinsic::amdgcn_workitem_id_z: | ||||||||
4788 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4789 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||||||
4790 | case Intrinsic::amdgcn_workgroup_id_x: | ||||||||
4791 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4792 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||||||
4793 | case Intrinsic::amdgcn_workgroup_id_y: | ||||||||
4794 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4795 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||||||
4796 | case Intrinsic::amdgcn_workgroup_id_z: | ||||||||
4797 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4798 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||||||
4799 | case Intrinsic::amdgcn_dispatch_ptr: | ||||||||
4800 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4801 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||||||
4802 | case Intrinsic::amdgcn_queue_ptr: | ||||||||
4803 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4804 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||||||
4805 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||||||
4806 | return legalizePreloadedArgIntrin( | ||||||||
4807 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||||||
4808 | case Intrinsic::amdgcn_dispatch_id: | ||||||||
4809 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4810 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||||||
4811 | case Intrinsic::amdgcn_fdiv_fast: | ||||||||
4812 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||||||
4813 | case Intrinsic::amdgcn_is_shared: | ||||||||
4814 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||||||
4815 | case Intrinsic::amdgcn_is_private: | ||||||||
4816 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||||||
4817 | case Intrinsic::amdgcn_wavefrontsize: { | ||||||||
4818 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||||||
4819 | MI.eraseFromParent(); | ||||||||
4820 | return true; | ||||||||
4821 | } | ||||||||
4822 | case Intrinsic::amdgcn_s_buffer_load: | ||||||||
4823 | return legalizeSBufferLoad(Helper, MI); | ||||||||
4824 | case Intrinsic::amdgcn_raw_buffer_store: | ||||||||
4825 | case Intrinsic::amdgcn_struct_buffer_store: | ||||||||
4826 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||||||
4827 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||||||
4828 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||||||
4829 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||||||
4830 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||||||
4831 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||||||
4832 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||||||
4833 | case Intrinsic::amdgcn_raw_buffer_load: | ||||||||
4834 | case Intrinsic::amdgcn_struct_buffer_load: | ||||||||
4835 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||||||
4836 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||||||
4837 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||||||
4838 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||||||
4839 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||||||
4840 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||||||
4841 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||||||
4842 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||||
4843 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||||
4844 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||||
4845 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||||
4846 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||||
4847 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||||
4848 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||||
4849 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||||
4850 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||||
4851 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||||
4852 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||||
4853 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||||
4854 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||||
4855 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||||
4856 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||||
4857 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||||
4858 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||||
4859 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||||
4860 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||||
4861 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||||
4862 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||||
4863 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||||
4864 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||||
4865 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||||
4866 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||||
4867 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||||
4868 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||||
4869 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||||
4870 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||||
4871 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||||
4872 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||||
4873 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||||
4874 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||||
4875 | return legalizeBufferAtomic(MI, B, IntrID); | ||||||||
4876 | case Intrinsic::amdgcn_atomic_inc: | ||||||||
4877 | return legalizeAtomicIncDec(MI, B, true); | ||||||||
4878 | case Intrinsic::amdgcn_atomic_dec: | ||||||||
4879 | return legalizeAtomicIncDec(MI, B, false); | ||||||||
4880 | case Intrinsic::trap: | ||||||||
4881 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||||||
4882 | case Intrinsic::debugtrap: | ||||||||
4883 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||||||
4884 | case Intrinsic::amdgcn_rsq_clamp: | ||||||||
4885 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||||||
4886 | case Intrinsic::amdgcn_ds_fadd: | ||||||||
4887 | case Intrinsic::amdgcn_ds_fmin: | ||||||||
4888 | case Intrinsic::amdgcn_ds_fmax: | ||||||||
4889 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||||||
4890 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||||||
4891 | return legalizeBVHIntrinsic(MI, B); | ||||||||
4892 | default: { | ||||||||
4893 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||||||
4894 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||||||
4895 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||||||
4896 | return true; | ||||||||
4897 | } | ||||||||
4898 | } | ||||||||
4899 | |||||||||
4900 | return true; | ||||||||
4901 | } |
1 | //== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==// | ||||
2 | // | ||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
6 | // | ||||
7 | //===----------------------------------------------------------------------===// | ||||
8 | /// \file | ||||
9 | /// Implement a low-level type suitable for MachineInstr level instruction | ||||
10 | /// selection. | ||||
11 | /// | ||||
12 | /// For a type attached to a MachineInstr, we only care about 2 details: total | ||||
13 | /// size and the number of vector lanes (if any). Accordingly, there are 4 | ||||
14 | /// possible valid type-kinds: | ||||
15 | /// | ||||
16 | /// * `sN` for scalars and aggregates | ||||
17 | /// * `<N x sM>` for vectors, which must have at least 2 elements. | ||||
18 | /// * `pN` for pointers | ||||
19 | /// | ||||
20 | /// Other information required for correct selection is expected to be carried | ||||
21 | /// by the opcode, or non-type flags. For example the distinction between G_ADD | ||||
22 | /// and G_FADD for int/float or fast-math flags. | ||||
23 | /// | ||||
24 | //===----------------------------------------------------------------------===// | ||||
25 | |||||
26 | #ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H | ||||
27 | #define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H | ||||
28 | |||||
29 | #include "llvm/ADT/DenseMapInfo.h" | ||||
30 | #include "llvm/Support/Debug.h" | ||||
31 | #include "llvm/Support/MachineValueType.h" | ||||
32 | #include <cassert> | ||||
33 | |||||
34 | namespace llvm { | ||||
35 | |||||
36 | class DataLayout; | ||||
37 | class Type; | ||||
38 | class raw_ostream; | ||||
39 | |||||
40 | class LLT { | ||||
41 | public: | ||||
42 | /// Get a low-level scalar or aggregate "bag of bits". | ||||
43 | static LLT scalar(unsigned SizeInBits) { | ||||
44 | assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 44, __extension__ __PRETTY_FUNCTION__)); | ||||
45 | return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0, | ||||
46 | SizeInBits, /*AddressSpace=*/0}; | ||||
47 | } | ||||
48 | |||||
49 | /// Get a low-level pointer in the given address space. | ||||
50 | static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) { | ||||
51 | assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 51, __extension__ __PRETTY_FUNCTION__)); | ||||
52 | return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0, | ||||
53 | SizeInBits, AddressSpace}; | ||||
54 | } | ||||
55 | |||||
56 | /// Get a low-level vector of some number of elements and element width. | ||||
57 | /// \p NumElements must be at least 2. | ||||
58 | static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) { | ||||
59 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 59, __extension__ __PRETTY_FUNCTION__)); | ||||
60 | assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 && "invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 60, __extension__ __PRETTY_FUNCTION__)); | ||||
61 | return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements, | ||||
62 | ScalarSizeInBits, /*AddressSpace=*/0}; | ||||
63 | } | ||||
64 | |||||
65 | /// Get a low-level vector of some number of elements and element type. | ||||
66 | static LLT vector(uint16_t NumElements, LLT ScalarTy) { | ||||
67 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 67, __extension__ __PRETTY_FUNCTION__)); | ||||
68 | assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type" ) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 68, __extension__ __PRETTY_FUNCTION__)); | ||||
69 | return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements, | ||||
70 | ScalarTy.getSizeInBits(), | ||||
71 | ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0}; | ||||
72 | } | ||||
73 | |||||
74 | static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) { | ||||
75 | return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy); | ||||
76 | } | ||||
77 | |||||
78 | static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) { | ||||
79 | return scalarOrVector(NumElements, LLT::scalar(ScalarSize)); | ||||
80 | } | ||||
81 | |||||
82 | explicit LLT(bool isPointer, bool isVector, uint16_t NumElements, | ||||
83 | unsigned SizeInBits, unsigned AddressSpace) { | ||||
84 | init(isPointer, isVector, NumElements, SizeInBits, AddressSpace); | ||||
85 | } | ||||
86 | explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {} | ||||
87 | |||||
88 | explicit LLT(MVT VT); | ||||
89 | |||||
90 | bool isValid() const { return RawData != 0; } | ||||
91 | |||||
92 | bool isScalar() const { return isValid() && !IsPointer && !IsVector; } | ||||
93 | |||||
94 | bool isPointer() const { return isValid() && IsPointer && !IsVector; } | ||||
95 | |||||
96 | bool isVector() const { return isValid() && IsVector; } | ||||
97 | |||||
98 | /// Returns the number of elements in a vector LLT. Must only be called on | ||||
99 | /// vector types. | ||||
100 | uint16_t getNumElements() const { | ||||
101 | assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate" ) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 101, __extension__ __PRETTY_FUNCTION__)); | ||||
102 | if (!IsPointer) | ||||
103 | return getFieldValue(VectorElementsFieldInfo); | ||||
104 | else | ||||
105 | return getFieldValue(PointerVectorElementsFieldInfo); | ||||
106 | } | ||||
107 | |||||
108 | /// Returns the total size of the type. Must only be called on sized types. | ||||
109 | unsigned getSizeInBits() const { | ||||
110 | if (isPointer() || isScalar()) | ||||
111 | return getScalarSizeInBits(); | ||||
112 | return getScalarSizeInBits() * getNumElements(); | ||||
113 | } | ||||
114 | |||||
115 | /// Returns the total size of the type in bytes, i.e. number of whole bytes | ||||
116 | /// needed to represent the size in bits. Must only be called on sized types. | ||||
117 | unsigned getSizeInBytes() const { | ||||
118 | return (getSizeInBits() + 7) / 8; | ||||
119 | } | ||||
120 | |||||
121 | LLT getScalarType() const { | ||||
122 | return isVector() ? getElementType() : *this; | ||||
123 | } | ||||
124 | |||||
125 | /// If this type is a vector, return a vector with the same number of elements | ||||
126 | /// but the new element type. Otherwise, return the new element type. | ||||
127 | LLT changeElementType(LLT NewEltTy) const { | ||||
128 | return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy; | ||||
129 | } | ||||
130 | |||||
131 | /// If this type is a vector, return a vector with the same number of elements | ||||
132 | /// but the new element size. Otherwise, return the new element type. Invalid | ||||
133 | /// for pointer types. For pointer types, use changeElementType. | ||||
134 | LLT changeElementSize(unsigned NewEltSize) const { | ||||
135 | assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)) | ||||
136 | "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)); | ||||
137 | return isVector() ? LLT::vector(getNumElements(), NewEltSize) | ||||
138 | : LLT::scalar(NewEltSize); | ||||
139 | } | ||||
140 | |||||
141 | /// Return a vector or scalar with the same element type and the new number of | ||||
142 | /// elements. | ||||
143 | LLT changeNumElements(unsigned NewNumElts) const { | ||||
144 | return LLT::scalarOrVector(NewNumElts, getScalarType()); | ||||
145 | } | ||||
146 | |||||
147 | /// Return a type that is \p Factor times smaller. Reduces the number of | ||||
148 | /// elements if this is a vector, or the bitwidth for scalar/pointers. Does | ||||
149 | /// not attempt to handle cases that aren't evenly divisible. | ||||
150 | LLT divide(int Factor) const { | ||||
151 | assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail ("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 151, __extension__ __PRETTY_FUNCTION__)); | ||||
152 | if (isVector()) { | ||||
153 | assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void (0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 153, __extension__ __PRETTY_FUNCTION__)); | ||||
154 | return scalarOrVector(getNumElements() / Factor, getElementType()); | ||||
155 | } | ||||
156 | |||||
157 | assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void (0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 157, __extension__ __PRETTY_FUNCTION__)); | ||||
158 | return scalar(getSizeInBits() / Factor); | ||||
159 | } | ||||
160 | |||||
161 | bool isByteSized() const { return (getSizeInBits() & 7) == 0; } | ||||
162 | |||||
163 | unsigned getScalarSizeInBits() const { | ||||
164 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 164, __extension__ __PRETTY_FUNCTION__)); | ||||
165 | if (!IsVector) { | ||||
166 | if (!IsPointer) | ||||
167 | return getFieldValue(ScalarSizeFieldInfo); | ||||
168 | else | ||||
169 | return getFieldValue(PointerSizeFieldInfo); | ||||
170 | } else { | ||||
171 | if (!IsPointer) | ||||
172 | return getFieldValue(VectorSizeFieldInfo); | ||||
173 | else | ||||
174 | return getFieldValue(PointerVectorSizeFieldInfo); | ||||
175 | } | ||||
176 | } | ||||
177 | |||||
178 | unsigned getAddressSpace() const { | ||||
179 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 179, __extension__ __PRETTY_FUNCTION__)); | ||||
180 | assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type" ) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 180, __extension__ __PRETTY_FUNCTION__)); | ||||
181 | if (!IsVector) | ||||
182 | return getFieldValue(PointerAddressSpaceFieldInfo); | ||||
183 | else | ||||
184 | return getFieldValue(PointerVectorAddressSpaceFieldInfo); | ||||
185 | } | ||||
186 | |||||
187 | /// Returns the vector's element type. Only valid for vector types. | ||||
188 | LLT getElementType() const { | ||||
189 | assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate" ) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 189, __extension__ __PRETTY_FUNCTION__)); | ||||
190 | if (IsPointer) | ||||
191 | return pointer(getAddressSpace(), getScalarSizeInBits()); | ||||
192 | else | ||||
193 | return scalar(getScalarSizeInBits()); | ||||
194 | } | ||||
195 | |||||
196 | void print(raw_ostream &OS) const; | ||||
197 | |||||
198 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | ||||
199 | LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { | ||||
200 | print(dbgs()); | ||||
201 | dbgs() << '\n'; | ||||
202 | } | ||||
203 | #endif | ||||
204 | |||||
205 | bool operator==(const LLT &RHS) const { | ||||
206 | return IsPointer
| ||||
207 | RHS.RawData == RawData; | ||||
208 | } | ||||
209 | |||||
210 | bool operator!=(const LLT &RHS) const { return !(*this == RHS); } | ||||
211 | |||||
212 | friend struct DenseMapInfo<LLT>; | ||||
213 | friend class GISelInstProfileBuilder; | ||||
214 | |||||
215 | private: | ||||
216 | /// LLT is packed into 64 bits as follows: | ||||
217 | /// isPointer : 1 | ||||
218 | /// isVector : 1 | ||||
219 | /// with 62 bits remaining for Kind-specific data, packed in bitfields | ||||
220 | /// as described below. As there isn't a simple portable way to pack bits | ||||
221 | /// into bitfields, here the different fields in the packed structure is | ||||
222 | /// described in static const *Field variables. Each of these variables | ||||
223 | /// is a 2-element array, with the first element describing the bitfield size | ||||
224 | /// and the second element describing the bitfield offset. | ||||
225 | typedef int BitFieldInfo[2]; | ||||
226 | /// | ||||
227 | /// This is how the bitfields are packed per Kind: | ||||
228 | /// * Invalid: | ||||
229 | /// gets encoded as RawData == 0, as that is an invalid encoding, since for | ||||
230 | /// valid encodings, SizeInBits/SizeOfElement must be larger than 0. | ||||
231 | /// * Non-pointer scalar (isPointer == 0 && isVector == 0): | ||||
232 | /// SizeInBits: 32; | ||||
233 | static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0}; | ||||
234 | /// * Pointer (isPointer == 1 && isVector == 0): | ||||
235 | /// SizeInBits: 16; | ||||
236 | /// AddressSpace: 24; | ||||
237 | static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0}; | ||||
238 | static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{ | ||||
239 | 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]}; | ||||
240 | /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1): | ||||
241 | /// NumElements: 16; | ||||
242 | /// SizeOfElement: 32; | ||||
243 | static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0}; | ||||
244 | static const constexpr BitFieldInfo VectorSizeFieldInfo{ | ||||
245 | 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]}; | ||||
246 | /// * Vector-of-pointer (isPointer == 1 && isVector == 1): | ||||
247 | /// NumElements: 16; | ||||
248 | /// SizeOfElement: 16; | ||||
249 | /// AddressSpace: 24; | ||||
250 | static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0}; | ||||
251 | static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{ | ||||
252 | 16, | ||||
253 | PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]}; | ||||
254 | static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{ | ||||
255 | 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]}; | ||||
256 | |||||
257 | uint64_t IsPointer : 1; | ||||
258 | uint64_t IsVector : 1; | ||||
259 | uint64_t RawData : 62; | ||||
260 | |||||
261 | static uint64_t getMask(const BitFieldInfo FieldInfo) { | ||||
262 | const int FieldSizeInBits = FieldInfo[0]; | ||||
263 | return (((uint64_t)1) << FieldSizeInBits) - 1; | ||||
264 | } | ||||
265 | static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) { | ||||
266 | assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field" ) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 266, __extension__ __PRETTY_FUNCTION__)); | ||||
267 | return (Val & Mask) << Shift; | ||||
268 | } | ||||
269 | static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) { | ||||
270 | return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]); | ||||
271 | } | ||||
272 | uint64_t getFieldValue(const BitFieldInfo FieldInfo) const { | ||||
273 | return getMask(FieldInfo) & (RawData >> FieldInfo[1]); | ||||
274 | } | ||||
275 | |||||
276 | void init(bool IsPointer, bool IsVector, uint16_t NumElements, | ||||
277 | unsigned SizeInBits, unsigned AddressSpace) { | ||||
278 | this->IsPointer = IsPointer; | ||||
279 | this->IsVector = IsVector; | ||||
280 | if (!IsVector) { | ||||
281 | if (!IsPointer) | ||||
282 | RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo); | ||||
283 | else | ||||
284 | RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) | | ||||
285 | maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo); | ||||
286 | } else { | ||||
287 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 287, __extension__ __PRETTY_FUNCTION__)); | ||||
288 | if (!IsPointer) | ||||
289 | RawData = maskAndShift(NumElements, VectorElementsFieldInfo) | | ||||
290 | maskAndShift(SizeInBits, VectorSizeFieldInfo); | ||||
291 | else | ||||
292 | RawData = | ||||
293 | maskAndShift(NumElements, PointerVectorElementsFieldInfo) | | ||||
294 | maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) | | ||||
295 | maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo); | ||||
296 | } | ||||
297 | } | ||||
298 | |||||
299 | uint64_t getUniqueRAWLLTData() const { | ||||
300 | return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 | | ||||
301 | ((uint64_t)IsVector); | ||||
302 | } | ||||
303 | }; | ||||
304 | |||||
305 | inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) { | ||||
306 | Ty.print(OS); | ||||
307 | return OS; | ||||
308 | } | ||||
309 | |||||
310 | template<> struct DenseMapInfo<LLT> { | ||||
311 | static inline LLT getEmptyKey() { | ||||
312 | LLT Invalid; | ||||
313 | Invalid.IsPointer = true; | ||||
314 | return Invalid; | ||||
315 | } | ||||
316 | static inline LLT getTombstoneKey() { | ||||
317 | LLT Invalid; | ||||
318 | Invalid.IsVector = true; | ||||
319 | return Invalid; | ||||
320 | } | ||||
321 | static inline unsigned getHashValue(const LLT &Ty) { | ||||
322 | uint64_t Val = Ty.getUniqueRAWLLTData(); | ||||
323 | return DenseMapInfo<uint64_t>::getHashValue(Val); | ||||
324 | } | ||||
325 | static bool isEqual(const LLT &LHS, const LLT &RHS) { | ||||
326 | return LHS == RHS; | ||||
327 | } | ||||
328 | }; | ||||
329 | |||||
330 | } | ||||
331 | |||||
332 | #endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |