File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 4453, column 49 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||||||
2 | // | ||||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||||
6 | // | ||||||||
7 | //===----------------------------------------------------------------------===// | ||||||||
8 | /// \file | ||||||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||||||
10 | /// AMDGPU. | ||||||||
11 | /// \todo This should be generated by TableGen. | ||||||||
12 | //===----------------------------------------------------------------------===// | ||||||||
13 | |||||||||
14 | #include "AMDGPULegalizerInfo.h" | ||||||||
15 | |||||||||
16 | #include "AMDGPU.h" | ||||||||
17 | #include "AMDGPUGlobalISelUtils.h" | ||||||||
18 | #include "AMDGPUInstrInfo.h" | ||||||||
19 | #include "AMDGPUTargetMachine.h" | ||||||||
20 | #include "SIMachineFunctionInfo.h" | ||||||||
21 | #include "Utils/AMDGPUBaseInfo.h" | ||||||||
22 | #include "llvm/ADT/ScopeExit.h" | ||||||||
23 | #include "llvm/BinaryFormat/ELF.h" | ||||||||
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||||||
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||||||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||||||
27 | #include "llvm/IR/DiagnosticInfo.h" | ||||||||
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||||||
29 | |||||||||
30 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||||||
31 | |||||||||
32 | using namespace llvm; | ||||||||
33 | using namespace LegalizeActions; | ||||||||
34 | using namespace LegalizeMutations; | ||||||||
35 | using namespace LegalityPredicates; | ||||||||
36 | using namespace MIPatternMatch; | ||||||||
37 | |||||||||
38 | // Hack until load/store selection patterns support any tuple of legal types. | ||||||||
39 | static cl::opt<bool> EnableNewLegality( | ||||||||
40 | "amdgpu-global-isel-new-legality", | ||||||||
41 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||||||
42 | "rules compatible with selection patterns"), | ||||||||
43 | cl::init(false), | ||||||||
44 | cl::ReallyHidden); | ||||||||
45 | |||||||||
46 | static constexpr unsigned MaxRegisterSize = 1024; | ||||||||
47 | |||||||||
48 | // Round the number of elements to the next power of two elements | ||||||||
49 | static LLT getPow2VectorType(LLT Ty) { | ||||||||
50 | unsigned NElts = Ty.getNumElements(); | ||||||||
51 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||||||
52 | return Ty.changeNumElements(Pow2NElts); | ||||||||
53 | } | ||||||||
54 | |||||||||
55 | // Round the number of bits to the next power of two bits | ||||||||
56 | static LLT getPow2ScalarType(LLT Ty) { | ||||||||
57 | unsigned Bits = Ty.getSizeInBits(); | ||||||||
58 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||||||
59 | return LLT::scalar(Pow2Bits); | ||||||||
60 | } | ||||||||
61 | |||||||||
62 | /// \returs true if this is an odd sized vector which should widen by adding an | ||||||||
63 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||||||
64 | /// excludes s1 vectors, which should always be scalarized. | ||||||||
65 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||||||
66 | return [=](const LegalityQuery &Query) { | ||||||||
67 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
68 | if (!Ty.isVector()) | ||||||||
69 | return false; | ||||||||
70 | |||||||||
71 | const LLT EltTy = Ty.getElementType(); | ||||||||
72 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
73 | return Ty.getNumElements() % 2 != 0 && | ||||||||
74 | EltSize > 1 && EltSize < 32 && | ||||||||
75 | Ty.getSizeInBits() % 32 != 0; | ||||||||
76 | }; | ||||||||
77 | } | ||||||||
78 | |||||||||
79 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||||||
80 | return [=](const LegalityQuery &Query) { | ||||||||
81 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
82 | return Ty.getSizeInBits() % 32 == 0; | ||||||||
83 | }; | ||||||||
84 | } | ||||||||
85 | |||||||||
86 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||||||
87 | return [=](const LegalityQuery &Query) { | ||||||||
88 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
89 | const LLT EltTy = Ty.getScalarType(); | ||||||||
90 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||||||
91 | }; | ||||||||
92 | } | ||||||||
93 | |||||||||
94 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||||||
95 | return [=](const LegalityQuery &Query) { | ||||||||
96 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
97 | const LLT EltTy = Ty.getElementType(); | ||||||||
98 | return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); | ||||||||
99 | }; | ||||||||
100 | } | ||||||||
101 | |||||||||
102 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||||||
103 | return [=](const LegalityQuery &Query) { | ||||||||
104 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
105 | const LLT EltTy = Ty.getElementType(); | ||||||||
106 | unsigned Size = Ty.getSizeInBits(); | ||||||||
107 | unsigned Pieces = (Size + 63) / 64; | ||||||||
108 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||||||
109 | return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); | ||||||||
110 | }; | ||||||||
111 | } | ||||||||
112 | |||||||||
113 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||||||
114 | // type. | ||||||||
115 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||||||
116 | return [=](const LegalityQuery &Query) { | ||||||||
117 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
118 | |||||||||
119 | const LLT EltTy = Ty.getElementType(); | ||||||||
120 | const int Size = Ty.getSizeInBits(); | ||||||||
121 | const int EltSize = EltTy.getSizeInBits(); | ||||||||
122 | const int NextMul32 = (Size + 31) / 32; | ||||||||
123 | |||||||||
124 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 124, __extension__ __PRETTY_FUNCTION__)); | ||||||||
125 | |||||||||
126 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||||||
127 | return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); | ||||||||
128 | }; | ||||||||
129 | } | ||||||||
130 | |||||||||
131 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||||||
132 | const unsigned Size = Ty.getSizeInBits(); | ||||||||
133 | |||||||||
134 | LLT CoercedTy; | ||||||||
135 | if (Size <= 32) { | ||||||||
136 | // <2 x s8> -> s16 | ||||||||
137 | // <4 x s8> -> s32 | ||||||||
138 | return LLT::scalar(Size); | ||||||||
139 | } | ||||||||
140 | |||||||||
141 | return LLT::scalarOrVector(Size / 32, 32); | ||||||||
142 | } | ||||||||
143 | |||||||||
144 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||||||
145 | return [=](const LegalityQuery &Query) { | ||||||||
146 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
147 | return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||||||
148 | }; | ||||||||
149 | } | ||||||||
150 | |||||||||
151 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||||||
152 | return [=](const LegalityQuery &Query) { | ||||||||
153 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
154 | unsigned Size = Ty.getSizeInBits(); | ||||||||
155 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 155, __extension__ __PRETTY_FUNCTION__)); | ||||||||
156 | return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); | ||||||||
157 | }; | ||||||||
158 | } | ||||||||
159 | |||||||||
160 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||||||
161 | return [=](const LegalityQuery &Query) { | ||||||||
162 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
163 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||||||
164 | }; | ||||||||
165 | } | ||||||||
166 | |||||||||
167 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||||||
168 | return [=](const LegalityQuery &Query) { | ||||||||
169 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
170 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||||||
171 | }; | ||||||||
172 | } | ||||||||
173 | |||||||||
174 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||||||
175 | return [=](const LegalityQuery &Query) { | ||||||||
176 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
177 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||||||
178 | }; | ||||||||
179 | } | ||||||||
180 | |||||||||
181 | static bool isRegisterSize(unsigned Size) { | ||||||||
182 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||||||
183 | } | ||||||||
184 | |||||||||
185 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||||||
186 | const int EltSize = EltTy.getSizeInBits(); | ||||||||
187 | return EltSize == 16 || EltSize % 32 == 0; | ||||||||
188 | } | ||||||||
189 | |||||||||
190 | static bool isRegisterVectorType(LLT Ty) { | ||||||||
191 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||||||
192 | return EltSize == 32 || EltSize == 64 || | ||||||||
193 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||||||
194 | EltSize == 128 || EltSize == 256; | ||||||||
195 | } | ||||||||
196 | |||||||||
197 | static bool isRegisterType(LLT Ty) { | ||||||||
198 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||||||
199 | return false; | ||||||||
200 | |||||||||
201 | if (Ty.isVector()) | ||||||||
202 | return isRegisterVectorType(Ty); | ||||||||
203 | |||||||||
204 | return true; | ||||||||
205 | } | ||||||||
206 | |||||||||
207 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||||||
208 | // multiples of v2s16. | ||||||||
209 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||||||
210 | return [=](const LegalityQuery &Query) { | ||||||||
211 | return isRegisterType(Query.Types[TypeIdx]); | ||||||||
212 | }; | ||||||||
213 | } | ||||||||
214 | |||||||||
215 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||||||
216 | return [=](const LegalityQuery &Query) { | ||||||||
217 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||||
218 | if (!QueryTy.isVector()) | ||||||||
219 | return false; | ||||||||
220 | const LLT EltTy = QueryTy.getElementType(); | ||||||||
221 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||||||
222 | }; | ||||||||
223 | } | ||||||||
224 | |||||||||
225 | // If we have a truncating store or an extending load with a data size larger | ||||||||
226 | // than 32-bits, we need to reduce to a 32-bit type. | ||||||||
227 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { | ||||||||
228 | return [=](const LegalityQuery &Query) { | ||||||||
229 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
230 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||||||
231 | Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); | ||||||||
232 | }; | ||||||||
233 | } | ||||||||
234 | |||||||||
235 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||||||
236 | // handle some operations by just promoting the register during | ||||||||
237 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||||||
238 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||||||
239 | bool IsLoad) { | ||||||||
240 | switch (AS) { | ||||||||
241 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||||||
242 | // FIXME: Private element size. | ||||||||
243 | return ST.enableFlatScratch() ? 128 : 32; | ||||||||
244 | case AMDGPUAS::LOCAL_ADDRESS: | ||||||||
245 | return ST.useDS128() ? 128 : 64; | ||||||||
246 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||||||
247 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||||||
248 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||||||
249 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||||||
250 | // global loads (ideally constant address space should be eliminated) | ||||||||
251 | // depending on the context. Legality cannot be context dependent, but | ||||||||
252 | // RegBankSelect can split the load as necessary depending on the pointer | ||||||||
253 | // register bank/uniformity and if the memory is invariant or not written in a | ||||||||
254 | // kernel. | ||||||||
255 | return IsLoad ? 512 : 128; | ||||||||
256 | default: | ||||||||
257 | // Flat addresses may contextually need to be split to 32-bit parts if they | ||||||||
258 | // may alias scratch depending on the subtarget. | ||||||||
259 | return 128; | ||||||||
260 | } | ||||||||
261 | } | ||||||||
262 | |||||||||
263 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||||||
264 | const LegalityQuery &Query) { | ||||||||
265 | const LLT Ty = Query.Types[0]; | ||||||||
266 | |||||||||
267 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||||||
268 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; | ||||||||
269 | |||||||||
270 | unsigned RegSize = Ty.getSizeInBits(); | ||||||||
271 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
272 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||||
273 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||||||
274 | |||||||||
275 | // All of these need to be custom lowered to cast the pointer operand. | ||||||||
276 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||||||
277 | return false; | ||||||||
278 | |||||||||
279 | // Do not handle extending vector loads. | ||||||||
280 | if (Ty.isVector() && MemSize != RegSize) | ||||||||
281 | return false; | ||||||||
282 | |||||||||
283 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||||||
284 | // we also need to modify the memory access size. | ||||||||
285 | #if 0 | ||||||||
286 | // Accept widening loads based on alignment. | ||||||||
287 | if (IsLoad && MemSize < Size) | ||||||||
288 | MemSize = std::max(MemSize, Align); | ||||||||
289 | #endif | ||||||||
290 | |||||||||
291 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||||||
292 | if (MemSize != RegSize && RegSize != 32) | ||||||||
293 | return false; | ||||||||
294 | |||||||||
295 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||||
296 | return false; | ||||||||
297 | |||||||||
298 | switch (MemSize) { | ||||||||
299 | case 8: | ||||||||
300 | case 16: | ||||||||
301 | case 32: | ||||||||
302 | case 64: | ||||||||
303 | case 128: | ||||||||
304 | break; | ||||||||
305 | case 96: | ||||||||
306 | if (!ST.hasDwordx3LoadStores()) | ||||||||
307 | return false; | ||||||||
308 | break; | ||||||||
309 | case 256: | ||||||||
310 | case 512: | ||||||||
311 | // These may contextually need to be broken down. | ||||||||
312 | break; | ||||||||
313 | default: | ||||||||
314 | return false; | ||||||||
315 | } | ||||||||
316 | |||||||||
317 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 317, __extension__ __PRETTY_FUNCTION__)); | ||||||||
318 | |||||||||
319 | if (AlignBits < MemSize) { | ||||||||
320 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
321 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||||
322 | Align(AlignBits / 8))) | ||||||||
323 | return false; | ||||||||
324 | } | ||||||||
325 | |||||||||
326 | return true; | ||||||||
327 | } | ||||||||
328 | |||||||||
329 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||||||
330 | // workaround this. Eventually it should ignore the type for loads and only care | ||||||||
331 | // about the size. Return true in cases where we will workaround this for now by | ||||||||
332 | // bitcasting. | ||||||||
333 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||||||
334 | if (EnableNewLegality) | ||||||||
335 | return false; | ||||||||
336 | |||||||||
337 | const unsigned Size = Ty.getSizeInBits(); | ||||||||
338 | if (Size <= 64) | ||||||||
339 | return false; | ||||||||
340 | if (!Ty.isVector()) | ||||||||
341 | return true; | ||||||||
342 | |||||||||
343 | LLT EltTy = Ty.getElementType(); | ||||||||
344 | if (EltTy.isPointer()) | ||||||||
345 | return true; | ||||||||
346 | |||||||||
347 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
348 | return EltSize != 32 && EltSize != 64; | ||||||||
349 | } | ||||||||
350 | |||||||||
351 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { | ||||||||
352 | const LLT Ty = Query.Types[0]; | ||||||||
353 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && | ||||||||
354 | !loadStoreBitcastWorkaround(Ty); | ||||||||
355 | } | ||||||||
356 | |||||||||
357 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||||||
358 | /// to a different type. | ||||||||
359 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||||||
360 | const unsigned MemSizeInBits) { | ||||||||
361 | const unsigned Size = Ty.getSizeInBits(); | ||||||||
362 | if (Size != MemSizeInBits) | ||||||||
363 | return Size <= 32 && Ty.isVector(); | ||||||||
364 | |||||||||
365 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||||||
366 | return true; | ||||||||
367 | return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && | ||||||||
368 | !isRegisterVectorElementType(Ty.getElementType()); | ||||||||
369 | } | ||||||||
370 | |||||||||
371 | /// Return true if we should legalize a load by widening an odd sized memory | ||||||||
372 | /// access up to the alignment. Note this case when the memory access itself | ||||||||
373 | /// changes, not the size of the result register. | ||||||||
374 | static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, | ||||||||
375 | unsigned AlignInBits, unsigned AddrSpace, | ||||||||
376 | unsigned Opcode) { | ||||||||
377 | // We don't want to widen cases that are naturally legal. | ||||||||
378 | if (isPowerOf2_32(SizeInBits)) | ||||||||
379 | return false; | ||||||||
380 | |||||||||
381 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||||||
382 | // end up widening these for a scalar load during RegBankSelect, since there | ||||||||
383 | // aren't 96-bit scalar loads. | ||||||||
384 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||||||
385 | return false; | ||||||||
386 | |||||||||
387 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) | ||||||||
388 | return false; | ||||||||
389 | |||||||||
390 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||||||
391 | // to it. | ||||||||
392 | // | ||||||||
393 | // TODO: Could check dereferenceable for less aligned cases. | ||||||||
394 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||||||
395 | if (AlignInBits < RoundedSize) | ||||||||
396 | return false; | ||||||||
397 | |||||||||
398 | // Do not widen if it would introduce a slow unaligned load. | ||||||||
399 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
400 | bool Fast = false; | ||||||||
401 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||||||
402 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||||||
403 | MachineMemOperand::MOLoad, &Fast) && | ||||||||
404 | Fast; | ||||||||
405 | } | ||||||||
406 | |||||||||
407 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||||||
408 | unsigned Opcode) { | ||||||||
409 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||||||
410 | return false; | ||||||||
411 | |||||||||
412 | return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, | ||||||||
413 | Query.MMODescrs[0].AlignInBits, | ||||||||
414 | Query.Types[1].getAddressSpace(), Opcode); | ||||||||
415 | } | ||||||||
416 | |||||||||
417 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||||||
418 | const GCNTargetMachine &TM) | ||||||||
419 | : ST(ST_) { | ||||||||
420 | using namespace TargetOpcode; | ||||||||
421 | |||||||||
422 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||||||
423 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||||||
424 | }; | ||||||||
425 | |||||||||
426 | const LLT S1 = LLT::scalar(1); | ||||||||
427 | const LLT S8 = LLT::scalar(8); | ||||||||
428 | const LLT S16 = LLT::scalar(16); | ||||||||
429 | const LLT S32 = LLT::scalar(32); | ||||||||
430 | const LLT S64 = LLT::scalar(64); | ||||||||
431 | const LLT S128 = LLT::scalar(128); | ||||||||
432 | const LLT S256 = LLT::scalar(256); | ||||||||
433 | const LLT S512 = LLT::scalar(512); | ||||||||
434 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||||||
435 | |||||||||
436 | const LLT V2S8 = LLT::vector(2, 8); | ||||||||
437 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
438 | const LLT V4S16 = LLT::vector(4, 16); | ||||||||
439 | |||||||||
440 | const LLT V2S32 = LLT::vector(2, 32); | ||||||||
441 | const LLT V3S32 = LLT::vector(3, 32); | ||||||||
442 | const LLT V4S32 = LLT::vector(4, 32); | ||||||||
443 | const LLT V5S32 = LLT::vector(5, 32); | ||||||||
444 | const LLT V6S32 = LLT::vector(6, 32); | ||||||||
445 | const LLT V7S32 = LLT::vector(7, 32); | ||||||||
446 | const LLT V8S32 = LLT::vector(8, 32); | ||||||||
447 | const LLT V9S32 = LLT::vector(9, 32); | ||||||||
448 | const LLT V10S32 = LLT::vector(10, 32); | ||||||||
449 | const LLT V11S32 = LLT::vector(11, 32); | ||||||||
450 | const LLT V12S32 = LLT::vector(12, 32); | ||||||||
451 | const LLT V13S32 = LLT::vector(13, 32); | ||||||||
452 | const LLT V14S32 = LLT::vector(14, 32); | ||||||||
453 | const LLT V15S32 = LLT::vector(15, 32); | ||||||||
454 | const LLT V16S32 = LLT::vector(16, 32); | ||||||||
455 | const LLT V32S32 = LLT::vector(32, 32); | ||||||||
456 | |||||||||
457 | const LLT V2S64 = LLT::vector(2, 64); | ||||||||
458 | const LLT V3S64 = LLT::vector(3, 64); | ||||||||
459 | const LLT V4S64 = LLT::vector(4, 64); | ||||||||
460 | const LLT V5S64 = LLT::vector(5, 64); | ||||||||
461 | const LLT V6S64 = LLT::vector(6, 64); | ||||||||
462 | const LLT V7S64 = LLT::vector(7, 64); | ||||||||
463 | const LLT V8S64 = LLT::vector(8, 64); | ||||||||
464 | const LLT V16S64 = LLT::vector(16, 64); | ||||||||
465 | |||||||||
466 | std::initializer_list<LLT> AllS32Vectors = | ||||||||
467 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||||||
468 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||||||
469 | std::initializer_list<LLT> AllS64Vectors = | ||||||||
470 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||||||
471 | |||||||||
472 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||||||
473 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||||||
474 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||||||
475 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||||||
476 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||||||
477 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||||||
478 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||||||
479 | |||||||||
480 | const LLT CodePtr = FlatPtr; | ||||||||
481 | |||||||||
482 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||||||
483 | GlobalPtr, ConstantPtr, FlatPtr | ||||||||
484 | }; | ||||||||
485 | |||||||||
486 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||||||
487 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||||||
488 | }; | ||||||||
489 | |||||||||
490 | const std::initializer_list<LLT> FPTypesBase = { | ||||||||
491 | S32, S64 | ||||||||
492 | }; | ||||||||
493 | |||||||||
494 | const std::initializer_list<LLT> FPTypes16 = { | ||||||||
495 | S32, S64, S16 | ||||||||
496 | }; | ||||||||
497 | |||||||||
498 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||||||
499 | S32, S64, S16, V2S16 | ||||||||
500 | }; | ||||||||
501 | |||||||||
502 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||||||
503 | |||||||||
504 | auto &LegacyInfo = getLegacyLegalizerInfo(); | ||||||||
505 | LegacyInfo.setAction({G_BRCOND, S1}, | ||||||||
506 | LegacyLegalizeActions::Legal); // VCC branches | ||||||||
507 | LegacyInfo.setAction({G_BRCOND, S32}, | ||||||||
508 | LegacyLegalizeActions::Legal); // SCC branches | ||||||||
509 | |||||||||
510 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||||||
511 | // elements for v3s16 | ||||||||
512 | getActionDefinitionsBuilder(G_PHI) | ||||||||
513 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||||||
514 | .legalFor(AllS32Vectors) | ||||||||
515 | .legalFor(AllS64Vectors) | ||||||||
516 | .legalFor(AddrSpaces64) | ||||||||
517 | .legalFor(AddrSpaces32) | ||||||||
518 | .legalIf(isPointer(0)) | ||||||||
519 | .clampScalar(0, S16, S256) | ||||||||
520 | .widenScalarToNextPow2(0, 32) | ||||||||
521 | .clampMaxNumElements(0, S32, 16) | ||||||||
522 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
523 | .scalarize(0); | ||||||||
524 | |||||||||
525 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||||||
526 | // Full set of gfx9 features. | ||||||||
527 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||||
528 | .legalFor({S32, S16, V2S16}) | ||||||||
529 | .clampScalar(0, S16, S32) | ||||||||
530 | .clampMaxNumElements(0, S16, 2) | ||||||||
531 | .scalarize(0) | ||||||||
532 | .widenScalarToNextPow2(0, 32); | ||||||||
533 | |||||||||
534 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||||||
535 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||||||
536 | .minScalarOrElt(0, S16) | ||||||||
537 | .clampMaxNumElements(0, S16, 2) | ||||||||
538 | .scalarize(0) | ||||||||
539 | .widenScalarToNextPow2(0, 32) | ||||||||
540 | .lower(); | ||||||||
541 | } else if (ST.has16BitInsts()) { | ||||||||
542 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||||
543 | .legalFor({S32, S16}) | ||||||||
544 | .clampScalar(0, S16, S32) | ||||||||
545 | .scalarize(0) | ||||||||
546 | .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 | ||||||||
547 | |||||||||
548 | // Technically the saturating operations require clamp bit support, but this | ||||||||
549 | // was introduced at the same time as 16-bit operations. | ||||||||
550 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||||
551 | .legalFor({S32, S16}) // Clamp modifier | ||||||||
552 | .minScalar(0, S16) | ||||||||
553 | .scalarize(0) | ||||||||
554 | .widenScalarToNextPow2(0, 16) | ||||||||
555 | .lower(); | ||||||||
556 | |||||||||
557 | // We're just lowering this, but it helps get a better result to try to | ||||||||
558 | // coerce to the desired type first. | ||||||||
559 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||||
560 | .minScalar(0, S16) | ||||||||
561 | .scalarize(0) | ||||||||
562 | .lower(); | ||||||||
563 | } else { | ||||||||
564 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||||
565 | .legalFor({S32}) | ||||||||
566 | .clampScalar(0, S32, S32) | ||||||||
567 | .scalarize(0); | ||||||||
568 | |||||||||
569 | if (ST.hasIntClamp()) { | ||||||||
570 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||||
571 | .legalFor({S32}) // Clamp modifier. | ||||||||
572 | .scalarize(0) | ||||||||
573 | .minScalarOrElt(0, S32) | ||||||||
574 | .lower(); | ||||||||
575 | } else { | ||||||||
576 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||||||
577 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||||
578 | .minScalar(0, S32) | ||||||||
579 | .scalarize(0) | ||||||||
580 | .lower(); | ||||||||
581 | } | ||||||||
582 | |||||||||
583 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||||||
584 | // range values and goes for the min/max lowering directly. | ||||||||
585 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||||
586 | .minScalar(0, S32) | ||||||||
587 | .scalarize(0) | ||||||||
588 | .lower(); | ||||||||
589 | } | ||||||||
590 | |||||||||
591 | getActionDefinitionsBuilder( | ||||||||
592 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) | ||||||||
593 | .customFor({S32, S64}) | ||||||||
594 | .clampScalar(0, S32, S64) | ||||||||
595 | .widenScalarToNextPow2(0, 32) | ||||||||
596 | .scalarize(0); | ||||||||
597 | |||||||||
598 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||||||
599 | .legalFor({S32}) | ||||||||
600 | .maxScalarOrElt(0, S32); | ||||||||
601 | |||||||||
602 | if (ST.hasVOP3PInsts()) { | ||||||||
603 | Mulh | ||||||||
604 | .clampMaxNumElements(0, S8, 2) | ||||||||
605 | .lowerFor({V2S8}); | ||||||||
606 | } | ||||||||
607 | |||||||||
608 | Mulh | ||||||||
609 | .scalarize(0) | ||||||||
610 | .lower(); | ||||||||
611 | |||||||||
612 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||||||
613 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||||||
614 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||||||
615 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||||||
616 | .clampScalar(0, S32, S64) | ||||||||
617 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
618 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||||||
619 | .widenScalarToNextPow2(0) | ||||||||
620 | .scalarize(0); | ||||||||
621 | |||||||||
622 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | ||||||||
623 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||||||
624 | .legalFor({{S32, S1}, {S32, S32}}) | ||||||||
625 | .minScalar(0, S32) | ||||||||
626 | // TODO: .scalarize(0) | ||||||||
627 | .lower(); | ||||||||
628 | |||||||||
629 | getActionDefinitionsBuilder(G_BITCAST) | ||||||||
630 | // Don't worry about the size constraint. | ||||||||
631 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||||
632 | .lower(); | ||||||||
633 | |||||||||
634 | |||||||||
635 | getActionDefinitionsBuilder(G_CONSTANT) | ||||||||
636 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||||||
637 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||||||
638 | .legalIf(isPointer(0)) | ||||||||
639 | .clampScalar(0, S32, S64) | ||||||||
640 | .widenScalarToNextPow2(0); | ||||||||
641 | |||||||||
642 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||||||
643 | .legalFor({S32, S64, S16}) | ||||||||
644 | .clampScalar(0, S16, S64); | ||||||||
645 | |||||||||
646 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||||||
647 | .legalIf(isRegisterType(0)) | ||||||||
648 | // s1 and s16 are special cases because they have legal operations on | ||||||||
649 | // them, but don't really occupy registers in the normal way. | ||||||||
650 | .legalFor({S1, S16}) | ||||||||
651 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
652 | .clampScalarOrElt(0, S32, MaxScalar) | ||||||||
653 | .widenScalarToNextPow2(0, 32) | ||||||||
654 | .clampMaxNumElements(0, S32, 16); | ||||||||
655 | |||||||||
656 | LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr}, | ||||||||
657 | LegacyLegalizeActions::Legal); | ||||||||
658 | |||||||||
659 | // If the amount is divergent, we have to do a wave reduction to get the | ||||||||
660 | // maximum value, so this is expanded during RegBankSelect. | ||||||||
661 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||||||
662 | .legalFor({{PrivatePtr, S32}}); | ||||||||
663 | |||||||||
664 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||||||
665 | .customIf(typeIsNot(0, PrivatePtr)); | ||||||||
666 | |||||||||
667 | LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal); | ||||||||
668 | |||||||||
669 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||||||
670 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | ||||||||
671 | .legalFor({S32, S64}); | ||||||||
672 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||||||
673 | .customFor({S32, S64}); | ||||||||
674 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||||||
675 | .customFor({S32, S64}); | ||||||||
676 | |||||||||
677 | if (ST.has16BitInsts()) { | ||||||||
678 | if (ST.hasVOP3PInsts()) | ||||||||
679 | FPOpActions.legalFor({S16, V2S16}); | ||||||||
680 | else | ||||||||
681 | FPOpActions.legalFor({S16}); | ||||||||
682 | |||||||||
683 | TrigActions.customFor({S16}); | ||||||||
684 | FDIVActions.customFor({S16}); | ||||||||
685 | } | ||||||||
686 | |||||||||
687 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||||||
688 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||||||
689 | |||||||||
690 | if (ST.hasVOP3PInsts()) { | ||||||||
691 | MinNumMaxNum.customFor(FPTypesPK16) | ||||||||
692 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
693 | .clampMaxNumElements(0, S16, 2) | ||||||||
694 | .clampScalar(0, S16, S64) | ||||||||
695 | .scalarize(0); | ||||||||
696 | } else if (ST.has16BitInsts()) { | ||||||||
697 | MinNumMaxNum.customFor(FPTypes16) | ||||||||
698 | .clampScalar(0, S16, S64) | ||||||||
699 | .scalarize(0); | ||||||||
700 | } else { | ||||||||
701 | MinNumMaxNum.customFor(FPTypesBase) | ||||||||
702 | .clampScalar(0, S32, S64) | ||||||||
703 | .scalarize(0); | ||||||||
704 | } | ||||||||
705 | |||||||||
706 | if (ST.hasVOP3PInsts()) | ||||||||
707 | FPOpActions.clampMaxNumElements(0, S16, 2); | ||||||||
708 | |||||||||
709 | FPOpActions | ||||||||
710 | .scalarize(0) | ||||||||
711 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||||
712 | |||||||||
713 | TrigActions | ||||||||
714 | .scalarize(0) | ||||||||
715 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||||
716 | |||||||||
717 | FDIVActions | ||||||||
718 | .scalarize(0) | ||||||||
719 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||||
720 | |||||||||
721 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||||||
722 | .legalFor(FPTypesPK16) | ||||||||
723 | .clampMaxNumElements(0, S16, 2) | ||||||||
724 | .scalarize(0) | ||||||||
725 | .clampScalar(0, S16, S64); | ||||||||
726 | |||||||||
727 | if (ST.has16BitInsts()) { | ||||||||
728 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||||
729 | .legalFor({S32, S64, S16}) | ||||||||
730 | .scalarize(0) | ||||||||
731 | .clampScalar(0, S16, S64); | ||||||||
732 | } else { | ||||||||
733 | getActionDefinitionsBuilder(G_FSQRT) | ||||||||
734 | .legalFor({S32, S64}) | ||||||||
735 | .scalarize(0) | ||||||||
736 | .clampScalar(0, S32, S64); | ||||||||
737 | |||||||||
738 | if (ST.hasFractBug()) { | ||||||||
739 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||||
740 | .customFor({S64}) | ||||||||
741 | .legalFor({S32, S64}) | ||||||||
742 | .scalarize(0) | ||||||||
743 | .clampScalar(0, S32, S64); | ||||||||
744 | } else { | ||||||||
745 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||||
746 | .legalFor({S32, S64}) | ||||||||
747 | .scalarize(0) | ||||||||
748 | .clampScalar(0, S32, S64); | ||||||||
749 | } | ||||||||
750 | } | ||||||||
751 | |||||||||
752 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||||||
753 | .legalFor({{S32, S64}, {S16, S32}}) | ||||||||
754 | .scalarize(0) | ||||||||
755 | .lower(); | ||||||||
756 | |||||||||
757 | getActionDefinitionsBuilder(G_FPEXT) | ||||||||
758 | .legalFor({{S64, S32}, {S32, S16}}) | ||||||||
759 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||||||
760 | .scalarize(0); | ||||||||
761 | |||||||||
762 | getActionDefinitionsBuilder(G_FSUB) | ||||||||
763 | // Use actual fsub instruction | ||||||||
764 | .legalFor({S32}) | ||||||||
765 | // Must use fadd + fneg | ||||||||
766 | .lowerFor({S64, S16, V2S16}) | ||||||||
767 | .scalarize(0) | ||||||||
768 | .clampScalar(0, S32, S64); | ||||||||
769 | |||||||||
770 | // Whether this is legal depends on the floating point mode for the function. | ||||||||
771 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||||||
772 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||||||
773 | FMad.customFor({S32, S16}); | ||||||||
774 | else if (ST.hasMadMacF32Insts()) | ||||||||
775 | FMad.customFor({S32}); | ||||||||
776 | else if (ST.hasMadF16()) | ||||||||
777 | FMad.customFor({S16}); | ||||||||
778 | FMad.scalarize(0) | ||||||||
779 | .lower(); | ||||||||
780 | |||||||||
781 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||||||
782 | if (ST.has16BitInsts()) { | ||||||||
783 | FRem.customFor({S16, S32, S64}); | ||||||||
784 | } else { | ||||||||
785 | FRem.minScalar(0, S32) | ||||||||
786 | .customFor({S32, S64}); | ||||||||
787 | } | ||||||||
788 | FRem.scalarize(0); | ||||||||
789 | |||||||||
790 | // TODO: Do we need to clamp maximum bitwidth? | ||||||||
791 | getActionDefinitionsBuilder(G_TRUNC) | ||||||||
792 | .legalIf(isScalar(0)) | ||||||||
793 | .legalFor({{V2S16, V2S32}}) | ||||||||
794 | .clampMaxNumElements(0, S16, 2) | ||||||||
795 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||||||
796 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||||||
797 | // in the legalizer. | ||||||||
798 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||||||
799 | .alwaysLegal(); | ||||||||
800 | |||||||||
801 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||||||
802 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||||||
803 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||||||
804 | .scalarize(0) | ||||||||
805 | .clampScalar(0, S32, S64) | ||||||||
806 | .widenScalarToNextPow2(1, 32); | ||||||||
807 | |||||||||
808 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||||||
809 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||||||
810 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||||||
811 | .lowerFor({{S32, S64}}) | ||||||||
812 | .lowerIf(typeIs(1, S1)) | ||||||||
813 | .customFor({{S64, S64}}); | ||||||||
814 | if (ST.has16BitInsts()) | ||||||||
815 | IToFP.legalFor({{S16, S16}}); | ||||||||
816 | IToFP.clampScalar(1, S32, S64) | ||||||||
817 | .minScalar(0, S32) | ||||||||
818 | .scalarize(0) | ||||||||
819 | .widenScalarToNextPow2(1); | ||||||||
820 | |||||||||
821 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||||||
822 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||||||
823 | .customFor({{S64, S32}, {S64, S64}}) | ||||||||
824 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||||||
825 | if (ST.has16BitInsts()) | ||||||||
826 | FPToI.legalFor({{S16, S16}}); | ||||||||
827 | else | ||||||||
828 | FPToI.minScalar(1, S32); | ||||||||
829 | |||||||||
830 | FPToI.minScalar(0, S32) | ||||||||
831 | .widenScalarToNextPow2(0, 32) | ||||||||
832 | .scalarize(0) | ||||||||
833 | .lower(); | ||||||||
834 | |||||||||
835 | // Lower roundeven into G_FRINT | ||||||||
836 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||||||
837 | .scalarize(0) | ||||||||
838 | .lower(); | ||||||||
839 | |||||||||
840 | if (ST.has16BitInsts()) { | ||||||||
841 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||||
842 | .legalFor({S16, S32, S64}) | ||||||||
843 | .clampScalar(0, S16, S64) | ||||||||
844 | .scalarize(0); | ||||||||
845 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||||||
846 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||||
847 | .legalFor({S32, S64}) | ||||||||
848 | .clampScalar(0, S32, S64) | ||||||||
849 | .scalarize(0); | ||||||||
850 | } else { | ||||||||
851 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||||
852 | .legalFor({S32}) | ||||||||
853 | .customFor({S64}) | ||||||||
854 | .clampScalar(0, S32, S64) | ||||||||
855 | .scalarize(0); | ||||||||
856 | } | ||||||||
857 | |||||||||
858 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||||||
859 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||||||
860 | .scalarize(0) | ||||||||
861 | .scalarSameSizeAs(1, 0); | ||||||||
862 | |||||||||
863 | getActionDefinitionsBuilder(G_PTRMASK) | ||||||||
864 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||||||
865 | .scalarSameSizeAs(1, 0) | ||||||||
866 | .scalarize(0); | ||||||||
867 | |||||||||
868 | auto &CmpBuilder = | ||||||||
869 | getActionDefinitionsBuilder(G_ICMP) | ||||||||
870 | // The compare output type differs based on the register bank of the output, | ||||||||
871 | // so make both s1 and s32 legal. | ||||||||
872 | // | ||||||||
873 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||||||
874 | // is the allocatable register type that will be needed for the copy from | ||||||||
875 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||||||
876 | // before that won't try to use s32 result types. | ||||||||
877 | // | ||||||||
878 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||||||
879 | // bank. | ||||||||
880 | .legalForCartesianProduct( | ||||||||
881 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||||||
882 | .legalForCartesianProduct( | ||||||||
883 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||||||
884 | if (ST.has16BitInsts()) { | ||||||||
885 | CmpBuilder.legalFor({{S1, S16}}); | ||||||||
886 | } | ||||||||
887 | |||||||||
888 | CmpBuilder | ||||||||
889 | .widenScalarToNextPow2(1) | ||||||||
890 | .clampScalar(1, S32, S64) | ||||||||
891 | .scalarize(0) | ||||||||
892 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||||||
893 | |||||||||
894 | getActionDefinitionsBuilder(G_FCMP) | ||||||||
895 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||||||
896 | .widenScalarToNextPow2(1) | ||||||||
897 | .clampScalar(1, S32, S64) | ||||||||
898 | .scalarize(0); | ||||||||
899 | |||||||||
900 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||||||
901 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||||||
902 | if (ST.has16BitInsts()) | ||||||||
903 | Exp2Ops.legalFor({S32, S16}); | ||||||||
904 | else | ||||||||
905 | Exp2Ops.legalFor({S32}); | ||||||||
906 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||||||
907 | Exp2Ops.scalarize(0); | ||||||||
908 | |||||||||
909 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||||||
910 | if (ST.has16BitInsts()) | ||||||||
911 | ExpOps.customFor({{S32}, {S16}}); | ||||||||
912 | else | ||||||||
913 | ExpOps.customFor({S32}); | ||||||||
914 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||||||
915 | .scalarize(0); | ||||||||
916 | |||||||||
917 | getActionDefinitionsBuilder(G_FPOWI) | ||||||||
918 | .clampScalar(0, MinScalarFPTy, S32) | ||||||||
919 | .lower(); | ||||||||
920 | |||||||||
921 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||||
922 | getActionDefinitionsBuilder(G_CTPOP) | ||||||||
923 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||||
924 | .clampScalar(0, S32, S32) | ||||||||
925 | .clampScalar(1, S32, S64) | ||||||||
926 | .scalarize(0) | ||||||||
927 | .widenScalarToNextPow2(0, 32) | ||||||||
928 | .widenScalarToNextPow2(1, 32); | ||||||||
929 | |||||||||
930 | // The hardware instructions return a different result on 0 than the generic | ||||||||
931 | // instructions expect. The hardware produces -1, but these produce the | ||||||||
932 | // bitwidth. | ||||||||
933 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||||||
934 | .scalarize(0) | ||||||||
935 | .clampScalar(0, S32, S32) | ||||||||
936 | .clampScalar(1, S32, S64) | ||||||||
937 | .widenScalarToNextPow2(0, 32) | ||||||||
938 | .widenScalarToNextPow2(1, 32) | ||||||||
939 | .lower(); | ||||||||
940 | |||||||||
941 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||||
942 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||||||
943 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||||
944 | .clampScalar(0, S32, S32) | ||||||||
945 | .clampScalar(1, S32, S64) | ||||||||
946 | .scalarize(0) | ||||||||
947 | .widenScalarToNextPow2(0, 32) | ||||||||
948 | .widenScalarToNextPow2(1, 32); | ||||||||
949 | |||||||||
950 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||||
951 | // RegBankSelect. | ||||||||
952 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||||||
953 | .legalFor({S32, S64}) | ||||||||
954 | .clampScalar(0, S32, S64) | ||||||||
955 | .scalarize(0) | ||||||||
956 | .widenScalarToNextPow2(0); | ||||||||
957 | |||||||||
958 | if (ST.has16BitInsts()) { | ||||||||
959 | getActionDefinitionsBuilder(G_BSWAP) | ||||||||
960 | .legalFor({S16, S32, V2S16}) | ||||||||
961 | .clampMaxNumElements(0, S16, 2) | ||||||||
962 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||||
963 | // narrowScalar limitation. | ||||||||
964 | .widenScalarToNextPow2(0) | ||||||||
965 | .clampScalar(0, S16, S32) | ||||||||
966 | .scalarize(0); | ||||||||
967 | |||||||||
968 | if (ST.hasVOP3PInsts()) { | ||||||||
969 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||||
970 | .legalFor({S32, S16, V2S16}) | ||||||||
971 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
972 | .clampMaxNumElements(0, S16, 2) | ||||||||
973 | .minScalar(0, S16) | ||||||||
974 | .widenScalarToNextPow2(0) | ||||||||
975 | .scalarize(0) | ||||||||
976 | .lower(); | ||||||||
977 | } else { | ||||||||
978 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||||
979 | .legalFor({S32, S16}) | ||||||||
980 | .widenScalarToNextPow2(0) | ||||||||
981 | .minScalar(0, S16) | ||||||||
982 | .scalarize(0) | ||||||||
983 | .lower(); | ||||||||
984 | } | ||||||||
985 | } else { | ||||||||
986 | // TODO: Should have same legality without v_perm_b32 | ||||||||
987 | getActionDefinitionsBuilder(G_BSWAP) | ||||||||
988 | .legalFor({S32}) | ||||||||
989 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||||||
990 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||||
991 | // narrowScalar limitation. | ||||||||
992 | .widenScalarToNextPow2(0) | ||||||||
993 | .maxScalar(0, S32) | ||||||||
994 | .scalarize(0) | ||||||||
995 | .lower(); | ||||||||
996 | |||||||||
997 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||||
998 | .legalFor({S32}) | ||||||||
999 | .minScalar(0, S32) | ||||||||
1000 | .widenScalarToNextPow2(0) | ||||||||
1001 | .scalarize(0) | ||||||||
1002 | .lower(); | ||||||||
1003 | } | ||||||||
1004 | |||||||||
1005 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||||||
1006 | // List the common cases | ||||||||
1007 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||||
1008 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||||
1009 | .scalarize(0) | ||||||||
1010 | // Accept any address space as long as the size matches | ||||||||
1011 | .legalIf(sameSize(0, 1)) | ||||||||
1012 | .widenScalarIf(smallerThan(1, 0), | ||||||||
1013 | [](const LegalityQuery &Query) { | ||||||||
1014 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||||
1015 | }) | ||||||||
1016 | .narrowScalarIf(largerThan(1, 0), | ||||||||
1017 | [](const LegalityQuery &Query) { | ||||||||
1018 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||||
1019 | }); | ||||||||
1020 | |||||||||
1021 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||||||
1022 | // List the common cases | ||||||||
1023 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||||
1024 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||||
1025 | .scalarize(0) | ||||||||
1026 | // Accept any address space as long as the size matches | ||||||||
1027 | .legalIf(sameSize(0, 1)) | ||||||||
1028 | .widenScalarIf(smallerThan(0, 1), | ||||||||
1029 | [](const LegalityQuery &Query) { | ||||||||
1030 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||||
1031 | }) | ||||||||
1032 | .narrowScalarIf( | ||||||||
1033 | largerThan(0, 1), | ||||||||
1034 | [](const LegalityQuery &Query) { | ||||||||
1035 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||||
1036 | }); | ||||||||
1037 | |||||||||
1038 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||||||
1039 | .scalarize(0) | ||||||||
1040 | .custom(); | ||||||||
1041 | |||||||||
1042 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||||||
1043 | bool IsLoad) -> bool { | ||||||||
1044 | const LLT DstTy = Query.Types[0]; | ||||||||
1045 | |||||||||
1046 | // Split vector extloads. | ||||||||
1047 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
1048 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||||
1049 | |||||||||
1050 | if (MemSize < DstTy.getSizeInBits()) | ||||||||
1051 | MemSize = std::max(MemSize, AlignBits); | ||||||||
1052 | |||||||||
1053 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||||||
1054 | return true; | ||||||||
1055 | |||||||||
1056 | const LLT PtrTy = Query.Types[1]; | ||||||||
1057 | unsigned AS = PtrTy.getAddressSpace(); | ||||||||
1058 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||||
1059 | return true; | ||||||||
1060 | |||||||||
1061 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||||||
1062 | // TODO: May be able to widen depending on alignment etc. | ||||||||
1063 | unsigned NumRegs = (MemSize + 31) / 32; | ||||||||
1064 | if (NumRegs == 3) { | ||||||||
1065 | if (!ST.hasDwordx3LoadStores()) | ||||||||
1066 | return true; | ||||||||
1067 | } else { | ||||||||
1068 | // If the alignment allows, these should have been widened. | ||||||||
1069 | if (!isPowerOf2_32(NumRegs)) | ||||||||
1070 | return true; | ||||||||
1071 | } | ||||||||
1072 | |||||||||
1073 | if (AlignBits < MemSize) { | ||||||||
1074 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
1075 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||||
1076 | Align(AlignBits / 8)); | ||||||||
1077 | } | ||||||||
1078 | |||||||||
1079 | return false; | ||||||||
1080 | }; | ||||||||
1081 | |||||||||
1082 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||||||
1083 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||||||
1084 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||||||
1085 | |||||||||
1086 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||||||
1087 | // LDS | ||||||||
1088 | // TODO: Unsupported flat for SI. | ||||||||
1089 | |||||||||
1090 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||||||
1091 | const bool IsStore = Op == G_STORE; | ||||||||
1092 | |||||||||
1093 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||||||
1094 | // Explicitly list some common cases. | ||||||||
1095 | // TODO: Does this help compile time at all? | ||||||||
1096 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, | ||||||||
1097 | {V2S32, GlobalPtr, 64, GlobalAlign32}, | ||||||||
1098 | {V4S32, GlobalPtr, 128, GlobalAlign32}, | ||||||||
1099 | {S64, GlobalPtr, 64, GlobalAlign32}, | ||||||||
1100 | {V2S64, GlobalPtr, 128, GlobalAlign32}, | ||||||||
1101 | {V2S16, GlobalPtr, 32, GlobalAlign32}, | ||||||||
1102 | {S32, GlobalPtr, 8, GlobalAlign8}, | ||||||||
1103 | {S32, GlobalPtr, 16, GlobalAlign16}, | ||||||||
1104 | |||||||||
1105 | {S32, LocalPtr, 32, 32}, | ||||||||
1106 | {S64, LocalPtr, 64, 32}, | ||||||||
1107 | {V2S32, LocalPtr, 64, 32}, | ||||||||
1108 | {S32, LocalPtr, 8, 8}, | ||||||||
1109 | {S32, LocalPtr, 16, 16}, | ||||||||
1110 | {V2S16, LocalPtr, 32, 32}, | ||||||||
1111 | |||||||||
1112 | {S32, PrivatePtr, 32, 32}, | ||||||||
1113 | {S32, PrivatePtr, 8, 8}, | ||||||||
1114 | {S32, PrivatePtr, 16, 16}, | ||||||||
1115 | {V2S16, PrivatePtr, 32, 32}, | ||||||||
1116 | |||||||||
1117 | {S32, ConstantPtr, 32, GlobalAlign32}, | ||||||||
1118 | {V2S32, ConstantPtr, 64, GlobalAlign32}, | ||||||||
1119 | {V4S32, ConstantPtr, 128, GlobalAlign32}, | ||||||||
1120 | {S64, ConstantPtr, 64, GlobalAlign32}, | ||||||||
1121 | {V2S32, ConstantPtr, 32, GlobalAlign32}}); | ||||||||
1122 | Actions.legalIf( | ||||||||
1123 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1124 | return isLoadStoreLegal(ST, Query); | ||||||||
1125 | }); | ||||||||
1126 | |||||||||
1127 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||||
1128 | // 64-bits. | ||||||||
1129 | // | ||||||||
1130 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||||
1131 | // inserting addrspacecasts. | ||||||||
1132 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||||||
1133 | |||||||||
1134 | // Turn any illegal element vectors into something easier to deal | ||||||||
1135 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||||||
1136 | // parts anyway. | ||||||||
1137 | // | ||||||||
1138 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||||||
1139 | // 16-bit vector parts. | ||||||||
1140 | Actions.bitcastIf( | ||||||||
1141 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1142 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||||||
1143 | Query.MMODescrs[0].SizeInBits); | ||||||||
1144 | }, bitcastToRegisterType(0)); | ||||||||
1145 | |||||||||
1146 | if (!IsStore) { | ||||||||
1147 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||||||
1148 | // legalization actions can't properly express widening memory operands. | ||||||||
1149 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||||||
1150 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||||||
1151 | }); | ||||||||
1152 | } | ||||||||
1153 | |||||||||
1154 | // FIXME: load/store narrowing should be moved to lower action | ||||||||
1155 | Actions | ||||||||
1156 | .narrowScalarIf( | ||||||||
1157 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1158 | return !Query.Types[0].isVector() && | ||||||||
1159 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||||
1160 | }, | ||||||||
1161 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||||
1162 | const LLT DstTy = Query.Types[0]; | ||||||||
1163 | const LLT PtrTy = Query.Types[1]; | ||||||||
1164 | |||||||||
1165 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||||||
1166 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
1167 | |||||||||
1168 | // Split extloads. | ||||||||
1169 | if (DstSize > MemSize) | ||||||||
1170 | return std::make_pair(0, LLT::scalar(MemSize)); | ||||||||
1171 | |||||||||
1172 | if (!isPowerOf2_32(DstSize)) { | ||||||||
1173 | // We're probably decomposing an odd sized store. Try to split | ||||||||
1174 | // to the widest type. TODO: Account for alignment. As-is it | ||||||||
1175 | // should be OK, since the new parts will be further legalized. | ||||||||
1176 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||||
1177 | return std::make_pair(0, LLT::scalar(FloorSize)); | ||||||||
1178 | } | ||||||||
1179 | |||||||||
1180 | if (DstSize > 32 && (DstSize % 32 != 0)) { | ||||||||
1181 | // FIXME: Need a way to specify non-extload of larger size if | ||||||||
1182 | // suitably aligned. | ||||||||
1183 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); | ||||||||
1184 | } | ||||||||
1185 | |||||||||
1186 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||||
1187 | PtrTy.getAddressSpace(), | ||||||||
1188 | Op == G_LOAD); | ||||||||
1189 | if (MemSize > MaxSize) | ||||||||
1190 | return std::make_pair(0, LLT::scalar(MaxSize)); | ||||||||
1191 | |||||||||
1192 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||||
1193 | return std::make_pair(0, LLT::scalar(Align)); | ||||||||
1194 | }) | ||||||||
1195 | .fewerElementsIf( | ||||||||
1196 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1197 | return Query.Types[0].isVector() && | ||||||||
1198 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||||
1199 | }, | ||||||||
1200 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||||
1201 | const LLT DstTy = Query.Types[0]; | ||||||||
1202 | const LLT PtrTy = Query.Types[1]; | ||||||||
1203 | |||||||||
1204 | LLT EltTy = DstTy.getElementType(); | ||||||||
1205 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||||
1206 | PtrTy.getAddressSpace(), | ||||||||
1207 | Op == G_LOAD); | ||||||||
1208 | |||||||||
1209 | // FIXME: Handle widened to power of 2 results better. This ends | ||||||||
1210 | // up scalarizing. | ||||||||
1211 | // FIXME: 3 element stores scalarized on SI | ||||||||
1212 | |||||||||
1213 | // Split if it's too large for the address space. | ||||||||
1214 | if (Query.MMODescrs[0].SizeInBits > MaxSize) { | ||||||||
1215 | unsigned NumElts = DstTy.getNumElements(); | ||||||||
1216 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
1217 | |||||||||
1218 | if (MaxSize % EltSize == 0) { | ||||||||
1219 | return std::make_pair( | ||||||||
1220 | 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); | ||||||||
1221 | } | ||||||||
1222 | |||||||||
1223 | unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; | ||||||||
1224 | |||||||||
1225 | // FIXME: Refine when odd breakdowns handled | ||||||||
1226 | // The scalars will need to be re-legalized. | ||||||||
1227 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||||||
1228 | NumElts % NumPieces != 0) | ||||||||
1229 | return std::make_pair(0, EltTy); | ||||||||
1230 | |||||||||
1231 | return std::make_pair(0, | ||||||||
1232 | LLT::vector(NumElts / NumPieces, EltTy)); | ||||||||
1233 | } | ||||||||
1234 | |||||||||
1235 | // FIXME: We could probably handle weird extending loads better. | ||||||||
1236 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||||
1237 | if (DstTy.getSizeInBits() > MemSize) | ||||||||
1238 | return std::make_pair(0, EltTy); | ||||||||
1239 | |||||||||
1240 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
1241 | unsigned DstSize = DstTy.getSizeInBits(); | ||||||||
1242 | if (!isPowerOf2_32(DstSize)) { | ||||||||
1243 | // We're probably decomposing an odd sized store. Try to split | ||||||||
1244 | // to the widest type. TODO: Account for alignment. As-is it | ||||||||
1245 | // should be OK, since the new parts will be further legalized. | ||||||||
1246 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||||
1247 | return std::make_pair( | ||||||||
1248 | 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); | ||||||||
1249 | } | ||||||||
1250 | |||||||||
1251 | // Need to split because of alignment. | ||||||||
1252 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||||
1253 | if (EltSize > Align && | ||||||||
1254 | (EltSize / Align < DstTy.getNumElements())) { | ||||||||
1255 | return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); | ||||||||
1256 | } | ||||||||
1257 | |||||||||
1258 | // May need relegalization for the scalars. | ||||||||
1259 | return std::make_pair(0, EltTy); | ||||||||
1260 | }) | ||||||||
1261 | .lowerIfMemSizeNotPow2() | ||||||||
1262 | .minScalar(0, S32) | ||||||||
1263 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) | ||||||||
1264 | .widenScalarToNextPow2(0) | ||||||||
1265 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||||||
1266 | .lower(); | ||||||||
1267 | } | ||||||||
1268 | |||||||||
1269 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||||||
1270 | .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, | ||||||||
1271 | {S32, GlobalPtr, 16, 2 * 8}, | ||||||||
1272 | {S32, LocalPtr, 8, 8}, | ||||||||
1273 | {S32, LocalPtr, 16, 16}, | ||||||||
1274 | {S32, PrivatePtr, 8, 8}, | ||||||||
1275 | {S32, PrivatePtr, 16, 16}, | ||||||||
1276 | {S32, ConstantPtr, 8, 8}, | ||||||||
1277 | {S32, ConstantPtr, 16, 2 * 8}}) | ||||||||
1278 | .legalIf( | ||||||||
1279 | [=](const LegalityQuery &Query) -> bool { | ||||||||
1280 | return isLoadStoreLegal(ST, Query); | ||||||||
1281 | }); | ||||||||
1282 | |||||||||
1283 | if (ST.hasFlatAddressSpace()) { | ||||||||
1284 | ExtLoads.legalForTypesWithMemDesc( | ||||||||
1285 | {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); | ||||||||
1286 | } | ||||||||
1287 | |||||||||
1288 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||||
1289 | // 64-bits. | ||||||||
1290 | // | ||||||||
1291 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||||
1292 | // inserting addrspacecasts. | ||||||||
1293 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); | ||||||||
1294 | |||||||||
1295 | ExtLoads.clampScalar(0, S32, S32) | ||||||||
1296 | .widenScalarToNextPow2(0) | ||||||||
1297 | .unsupportedIfMemSizeNotPow2() | ||||||||
1298 | .lower(); | ||||||||
1299 | |||||||||
1300 | auto &Atomics = getActionDefinitionsBuilder( | ||||||||
1301 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||||||
1302 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||||||
1303 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||||||
1304 | G_ATOMICRMW_UMIN}) | ||||||||
1305 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||||||
1306 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||||||
1307 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||||
1308 | if (ST.hasFlatAddressSpace()) { | ||||||||
1309 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||||||
1310 | } | ||||||||
1311 | |||||||||
1312 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | ||||||||
1313 | if (ST.hasLDSFPAtomics()) { | ||||||||
1314 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||||||
1315 | if (ST.hasGFX90AInsts()) | ||||||||
1316 | Atomic.legalFor({{S64, LocalPtr}}); | ||||||||
1317 | } | ||||||||
1318 | if (ST.hasAtomicFaddInsts()) | ||||||||
1319 | Atomic.legalFor({{S32, GlobalPtr}}); | ||||||||
1320 | |||||||||
1321 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||||||
1322 | // demarshalling | ||||||||
1323 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||||||
1324 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||||||
1325 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||||||
1326 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||||||
1327 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||||
1328 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||||||
1329 | |||||||||
1330 | // Condition should be s32 for scalar, s1 for vector. | ||||||||
1331 | getActionDefinitionsBuilder(G_SELECT) | ||||||||
1332 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, | ||||||||
1333 | GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, | ||||||||
1334 | LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) | ||||||||
1335 | .clampScalar(0, S16, S64) | ||||||||
1336 | .scalarize(1) | ||||||||
1337 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||||
1338 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||||||
1339 | .clampMaxNumElements(0, S32, 2) | ||||||||
1340 | .clampMaxNumElements(0, LocalPtr, 2) | ||||||||
1341 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||||||
1342 | .scalarize(0) | ||||||||
1343 | .widenScalarToNextPow2(0) | ||||||||
1344 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||||||
1345 | |||||||||
1346 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||||||
1347 | // be more flexible with the shift amount type. | ||||||||
1348 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||||||
1349 | .legalFor({{S32, S32}, {S64, S32}}); | ||||||||
1350 | if (ST.has16BitInsts()) { | ||||||||
1351 | if (ST.hasVOP3PInsts()) { | ||||||||
1352 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||||||
1353 | .clampMaxNumElements(0, S16, 2); | ||||||||
1354 | } else | ||||||||
1355 | Shifts.legalFor({{S16, S16}}); | ||||||||
1356 | |||||||||
1357 | // TODO: Support 16-bit shift amounts for all types | ||||||||
1358 | Shifts.widenScalarIf( | ||||||||
1359 | [=](const LegalityQuery &Query) { | ||||||||
1360 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||||||
1361 | // 32-bit amount. | ||||||||
1362 | const LLT ValTy = Query.Types[0]; | ||||||||
1363 | const LLT AmountTy = Query.Types[1]; | ||||||||
1364 | return ValTy.getSizeInBits() <= 16 && | ||||||||
1365 | AmountTy.getSizeInBits() < 16; | ||||||||
1366 | }, changeTo(1, S16)); | ||||||||
1367 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||||||
1368 | Shifts.clampScalar(1, S32, S32); | ||||||||
1369 | Shifts.clampScalar(0, S16, S64); | ||||||||
1370 | Shifts.widenScalarToNextPow2(0, 16); | ||||||||
1371 | |||||||||
1372 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||||
1373 | .minScalar(0, S16) | ||||||||
1374 | .scalarize(0) | ||||||||
1375 | .lower(); | ||||||||
1376 | } else { | ||||||||
1377 | // Make sure we legalize the shift amount type first, as the general | ||||||||
1378 | // expansion for the shifted type will produce much worse code if it hasn't | ||||||||
1379 | // been truncated already. | ||||||||
1380 | Shifts.clampScalar(1, S32, S32); | ||||||||
1381 | Shifts.clampScalar(0, S32, S64); | ||||||||
1382 | Shifts.widenScalarToNextPow2(0, 32); | ||||||||
1383 | |||||||||
1384 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||||
1385 | .minScalar(0, S32) | ||||||||
1386 | .scalarize(0) | ||||||||
1387 | .lower(); | ||||||||
1388 | } | ||||||||
1389 | Shifts.scalarize(0); | ||||||||
1390 | |||||||||
1391 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||||||
1392 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||||||
1393 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||||||
1394 | unsigned IdxTypeIdx = 2; | ||||||||
1395 | |||||||||
1396 | getActionDefinitionsBuilder(Op) | ||||||||
1397 | .customIf([=](const LegalityQuery &Query) { | ||||||||
1398 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||||
1399 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||||
1400 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||||||
1401 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
1402 | return (EltSize == 32 || EltSize == 64) && | ||||||||
1403 | VecTy.getSizeInBits() % 32 == 0 && | ||||||||
1404 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||||||
1405 | IdxTy.getSizeInBits() == 32; | ||||||||
1406 | }) | ||||||||
1407 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||||||
1408 | bitcastToVectorElement32(VecTypeIdx)) | ||||||||
1409 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||||||
1410 | .bitcastIf( | ||||||||
1411 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||||||
1412 | [=](const LegalityQuery &Query) { | ||||||||
1413 | // For > 64-bit element types, try to turn this into a 64-bit | ||||||||
1414 | // element vector since we may be able to do better indexing | ||||||||
1415 | // if this is scalar. If not, fall back to 32. | ||||||||
1416 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||||
1417 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||||
1418 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||||||
1419 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||||||
1420 | |||||||||
1421 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||||||
1422 | return std::make_pair( | ||||||||
1423 | VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); | ||||||||
1424 | }) | ||||||||
1425 | .clampScalar(EltTypeIdx, S32, S64) | ||||||||
1426 | .clampScalar(VecTypeIdx, S32, S64) | ||||||||
1427 | .clampScalar(IdxTypeIdx, S32, S32) | ||||||||
1428 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||||||
1429 | // TODO: Clamp elements for 64-bit vectors? | ||||||||
1430 | // It should only be necessary with variable indexes. | ||||||||
1431 | // As a last resort, lower to the stack | ||||||||
1432 | .lower(); | ||||||||
1433 | } | ||||||||
1434 | |||||||||
1435 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||||||
1436 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||||||
1437 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||||||
1438 | return Query.Types[0] != EltTy; | ||||||||
1439 | }); | ||||||||
1440 | |||||||||
1441 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||||||
1442 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||||||
1443 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||||||
1444 | |||||||||
1445 | // FIXME: Doesn't handle extract of illegal sizes. | ||||||||
1446 | getActionDefinitionsBuilder(Op) | ||||||||
1447 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||||||
1448 | // FIXME: Multiples of 16 should not be legal. | ||||||||
1449 | .legalIf([=](const LegalityQuery &Query) { | ||||||||
1450 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||||
1451 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||||
1452 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||||||
1453 | (LitTy.getSizeInBits() % 16 == 0); | ||||||||
1454 | }) | ||||||||
1455 | .widenScalarIf( | ||||||||
1456 | [=](const LegalityQuery &Query) { | ||||||||
1457 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||||
1458 | return (BigTy.getScalarSizeInBits() < 16); | ||||||||
1459 | }, | ||||||||
1460 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||||||
1461 | .widenScalarIf( | ||||||||
1462 | [=](const LegalityQuery &Query) { | ||||||||
1463 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||||
1464 | return (LitTy.getScalarSizeInBits() < 16); | ||||||||
1465 | }, | ||||||||
1466 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||||||
1467 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||||
1468 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||||||
1469 | |||||||||
1470 | } | ||||||||
1471 | |||||||||
1472 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||||||
1473 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||||||
1474 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||||||
1475 | .clampNumElements(0, V16S32, V32S32) | ||||||||
1476 | .clampNumElements(0, V2S64, V16S64) | ||||||||
1477 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||||||
1478 | |||||||||
1479 | if (ST.hasScalarPackInsts()) { | ||||||||
1480 | BuildVector | ||||||||
1481 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||||||
1482 | .minScalarOrElt(0, S16) | ||||||||
1483 | // Widen source elements and produce a G_BUILD_VECTOR_TRUNC | ||||||||
1484 | .minScalar(1, S32); | ||||||||
1485 | |||||||||
1486 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||||
1487 | .legalFor({V2S16, S32}) | ||||||||
1488 | .lower(); | ||||||||
1489 | BuildVector.minScalarOrElt(0, S32); | ||||||||
1490 | } else { | ||||||||
1491 | BuildVector.customFor({V2S16, S16}); | ||||||||
1492 | BuildVector.minScalarOrElt(0, S32); | ||||||||
1493 | |||||||||
1494 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||||
1495 | .customFor({V2S16, S32}) | ||||||||
1496 | .lower(); | ||||||||
1497 | } | ||||||||
1498 | |||||||||
1499 | BuildVector.legalIf(isRegisterType(0)); | ||||||||
1500 | |||||||||
1501 | // FIXME: Clamp maximum size | ||||||||
1502 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||||||
1503 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||||
1504 | .clampMaxNumElements(0, S32, 32) | ||||||||
1505 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||||||
1506 | .clampMaxNumElements(0, S16, 64); | ||||||||
1507 | |||||||||
1508 | // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse | ||||||||
1509 | // pre-legalize. | ||||||||
1510 | if (ST.hasVOP3PInsts()) { | ||||||||
1511 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) | ||||||||
1512 | .customFor({V2S16, V2S16}) | ||||||||
1513 | .lower(); | ||||||||
1514 | } else | ||||||||
1515 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||||||
1516 | |||||||||
1517 | // Merge/Unmerge | ||||||||
1518 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||||||
1519 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||||||
1520 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||||||
1521 | |||||||||
1522 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||||||
1523 | const LLT Ty = Query.Types[TypeIdx]; | ||||||||
1524 | if (Ty.isVector()) { | ||||||||
1525 | const LLT &EltTy = Ty.getElementType(); | ||||||||
1526 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||||||
1527 | return true; | ||||||||
1528 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | ||||||||
1529 | return true; | ||||||||
1530 | } | ||||||||
1531 | return false; | ||||||||
1532 | }; | ||||||||
1533 | |||||||||
1534 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||||||
1535 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||||
1536 | .lowerFor({{S16, V2S16}}) | ||||||||
1537 | .lowerIf([=](const LegalityQuery &Query) { | ||||||||
1538 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||||
1539 | return BigTy.getSizeInBits() == 32; | ||||||||
1540 | }) | ||||||||
1541 | // Try to widen to s16 first for small types. | ||||||||
1542 | // TODO: Only do this on targets with legal s16 shifts | ||||||||
1543 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||||||
1544 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||||||
1545 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||||
1546 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||||||
1547 | elementTypeIs(1, S16)), | ||||||||
1548 | changeTo(1, V2S16)) | ||||||||
1549 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||||||
1550 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||||||
1551 | // valid. | ||||||||
1552 | .clampScalar(LitTyIdx, S32, S512) | ||||||||
1553 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||||||
1554 | // Break up vectors with weird elements into scalars | ||||||||
1555 | .fewerElementsIf( | ||||||||
1556 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||||||
1557 | scalarize(0)) | ||||||||
1558 | .fewerElementsIf( | ||||||||
1559 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||||||
1560 | scalarize(1)) | ||||||||
1561 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||||||
1562 | |||||||||
1563 | if (Op == G_MERGE_VALUES) { | ||||||||
1564 | Builder.widenScalarIf( | ||||||||
1565 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||||||
1566 | [=](const LegalityQuery &Query) { | ||||||||
1567 | const LLT Ty = Query.Types[LitTyIdx]; | ||||||||
1568 | return Ty.getSizeInBits() < 32; | ||||||||
1569 | }, | ||||||||
1570 | changeTo(LitTyIdx, S32)); | ||||||||
1571 | } | ||||||||
1572 | |||||||||
1573 | Builder.widenScalarIf( | ||||||||
1574 | [=](const LegalityQuery &Query) { | ||||||||
1575 | const LLT Ty = Query.Types[BigTyIdx]; | ||||||||
1576 | return !isPowerOf2_32(Ty.getSizeInBits()) && | ||||||||
1577 | Ty.getSizeInBits() % 16 != 0; | ||||||||
1578 | }, | ||||||||
1579 | [=](const LegalityQuery &Query) { | ||||||||
1580 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||||||
1581 | // Whichever is smaller. | ||||||||
1582 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||||||
1583 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||||||
1584 | if (NewSizeInBits >= 256) { | ||||||||
1585 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||||||
1586 | if (RoundedTo < NewSizeInBits) | ||||||||
1587 | NewSizeInBits = RoundedTo; | ||||||||
1588 | } | ||||||||
1589 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||||||
1590 | }) | ||||||||
1591 | // Any vectors left are the wrong size. Scalarize them. | ||||||||
1592 | .scalarize(0) | ||||||||
1593 | .scalarize(1); | ||||||||
1594 | } | ||||||||
1595 | |||||||||
1596 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||||
1597 | // RegBankSelect. | ||||||||
1598 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||||||
1599 | .legalFor({{S32}, {S64}}); | ||||||||
1600 | |||||||||
1601 | if (ST.hasVOP3PInsts()) { | ||||||||
1602 | SextInReg.lowerFor({{V2S16}}) | ||||||||
1603 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||||||
1604 | // get more vector shift opportunities, since we'll get those when | ||||||||
1605 | // expanded. | ||||||||
1606 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); | ||||||||
1607 | } else if (ST.has16BitInsts()) { | ||||||||
1608 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||||||
1609 | } else { | ||||||||
1610 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||||||
1611 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||||||
1612 | SextInReg.lowerFor({{S32}, {S64}}); | ||||||||
1613 | } | ||||||||
1614 | |||||||||
1615 | SextInReg | ||||||||
1616 | .scalarize(0) | ||||||||
1617 | .clampScalar(0, S32, S64) | ||||||||
1618 | .lower(); | ||||||||
1619 | |||||||||
1620 | // TODO: Only Try to form v2s16 with legal packed instructions. | ||||||||
1621 | getActionDefinitionsBuilder(G_FSHR) | ||||||||
1622 | .legalFor({{S32, S32}}) | ||||||||
1623 | .lowerFor({{V2S16, V2S16}}) | ||||||||
1624 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | ||||||||
1625 | .scalarize(0) | ||||||||
1626 | .lower(); | ||||||||
1627 | |||||||||
1628 | if (ST.hasVOP3PInsts()) { | ||||||||
1629 | getActionDefinitionsBuilder(G_FSHL) | ||||||||
1630 | .lowerFor({{V2S16, V2S16}}) | ||||||||
1631 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | ||||||||
1632 | .scalarize(0) | ||||||||
1633 | .lower(); | ||||||||
1634 | } else { | ||||||||
1635 | getActionDefinitionsBuilder(G_FSHL) | ||||||||
1636 | .scalarize(0) | ||||||||
1637 | .lower(); | ||||||||
1638 | } | ||||||||
1639 | |||||||||
1640 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||||||
1641 | .legalFor({S64}); | ||||||||
1642 | |||||||||
1643 | getActionDefinitionsBuilder(G_FENCE) | ||||||||
1644 | .alwaysLegal(); | ||||||||
1645 | |||||||||
1646 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | ||||||||
1647 | .scalarize(0) | ||||||||
1648 | .minScalar(0, S32) | ||||||||
1649 | .lower(); | ||||||||
1650 | |||||||||
1651 | getActionDefinitionsBuilder({ | ||||||||
1652 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||||||
1653 | G_FCOPYSIGN, | ||||||||
1654 | |||||||||
1655 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||||||
1656 | G_ATOMICRMW_NAND, | ||||||||
1657 | G_ATOMICRMW_FSUB, | ||||||||
1658 | G_READ_REGISTER, | ||||||||
1659 | G_WRITE_REGISTER, | ||||||||
1660 | |||||||||
1661 | G_SADDO, G_SSUBO, | ||||||||
1662 | |||||||||
1663 | // TODO: Implement | ||||||||
1664 | G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||||||
1665 | |||||||||
1666 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||||||
1667 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||||||
1668 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||||||
1669 | .unsupported(); | ||||||||
1670 | |||||||||
1671 | getLegacyLegalizerInfo().computeTables(); | ||||||||
1672 | verify(*ST.getInstrInfo()); | ||||||||
1673 | } | ||||||||
1674 | |||||||||
1675 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||||||
1676 | MachineInstr &MI) const { | ||||||||
1677 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
1678 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
1679 | |||||||||
1680 | switch (MI.getOpcode()) { | ||||||||
1681 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||||||
1682 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||||||
1683 | case TargetOpcode::G_FRINT: | ||||||||
1684 | return legalizeFrint(MI, MRI, B); | ||||||||
1685 | case TargetOpcode::G_FCEIL: | ||||||||
1686 | return legalizeFceil(MI, MRI, B); | ||||||||
1687 | case TargetOpcode::G_FREM: | ||||||||
1688 | return legalizeFrem(MI, MRI, B); | ||||||||
1689 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||||||
1690 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||||||
1691 | case TargetOpcode::G_SITOFP: | ||||||||
1692 | return legalizeITOFP(MI, MRI, B, true); | ||||||||
1693 | case TargetOpcode::G_UITOFP: | ||||||||
1694 | return legalizeITOFP(MI, MRI, B, false); | ||||||||
1695 | case TargetOpcode::G_FPTOSI: | ||||||||
1696 | return legalizeFPTOI(MI, MRI, B, true); | ||||||||
1697 | case TargetOpcode::G_FPTOUI: | ||||||||
1698 | return legalizeFPTOI(MI, MRI, B, false); | ||||||||
1699 | case TargetOpcode::G_FMINNUM: | ||||||||
1700 | case TargetOpcode::G_FMAXNUM: | ||||||||
1701 | case TargetOpcode::G_FMINNUM_IEEE: | ||||||||
1702 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||||||
1703 | return legalizeMinNumMaxNum(Helper, MI); | ||||||||
1704 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||||||
1705 | return legalizeExtractVectorElt(MI, MRI, B); | ||||||||
1706 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||||||
1707 | return legalizeInsertVectorElt(MI, MRI, B); | ||||||||
1708 | case TargetOpcode::G_SHUFFLE_VECTOR: | ||||||||
1709 | return legalizeShuffleVector(MI, MRI, B); | ||||||||
1710 | case TargetOpcode::G_FSIN: | ||||||||
1711 | case TargetOpcode::G_FCOS: | ||||||||
1712 | return legalizeSinCos(MI, MRI, B); | ||||||||
1713 | case TargetOpcode::G_GLOBAL_VALUE: | ||||||||
1714 | return legalizeGlobalValue(MI, MRI, B); | ||||||||
1715 | case TargetOpcode::G_LOAD: | ||||||||
1716 | case TargetOpcode::G_SEXTLOAD: | ||||||||
1717 | case TargetOpcode::G_ZEXTLOAD: | ||||||||
1718 | return legalizeLoad(Helper, MI); | ||||||||
1719 | case TargetOpcode::G_FMAD: | ||||||||
1720 | return legalizeFMad(MI, MRI, B); | ||||||||
1721 | case TargetOpcode::G_FDIV: | ||||||||
1722 | return legalizeFDIV(MI, MRI, B); | ||||||||
1723 | case TargetOpcode::G_UDIV: | ||||||||
1724 | case TargetOpcode::G_UREM: | ||||||||
1725 | case TargetOpcode::G_UDIVREM: | ||||||||
1726 | return legalizeUnsignedDIV_REM(MI, MRI, B); | ||||||||
1727 | case TargetOpcode::G_SDIV: | ||||||||
1728 | case TargetOpcode::G_SREM: | ||||||||
1729 | case TargetOpcode::G_SDIVREM: | ||||||||
1730 | return legalizeSignedDIV_REM(MI, MRI, B); | ||||||||
1731 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||||||
1732 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||||||
1733 | case TargetOpcode::G_FLOG: | ||||||||
1734 | return legalizeFlog(MI, B, numbers::ln2f); | ||||||||
1735 | case TargetOpcode::G_FLOG10: | ||||||||
1736 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||||||
1737 | case TargetOpcode::G_FEXP: | ||||||||
1738 | return legalizeFExp(MI, B); | ||||||||
1739 | case TargetOpcode::G_FPOW: | ||||||||
1740 | return legalizeFPow(MI, B); | ||||||||
1741 | case TargetOpcode::G_FFLOOR: | ||||||||
1742 | return legalizeFFloor(MI, MRI, B); | ||||||||
1743 | case TargetOpcode::G_BUILD_VECTOR: | ||||||||
1744 | return legalizeBuildVector(MI, MRI, B); | ||||||||
1745 | default: | ||||||||
1746 | return false; | ||||||||
1747 | } | ||||||||
1748 | |||||||||
1749 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1749); | ||||||||
1750 | } | ||||||||
1751 | |||||||||
1752 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||||||
1753 | unsigned AS, | ||||||||
1754 | MachineRegisterInfo &MRI, | ||||||||
1755 | MachineIRBuilder &B) const { | ||||||||
1756 | MachineFunction &MF = B.getMF(); | ||||||||
1757 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||||
1758 | const LLT S32 = LLT::scalar(32); | ||||||||
1759 | |||||||||
1760 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1760, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1761 | |||||||||
1762 | if (ST.hasApertureRegs()) { | ||||||||
1763 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | ||||||||
1764 | // getreg. | ||||||||
1765 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||||
1766 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | ||||||||
1767 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | ||||||||
1768 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||||
1769 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | ||||||||
1770 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | ||||||||
1771 | unsigned Encoding = | ||||||||
1772 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | ||||||||
1773 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | ||||||||
1774 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | ||||||||
1775 | |||||||||
1776 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||||||||
1777 | |||||||||
1778 | B.buildInstr(AMDGPU::S_GETREG_B32) | ||||||||
1779 | .addDef(GetReg) | ||||||||
1780 | .addImm(Encoding); | ||||||||
1781 | MRI.setType(GetReg, S32); | ||||||||
1782 | |||||||||
1783 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | ||||||||
1784 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); | ||||||||
1785 | } | ||||||||
1786 | |||||||||
1787 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||||||
1788 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||||
1789 | |||||||||
1790 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||||
1791 | return Register(); | ||||||||
1792 | |||||||||
1793 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||||||
1794 | // private_segment_aperture_base_hi. | ||||||||
1795 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||||||
1796 | |||||||||
1797 | // TODO: can we be smarter about machine pointer info? | ||||||||
1798 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||||||
1799 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||||
1800 | PtrInfo, | ||||||||
1801 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||||
1802 | MachineMemOperand::MOInvariant, | ||||||||
1803 | 4, commonAlignment(Align(64), StructOffset)); | ||||||||
1804 | |||||||||
1805 | Register LoadAddr; | ||||||||
1806 | |||||||||
1807 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | ||||||||
1808 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||||||
1809 | } | ||||||||
1810 | |||||||||
1811 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||||||
1812 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1813 | MachineIRBuilder &B) const { | ||||||||
1814 | MachineFunction &MF = B.getMF(); | ||||||||
1815 | |||||||||
1816 | const LLT S32 = LLT::scalar(32); | ||||||||
1817 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
1818 | Register Src = MI.getOperand(1).getReg(); | ||||||||
1819 | |||||||||
1820 | LLT DstTy = MRI.getType(Dst); | ||||||||
1821 | LLT SrcTy = MRI.getType(Src); | ||||||||
1822 | unsigned DestAS = DstTy.getAddressSpace(); | ||||||||
1823 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||||||
1824 | |||||||||
1825 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||||||
1826 | // vector element. | ||||||||
1827 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1827, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1828 | |||||||||
1829 | const AMDGPUTargetMachine &TM | ||||||||
1830 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||||||
1831 | |||||||||
1832 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||||||
1833 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||||||
1834 | return true; | ||||||||
1835 | } | ||||||||
1836 | |||||||||
1837 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||||
1838 | // Truncate. | ||||||||
1839 | B.buildExtract(Dst, Src, 0); | ||||||||
1840 | MI.eraseFromParent(); | ||||||||
1841 | return true; | ||||||||
1842 | } | ||||||||
1843 | |||||||||
1844 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||||
1845 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
1846 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||||||
1847 | |||||||||
1848 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | ||||||||
1849 | // another. Merge operands are required to be the same type, but creating an | ||||||||
1850 | // extra ptrtoint would be kind of pointless. | ||||||||
1851 | auto HighAddr = B.buildConstant( | ||||||||
1852 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | ||||||||
1853 | B.buildMerge(Dst, {Src, HighAddr}); | ||||||||
1854 | MI.eraseFromParent(); | ||||||||
1855 | return true; | ||||||||
1856 | } | ||||||||
1857 | |||||||||
1858 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | ||||||||
1859 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1860, __extension__ __PRETTY_FUNCTION__)) | ||||||||
1860 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1860, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1861 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||||||
1862 | |||||||||
1863 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||||||
1864 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||||||
1865 | |||||||||
1866 | // Extract low 32-bits of the pointer. | ||||||||
1867 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||||||
1868 | |||||||||
1869 | auto CmpRes = | ||||||||
1870 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||||||
1871 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||||||
1872 | |||||||||
1873 | MI.eraseFromParent(); | ||||||||
1874 | return true; | ||||||||
1875 | } | ||||||||
1876 | |||||||||
1877 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | ||||||||
1878 | return false; | ||||||||
1879 | |||||||||
1880 | if (!ST.hasFlatAddressSpace()) | ||||||||
1881 | return false; | ||||||||
1882 | |||||||||
1883 | auto SegmentNull = | ||||||||
1884 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||||||
1885 | auto FlatNull = | ||||||||
1886 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||||||
1887 | |||||||||
1888 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||||||
1889 | if (!ApertureReg.isValid()) | ||||||||
1890 | return false; | ||||||||
1891 | |||||||||
1892 | auto CmpRes = | ||||||||
1893 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); | ||||||||
1894 | |||||||||
1895 | // Coerce the type of the low half of the result so we can use merge_values. | ||||||||
1896 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||||||
1897 | |||||||||
1898 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||||||
1899 | // avoid the ptrtoint? | ||||||||
1900 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); | ||||||||
1901 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||||||
1902 | |||||||||
1903 | MI.eraseFromParent(); | ||||||||
1904 | return true; | ||||||||
1905 | } | ||||||||
1906 | |||||||||
1907 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||||||
1908 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1909 | MachineIRBuilder &B) const { | ||||||||
1910 | Register Src = MI.getOperand(1).getReg(); | ||||||||
1911 | LLT Ty = MRI.getType(Src); | ||||||||
1912 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1912, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1913 | |||||||||
1914 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||||||
1915 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||||||
1916 | |||||||||
1917 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||||||
1918 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||||||
1919 | |||||||||
1920 | // TODO: Should this propagate fast-math-flags? | ||||||||
1921 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||||||
1922 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||||||
1923 | |||||||||
1924 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||||||
1925 | auto Fabs = B.buildFAbs(Ty, Src); | ||||||||
1926 | |||||||||
1927 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||||||
1928 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||||||
1929 | MI.eraseFromParent(); | ||||||||
1930 | return true; | ||||||||
1931 | } | ||||||||
1932 | |||||||||
1933 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||||||
1934 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1935 | MachineIRBuilder &B) const { | ||||||||
1936 | |||||||||
1937 | const LLT S1 = LLT::scalar(1); | ||||||||
1938 | const LLT S64 = LLT::scalar(64); | ||||||||
1939 | |||||||||
1940 | Register Src = MI.getOperand(1).getReg(); | ||||||||
1941 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1941, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1942 | |||||||||
1943 | // result = trunc(src) | ||||||||
1944 | // if (src > 0.0 && src != result) | ||||||||
1945 | // result += 1.0 | ||||||||
1946 | |||||||||
1947 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||||||
1948 | |||||||||
1949 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||||||
1950 | const auto One = B.buildFConstant(S64, 1.0); | ||||||||
1951 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||||||
1952 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||||||
1953 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||||||
1954 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||||||
1955 | |||||||||
1956 | // TODO: Should this propagate fast-math-flags? | ||||||||
1957 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||||||
1958 | return true; | ||||||||
1959 | } | ||||||||
1960 | |||||||||
1961 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||||||
1962 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1963 | MachineIRBuilder &B) const { | ||||||||
1964 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
1965 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||||||
1966 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||||||
1967 | auto Flags = MI.getFlags(); | ||||||||
1968 | LLT Ty = MRI.getType(DstReg); | ||||||||
1969 | |||||||||
1970 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||||||
1971 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||||||
1972 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||||||
1973 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||||||
1974 | MI.eraseFromParent(); | ||||||||
1975 | return true; | ||||||||
1976 | } | ||||||||
1977 | |||||||||
1978 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||||||
1979 | MachineIRBuilder &B) { | ||||||||
1980 | const unsigned FractBits = 52; | ||||||||
1981 | const unsigned ExpBits = 11; | ||||||||
1982 | LLT S32 = LLT::scalar(32); | ||||||||
1983 | |||||||||
1984 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||||||
1985 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||||||
1986 | |||||||||
1987 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||||||
1988 | .addUse(Hi) | ||||||||
1989 | .addUse(Const0.getReg(0)) | ||||||||
1990 | .addUse(Const1.getReg(0)); | ||||||||
1991 | |||||||||
1992 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||||||
1993 | } | ||||||||
1994 | |||||||||
1995 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||||||
1996 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
1997 | MachineIRBuilder &B) const { | ||||||||
1998 | const LLT S1 = LLT::scalar(1); | ||||||||
1999 | const LLT S32 = LLT::scalar(32); | ||||||||
2000 | const LLT S64 = LLT::scalar(64); | ||||||||
2001 | |||||||||
2002 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2003 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2003, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2004 | |||||||||
2005 | // TODO: Should this use extract since the low half is unused? | ||||||||
2006 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||||
2007 | Register Hi = Unmerge.getReg(1); | ||||||||
2008 | |||||||||
2009 | // Extract the upper half, since this is where we will find the sign and | ||||||||
2010 | // exponent. | ||||||||
2011 | auto Exp = extractF64Exponent(Hi, B); | ||||||||
2012 | |||||||||
2013 | const unsigned FractBits = 52; | ||||||||
2014 | |||||||||
2015 | // Extract the sign bit. | ||||||||
2016 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||||||
2017 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||||||
2018 | |||||||||
2019 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||||||
2020 | |||||||||
2021 | const auto Zero32 = B.buildConstant(S32, 0); | ||||||||
2022 | |||||||||
2023 | // Extend back to 64-bits. | ||||||||
2024 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); | ||||||||
2025 | |||||||||
2026 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||||||
2027 | auto Not = B.buildNot(S64, Shr); | ||||||||
2028 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||||||
2029 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||||||
2030 | |||||||||
2031 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||||||
2032 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||||||
2033 | |||||||||
2034 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||||||
2035 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||||||
2036 | MI.eraseFromParent(); | ||||||||
2037 | return true; | ||||||||
2038 | } | ||||||||
2039 | |||||||||
2040 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||||||
2041 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2042 | MachineIRBuilder &B, bool Signed) const { | ||||||||
2043 | |||||||||
2044 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2045 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2046 | |||||||||
2047 | const LLT S64 = LLT::scalar(64); | ||||||||
2048 | const LLT S32 = LLT::scalar(32); | ||||||||
2049 | |||||||||
2050 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2050, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2051 | |||||||||
2052 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||||
2053 | |||||||||
2054 | auto CvtHi = Signed ? | ||||||||
2055 | B.buildSITOFP(S64, Unmerge.getReg(1)) : | ||||||||
2056 | B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||||||
2057 | |||||||||
2058 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||||||
2059 | |||||||||
2060 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||||||
2061 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||||||
2062 | .addUse(CvtHi.getReg(0)) | ||||||||
2063 | .addUse(ThirtyTwo.getReg(0)); | ||||||||
2064 | |||||||||
2065 | // TODO: Should this propagate fast-math-flags? | ||||||||
2066 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||||||
2067 | MI.eraseFromParent(); | ||||||||
2068 | return true; | ||||||||
2069 | } | ||||||||
2070 | |||||||||
2071 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||||||
2072 | // actually works. | ||||||||
2073 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, | ||||||||
2074 | MachineRegisterInfo &MRI, | ||||||||
2075 | MachineIRBuilder &B, | ||||||||
2076 | bool Signed) const { | ||||||||
2077 | |||||||||
2078 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2079 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2080 | |||||||||
2081 | const LLT S64 = LLT::scalar(64); | ||||||||
2082 | const LLT S32 = LLT::scalar(32); | ||||||||
2083 | |||||||||
2084 | const LLT SrcLT = MRI.getType(Src); | ||||||||
2085 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2085, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2086 | |||||||||
2087 | unsigned Flags = MI.getFlags(); | ||||||||
2088 | |||||||||
2089 | // The basic idea of converting a floating point number into a pair of 32-bit | ||||||||
2090 | // integers is illustrated as follows: | ||||||||
2091 | // | ||||||||
2092 | // tf := trunc(val); | ||||||||
2093 | // hif := floor(tf * 2^-32); | ||||||||
2094 | // lof := tf - hif * 2^32; // lof is always positive due to floor. | ||||||||
2095 | // hi := fptoi(hif); | ||||||||
2096 | // lo := fptoi(lof); | ||||||||
2097 | // | ||||||||
2098 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); | ||||||||
2099 | MachineInstrBuilder Sign; | ||||||||
2100 | if (Signed && SrcLT == S32) { | ||||||||
2101 | // However, a 32-bit floating point number has only 23 bits mantissa and | ||||||||
2102 | // it's not enough to hold all the significant bits of `lof` if val is | ||||||||
2103 | // negative. To avoid the loss of precision, We need to take the absolute | ||||||||
2104 | // value after truncating and flip the result back based on the original | ||||||||
2105 | // signedness. | ||||||||
2106 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); | ||||||||
2107 | Trunc = B.buildFAbs(S32, Trunc, Flags); | ||||||||
2108 | } | ||||||||
2109 | MachineInstrBuilder K0, K1; | ||||||||
2110 | if (SrcLT == S64) { | ||||||||
2111 | K0 = B.buildFConstant(S64, | ||||||||
2112 | BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL)); | ||||||||
2113 | K1 = B.buildFConstant(S64, | ||||||||
2114 | BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL)); | ||||||||
2115 | } else { | ||||||||
2116 | K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U)); | ||||||||
2117 | K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U)); | ||||||||
2118 | } | ||||||||
2119 | |||||||||
2120 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); | ||||||||
2121 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); | ||||||||
2122 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); | ||||||||
2123 | |||||||||
2124 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) | ||||||||
2125 | : B.buildFPTOUI(S32, FloorMul); | ||||||||
2126 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||||||
2127 | |||||||||
2128 | if (Signed && SrcLT == S32) { | ||||||||
2129 | // Flip the result based on the signedness, which is either all 0s or 1s. | ||||||||
2130 | Sign = B.buildMerge(S64, {Sign, Sign}); | ||||||||
2131 | // r := xor({lo, hi}, sign) - sign; | ||||||||
2132 | B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); | ||||||||
2133 | } else | ||||||||
2134 | B.buildMerge(Dst, {Lo, Hi}); | ||||||||
2135 | MI.eraseFromParent(); | ||||||||
2136 | |||||||||
2137 | return true; | ||||||||
2138 | } | ||||||||
2139 | |||||||||
2140 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||||||
2141 | MachineInstr &MI) const { | ||||||||
2142 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||||||
2143 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
2144 | |||||||||
2145 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||||||
2146 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||||||
2147 | |||||||||
2148 | // With ieee_mode disabled, the instructions have the correct behavior | ||||||||
2149 | // already for G_FMINNUM/G_FMAXNUM | ||||||||
2150 | if (!MFI->getMode().IEEE) | ||||||||
2151 | return !IsIEEEOp; | ||||||||
2152 | |||||||||
2153 | if (IsIEEEOp) | ||||||||
2154 | return true; | ||||||||
2155 | |||||||||
2156 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||||||
2157 | } | ||||||||
2158 | |||||||||
2159 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||||||
2160 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2161 | MachineIRBuilder &B) const { | ||||||||
2162 | // TODO: Should move some of this into LegalizerHelper. | ||||||||
2163 | |||||||||
2164 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||||
2165 | |||||||||
2166 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||||
2167 | // constant before this, so we shouldn't need | ||||||||
2168 | // getConstantVRegValWithLookThrough. | ||||||||
2169 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||||
2170 | getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||||||
2171 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||||
2172 | return true; | ||||||||
2173 | const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||||
2174 | |||||||||
2175 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2176 | Register Vec = MI.getOperand(1).getReg(); | ||||||||
2177 | |||||||||
2178 | LLT VecTy = MRI.getType(Vec); | ||||||||
2179 | LLT EltTy = VecTy.getElementType(); | ||||||||
2180 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2180, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2181 | |||||||||
2182 | if (IdxVal < VecTy.getNumElements()) | ||||||||
2183 | B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); | ||||||||
2184 | else | ||||||||
2185 | B.buildUndef(Dst); | ||||||||
2186 | |||||||||
2187 | MI.eraseFromParent(); | ||||||||
2188 | return true; | ||||||||
2189 | } | ||||||||
2190 | |||||||||
2191 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||||||
2192 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2193 | MachineIRBuilder &B) const { | ||||||||
2194 | // TODO: Should move some of this into LegalizerHelper. | ||||||||
2195 | |||||||||
2196 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||||
2197 | |||||||||
2198 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||||
2199 | // constant before this, so we shouldn't need | ||||||||
2200 | // getConstantVRegValWithLookThrough. | ||||||||
2201 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||||
2202 | getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||||||
2203 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||||
2204 | return true; | ||||||||
2205 | |||||||||
2206 | int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||||
2207 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2208 | Register Vec = MI.getOperand(1).getReg(); | ||||||||
2209 | Register Ins = MI.getOperand(2).getReg(); | ||||||||
2210 | |||||||||
2211 | LLT VecTy = MRI.getType(Vec); | ||||||||
2212 | LLT EltTy = VecTy.getElementType(); | ||||||||
2213 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2213, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2214 | |||||||||
2215 | if (IdxVal < VecTy.getNumElements()) | ||||||||
2216 | B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); | ||||||||
2217 | else | ||||||||
2218 | B.buildUndef(Dst); | ||||||||
2219 | |||||||||
2220 | MI.eraseFromParent(); | ||||||||
2221 | return true; | ||||||||
2222 | } | ||||||||
2223 | |||||||||
2224 | bool AMDGPULegalizerInfo::legalizeShuffleVector( | ||||||||
2225 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2226 | MachineIRBuilder &B) const { | ||||||||
2227 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
2228 | |||||||||
2229 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2230 | Register Src0 = MI.getOperand(1).getReg(); | ||||||||
2231 | LLT DstTy = MRI.getType(Dst); | ||||||||
2232 | LLT SrcTy = MRI.getType(Src0); | ||||||||
2233 | |||||||||
2234 | if (SrcTy == V2S16 && DstTy == V2S16 && | ||||||||
2235 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) | ||||||||
2236 | return true; | ||||||||
2237 | |||||||||
2238 | MachineIRBuilder HelperBuilder(MI); | ||||||||
2239 | GISelObserverWrapper DummyObserver; | ||||||||
2240 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); | ||||||||
2241 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; | ||||||||
2242 | } | ||||||||
2243 | |||||||||
2244 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||||||
2245 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2246 | MachineIRBuilder &B) const { | ||||||||
2247 | |||||||||
2248 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
2249 | Register SrcReg = MI.getOperand(1).getReg(); | ||||||||
2250 | LLT Ty = MRI.getType(DstReg); | ||||||||
2251 | unsigned Flags = MI.getFlags(); | ||||||||
2252 | |||||||||
2253 | Register TrigVal; | ||||||||
2254 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||||||
2255 | if (ST.hasTrigReducedRange()) { | ||||||||
2256 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||||||
2257 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||||||
2258 | .addUse(MulVal.getReg(0)) | ||||||||
2259 | .setMIFlags(Flags).getReg(0); | ||||||||
2260 | } else | ||||||||
2261 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||||||
2262 | |||||||||
2263 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||||||
2264 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||||||
2265 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | ||||||||
2266 | .addUse(TrigVal) | ||||||||
2267 | .setMIFlags(Flags); | ||||||||
2268 | MI.eraseFromParent(); | ||||||||
2269 | return true; | ||||||||
2270 | } | ||||||||
2271 | |||||||||
2272 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||||||
2273 | MachineIRBuilder &B, | ||||||||
2274 | const GlobalValue *GV, | ||||||||
2275 | int64_t Offset, | ||||||||
2276 | unsigned GAFlags) const { | ||||||||
2277 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2277, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2278 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||||||
2279 | // to the following code sequence: | ||||||||
2280 | // | ||||||||
2281 | // For constant address space: | ||||||||
2282 | // s_getpc_b64 s[0:1] | ||||||||
2283 | // s_add_u32 s0, s0, $symbol | ||||||||
2284 | // s_addc_u32 s1, s1, 0 | ||||||||
2285 | // | ||||||||
2286 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||||
2287 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||||||
2288 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||||||
2289 | // operand to the global variable. | ||||||||
2290 | // | ||||||||
2291 | // For global address space: | ||||||||
2292 | // s_getpc_b64 s[0:1] | ||||||||
2293 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||||||
2294 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||||||
2295 | // | ||||||||
2296 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||||
2297 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||||||
2298 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||||||
2299 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||||||
2300 | // operand to the global variable. | ||||||||
2301 | // | ||||||||
2302 | // What we want here is an offset from the value returned by s_getpc | ||||||||
2303 | // (which is the address of the s_add_u32 instruction) to the global | ||||||||
2304 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||||||
2305 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||||||
2306 | // small. This requires us to add 4 to the global variable offset in order to | ||||||||
2307 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||||||
2308 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||||||
2309 | // instruction. | ||||||||
2310 | |||||||||
2311 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||||
2312 | |||||||||
2313 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||||||
2314 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||||||
2315 | |||||||||
2316 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||||||
2317 | .addDef(PCReg); | ||||||||
2318 | |||||||||
2319 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||||||
2320 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||||||
2321 | MIB.addImm(0); | ||||||||
2322 | else | ||||||||
2323 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||||||
2324 | |||||||||
2325 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||||||
2326 | |||||||||
2327 | if (PtrTy.getSizeInBits() == 32) | ||||||||
2328 | B.buildExtract(DstReg, PCReg, 0); | ||||||||
2329 | return true; | ||||||||
2330 | } | ||||||||
2331 | |||||||||
2332 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||||||
2333 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2334 | MachineIRBuilder &B) const { | ||||||||
2335 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
2336 | LLT Ty = MRI.getType(DstReg); | ||||||||
2337 | unsigned AS = Ty.getAddressSpace(); | ||||||||
2338 | |||||||||
2339 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||||||
2340 | MachineFunction &MF = B.getMF(); | ||||||||
2341 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
2342 | |||||||||
2343 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||||||
2344 | if (!MFI->isModuleEntryFunction() && | ||||||||
2345 | !GV->getName().equals("llvm.amdgcn.module.lds")) { | ||||||||
2346 | const Function &Fn = MF.getFunction(); | ||||||||
2347 | DiagnosticInfoUnsupported BadLDSDecl( | ||||||||
2348 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||||||
2349 | DS_Warning); | ||||||||
2350 | Fn.getContext().diagnose(BadLDSDecl); | ||||||||
2351 | |||||||||
2352 | // We currently don't have a way to correctly allocate LDS objects that | ||||||||
2353 | // aren't directly associated with a kernel. We do force inlining of | ||||||||
2354 | // functions that use local objects. However, if these dead functions are | ||||||||
2355 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||||||
2356 | // and a trap, since there should be no callable path here. | ||||||||
2357 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||||||
2358 | B.buildUndef(DstReg); | ||||||||
2359 | MI.eraseFromParent(); | ||||||||
2360 | return true; | ||||||||
2361 | } | ||||||||
2362 | |||||||||
2363 | // TODO: We could emit code to handle the initialization somewhere. | ||||||||
2364 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { | ||||||||
2365 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
2366 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||||||
2367 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||||||
2368 | return true; // Leave in place; | ||||||||
2369 | } | ||||||||
2370 | |||||||||
2371 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||||||
2372 | Type *Ty = GV->getValueType(); | ||||||||
2373 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||||||
2374 | // zero-sized type in other languages to declare the dynamic shared | ||||||||
2375 | // memory which size is not known at the compile time. They will be | ||||||||
2376 | // allocated by the runtime and placed directly after the static | ||||||||
2377 | // allocated ones. They all share the same offset. | ||||||||
2378 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||||||
2379 | // Adjust alignment for that dynamic shared memory array. | ||||||||
2380 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); | ||||||||
2381 | LLT S32 = LLT::scalar(32); | ||||||||
2382 | auto Sz = | ||||||||
2383 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||||||
2384 | B.buildIntToPtr(DstReg, Sz); | ||||||||
2385 | MI.eraseFromParent(); | ||||||||
2386 | return true; | ||||||||
2387 | } | ||||||||
2388 | } | ||||||||
2389 | |||||||||
2390 | B.buildConstant( | ||||||||
2391 | DstReg, | ||||||||
2392 | MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); | ||||||||
2393 | MI.eraseFromParent(); | ||||||||
2394 | return true; | ||||||||
2395 | } | ||||||||
2396 | |||||||||
2397 | const Function &Fn = MF.getFunction(); | ||||||||
2398 | DiagnosticInfoUnsupported BadInit( | ||||||||
2399 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); | ||||||||
2400 | Fn.getContext().diagnose(BadInit); | ||||||||
2401 | return true; | ||||||||
2402 | } | ||||||||
2403 | |||||||||
2404 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||||
2405 | |||||||||
2406 | if (TLI->shouldEmitFixup(GV)) { | ||||||||
2407 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||||||
2408 | MI.eraseFromParent(); | ||||||||
2409 | return true; | ||||||||
2410 | } | ||||||||
2411 | |||||||||
2412 | if (TLI->shouldEmitPCReloc(GV)) { | ||||||||
2413 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||||||
2414 | MI.eraseFromParent(); | ||||||||
2415 | return true; | ||||||||
2416 | } | ||||||||
2417 | |||||||||
2418 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||||
2419 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||||||
2420 | |||||||||
2421 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||||||
2422 | MachinePointerInfo::getGOT(MF), | ||||||||
2423 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||||
2424 | MachineMemOperand::MOInvariant, | ||||||||
2425 | 8 /*Size*/, Align(8)); | ||||||||
2426 | |||||||||
2427 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||||||
2428 | |||||||||
2429 | if (Ty.getSizeInBits() == 32) { | ||||||||
2430 | // Truncate if this is a 32-bit constant adrdess. | ||||||||
2431 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||||||
2432 | B.buildExtract(DstReg, Load, 0); | ||||||||
2433 | } else | ||||||||
2434 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||||||
2435 | |||||||||
2436 | MI.eraseFromParent(); | ||||||||
2437 | return true; | ||||||||
2438 | } | ||||||||
2439 | |||||||||
2440 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||||||
2441 | if (Ty.isVector()) | ||||||||
2442 | return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); | ||||||||
2443 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||||||
2444 | } | ||||||||
2445 | |||||||||
2446 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||||||
2447 | MachineInstr &MI) const { | ||||||||
2448 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
2449 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
2450 | GISelChangeObserver &Observer = Helper.Observer; | ||||||||
2451 | |||||||||
2452 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||||
2453 | LLT PtrTy = MRI.getType(PtrReg); | ||||||||
2454 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||||||
2455 | |||||||||
2456 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||||
2457 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||||
2458 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||||||
2459 | Observer.changingInstr(MI); | ||||||||
2460 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||||||
2461 | Observer.changedInstr(MI); | ||||||||
2462 | return true; | ||||||||
2463 | } | ||||||||
2464 | |||||||||
2465 | if (MI.getOpcode() != AMDGPU::G_LOAD) | ||||||||
2466 | return false; | ||||||||
2467 | |||||||||
2468 | Register ValReg = MI.getOperand(0).getReg(); | ||||||||
2469 | LLT ValTy = MRI.getType(ValReg); | ||||||||
2470 | |||||||||
2471 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
2472 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||||||
2473 | const unsigned MemSize = 8 * MMO->getSize(); | ||||||||
2474 | const Align MemAlign = MMO->getAlign(); | ||||||||
2475 | const unsigned AlignInBits = 8 * MemAlign.value(); | ||||||||
2476 | |||||||||
2477 | // Widen non-power-of-2 loads to the alignment if needed | ||||||||
2478 | if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||||||
2479 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||||||
2480 | |||||||||
2481 | // This was already the correct extending load result type, so just adjust | ||||||||
2482 | // the memory type. | ||||||||
2483 | if (WideMemSize == ValSize) { | ||||||||
2484 | MachineFunction &MF = B.getMF(); | ||||||||
2485 | |||||||||
2486 | MachineMemOperand *WideMMO = | ||||||||
2487 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||||||
2488 | Observer.changingInstr(MI); | ||||||||
2489 | MI.setMemRefs(MF, {WideMMO}); | ||||||||
2490 | Observer.changedInstr(MI); | ||||||||
2491 | return true; | ||||||||
2492 | } | ||||||||
2493 | |||||||||
2494 | // Don't bother handling edge case that should probably never be produced. | ||||||||
2495 | if (ValSize > WideMemSize) | ||||||||
2496 | return false; | ||||||||
2497 | |||||||||
2498 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||||||
2499 | |||||||||
2500 | Register WideLoad; | ||||||||
2501 | if (!WideTy.isVector()) { | ||||||||
2502 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||||
2503 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||||||
2504 | } else { | ||||||||
2505 | // Extract the subvector. | ||||||||
2506 | |||||||||
2507 | if (isRegisterType(ValTy)) { | ||||||||
2508 | // If this a case where G_EXTRACT is legal, use it. | ||||||||
2509 | // (e.g. <3 x s32> -> <4 x s32>) | ||||||||
2510 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||||
2511 | B.buildExtract(ValReg, WideLoad, 0); | ||||||||
2512 | } else { | ||||||||
2513 | // For cases where the widened type isn't a nice register value, unmerge | ||||||||
2514 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||||||
2515 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||||
2516 | WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); | ||||||||
2517 | B.setInsertPt(B.getMBB(), MI.getIterator()); | ||||||||
2518 | B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); | ||||||||
2519 | } | ||||||||
2520 | } | ||||||||
2521 | |||||||||
2522 | MI.eraseFromParent(); | ||||||||
2523 | return true; | ||||||||
2524 | } | ||||||||
2525 | |||||||||
2526 | return false; | ||||||||
2527 | } | ||||||||
2528 | |||||||||
2529 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||||||
2530 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||||
2531 | MachineIRBuilder &B) const { | ||||||||
2532 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||||
2533 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2533, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2534 | |||||||||
2535 | MachineFunction &MF = B.getMF(); | ||||||||
2536 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||||
2537 | |||||||||
2538 | // TODO: Always legal with future ftz flag. | ||||||||
2539 | // FIXME: Do we need just output? | ||||||||
2540 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||||||
2541 | return true; | ||||||||
2542 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||||||
2543 | return true; | ||||||||
2544 | |||||||||
2545 | MachineIRBuilder HelperBuilder(MI); | ||||||||
2546 | GISelObserverWrapper DummyObserver; | ||||||||
2547 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||||
2548 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||||||
2549 | } | ||||||||
2550 | |||||||||
2551 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||||||
2552 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
2553 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
2554 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||||
2555 | Register CmpVal = MI.getOperand(2).getReg(); | ||||||||
2556 | Register NewVal = MI.getOperand(3).getReg(); | ||||||||
2557 | |||||||||
2558 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2559, __extension__ __PRETTY_FUNCTION__)) | ||||||||
2559 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2559, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2560 | |||||||||
2561 | LLT ValTy = MRI.getType(CmpVal); | ||||||||
2562 | LLT VecTy = LLT::vector(2, ValTy); | ||||||||
2563 | |||||||||
2564 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||||||
2565 | |||||||||
2566 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||||||
2567 | .addDef(DstReg) | ||||||||
2568 | .addUse(PtrReg) | ||||||||
2569 | .addUse(PackedVal) | ||||||||
2570 | .setMemRefs(MI.memoperands()); | ||||||||
2571 | |||||||||
2572 | MI.eraseFromParent(); | ||||||||
2573 | return true; | ||||||||
2574 | } | ||||||||
2575 | |||||||||
2576 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||||||
2577 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||||||
2578 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2579 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2580 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
2581 | unsigned Flags = MI.getFlags(); | ||||||||
2582 | |||||||||
2583 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||||||
2584 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||||||
2585 | |||||||||
2586 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||||||
2587 | MI.eraseFromParent(); | ||||||||
2588 | return true; | ||||||||
2589 | } | ||||||||
2590 | |||||||||
2591 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||||||
2592 | MachineIRBuilder &B) const { | ||||||||
2593 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2594 | Register Src = MI.getOperand(1).getReg(); | ||||||||
2595 | unsigned Flags = MI.getFlags(); | ||||||||
2596 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
2597 | |||||||||
2598 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||||||
2599 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||||||
2600 | B.buildFExp2(Dst, Mul, Flags); | ||||||||
2601 | MI.eraseFromParent(); | ||||||||
2602 | return true; | ||||||||
2603 | } | ||||||||
2604 | |||||||||
2605 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||||||
2606 | MachineIRBuilder &B) const { | ||||||||
2607 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2608 | Register Src0 = MI.getOperand(1).getReg(); | ||||||||
2609 | Register Src1 = MI.getOperand(2).getReg(); | ||||||||
2610 | unsigned Flags = MI.getFlags(); | ||||||||
2611 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
2612 | const LLT S16 = LLT::scalar(16); | ||||||||
2613 | const LLT S32 = LLT::scalar(32); | ||||||||
2614 | |||||||||
2615 | if (Ty == S32) { | ||||||||
2616 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||||||
2617 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||||
2618 | .addUse(Log.getReg(0)) | ||||||||
2619 | .addUse(Src1) | ||||||||
2620 | .setMIFlags(Flags); | ||||||||
2621 | B.buildFExp2(Dst, Mul, Flags); | ||||||||
2622 | } else if (Ty == S16) { | ||||||||
2623 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||||||
2624 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||||||
2625 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||||||
2626 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||||||
2627 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||||
2628 | .addUse(Ext0.getReg(0)) | ||||||||
2629 | .addUse(Ext1.getReg(0)) | ||||||||
2630 | .setMIFlags(Flags); | ||||||||
2631 | |||||||||
2632 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||||||
2633 | } else | ||||||||
2634 | return false; | ||||||||
2635 | |||||||||
2636 | MI.eraseFromParent(); | ||||||||
2637 | return true; | ||||||||
2638 | } | ||||||||
2639 | |||||||||
2640 | // Find a source register, ignoring any possible source modifiers. | ||||||||
2641 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||||||
2642 | Register ModSrc = OrigSrc; | ||||||||
2643 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||||||
2644 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||||||
2645 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||||
2646 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||||
2647 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||||
2648 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||||
2649 | return ModSrc; | ||||||||
2650 | } | ||||||||
2651 | |||||||||
2652 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||||||
2653 | MachineRegisterInfo &MRI, | ||||||||
2654 | MachineIRBuilder &B) const { | ||||||||
2655 | |||||||||
2656 | const LLT S1 = LLT::scalar(1); | ||||||||
2657 | const LLT S64 = LLT::scalar(64); | ||||||||
2658 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2659 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||||||
2660 | unsigned Flags = MI.getFlags(); | ||||||||
2661 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2662, __extension__ __PRETTY_FUNCTION__)) | ||||||||
2662 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2662, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2663 | |||||||||
2664 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||||||
2665 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||||||
2666 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||||||
2667 | // V_FRACT bug is: | ||||||||
2668 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||||||
2669 | // | ||||||||
2670 | // Convert floor(x) to (x - fract(x)) | ||||||||
2671 | |||||||||
2672 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||||||
2673 | .addUse(OrigSrc) | ||||||||
2674 | .setMIFlags(Flags); | ||||||||
2675 | |||||||||
2676 | // Give source modifier matching some assistance before obscuring a foldable | ||||||||
2677 | // pattern. | ||||||||
2678 | |||||||||
2679 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||||||
2680 | // shouldn't matter? | ||||||||
2681 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||||||
2682 | |||||||||
2683 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); | ||||||||
2684 | |||||||||
2685 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||||||
2686 | |||||||||
2687 | // We don't need to concern ourselves with the snan handling difference, so | ||||||||
2688 | // use the one which will directly select. | ||||||||
2689 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
2690 | if (MFI->getMode().IEEE) | ||||||||
2691 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||||||
2692 | else | ||||||||
2693 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||||||
2694 | |||||||||
2695 | Register CorrectedFract = Min; | ||||||||
2696 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||||||
2697 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||||||
2698 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||||||
2699 | } | ||||||||
2700 | |||||||||
2701 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||||||
2702 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||||||
2703 | |||||||||
2704 | MI.eraseFromParent(); | ||||||||
2705 | return true; | ||||||||
2706 | } | ||||||||
2707 | |||||||||
2708 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||||||
2709 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||||||
2710 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||||||
2711 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
2712 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2713 | const LLT S32 = LLT::scalar(32); | ||||||||
2714 | assert(MRI.getType(Dst) == LLT::vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::vector(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2714, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2715 | |||||||||
2716 | Register Src0 = MI.getOperand(1).getReg(); | ||||||||
2717 | Register Src1 = MI.getOperand(2).getReg(); | ||||||||
2718 | assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16 )) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2718, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2719 | |||||||||
2720 | auto Merge = B.buildMerge(S32, {Src0, Src1}); | ||||||||
2721 | B.buildBitcast(Dst, Merge); | ||||||||
2722 | |||||||||
2723 | MI.eraseFromParent(); | ||||||||
2724 | return true; | ||||||||
2725 | } | ||||||||
2726 | |||||||||
2727 | // Check that this is a G_XOR x, -1 | ||||||||
2728 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||||||
2729 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||||||
2730 | return false; | ||||||||
2731 | auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||||||
2732 | return ConstVal && *ConstVal == -1; | ||||||||
2733 | } | ||||||||
2734 | |||||||||
2735 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||||||
2736 | static MachineInstr * | ||||||||
2737 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||||||
2738 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||||||
2739 | Register CondDef = MI.getOperand(0).getReg(); | ||||||||
2740 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||||||
2741 | return nullptr; | ||||||||
2742 | |||||||||
2743 | MachineBasicBlock *Parent = MI.getParent(); | ||||||||
2744 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||||||
2745 | |||||||||
2746 | if (isNot(MRI, *UseMI)) { | ||||||||
2747 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||||||
2748 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||||||
2749 | return nullptr; | ||||||||
2750 | |||||||||
2751 | // We're deleting the def of this value, so we need to remove it. | ||||||||
2752 | UseMI->eraseFromParent(); | ||||||||
2753 | |||||||||
2754 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||||||
2755 | Negated = true; | ||||||||
2756 | } | ||||||||
2757 | |||||||||
2758 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||||||
2759 | return nullptr; | ||||||||
2760 | |||||||||
2761 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||||||
2762 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||||||
2763 | if (Next == Parent->end()) { | ||||||||
2764 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||||||
2765 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||||||
2766 | return nullptr; | ||||||||
2767 | UncondBrTarget = &*NextMBB; | ||||||||
2768 | } else { | ||||||||
2769 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||||||
2770 | return nullptr; | ||||||||
2771 | Br = &*Next; | ||||||||
2772 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||||||
2773 | } | ||||||||
2774 | |||||||||
2775 | return UseMI; | ||||||||
2776 | } | ||||||||
2777 | |||||||||
2778 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||||||
2779 | const ArgDescriptor *Arg, | ||||||||
2780 | const TargetRegisterClass *ArgRC, | ||||||||
2781 | LLT ArgTy) const { | ||||||||
2782 | MCRegister SrcReg = Arg->getRegister(); | ||||||||
2783 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2783, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2784 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2784, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2785 | |||||||||
2786 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, | ||||||||
2787 | ArgTy); | ||||||||
2788 | if (Arg->isMasked()) { | ||||||||
2789 | // TODO: Should we try to emit this once in the entry block? | ||||||||
2790 | const LLT S32 = LLT::scalar(32); | ||||||||
2791 | const unsigned Mask = Arg->getMask(); | ||||||||
2792 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | ||||||||
2793 | |||||||||
2794 | Register AndMaskSrc = LiveIn; | ||||||||
2795 | |||||||||
2796 | if (Shift != 0) { | ||||||||
2797 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||||||
2798 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||||||
2799 | } | ||||||||
2800 | |||||||||
2801 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||||||
2802 | } else { | ||||||||
2803 | B.buildCopy(DstReg, LiveIn); | ||||||||
2804 | } | ||||||||
2805 | |||||||||
2806 | return true; | ||||||||
2807 | } | ||||||||
2808 | |||||||||
2809 | bool AMDGPULegalizerInfo::loadInputValue( | ||||||||
2810 | Register DstReg, MachineIRBuilder &B, | ||||||||
2811 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||||
2812 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
2813 | const ArgDescriptor *Arg; | ||||||||
2814 | const TargetRegisterClass *ArgRC; | ||||||||
2815 | LLT ArgTy; | ||||||||
2816 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||||||
2817 | |||||||||
2818 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||||||
2819 | return false; // TODO: Handle these | ||||||||
2820 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||||||
2821 | } | ||||||||
2822 | |||||||||
2823 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||||||
2824 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||||||
2825 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||||
2826 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||||||
2827 | return false; | ||||||||
2828 | |||||||||
2829 | MI.eraseFromParent(); | ||||||||
2830 | return true; | ||||||||
2831 | } | ||||||||
2832 | |||||||||
2833 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||||||
2834 | MachineRegisterInfo &MRI, | ||||||||
2835 | MachineIRBuilder &B) const { | ||||||||
2836 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
2837 | LLT DstTy = MRI.getType(Dst); | ||||||||
2838 | LLT S16 = LLT::scalar(16); | ||||||||
2839 | LLT S32 = LLT::scalar(32); | ||||||||
2840 | LLT S64 = LLT::scalar(64); | ||||||||
2841 | |||||||||
2842 | if (DstTy == S16) | ||||||||
2843 | return legalizeFDIV16(MI, MRI, B); | ||||||||
2844 | if (DstTy == S32) | ||||||||
2845 | return legalizeFDIV32(MI, MRI, B); | ||||||||
2846 | if (DstTy == S64) | ||||||||
2847 | return legalizeFDIV64(MI, MRI, B); | ||||||||
2848 | |||||||||
2849 | return false; | ||||||||
2850 | } | ||||||||
2851 | |||||||||
2852 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, | ||||||||
2853 | Register DstDivReg, | ||||||||
2854 | Register DstRemReg, | ||||||||
2855 | Register X, | ||||||||
2856 | Register Y) const { | ||||||||
2857 | const LLT S1 = LLT::scalar(1); | ||||||||
2858 | const LLT S32 = LLT::scalar(32); | ||||||||
2859 | |||||||||
2860 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||||||
2861 | // algorithm used here. | ||||||||
2862 | |||||||||
2863 | // Initial estimate of inv(y). | ||||||||
2864 | auto FloatY = B.buildUITOFP(S32, Y); | ||||||||
2865 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||||||
2866 | auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); | ||||||||
2867 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||||||
2868 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||||||
2869 | |||||||||
2870 | // One round of UNR. | ||||||||
2871 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||||||
2872 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||||||
2873 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||||||
2874 | |||||||||
2875 | // Quotient/remainder estimate. | ||||||||
2876 | auto Q = B.buildUMulH(S32, X, Z); | ||||||||
2877 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||||||
2878 | |||||||||
2879 | // First quotient/remainder refinement. | ||||||||
2880 | auto One = B.buildConstant(S32, 1); | ||||||||
2881 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||||
2882 | if (DstDivReg) | ||||||||
2883 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||||||
2884 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||||||
2885 | |||||||||
2886 | // Second quotient/remainder refinement. | ||||||||
2887 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||||
2888 | if (DstDivReg) | ||||||||
2889 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||||||
2890 | |||||||||
2891 | if (DstRemReg) | ||||||||
2892 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); | ||||||||
2893 | } | ||||||||
2894 | |||||||||
2895 | // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 | ||||||||
2896 | // | ||||||||
2897 | // Return lo, hi of result | ||||||||
2898 | // | ||||||||
2899 | // %cvt.lo = G_UITOFP Val.lo | ||||||||
2900 | // %cvt.hi = G_UITOFP Val.hi | ||||||||
2901 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||||||
2902 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||||||
2903 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||||||
2904 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||||||
2905 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||||||
2906 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||||||
2907 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||||||
2908 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||||||
2909 | Register Val) { | ||||||||
2910 | const LLT S32 = LLT::scalar(32); | ||||||||
2911 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||||||
2912 | |||||||||
2913 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||||||
2914 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||||||
2915 | |||||||||
2916 | auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 | ||||||||
2917 | B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); | ||||||||
2918 | |||||||||
2919 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||||||
2920 | auto Mul1 = | ||||||||
2921 | B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); | ||||||||
2922 | |||||||||
2923 | // 2**(-32) | ||||||||
2924 | auto Mul2 = | ||||||||
2925 | B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); | ||||||||
2926 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||||||
2927 | |||||||||
2928 | // -(2**32) | ||||||||
2929 | auto Mad2 = B.buildFMAD(S32, Trunc, | ||||||||
2930 | B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); | ||||||||
2931 | |||||||||
2932 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||||||
2933 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||||||
2934 | |||||||||
2935 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||||||
2936 | } | ||||||||
2937 | |||||||||
2938 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, | ||||||||
2939 | Register DstDivReg, | ||||||||
2940 | Register DstRemReg, | ||||||||
2941 | Register Numer, | ||||||||
2942 | Register Denom) const { | ||||||||
2943 | const LLT S32 = LLT::scalar(32); | ||||||||
2944 | const LLT S64 = LLT::scalar(64); | ||||||||
2945 | const LLT S1 = LLT::scalar(1); | ||||||||
2946 | Register RcpLo, RcpHi; | ||||||||
2947 | |||||||||
2948 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||||||
2949 | |||||||||
2950 | auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); | ||||||||
2951 | |||||||||
2952 | auto Zero64 = B.buildConstant(S64, 0); | ||||||||
2953 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||||||
2954 | |||||||||
2955 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||||||
2956 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||||||
2957 | |||||||||
2958 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||||||
2959 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||||||
2960 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||||||
2961 | |||||||||
2962 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||||||
2963 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||||||
2964 | auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); | ||||||||
2965 | auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); | ||||||||
2966 | |||||||||
2967 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||||||
2968 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||||||
2969 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||||||
2970 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||||||
2971 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||||||
2972 | |||||||||
2973 | auto Zero32 = B.buildConstant(S32, 0); | ||||||||
2974 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||||||
2975 | auto Add2_HiC = | ||||||||
2976 | B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); | ||||||||
2977 | auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); | ||||||||
2978 | auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); | ||||||||
2979 | |||||||||
2980 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||||||
2981 | Register NumerLo = UnmergeNumer.getReg(0); | ||||||||
2982 | Register NumerHi = UnmergeNumer.getReg(1); | ||||||||
2983 | |||||||||
2984 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||||||
2985 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||||||
2986 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||||||
2987 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||||||
2988 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||||||
2989 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||||||
2990 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||||||
2991 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||||||
2992 | auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); | ||||||||
2993 | |||||||||
2994 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||||||
2995 | Register DenomLo = UnmergeDenom.getReg(0); | ||||||||
2996 | Register DenomHi = UnmergeDenom.getReg(1); | ||||||||
2997 | |||||||||
2998 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||||||
2999 | auto C1 = B.buildSExt(S32, CmpHi); | ||||||||
3000 | |||||||||
3001 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||||||
3002 | auto C2 = B.buildSExt(S32, CmpLo); | ||||||||
3003 | |||||||||
3004 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||||||
3005 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||||||
3006 | |||||||||
3007 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||||||
3008 | // Currently control flow is unconditional and we have 4 selects after | ||||||||
3009 | // potential endif to substitute PHIs. | ||||||||
3010 | |||||||||
3011 | // if C3 != 0 ... | ||||||||
3012 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||||||
3013 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||||||
3014 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||||||
3015 | auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); | ||||||||
3016 | |||||||||
3017 | auto One64 = B.buildConstant(S64, 1); | ||||||||
3018 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||||||
3019 | |||||||||
3020 | auto C4 = | ||||||||
3021 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||||||
3022 | auto C5 = | ||||||||
3023 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||||||
3024 | auto C6 = B.buildSelect( | ||||||||
3025 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||||||
3026 | |||||||||
3027 | // if (C6 != 0) | ||||||||
3028 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||||||
3029 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||||||
3030 | |||||||||
3031 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||||||
3032 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||||||
3033 | auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); | ||||||||
3034 | |||||||||
3035 | // endif C6 | ||||||||
3036 | // endif C3 | ||||||||
3037 | |||||||||
3038 | if (DstDivReg) { | ||||||||
3039 | auto Sel1 = B.buildSelect( | ||||||||
3040 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||||||
3041 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||||||
3042 | Sel1, MulHi3); | ||||||||
3043 | } | ||||||||
3044 | |||||||||
3045 | if (DstRemReg) { | ||||||||
3046 | auto Sel2 = B.buildSelect( | ||||||||
3047 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||||||
3048 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||||||
3049 | Sel2, Sub1); | ||||||||
3050 | } | ||||||||
3051 | } | ||||||||
3052 | |||||||||
3053 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, | ||||||||
3054 | MachineRegisterInfo &MRI, | ||||||||
3055 | MachineIRBuilder &B) const { | ||||||||
3056 | Register DstDivReg, DstRemReg; | ||||||||
3057 | switch (MI.getOpcode()) { | ||||||||
3058 | default: | ||||||||
3059 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3059); | ||||||||
3060 | case AMDGPU::G_UDIV: { | ||||||||
3061 | DstDivReg = MI.getOperand(0).getReg(); | ||||||||
3062 | break; | ||||||||
3063 | } | ||||||||
3064 | case AMDGPU::G_UREM: { | ||||||||
3065 | DstRemReg = MI.getOperand(0).getReg(); | ||||||||
3066 | break; | ||||||||
3067 | } | ||||||||
3068 | case AMDGPU::G_UDIVREM: { | ||||||||
3069 | DstDivReg = MI.getOperand(0).getReg(); | ||||||||
3070 | DstRemReg = MI.getOperand(1).getReg(); | ||||||||
3071 | break; | ||||||||
3072 | } | ||||||||
3073 | } | ||||||||
3074 | |||||||||
3075 | const LLT S64 = LLT::scalar(64); | ||||||||
3076 | const LLT S32 = LLT::scalar(32); | ||||||||
3077 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||||||
3078 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||||||
3079 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||||||
3080 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||||
3081 | |||||||||
3082 | if (Ty == S32) | ||||||||
3083 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||||||
3084 | else if (Ty == S64) | ||||||||
3085 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||||||
3086 | else | ||||||||
3087 | return false; | ||||||||
3088 | |||||||||
3089 | MI.eraseFromParent(); | ||||||||
3090 | return true; | ||||||||
3091 | } | ||||||||
3092 | |||||||||
3093 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, | ||||||||
3094 | MachineRegisterInfo &MRI, | ||||||||
3095 | MachineIRBuilder &B) const { | ||||||||
3096 | const LLT S64 = LLT::scalar(64); | ||||||||
3097 | const LLT S32 = LLT::scalar(32); | ||||||||
3098 | |||||||||
3099 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||||
3100 | if (Ty != S32 && Ty != S64) | ||||||||
3101 | return false; | ||||||||
3102 | |||||||||
3103 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||||||
3104 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||||||
3105 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||||||
3106 | |||||||||
3107 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||||||
3108 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||||||
3109 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||||||
3110 | |||||||||
3111 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||||||
3112 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||||||
3113 | |||||||||
3114 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||||||
3115 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||||||
3116 | |||||||||
3117 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; | ||||||||
3118 | switch (MI.getOpcode()) { | ||||||||
3119 | default: | ||||||||
3120 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3120); | ||||||||
3121 | case AMDGPU::G_SDIV: { | ||||||||
3122 | DstDivReg = MI.getOperand(0).getReg(); | ||||||||
3123 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||||||
3124 | break; | ||||||||
3125 | } | ||||||||
3126 | case AMDGPU::G_SREM: { | ||||||||
3127 | DstRemReg = MI.getOperand(0).getReg(); | ||||||||
3128 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||||||
3129 | break; | ||||||||
3130 | } | ||||||||
3131 | case AMDGPU::G_SDIVREM: { | ||||||||
3132 | DstDivReg = MI.getOperand(0).getReg(); | ||||||||
3133 | DstRemReg = MI.getOperand(1).getReg(); | ||||||||
3134 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||||||
3135 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||||||
3136 | break; | ||||||||
3137 | } | ||||||||
3138 | } | ||||||||
3139 | |||||||||
3140 | if (Ty == S32) | ||||||||
3141 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||||||
3142 | else | ||||||||
3143 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||||||
3144 | |||||||||
3145 | if (DstDivReg) { | ||||||||
3146 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||||||
3147 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); | ||||||||
3148 | B.buildSub(DstDivReg, SignXor, Sign); | ||||||||
3149 | } | ||||||||
3150 | |||||||||
3151 | if (DstRemReg) { | ||||||||
3152 | auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||||||
3153 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); | ||||||||
3154 | B.buildSub(DstRemReg, SignXor, Sign); | ||||||||
3155 | } | ||||||||
3156 | |||||||||
3157 | MI.eraseFromParent(); | ||||||||
3158 | return true; | ||||||||
3159 | } | ||||||||
3160 | |||||||||
3161 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||||||
3162 | MachineRegisterInfo &MRI, | ||||||||
3163 | MachineIRBuilder &B) const { | ||||||||
3164 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3165 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3166 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3167 | uint16_t Flags = MI.getFlags(); | ||||||||
3168 | LLT ResTy = MRI.getType(Res); | ||||||||
3169 | |||||||||
3170 | const MachineFunction &MF = B.getMF(); | ||||||||
3171 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||||
3172 | MI.getFlag(MachineInstr::FmAfn); | ||||||||
3173 | |||||||||
3174 | if (!AllowInaccurateRcp) | ||||||||
3175 | return false; | ||||||||
3176 | |||||||||
3177 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||||||
3178 | // 1 / x -> RCP(x) | ||||||||
3179 | if (CLHS->isExactlyValue(1.0)) { | ||||||||
3180 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||||
3181 | .addUse(RHS) | ||||||||
3182 | .setMIFlags(Flags); | ||||||||
3183 | |||||||||
3184 | MI.eraseFromParent(); | ||||||||
3185 | return true; | ||||||||
3186 | } | ||||||||
3187 | |||||||||
3188 | // -1 / x -> RCP( FNEG(x) ) | ||||||||
3189 | if (CLHS->isExactlyValue(-1.0)) { | ||||||||
3190 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||||||
3191 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||||
3192 | .addUse(FNeg.getReg(0)) | ||||||||
3193 | .setMIFlags(Flags); | ||||||||
3194 | |||||||||
3195 | MI.eraseFromParent(); | ||||||||
3196 | return true; | ||||||||
3197 | } | ||||||||
3198 | } | ||||||||
3199 | |||||||||
3200 | // x / y -> x * (1.0 / y) | ||||||||
3201 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||||
3202 | .addUse(RHS) | ||||||||
3203 | .setMIFlags(Flags); | ||||||||
3204 | B.buildFMul(Res, LHS, RCP, Flags); | ||||||||
3205 | |||||||||
3206 | MI.eraseFromParent(); | ||||||||
3207 | return true; | ||||||||
3208 | } | ||||||||
3209 | |||||||||
3210 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||||||
3211 | MachineRegisterInfo &MRI, | ||||||||
3212 | MachineIRBuilder &B) const { | ||||||||
3213 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3214 | Register X = MI.getOperand(1).getReg(); | ||||||||
3215 | Register Y = MI.getOperand(2).getReg(); | ||||||||
3216 | uint16_t Flags = MI.getFlags(); | ||||||||
3217 | LLT ResTy = MRI.getType(Res); | ||||||||
3218 | |||||||||
3219 | const MachineFunction &MF = B.getMF(); | ||||||||
3220 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||||
3221 | MI.getFlag(MachineInstr::FmAfn); | ||||||||
3222 | |||||||||
3223 | if (!AllowInaccurateRcp) | ||||||||
3224 | return false; | ||||||||
3225 | |||||||||
3226 | auto NegY = B.buildFNeg(ResTy, Y); | ||||||||
3227 | auto One = B.buildFConstant(ResTy, 1.0); | ||||||||
3228 | |||||||||
3229 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||||
3230 | .addUse(Y) | ||||||||
3231 | .setMIFlags(Flags); | ||||||||
3232 | |||||||||
3233 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||||||
3234 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||||||
3235 | |||||||||
3236 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||||||
3237 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||||||
3238 | |||||||||
3239 | auto Ret = B.buildFMul(ResTy, X, R); | ||||||||
3240 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||||||
3241 | |||||||||
3242 | B.buildFMA(Res, Tmp2, R, Ret); | ||||||||
3243 | MI.eraseFromParent(); | ||||||||
3244 | return true; | ||||||||
3245 | } | ||||||||
3246 | |||||||||
3247 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||||||
3248 | MachineRegisterInfo &MRI, | ||||||||
3249 | MachineIRBuilder &B) const { | ||||||||
3250 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||||
3251 | return true; | ||||||||
3252 | |||||||||
3253 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3254 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3255 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3256 | |||||||||
3257 | uint16_t Flags = MI.getFlags(); | ||||||||
3258 | |||||||||
3259 | LLT S16 = LLT::scalar(16); | ||||||||
3260 | LLT S32 = LLT::scalar(32); | ||||||||
3261 | |||||||||
3262 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||||||
3263 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||||||
3264 | |||||||||
3265 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||||
3266 | .addUse(RHSExt.getReg(0)) | ||||||||
3267 | .setMIFlags(Flags); | ||||||||
3268 | |||||||||
3269 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||||||
3270 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||||||
3271 | |||||||||
3272 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||||
3273 | .addUse(RDst.getReg(0)) | ||||||||
3274 | .addUse(RHS) | ||||||||
3275 | .addUse(LHS) | ||||||||
3276 | .setMIFlags(Flags); | ||||||||
3277 | |||||||||
3278 | MI.eraseFromParent(); | ||||||||
3279 | return true; | ||||||||
3280 | } | ||||||||
3281 | |||||||||
3282 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||||||
3283 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||||||
3284 | static void toggleSPDenormMode(bool Enable, | ||||||||
3285 | MachineIRBuilder &B, | ||||||||
3286 | const GCNSubtarget &ST, | ||||||||
3287 | AMDGPU::SIModeRegisterDefaults Mode) { | ||||||||
3288 | // Set SP denorm mode to this value. | ||||||||
3289 | unsigned SPDenormMode = | ||||||||
3290 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||||||
3291 | |||||||||
3292 | if (ST.hasDenormModeInst()) { | ||||||||
3293 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||||||
3294 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||||||
3295 | |||||||||
3296 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||||||
3297 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||||||
3298 | .addImm(NewDenormModeValue); | ||||||||
3299 | |||||||||
3300 | } else { | ||||||||
3301 | // Select FP32 bit field in mode register. | ||||||||
3302 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||||||
3303 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||||||
3304 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||||||
3305 | |||||||||
3306 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||||||
3307 | .addImm(SPDenormMode) | ||||||||
3308 | .addImm(SPDenormModeBitField); | ||||||||
3309 | } | ||||||||
3310 | } | ||||||||
3311 | |||||||||
3312 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||||||
3313 | MachineRegisterInfo &MRI, | ||||||||
3314 | MachineIRBuilder &B) const { | ||||||||
3315 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||||
3316 | return true; | ||||||||
3317 | |||||||||
3318 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3319 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3320 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3321 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
3322 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); | ||||||||
3323 | |||||||||
3324 | uint16_t Flags = MI.getFlags(); | ||||||||
3325 | |||||||||
3326 | LLT S32 = LLT::scalar(32); | ||||||||
3327 | LLT S1 = LLT::scalar(1); | ||||||||
3328 | |||||||||
3329 | auto One = B.buildFConstant(S32, 1.0f); | ||||||||
3330 | |||||||||
3331 | auto DenominatorScaled = | ||||||||
3332 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||||
3333 | .addUse(LHS) | ||||||||
3334 | .addUse(RHS) | ||||||||
3335 | .addImm(0) | ||||||||
3336 | .setMIFlags(Flags); | ||||||||
3337 | auto NumeratorScaled = | ||||||||
3338 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||||
3339 | .addUse(LHS) | ||||||||
3340 | .addUse(RHS) | ||||||||
3341 | .addImm(1) | ||||||||
3342 | .setMIFlags(Flags); | ||||||||
3343 | |||||||||
3344 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||||
3345 | .addUse(DenominatorScaled.getReg(0)) | ||||||||
3346 | .setMIFlags(Flags); | ||||||||
3347 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||||||
3348 | |||||||||
3349 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||||||
3350 | // aren't modeled as reading it. | ||||||||
3351 | if (!Mode.allFP32Denormals()) | ||||||||
3352 | toggleSPDenormMode(true, B, ST, Mode); | ||||||||
3353 | |||||||||
3354 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||||||
3355 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||||||
3356 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||||||
3357 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||||||
3358 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||||||
3359 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||||||
3360 | |||||||||
3361 | if (!Mode.allFP32Denormals()) | ||||||||
3362 | toggleSPDenormMode(false, B, ST, Mode); | ||||||||
3363 | |||||||||
3364 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||||||
3365 | .addUse(Fma4.getReg(0)) | ||||||||
3366 | .addUse(Fma1.getReg(0)) | ||||||||
3367 | .addUse(Fma3.getReg(0)) | ||||||||
3368 | .addUse(NumeratorScaled.getReg(1)) | ||||||||
3369 | .setMIFlags(Flags); | ||||||||
3370 | |||||||||
3371 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||||
3372 | .addUse(Fmas.getReg(0)) | ||||||||
3373 | .addUse(RHS) | ||||||||
3374 | .addUse(LHS) | ||||||||
3375 | .setMIFlags(Flags); | ||||||||
3376 | |||||||||
3377 | MI.eraseFromParent(); | ||||||||
3378 | return true; | ||||||||
3379 | } | ||||||||
3380 | |||||||||
3381 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||||||
3382 | MachineRegisterInfo &MRI, | ||||||||
3383 | MachineIRBuilder &B) const { | ||||||||
3384 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||||||
3385 | return true; | ||||||||
3386 | |||||||||
3387 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3388 | Register LHS = MI.getOperand(1).getReg(); | ||||||||
3389 | Register RHS = MI.getOperand(2).getReg(); | ||||||||
3390 | |||||||||
3391 | uint16_t Flags = MI.getFlags(); | ||||||||
3392 | |||||||||
3393 | LLT S64 = LLT::scalar(64); | ||||||||
3394 | LLT S1 = LLT::scalar(1); | ||||||||
3395 | |||||||||
3396 | auto One = B.buildFConstant(S64, 1.0); | ||||||||
3397 | |||||||||
3398 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||||
3399 | .addUse(LHS) | ||||||||
3400 | .addUse(RHS) | ||||||||
3401 | .addImm(0) | ||||||||
3402 | .setMIFlags(Flags); | ||||||||
3403 | |||||||||
3404 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||||||
3405 | |||||||||
3406 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||||||
3407 | .addUse(DivScale0.getReg(0)) | ||||||||
3408 | .setMIFlags(Flags); | ||||||||
3409 | |||||||||
3410 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||||||
3411 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||||||
3412 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||||||
3413 | |||||||||
3414 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||||
3415 | .addUse(LHS) | ||||||||
3416 | .addUse(RHS) | ||||||||
3417 | .addImm(1) | ||||||||
3418 | .setMIFlags(Flags); | ||||||||
3419 | |||||||||
3420 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||||||
3421 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||||||
3422 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||||||
3423 | |||||||||
3424 | Register Scale; | ||||||||
3425 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||||||
3426 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||||||
3427 | // is not usable. | ||||||||
3428 | |||||||||
3429 | LLT S32 = LLT::scalar(32); | ||||||||
3430 | |||||||||
3431 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||||||
3432 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||||||
3433 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||||||
3434 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||||||
3435 | |||||||||
3436 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||||||
3437 | Scale1Unmerge.getReg(1)); | ||||||||
3438 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||||||
3439 | Scale0Unmerge.getReg(1)); | ||||||||
3440 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||||||
3441 | } else { | ||||||||
3442 | Scale = DivScale1.getReg(1); | ||||||||
3443 | } | ||||||||
3444 | |||||||||
3445 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||||||
3446 | .addUse(Fma4.getReg(0)) | ||||||||
3447 | .addUse(Fma3.getReg(0)) | ||||||||
3448 | .addUse(Mul.getReg(0)) | ||||||||
3449 | .addUse(Scale) | ||||||||
3450 | .setMIFlags(Flags); | ||||||||
3451 | |||||||||
3452 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) | ||||||||
3453 | .addUse(Fmas.getReg(0)) | ||||||||
3454 | .addUse(RHS) | ||||||||
3455 | .addUse(LHS) | ||||||||
3456 | .setMIFlags(Flags); | ||||||||
3457 | |||||||||
3458 | MI.eraseFromParent(); | ||||||||
3459 | return true; | ||||||||
3460 | } | ||||||||
3461 | |||||||||
3462 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||||||
3463 | MachineRegisterInfo &MRI, | ||||||||
3464 | MachineIRBuilder &B) const { | ||||||||
3465 | Register Res = MI.getOperand(0).getReg(); | ||||||||
3466 | Register LHS = MI.getOperand(2).getReg(); | ||||||||
3467 | Register RHS = MI.getOperand(3).getReg(); | ||||||||
3468 | uint16_t Flags = MI.getFlags(); | ||||||||
3469 | |||||||||
3470 | LLT S32 = LLT::scalar(32); | ||||||||
3471 | LLT S1 = LLT::scalar(1); | ||||||||
3472 | |||||||||
3473 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||||||
3474 | const APFloat C0Val(1.0f); | ||||||||
3475 | |||||||||
3476 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||||||
3477 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||||||
3478 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | ||||||||
3479 | |||||||||
3480 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||||||
3481 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||||||
3482 | |||||||||
3483 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||||||
3484 | |||||||||
3485 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||||
3486 | .addUse(Mul0.getReg(0)) | ||||||||
3487 | .setMIFlags(Flags); | ||||||||
3488 | |||||||||
3489 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||||||
3490 | |||||||||
3491 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||||||
3492 | |||||||||
3493 | MI.eraseFromParent(); | ||||||||
3494 | return true; | ||||||||
3495 | } | ||||||||
3496 | |||||||||
3497 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||||||
3498 | // FIXME: Why do we handle this one but not other removed instructions? | ||||||||
3499 | // | ||||||||
3500 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||||||
3501 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||||||
3502 | // +-max_float. | ||||||||
3503 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||||||
3504 | MachineRegisterInfo &MRI, | ||||||||
3505 | MachineIRBuilder &B) const { | ||||||||
3506 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||||||
3507 | return true; | ||||||||
3508 | |||||||||
3509 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
3510 | Register Src = MI.getOperand(2).getReg(); | ||||||||
3511 | auto Flags = MI.getFlags(); | ||||||||
3512 | |||||||||
3513 | LLT Ty = MRI.getType(Dst); | ||||||||
3514 | |||||||||
3515 | const fltSemantics *FltSemantics; | ||||||||
3516 | if (Ty == LLT::scalar(32)) | ||||||||
3517 | FltSemantics = &APFloat::IEEEsingle(); | ||||||||
3518 | else if (Ty == LLT::scalar(64)) | ||||||||
3519 | FltSemantics = &APFloat::IEEEdouble(); | ||||||||
3520 | else | ||||||||
3521 | return false; | ||||||||
3522 | |||||||||
3523 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||||||
3524 | .addUse(Src) | ||||||||
3525 | .setMIFlags(Flags); | ||||||||
3526 | |||||||||
3527 | // We don't need to concern ourselves with the snan handling difference, since | ||||||||
3528 | // the rsq quieted (or not) so use the one which will directly select. | ||||||||
3529 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
3530 | const bool UseIEEE = MFI->getMode().IEEE; | ||||||||
3531 | |||||||||
3532 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||||||
3533 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||||||
3534 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||||||
3535 | |||||||||
3536 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||||||
3537 | |||||||||
3538 | if (UseIEEE) | ||||||||
3539 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||||||
3540 | else | ||||||||
3541 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||||||
3542 | MI.eraseFromParent(); | ||||||||
3543 | return true; | ||||||||
3544 | } | ||||||||
3545 | |||||||||
3546 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||||||
3547 | switch (IID) { | ||||||||
3548 | case Intrinsic::amdgcn_ds_fadd: | ||||||||
3549 | return AMDGPU::G_ATOMICRMW_FADD; | ||||||||
3550 | case Intrinsic::amdgcn_ds_fmin: | ||||||||
3551 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||||||
3552 | case Intrinsic::amdgcn_ds_fmax: | ||||||||
3553 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||||||
3554 | default: | ||||||||
3555 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3555); | ||||||||
3556 | } | ||||||||
3557 | } | ||||||||
3558 | |||||||||
3559 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||||||
3560 | MachineInstr &MI, | ||||||||
3561 | Intrinsic::ID IID) const { | ||||||||
3562 | GISelChangeObserver &Observer = Helper.Observer; | ||||||||
3563 | Observer.changingInstr(MI); | ||||||||
3564 | |||||||||
3565 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||||||
3566 | |||||||||
3567 | // The remaining operands were used to set fields in the MemOperand on | ||||||||
3568 | // construction. | ||||||||
3569 | for (int I = 6; I > 3; --I) | ||||||||
3570 | MI.RemoveOperand(I); | ||||||||
3571 | |||||||||
3572 | MI.RemoveOperand(1); // Remove the intrinsic ID. | ||||||||
3573 | Observer.changedInstr(MI); | ||||||||
3574 | return true; | ||||||||
3575 | } | ||||||||
3576 | |||||||||
3577 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||||||
3578 | MachineRegisterInfo &MRI, | ||||||||
3579 | MachineIRBuilder &B) const { | ||||||||
3580 | uint64_t Offset = | ||||||||
3581 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||||||
3582 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||||||
3583 | LLT DstTy = MRI.getType(DstReg); | ||||||||
3584 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||||||
3585 | |||||||||
3586 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||||||
3587 | if (!loadInputValue(KernargPtrReg, B, | ||||||||
3588 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||||||
3589 | return false; | ||||||||
3590 | |||||||||
3591 | // FIXME: This should be nuw | ||||||||
3592 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||||||
3593 | return true; | ||||||||
3594 | } | ||||||||
3595 | |||||||||
3596 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||||||
3597 | MachineRegisterInfo &MRI, | ||||||||
3598 | MachineIRBuilder &B) const { | ||||||||
3599 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||||
3600 | if (!MFI->isEntryFunction()) { | ||||||||
3601 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
3602 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||||||
3603 | } | ||||||||
3604 | |||||||||
3605 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
3606 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||||||
3607 | return false; | ||||||||
3608 | |||||||||
3609 | MI.eraseFromParent(); | ||||||||
3610 | return true; | ||||||||
3611 | } | ||||||||
3612 | |||||||||
3613 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||||||
3614 | MachineRegisterInfo &MRI, | ||||||||
3615 | MachineIRBuilder &B, | ||||||||
3616 | unsigned AddrSpace) const { | ||||||||
3617 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||||||
3618 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||||||
3619 | Register Hi32 = Unmerge.getReg(1); | ||||||||
3620 | |||||||||
3621 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||||||
3622 | MI.eraseFromParent(); | ||||||||
3623 | return true; | ||||||||
3624 | } | ||||||||
3625 | |||||||||
3626 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||||||
3627 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||||||
3628 | // split between the instruction's voffset and immoffset fields) and soffset | ||||||||
3629 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||||||
3630 | // the instruction's soffset field). This function takes the first kind of | ||||||||
3631 | // offset and figures out how to split it between voffset and immoffset. | ||||||||
3632 | std::tuple<Register, unsigned, unsigned> | ||||||||
3633 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||||||
3634 | Register OrigOffset) const { | ||||||||
3635 | const unsigned MaxImm = 4095; | ||||||||
3636 | Register BaseReg; | ||||||||
3637 | unsigned TotalConstOffset; | ||||||||
3638 | const LLT S32 = LLT::scalar(32); | ||||||||
3639 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
3640 | |||||||||
3641 | std::tie(BaseReg, TotalConstOffset) = | ||||||||
3642 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | ||||||||
3643 | |||||||||
3644 | unsigned ImmOffset = TotalConstOffset; | ||||||||
3645 | |||||||||
3646 | // If BaseReg is a pointer, convert it to int. | ||||||||
3647 | if (MRI.getType(BaseReg).isPointer()) | ||||||||
3648 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | ||||||||
3649 | |||||||||
3650 | // If the immediate value is too big for the immoffset field, put the value | ||||||||
3651 | // and -4096 into the immoffset field so that the value that is copied/added | ||||||||
3652 | // for the voffset field is a multiple of 4096, and it stands more chance | ||||||||
3653 | // of being CSEd with the copy/add for another similar load/store. | ||||||||
3654 | // However, do not do that rounding down to a multiple of 4096 if that is a | ||||||||
3655 | // negative number, as it appears to be illegal to have a negative offset | ||||||||
3656 | // in the vgpr, even if adding the immediate offset makes it positive. | ||||||||
3657 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||||||
3658 | ImmOffset -= Overflow; | ||||||||
3659 | if ((int32_t)Overflow < 0) { | ||||||||
3660 | Overflow += ImmOffset; | ||||||||
3661 | ImmOffset = 0; | ||||||||
3662 | } | ||||||||
3663 | |||||||||
3664 | if (Overflow != 0) { | ||||||||
3665 | if (!BaseReg) { | ||||||||
3666 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||||||
3667 | } else { | ||||||||
3668 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||||||
3669 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||||||
3670 | } | ||||||||
3671 | } | ||||||||
3672 | |||||||||
3673 | if (!BaseReg) | ||||||||
3674 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||||||
3675 | |||||||||
3676 | return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); | ||||||||
3677 | } | ||||||||
3678 | |||||||||
3679 | /// Handle register layout difference for f16 images for some subtargets. | ||||||||
3680 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||||||
3681 | MachineRegisterInfo &MRI, | ||||||||
3682 | Register Reg, | ||||||||
3683 | bool ImageStore) const { | ||||||||
3684 | const LLT S16 = LLT::scalar(16); | ||||||||
3685 | const LLT S32 = LLT::scalar(32); | ||||||||
3686 | LLT StoreVT = MRI.getType(Reg); | ||||||||
3687 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3687, __extension__ __PRETTY_FUNCTION__)); | ||||||||
3688 | |||||||||
3689 | if (ST.hasUnpackedD16VMem()) { | ||||||||
3690 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||||
3691 | |||||||||
3692 | SmallVector<Register, 4> WideRegs; | ||||||||
3693 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||||
3694 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||||||
3695 | |||||||||
3696 | int NumElts = StoreVT.getNumElements(); | ||||||||
3697 | |||||||||
3698 | return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); | ||||||||
3699 | } | ||||||||
3700 | |||||||||
3701 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||||||
3702 | if (StoreVT.getNumElements() == 2) { | ||||||||
3703 | SmallVector<Register, 4> PackedRegs; | ||||||||
3704 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||||||
3705 | PackedRegs.push_back(Reg); | ||||||||
3706 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||||||
3707 | return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); | ||||||||
3708 | } | ||||||||
3709 | |||||||||
3710 | if (StoreVT.getNumElements() == 3) { | ||||||||
3711 | SmallVector<Register, 4> PackedRegs; | ||||||||
3712 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||||
3713 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||||
3714 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||||
3715 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||||||
3716 | Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); | ||||||||
3717 | return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); | ||||||||
3718 | } | ||||||||
3719 | |||||||||
3720 | if (StoreVT.getNumElements() == 4) { | ||||||||
3721 | SmallVector<Register, 4> PackedRegs; | ||||||||
3722 | Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); | ||||||||
3723 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||||||
3724 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||||
3725 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||||
3726 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||||||
3727 | return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); | ||||||||
3728 | } | ||||||||
3729 | |||||||||
3730 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3730); | ||||||||
3731 | } | ||||||||
3732 | |||||||||
3733 | return Reg; | ||||||||
3734 | } | ||||||||
3735 | |||||||||
3736 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||||||
3737 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||||||
3738 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||||
3739 | LLT Ty = MRI->getType(VData); | ||||||||
3740 | |||||||||
3741 | const LLT S16 = LLT::scalar(16); | ||||||||
3742 | |||||||||
3743 | // Fixup illegal register types for i8 stores. | ||||||||
3744 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||||||
3745 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||||||
3746 | return AnyExt; | ||||||||
3747 | } | ||||||||
3748 | |||||||||
3749 | if (Ty.isVector()) { | ||||||||
3750 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||||||
3751 | if (IsFormat) | ||||||||
3752 | return handleD16VData(B, *MRI, VData); | ||||||||
3753 | } | ||||||||
3754 | } | ||||||||
3755 | |||||||||
3756 | return VData; | ||||||||
3757 | } | ||||||||
3758 | |||||||||
3759 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||||||
3760 | MachineRegisterInfo &MRI, | ||||||||
3761 | MachineIRBuilder &B, | ||||||||
3762 | bool IsTyped, | ||||||||
3763 | bool IsFormat) const { | ||||||||
3764 | Register VData = MI.getOperand(1).getReg(); | ||||||||
3765 | LLT Ty = MRI.getType(VData); | ||||||||
3766 | LLT EltTy = Ty.getScalarType(); | ||||||||
3767 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||||
3768 | const LLT S32 = LLT::scalar(32); | ||||||||
3769 | |||||||||
3770 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||||||
3771 | Register RSrc = MI.getOperand(2).getReg(); | ||||||||
3772 | |||||||||
3773 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
3774 | const int MemSize = MMO->getSize(); | ||||||||
3775 | |||||||||
3776 | unsigned ImmOffset; | ||||||||
3777 | unsigned TotalOffset; | ||||||||
3778 | |||||||||
3779 | // The typed intrinsics add an immediate after the registers. | ||||||||
3780 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||||
3781 | |||||||||
3782 | // The struct intrinsic variants add one additional operand over raw. | ||||||||
3783 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||||
3784 | Register VIndex; | ||||||||
3785 | int OpOffset = 0; | ||||||||
3786 | if (HasVIndex) { | ||||||||
3787 | VIndex = MI.getOperand(3).getReg(); | ||||||||
3788 | OpOffset = 1; | ||||||||
3789 | } | ||||||||
3790 | |||||||||
3791 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
3792 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
3793 | |||||||||
3794 | unsigned Format = 0; | ||||||||
3795 | if (IsTyped) { | ||||||||
3796 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3797 | ++OpOffset; | ||||||||
3798 | } | ||||||||
3799 | |||||||||
3800 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3801 | |||||||||
3802 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||||
3803 | if (TotalOffset != 0) | ||||||||
3804 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||||
3805 | |||||||||
3806 | unsigned Opc; | ||||||||
3807 | if (IsTyped) { | ||||||||
3808 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||||||
3809 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||||||
3810 | } else if (IsFormat) { | ||||||||
3811 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||||||
3812 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||||||
3813 | } else { | ||||||||
3814 | switch (MemSize) { | ||||||||
3815 | case 1: | ||||||||
3816 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||||||
3817 | break; | ||||||||
3818 | case 2: | ||||||||
3819 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||||||
3820 | break; | ||||||||
3821 | default: | ||||||||
3822 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||||||
3823 | break; | ||||||||
3824 | } | ||||||||
3825 | } | ||||||||
3826 | |||||||||
3827 | if (!VIndex) | ||||||||
3828 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||||
3829 | |||||||||
3830 | auto MIB = B.buildInstr(Opc) | ||||||||
3831 | .addUse(VData) // vdata | ||||||||
3832 | .addUse(RSrc) // rsrc | ||||||||
3833 | .addUse(VIndex) // vindex | ||||||||
3834 | .addUse(VOffset) // voffset | ||||||||
3835 | .addUse(SOffset) // soffset | ||||||||
3836 | .addImm(ImmOffset); // offset(imm) | ||||||||
3837 | |||||||||
3838 | if (IsTyped) | ||||||||
3839 | MIB.addImm(Format); | ||||||||
3840 | |||||||||
3841 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||||
3842 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||||
3843 | .addMemOperand(MMO); | ||||||||
3844 | |||||||||
3845 | MI.eraseFromParent(); | ||||||||
3846 | return true; | ||||||||
3847 | } | ||||||||
3848 | |||||||||
3849 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||||||
3850 | MachineRegisterInfo &MRI, | ||||||||
3851 | MachineIRBuilder &B, | ||||||||
3852 | bool IsFormat, | ||||||||
3853 | bool IsTyped) const { | ||||||||
3854 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||||||
3855 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
3856 | const int MemSize = MMO->getSize(); | ||||||||
3857 | const LLT S32 = LLT::scalar(32); | ||||||||
3858 | |||||||||
3859 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
3860 | Register RSrc = MI.getOperand(2).getReg(); | ||||||||
3861 | |||||||||
3862 | // The typed intrinsics add an immediate after the registers. | ||||||||
3863 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||||
3864 | |||||||||
3865 | // The struct intrinsic variants add one additional operand over raw. | ||||||||
3866 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||||
3867 | Register VIndex; | ||||||||
3868 | int OpOffset = 0; | ||||||||
3869 | if (HasVIndex) { | ||||||||
3870 | VIndex = MI.getOperand(3).getReg(); | ||||||||
3871 | OpOffset = 1; | ||||||||
3872 | } | ||||||||
3873 | |||||||||
3874 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
3875 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
3876 | |||||||||
3877 | unsigned Format = 0; | ||||||||
3878 | if (IsTyped) { | ||||||||
3879 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3880 | ++OpOffset; | ||||||||
3881 | } | ||||||||
3882 | |||||||||
3883 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||||
3884 | unsigned ImmOffset; | ||||||||
3885 | unsigned TotalOffset; | ||||||||
3886 | |||||||||
3887 | LLT Ty = MRI.getType(Dst); | ||||||||
3888 | LLT EltTy = Ty.getScalarType(); | ||||||||
3889 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||||
3890 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||||||
3891 | |||||||||
3892 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||||
3893 | if (TotalOffset != 0) | ||||||||
3894 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||||
3895 | |||||||||
3896 | unsigned Opc; | ||||||||
3897 | |||||||||
3898 | if (IsTyped) { | ||||||||
3899 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||||||
3900 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||||||
3901 | } else if (IsFormat) { | ||||||||
3902 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : | ||||||||
3903 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||||||
3904 | } else { | ||||||||
3905 | switch (MemSize) { | ||||||||
3906 | case 1: | ||||||||
3907 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||||||
3908 | break; | ||||||||
3909 | case 2: | ||||||||
3910 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||||||
3911 | break; | ||||||||
3912 | default: | ||||||||
3913 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||||||
3914 | break; | ||||||||
3915 | } | ||||||||
3916 | } | ||||||||
3917 | |||||||||
3918 | Register LoadDstReg; | ||||||||
3919 | |||||||||
3920 | bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); | ||||||||
3921 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||||||
3922 | |||||||||
3923 | if (IsExtLoad) | ||||||||
3924 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||||||
3925 | else if (Unpacked && IsD16 && Ty.isVector()) | ||||||||
3926 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||||||
3927 | else | ||||||||
3928 | LoadDstReg = Dst; | ||||||||
3929 | |||||||||
3930 | if (!VIndex) | ||||||||
3931 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||||
3932 | |||||||||
3933 | auto MIB = B.buildInstr(Opc) | ||||||||
3934 | .addDef(LoadDstReg) // vdata | ||||||||
3935 | .addUse(RSrc) // rsrc | ||||||||
3936 | .addUse(VIndex) // vindex | ||||||||
3937 | .addUse(VOffset) // voffset | ||||||||
3938 | .addUse(SOffset) // soffset | ||||||||
3939 | .addImm(ImmOffset); // offset(imm) | ||||||||
3940 | |||||||||
3941 | if (IsTyped) | ||||||||
3942 | MIB.addImm(Format); | ||||||||
3943 | |||||||||
3944 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||||
3945 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||||
3946 | .addMemOperand(MMO); | ||||||||
3947 | |||||||||
3948 | if (LoadDstReg != Dst) { | ||||||||
3949 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||||
3950 | |||||||||
3951 | // Widen result for extending loads was widened. | ||||||||
3952 | if (IsExtLoad) | ||||||||
3953 | B.buildTrunc(Dst, LoadDstReg); | ||||||||
3954 | else { | ||||||||
3955 | // Repack to original 16-bit vector result | ||||||||
3956 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||||||
3957 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||||||
3958 | SmallVector<Register, 4> Repack; | ||||||||
3959 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||||||
3960 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||||||
3961 | B.buildMerge(Dst, Repack); | ||||||||
3962 | } | ||||||||
3963 | } | ||||||||
3964 | |||||||||
3965 | MI.eraseFromParent(); | ||||||||
3966 | return true; | ||||||||
3967 | } | ||||||||
3968 | |||||||||
3969 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||||||
3970 | MachineIRBuilder &B, | ||||||||
3971 | bool IsInc) const { | ||||||||
3972 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : | ||||||||
3973 | AMDGPU::G_AMDGPU_ATOMIC_DEC; | ||||||||
3974 | B.buildInstr(Opc) | ||||||||
3975 | .addDef(MI.getOperand(0).getReg()) | ||||||||
3976 | .addUse(MI.getOperand(2).getReg()) | ||||||||
3977 | .addUse(MI.getOperand(3).getReg()) | ||||||||
3978 | .cloneMemRefs(MI); | ||||||||
3979 | MI.eraseFromParent(); | ||||||||
3980 | return true; | ||||||||
3981 | } | ||||||||
3982 | |||||||||
3983 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||||||
3984 | switch (IntrID) { | ||||||||
3985 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||||
3986 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||||
3987 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||||||
3988 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||||
3989 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||||
3990 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||||||
3991 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||||
3992 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||||
3993 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||||||
3994 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||||
3995 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||||
3996 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||||||
3997 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||||
3998 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||||
3999 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||||||
4000 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||||
4001 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||||
4002 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||||||
4003 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||||
4004 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||||
4005 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||||||
4006 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||||
4007 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||||
4008 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||||||
4009 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||||
4010 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||||
4011 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||||||
4012 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||||
4013 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||||
4014 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||||||
4015 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||||
4016 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||||
4017 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||||||
4018 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||||
4019 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||||
4020 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||||||
4021 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||||
4022 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||||
4023 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||||||
4024 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||||
4025 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||||
4026 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||||
4027 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||||||
4028 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||||
4029 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||||
4030 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | ||||||||
4031 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||||
4032 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||||
4033 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | ||||||||
4034 | default: | ||||||||
4035 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4035); | ||||||||
4036 | } | ||||||||
4037 | } | ||||||||
4038 | |||||||||
4039 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||||||
4040 | MachineIRBuilder &B, | ||||||||
4041 | Intrinsic::ID IID) const { | ||||||||
4042 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||||||
4043 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||||||
4044 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||||||
4045 | |||||||||
4046 | Register Dst; | ||||||||
4047 | |||||||||
4048 | int OpOffset = 0; | ||||||||
4049 | if (HasReturn) { | ||||||||
4050 | // A few FP atomics do not support return values. | ||||||||
4051 | Dst = MI.getOperand(0).getReg(); | ||||||||
4052 | } else { | ||||||||
4053 | OpOffset = -1; | ||||||||
4054 | } | ||||||||
4055 | |||||||||
4056 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||||||
4057 | Register CmpVal; | ||||||||
4058 | |||||||||
4059 | if (IsCmpSwap) { | ||||||||
4060 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
4061 | ++OpOffset; | ||||||||
4062 | } | ||||||||
4063 | |||||||||
4064 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||||||
4065 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||||||
4066 | |||||||||
4067 | // The struct intrinsic variants add one additional operand over raw. | ||||||||
4068 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||||
4069 | Register VIndex; | ||||||||
4070 | if (HasVIndex) { | ||||||||
4071 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
4072 | ++OpOffset; | ||||||||
4073 | } | ||||||||
4074 | |||||||||
4075 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||||
4076 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||||||
4077 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||||||
4078 | |||||||||
4079 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||||
4080 | |||||||||
4081 | unsigned ImmOffset; | ||||||||
4082 | unsigned TotalOffset; | ||||||||
4083 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||||
4084 | if (TotalOffset != 0) | ||||||||
4085 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); | ||||||||
4086 | |||||||||
4087 | if (!VIndex) | ||||||||
4088 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||||||
4089 | |||||||||
4090 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||||||
4091 | |||||||||
4092 | if (HasReturn) | ||||||||
4093 | MIB.addDef(Dst); | ||||||||
4094 | |||||||||
4095 | MIB.addUse(VData); // vdata | ||||||||
4096 | |||||||||
4097 | if (IsCmpSwap) | ||||||||
4098 | MIB.addReg(CmpVal); | ||||||||
4099 | |||||||||
4100 | MIB.addUse(RSrc) // rsrc | ||||||||
4101 | .addUse(VIndex) // vindex | ||||||||
4102 | .addUse(VOffset) // voffset | ||||||||
4103 | .addUse(SOffset) // soffset | ||||||||
4104 | .addImm(ImmOffset) // offset(imm) | ||||||||
4105 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||||
4106 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||||
4107 | .addMemOperand(MMO); | ||||||||
4108 | |||||||||
4109 | MI.eraseFromParent(); | ||||||||
4110 | return true; | ||||||||
4111 | } | ||||||||
4112 | |||||||||
4113 | /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized | ||||||||
4114 | /// vector with s16 typed elements. | ||||||||
4115 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, | ||||||||
4116 | SmallVectorImpl<Register> &PackedAddrs, | ||||||||
4117 | unsigned ArgOffset, | ||||||||
4118 | const AMDGPU::ImageDimIntrinsicInfo *Intr, | ||||||||
4119 | bool IsA16, bool IsG16) { | ||||||||
4120 | const LLT S16 = LLT::scalar(16); | ||||||||
4121 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
4122 | auto EndIdx = Intr->VAddrEnd; | ||||||||
4123 | |||||||||
4124 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||||||
4125 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||||
4126 | if (!SrcOp.isReg()) | ||||||||
4127 | continue; // _L to _LZ may have eliminated this. | ||||||||
4128 | |||||||||
4129 | Register AddrReg = SrcOp.getReg(); | ||||||||
4130 | |||||||||
4131 | if (I < Intr->GradientStart) { | ||||||||
4132 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||||||
4133 | PackedAddrs.push_back(AddrReg); | ||||||||
4134 | } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || | ||||||||
4135 | (I >= Intr->CoordStart && !IsA16)) { | ||||||||
4136 | // Handle any gradient or coordinate operands that should not be packed | ||||||||
4137 | PackedAddrs.push_back(AddrReg); | ||||||||
4138 | } else { | ||||||||
4139 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||||||
4140 | // derivatives dx/dh and dx/dv are packed with undef. | ||||||||
4141 | if (((I + 1) >= EndIdx) || | ||||||||
4142 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||||||
4143 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||||||
4144 | (Intr->NumGradients / 2) - 1) || | ||||||||
4145 | I == static_cast<unsigned>(Intr->GradientStart + | ||||||||
4146 | Intr->NumGradients - 1))) || | ||||||||
4147 | // Check for _L to _LZ optimization | ||||||||
4148 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||||||
4149 | PackedAddrs.push_back( | ||||||||
4150 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||||||
4151 | .getReg(0)); | ||||||||
4152 | } else { | ||||||||
4153 | PackedAddrs.push_back( | ||||||||
4154 | B.buildBuildVector( | ||||||||
4155 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||||||
4156 | .getReg(0)); | ||||||||
4157 | ++I; | ||||||||
4158 | } | ||||||||
4159 | } | ||||||||
4160 | } | ||||||||
4161 | } | ||||||||
4162 | |||||||||
4163 | /// Convert from separate vaddr components to a single vector address register, | ||||||||
4164 | /// and replace the remaining operands with $noreg. | ||||||||
4165 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||||||
4166 | int DimIdx, int NumVAddrs) { | ||||||||
4167 | const LLT S32 = LLT::scalar(32); | ||||||||
4168 | |||||||||
4169 | SmallVector<Register, 8> AddrRegs; | ||||||||
4170 | for (int I = 0; I != NumVAddrs; ++I) { | ||||||||
4171 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||||
4172 | if (SrcOp.isReg()) { | ||||||||
4173 | AddrRegs.push_back(SrcOp.getReg()); | ||||||||
4174 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4174, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4175 | } | ||||||||
4176 | } | ||||||||
4177 | |||||||||
4178 | int NumAddrRegs = AddrRegs.size(); | ||||||||
4179 | if (NumAddrRegs != 1) { | ||||||||
4180 | // Round up to 8 elements for v5-v7 | ||||||||
4181 | // FIXME: Missing intermediate sized register classes and instructions. | ||||||||
4182 | if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { | ||||||||
4183 | const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); | ||||||||
4184 | auto Undef = B.buildUndef(S32); | ||||||||
4185 | AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); | ||||||||
4186 | NumAddrRegs = RoundedNumRegs; | ||||||||
4187 | } | ||||||||
4188 | |||||||||
4189 | auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); | ||||||||
4190 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||||||
4191 | } | ||||||||
4192 | |||||||||
4193 | for (int I = 1; I != NumVAddrs; ++I) { | ||||||||
4194 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||||
4195 | if (SrcOp.isReg()) | ||||||||
4196 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||||||
4197 | } | ||||||||
4198 | } | ||||||||
4199 | |||||||||
4200 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||||||
4201 | /// | ||||||||
4202 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||||||
4203 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||||||
4204 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||||||
4205 | /// registers. | ||||||||
4206 | /// | ||||||||
4207 | /// We don't want to directly select image instructions just yet, but also want | ||||||||
4208 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||||||
4209 | /// want a selected instrution entering RegBankSelect. In order to avoid | ||||||||
4210 | /// defining a multitude of intermediate image instructions, directly hack on | ||||||||
4211 | /// the intrinsic's arguments. In cases like a16 addreses, this requires padding | ||||||||
4212 | /// now unnecessary arguments with $noreg. | ||||||||
4213 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||||||
4214 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||||||
4215 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||||||
4216 | |||||||||
4217 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||||||
4218 | const unsigned ArgOffset = NumDefs + 1; | ||||||||
4219 | bool IsTFE = NumDefs == 2; | ||||||||
| |||||||||
4220 | // We are only processing the operands of d16 image operations on subtargets | ||||||||
4221 | // that use the unpacked register layout, or need to repack the TFE result. | ||||||||
4222 | |||||||||
4223 | // TODO: Do we need to guard against already legalized intrinsics? | ||||||||
4224 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||||||
4225 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||||||
4226 | |||||||||
4227 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||||
4228 | const LLT S32 = LLT::scalar(32); | ||||||||
4229 | const LLT S16 = LLT::scalar(16); | ||||||||
4230 | const LLT V2S16 = LLT::vector(2, 16); | ||||||||
4231 | |||||||||
4232 | unsigned DMask = 0; | ||||||||
4233 | |||||||||
4234 | // Check for 16 bit addresses and pack if true. | ||||||||
4235 | LLT GradTy = | ||||||||
4236 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||||||
4237 | LLT AddrTy = | ||||||||
4238 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||||||
4239 | const bool IsG16 = GradTy == S16; | ||||||||
4240 | const bool IsA16 = AddrTy == S16; | ||||||||
4241 | |||||||||
4242 | int DMaskLanes = 0; | ||||||||
4243 | if (!BaseOpcode->Atomic) { | ||||||||
4244 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||||||
4245 | if (BaseOpcode->Gather4) { | ||||||||
4246 | DMaskLanes = 4; | ||||||||
4247 | } else if (DMask != 0) { | ||||||||
4248 | DMaskLanes = countPopulation(DMask); | ||||||||
4249 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||||||
4250 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||||||
4251 | B.buildUndef(MI.getOperand(0)); | ||||||||
4252 | MI.eraseFromParent(); | ||||||||
4253 | return true; | ||||||||
4254 | } | ||||||||
4255 | } | ||||||||
4256 | |||||||||
4257 | Observer.changingInstr(MI); | ||||||||
4258 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||||||
4259 | |||||||||
4260 | unsigned NewOpcode = NumDefs == 0 ? | ||||||||
4261 | AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||||||
4262 | |||||||||
4263 | // Track that we legalized this | ||||||||
4264 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||||||
4265 | |||||||||
4266 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||||||
4267 | // dmask to be at least 1 otherwise the instruction will fail | ||||||||
4268 | if (IsTFE
| ||||||||
4269 | DMask = 0x1; | ||||||||
4270 | DMaskLanes = 1; | ||||||||
4271 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||||||
4272 | } | ||||||||
4273 | |||||||||
4274 | if (BaseOpcode->Atomic
| ||||||||
4275 | Register VData0 = MI.getOperand(2).getReg(); | ||||||||
4276 | LLT Ty = MRI->getType(VData0); | ||||||||
4277 | |||||||||
4278 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||||||
4279 | if (Ty.isVector()) | ||||||||
4280 | return false; | ||||||||
4281 | |||||||||
4282 | if (BaseOpcode->AtomicX2) { | ||||||||
4283 | Register VData1 = MI.getOperand(3).getReg(); | ||||||||
4284 | // The two values are packed in one register. | ||||||||
4285 | LLT PackedTy = LLT::vector(2, Ty); | ||||||||
4286 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||||||
4287 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||||||
4288 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||||||
4289 | } | ||||||||
4290 | } | ||||||||
4291 | |||||||||
4292 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||||||
4293 | |||||||||
4294 | // Optimize _L to _LZ when _L is zero | ||||||||
4295 | if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = | ||||||||
4296 | AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { | ||||||||
4297 | const ConstantFP *ConstantLod; | ||||||||
4298 | |||||||||
4299 | if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, | ||||||||
4300 | m_GFCst(ConstantLod))) { | ||||||||
4301 | if (ConstantLod->isZero() || ConstantLod->isNegative()) { | ||||||||
4302 | // Set new opcode to _lz variant of _l, and change the intrinsic ID. | ||||||||
4303 | const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = | ||||||||
4304 | AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, | ||||||||
4305 | Intr->Dim); | ||||||||
4306 | |||||||||
4307 | // The starting indexes should remain in the same place. | ||||||||
4308 | --CorrectedNumVAddrs; | ||||||||
4309 | |||||||||
4310 | MI.getOperand(MI.getNumExplicitDefs()) | ||||||||
4311 | .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); | ||||||||
4312 | MI.RemoveOperand(ArgOffset + Intr->LodIndex); | ||||||||
4313 | Intr = NewImageDimIntr; | ||||||||
4314 | } | ||||||||
4315 | } | ||||||||
4316 | } | ||||||||
4317 | |||||||||
4318 | // Optimize _mip away, when 'lod' is zero | ||||||||
4319 | if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { | ||||||||
4320 | int64_t ConstantLod; | ||||||||
4321 | if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, | ||||||||
4322 | m_ICst(ConstantLod))) { | ||||||||
4323 | if (ConstantLod == 0) { | ||||||||
4324 | // TODO: Change intrinsic opcode and remove operand instead or replacing | ||||||||
4325 | // it with 0, as the _L to _LZ handling is done above. | ||||||||
4326 | MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); | ||||||||
4327 | --CorrectedNumVAddrs; | ||||||||
4328 | } | ||||||||
4329 | } | ||||||||
4330 | } | ||||||||
4331 | |||||||||
4332 | // Rewrite the addressing register layout before doing anything else. | ||||||||
4333 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { | ||||||||
4334 | // 16 bit gradients are supported, but are tied to the A16 control | ||||||||
4335 | // so both gradients and addresses must be 16 bit | ||||||||
4336 | return false; | ||||||||
4337 | } | ||||||||
4338 | |||||||||
4339 | if (IsA16
| ||||||||
4340 | // A16 not supported | ||||||||
4341 | return false; | ||||||||
4342 | } | ||||||||
4343 | |||||||||
4344 | if (IsA16
| ||||||||
4345 | if (Intr->NumVAddrs > 1) { | ||||||||
4346 | SmallVector<Register, 4> PackedRegs; | ||||||||
4347 | |||||||||
4348 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, | ||||||||
4349 | IsG16); | ||||||||
4350 | |||||||||
4351 | // See also below in the non-a16 branch | ||||||||
4352 | const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); | ||||||||
4353 | |||||||||
4354 | if (!UseNSA && PackedRegs.size() > 1) { | ||||||||
4355 | LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); | ||||||||
4356 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||||||
4357 | PackedRegs[0] = Concat.getReg(0); | ||||||||
4358 | PackedRegs.resize(1); | ||||||||
4359 | } | ||||||||
4360 | |||||||||
4361 | const unsigned NumPacked = PackedRegs.size(); | ||||||||
4362 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||||||
4363 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||||
4364 | if (!SrcOp.isReg()) { | ||||||||
4365 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4365, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4366 | continue; | ||||||||
4367 | } | ||||||||
4368 | |||||||||
4369 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4369, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4370 | |||||||||
4371 | if (I - Intr->VAddrStart < NumPacked) | ||||||||
4372 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||||||
4373 | else | ||||||||
4374 | SrcOp.setReg(AMDGPU::NoRegister); | ||||||||
4375 | } | ||||||||
4376 | } | ||||||||
4377 | } else { | ||||||||
4378 | // If the register allocator cannot place the address registers contiguously | ||||||||
4379 | // without introducing moves, then using the non-sequential address encoding | ||||||||
4380 | // is always preferable, since it saves VALU instructions and is usually a | ||||||||
4381 | // wash in terms of code size or even better. | ||||||||
4382 | // | ||||||||
4383 | // However, we currently have no way of hinting to the register allocator | ||||||||
4384 | // that MIMG addresses should be placed contiguously when it is possible to | ||||||||
4385 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||||||
4386 | // | ||||||||
4387 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||||||
4388 | // allocation when possible. | ||||||||
4389 | const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); | ||||||||
4390 | |||||||||
4391 | if (!UseNSA
| ||||||||
4392 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||||||
4393 | Intr->NumVAddrs); | ||||||||
4394 | } | ||||||||
4395 | |||||||||
4396 | int Flags = 0; | ||||||||
4397 | if (IsA16
| ||||||||
4398 | Flags |= 1; | ||||||||
4399 | if (IsG16
| ||||||||
4400 | Flags |= 2; | ||||||||
4401 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||||||
4402 | |||||||||
4403 | if (BaseOpcode->Store) { // No TFE for stores? | ||||||||
4404 | // TODO: Handle dmask trim | ||||||||
4405 | Register VData = MI.getOperand(1).getReg(); | ||||||||
4406 | LLT Ty = MRI->getType(VData); | ||||||||
4407 | if (!Ty.isVector() || Ty.getElementType() != S16) | ||||||||
4408 | return true; | ||||||||
4409 | |||||||||
4410 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||||||
4411 | if (RepackedReg != VData) { | ||||||||
4412 | MI.getOperand(1).setReg(RepackedReg); | ||||||||
4413 | } | ||||||||
4414 | |||||||||
4415 | return true; | ||||||||
4416 | } | ||||||||
4417 | |||||||||
4418 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
4419 | LLT Ty = MRI->getType(DstReg); | ||||||||
4420 | const LLT EltTy = Ty.getScalarType(); | ||||||||
4421 | const bool IsD16 = Ty.getScalarType() == S16; | ||||||||
4422 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||||||
4423 | |||||||||
4424 | // Confirm that the return type is large enough for the dmask specified | ||||||||
4425 | if (NumElts
| ||||||||
4426 | return false; | ||||||||
4427 | |||||||||
4428 | if (NumElts > 4 || DMaskLanes
| ||||||||
4429 | return false; | ||||||||
4430 | |||||||||
4431 | const unsigned AdjustedNumElts = DMaskLanes
| ||||||||
4432 | const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); | ||||||||
4433 | |||||||||
4434 | // The raw dword aligned data component of the load. The only legal cases | ||||||||
4435 | // where this matters should be when using the packed D16 format, for | ||||||||
4436 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||||||
4437 | LLT RoundedTy; | ||||||||
4438 | |||||||||
4439 | // S32 vector to to cover all data, plus TFE result element. | ||||||||
4440 | LLT TFETy; | ||||||||
4441 | |||||||||
4442 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||||||
4443 | LLT RegTy; | ||||||||
4444 | |||||||||
4445 | if (IsD16
| ||||||||
4446 | RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); | ||||||||
4447 | TFETy = LLT::vector(AdjustedNumElts + 1, 32); | ||||||||
4448 | RegTy = S32; | ||||||||
4449 | } else { | ||||||||
4450 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||||
4451 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||||||
4452 | unsigned RoundedSize = 32 * RoundedElts; | ||||||||
4453 | RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); | ||||||||
| |||||||||
4454 | TFETy = LLT::vector(RoundedSize / 32 + 1, S32); | ||||||||
4455 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||||||
4456 | } | ||||||||
4457 | |||||||||
4458 | // The return type does not need adjustment. | ||||||||
4459 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||||||
4460 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||||||
4461 | return true; | ||||||||
4462 | |||||||||
4463 | Register Dst1Reg; | ||||||||
4464 | |||||||||
4465 | // Insert after the instruction. | ||||||||
4466 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||||||
4467 | |||||||||
4468 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||||||
4469 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||||||
4470 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||||||
4471 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||||||
4472 | |||||||||
4473 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||||||
4474 | |||||||||
4475 | MI.getOperand(0).setReg(NewResultReg); | ||||||||
4476 | |||||||||
4477 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||||||
4478 | // type. The intruction really returns these two values in one contiguous | ||||||||
4479 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||||||
4480 | // return type to use a single register result. | ||||||||
4481 | |||||||||
4482 | if (IsTFE) { | ||||||||
4483 | Dst1Reg = MI.getOperand(1).getReg(); | ||||||||
4484 | if (MRI->getType(Dst1Reg) != S32) | ||||||||
4485 | return false; | ||||||||
4486 | |||||||||
4487 | // TODO: Make sure the TFE operand bit is set. | ||||||||
4488 | MI.RemoveOperand(1); | ||||||||
4489 | |||||||||
4490 | // Handle the easy case that requires no repack instructions. | ||||||||
4491 | if (Ty == S32) { | ||||||||
4492 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||||||
4493 | return true; | ||||||||
4494 | } | ||||||||
4495 | } | ||||||||
4496 | |||||||||
4497 | // Now figure out how to copy the new result register back into the old | ||||||||
4498 | // result. | ||||||||
4499 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||||||
4500 | |||||||||
4501 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||||||
4502 | |||||||||
4503 | if (ResultNumRegs == 1) { | ||||||||
4504 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4504, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4505 | ResultRegs[0] = NewResultReg; | ||||||||
4506 | } else { | ||||||||
4507 | // We have to repack into a new vector of some kind. | ||||||||
4508 | for (int I = 0; I != NumDataRegs; ++I) | ||||||||
4509 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||||||
4510 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||||||
4511 | |||||||||
4512 | // Drop the final TFE element to get the data part. The TFE result is | ||||||||
4513 | // directly written to the right place already. | ||||||||
4514 | if (IsTFE) | ||||||||
4515 | ResultRegs.resize(NumDataRegs); | ||||||||
4516 | } | ||||||||
4517 | |||||||||
4518 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||||||
4519 | // of packed vs. unpacked. | ||||||||
4520 | if (IsD16 && !Ty.isVector()) { | ||||||||
4521 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||||||
4522 | return true; | ||||||||
4523 | } | ||||||||
4524 | |||||||||
4525 | // Avoid a build/concat_vector of 1 entry. | ||||||||
4526 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||||||
4527 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||||||
4528 | return true; | ||||||||
4529 | } | ||||||||
4530 | |||||||||
4531 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4531, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4532 | |||||||||
4533 | if (IsD16) { | ||||||||
4534 | // For packed D16 results with TFE enabled, all the data components are | ||||||||
4535 | // S32. Cast back to the expected type. | ||||||||
4536 | // | ||||||||
4537 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||||||
4538 | // cast for the TFE result if a multiple of v2s16 was used. | ||||||||
4539 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||||||
4540 | for (Register &Reg : ResultRegs) | ||||||||
4541 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||||||
4542 | } else if (ST.hasUnpackedD16VMem()) { | ||||||||
4543 | for (Register &Reg : ResultRegs) | ||||||||
4544 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||||||
4545 | } | ||||||||
4546 | } | ||||||||
4547 | |||||||||
4548 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||||||
4549 | if (NumElts == 0) | ||||||||
4550 | return; | ||||||||
4551 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||||||
4552 | for (int I = 0; I != NumElts; ++I) | ||||||||
4553 | ResultRegs.push_back(Undef); | ||||||||
4554 | }; | ||||||||
4555 | |||||||||
4556 | // Pad out any elements eliminated due to the dmask. | ||||||||
4557 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||||||
4558 | if (!ResTy.isVector()) { | ||||||||
4559 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||||||
4560 | B.buildBuildVector(DstReg, ResultRegs); | ||||||||
4561 | return true; | ||||||||
4562 | } | ||||||||
4563 | |||||||||
4564 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4564, __extension__ __PRETTY_FUNCTION__)); | ||||||||
4565 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||||||
4566 | |||||||||
4567 | // Deal with the one annoying legal case. | ||||||||
4568 | const LLT V3S16 = LLT::vector(3, 16); | ||||||||
4569 | if (Ty == V3S16) { | ||||||||
4570 | padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); | ||||||||
4571 | auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); | ||||||||
4572 | B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); | ||||||||
4573 | return true; | ||||||||
4574 | } | ||||||||
4575 | |||||||||
4576 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||||||
4577 | B.buildConcatVectors(DstReg, ResultRegs); | ||||||||
4578 | return true; | ||||||||
4579 | } | ||||||||
4580 | |||||||||
4581 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||||||
4582 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||||||
4583 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
4584 | GISelChangeObserver &Observer = Helper.Observer; | ||||||||
4585 | |||||||||
4586 | Register Dst = MI.getOperand(0).getReg(); | ||||||||
4587 | LLT Ty = B.getMRI()->getType(Dst); | ||||||||
4588 | unsigned Size = Ty.getSizeInBits(); | ||||||||
4589 | MachineFunction &MF = B.getMF(); | ||||||||
4590 | |||||||||
4591 | Observer.changingInstr(MI); | ||||||||
4592 | |||||||||
4593 | if (shouldBitcastLoadStoreType(ST, Ty, Size)) { | ||||||||
4594 | Ty = getBitcastRegisterType(Ty); | ||||||||
4595 | Helper.bitcastDst(MI, Ty, 0); | ||||||||
4596 | Dst = MI.getOperand(0).getReg(); | ||||||||
4597 | B.setInsertPt(B.getMBB(), MI); | ||||||||
4598 | } | ||||||||
4599 | |||||||||
4600 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||||||
4601 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||||||
4602 | // allowed to add one. | ||||||||
4603 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||||||
4604 | MI.RemoveOperand(1); // Remove intrinsic ID | ||||||||
4605 | |||||||||
4606 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||||||
4607 | // TODO: Should this use datalayout alignment? | ||||||||
4608 | const unsigned MemSize = (Size + 7) / 8; | ||||||||
4609 | const Align MemAlign(4); | ||||||||
4610 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||||
4611 | MachinePointerInfo(), | ||||||||
4612 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||||
4613 | MachineMemOperand::MOInvariant, | ||||||||
4614 | MemSize, MemAlign); | ||||||||
4615 | MI.addMemOperand(MF, MMO); | ||||||||
4616 | |||||||||
4617 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||||||
4618 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||||||
4619 | // out this needs to be converted to a vector load during RegBankSelect. | ||||||||
4620 | if (!isPowerOf2_32(Size)) { | ||||||||
4621 | if (Ty.isVector()) | ||||||||
4622 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||||||
4623 | else | ||||||||
4624 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||||||
4625 | } | ||||||||
4626 | |||||||||
4627 | Observer.changedInstr(MI); | ||||||||
4628 | return true; | ||||||||
4629 | } | ||||||||
4630 | |||||||||
4631 | // TODO: Move to selection | ||||||||
4632 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||||||
4633 | MachineRegisterInfo &MRI, | ||||||||
4634 | MachineIRBuilder &B) const { | ||||||||
4635 | if (!ST.isTrapHandlerEnabled() || | ||||||||
4636 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | ||||||||
4637 | return legalizeTrapEndpgm(MI, MRI, B); | ||||||||
4638 | |||||||||
4639 | if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { | ||||||||
4640 | switch (*HsaAbiVer) { | ||||||||
4641 | case ELF::ELFABIVERSION_AMDGPU_HSA_V2: | ||||||||
4642 | case ELF::ELFABIVERSION_AMDGPU_HSA_V3: | ||||||||
4643 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||||
4644 | case ELF::ELFABIVERSION_AMDGPU_HSA_V4: | ||||||||
4645 | return ST.supportsGetDoorbellID() ? | ||||||||
4646 | legalizeTrapHsa(MI, MRI, B) : | ||||||||
4647 | legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||||
4648 | } | ||||||||
4649 | } | ||||||||
4650 | |||||||||
4651 | llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4651); | ||||||||
4652 | } | ||||||||
4653 | |||||||||
4654 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | ||||||||
4655 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4656 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||||||
4657 | MI.eraseFromParent(); | ||||||||
4658 | return true; | ||||||||
4659 | } | ||||||||
4660 | |||||||||
4661 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | ||||||||
4662 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4663 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||||||
4664 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||||||
4665 | Register LiveIn = | ||||||||
4666 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||||
4667 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||||
4668 | return false; | ||||||||
4669 | |||||||||
4670 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||||||
4671 | B.buildCopy(SGPR01, LiveIn); | ||||||||
4672 | B.buildInstr(AMDGPU::S_TRAP) | ||||||||
4673 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||||||
4674 | .addReg(SGPR01, RegState::Implicit); | ||||||||
4675 | |||||||||
4676 | MI.eraseFromParent(); | ||||||||
4677 | return true; | ||||||||
4678 | } | ||||||||
4679 | |||||||||
4680 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | ||||||||
4681 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4682 | B.buildInstr(AMDGPU::S_TRAP) | ||||||||
4683 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | ||||||||
4684 | MI.eraseFromParent(); | ||||||||
4685 | return true; | ||||||||
4686 | } | ||||||||
4687 | |||||||||
4688 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||||||
4689 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||||
4690 | // Is non-HSA path or trap-handler disabled? then, report a warning | ||||||||
4691 | // accordingly | ||||||||
4692 | if (!ST.isTrapHandlerEnabled() || | ||||||||
4693 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | ||||||||
4694 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||||||
4695 | "debugtrap handler not supported", | ||||||||
4696 | MI.getDebugLoc(), DS_Warning); | ||||||||
4697 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||||||
4698 | Ctx.diagnose(NoTrap); | ||||||||
4699 | } else { | ||||||||
4700 | // Insert debug-trap instruction | ||||||||
4701 | B.buildInstr(AMDGPU::S_TRAP) | ||||||||
4702 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | ||||||||
4703 | } | ||||||||
4704 | |||||||||
4705 | MI.eraseFromParent(); | ||||||||
4706 | return true; | ||||||||
4707 | } | ||||||||
4708 | |||||||||
4709 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||||||
4710 | MachineIRBuilder &B) const { | ||||||||
4711 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
4712 | const LLT S16 = LLT::scalar(16); | ||||||||
4713 | const LLT S32 = LLT::scalar(32); | ||||||||
4714 | |||||||||
4715 | Register DstReg = MI.getOperand(0).getReg(); | ||||||||
4716 | Register NodePtr = MI.getOperand(2).getReg(); | ||||||||
4717 | Register RayExtent = MI.getOperand(3).getReg(); | ||||||||
4718 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||||||
4719 | Register RayDir = MI.getOperand(5).getReg(); | ||||||||
4720 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||||||
4721 | Register TDescr = MI.getOperand(7).getReg(); | ||||||||
4722 | |||||||||
4723 | if (!ST.hasGFX10_AEncoding()) { | ||||||||
4724 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), | ||||||||
4725 | "intrinsic not supported on subtarget", | ||||||||
4726 | MI.getDebugLoc()); | ||||||||
4727 | B.getMF().getFunction().getContext().diagnose(BadIntrin); | ||||||||
4728 | return false; | ||||||||
4729 | } | ||||||||
4730 | |||||||||
4731 | bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||||||
4732 | bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||||||
4733 | unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa | ||||||||
4734 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa | ||||||||
4735 | : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa | ||||||||
4736 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; | ||||||||
4737 | |||||||||
4738 | SmallVector<Register, 12> Ops; | ||||||||
4739 | if (Is64) { | ||||||||
4740 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||||||
4741 | Ops.push_back(Unmerge.getReg(0)); | ||||||||
4742 | Ops.push_back(Unmerge.getReg(1)); | ||||||||
4743 | } else { | ||||||||
4744 | Ops.push_back(NodePtr); | ||||||||
4745 | } | ||||||||
4746 | Ops.push_back(RayExtent); | ||||||||
4747 | |||||||||
4748 | auto packLanes = [&Ops, &S32, &B] (Register Src) { | ||||||||
4749 | auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); | ||||||||
4750 | Ops.push_back(Unmerge.getReg(0)); | ||||||||
4751 | Ops.push_back(Unmerge.getReg(1)); | ||||||||
4752 | Ops.push_back(Unmerge.getReg(2)); | ||||||||
4753 | }; | ||||||||
4754 | |||||||||
4755 | packLanes(RayOrigin); | ||||||||
4756 | if (IsA16) { | ||||||||
4757 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); | ||||||||
4758 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); | ||||||||
4759 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||||||
4760 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||||||
4761 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||||||
4762 | B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||||||
4763 | B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||||||
4764 | B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||||||
4765 | Ops.push_back(R1); | ||||||||
4766 | Ops.push_back(R2); | ||||||||
4767 | Ops.push_back(R3); | ||||||||
4768 | } else { | ||||||||
4769 | packLanes(RayDir); | ||||||||
4770 | packLanes(RayInvDir); | ||||||||
4771 | } | ||||||||
4772 | |||||||||
4773 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||||||
4774 | .addDef(DstReg) | ||||||||
4775 | .addImm(Opcode); | ||||||||
4776 | |||||||||
4777 | for (Register R : Ops) { | ||||||||
4778 | MIB.addUse(R); | ||||||||
4779 | } | ||||||||
4780 | |||||||||
4781 | MIB.addUse(TDescr) | ||||||||
4782 | .addImm(IsA16 ? 1 : 0) | ||||||||
4783 | .cloneMemRefs(MI); | ||||||||
4784 | |||||||||
4785 | MI.eraseFromParent(); | ||||||||
4786 | return true; | ||||||||
4787 | } | ||||||||
4788 | |||||||||
4789 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||||||
4790 | MachineInstr &MI) const { | ||||||||
4791 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||||
4792 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||||
4793 | |||||||||
4794 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||||||
4795 | auto IntrID = MI.getIntrinsicID(); | ||||||||
4796 | switch (IntrID) { | ||||||||
4797 | case Intrinsic::amdgcn_if: | ||||||||
4798 | case Intrinsic::amdgcn_else: { | ||||||||
4799 | MachineInstr *Br = nullptr; | ||||||||
4800 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||||
4801 | bool Negated = false; | ||||||||
4802 | if (MachineInstr *BrCond = | ||||||||
4803 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||||
4804 | const SIRegisterInfo *TRI | ||||||||
4805 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||||
4806 | |||||||||
4807 | Register Def = MI.getOperand(1).getReg(); | ||||||||
4808 | Register Use = MI.getOperand(3).getReg(); | ||||||||
4809 | |||||||||
4810 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||||
4811 | |||||||||
4812 | if (Negated) | ||||||||
4813 | std::swap(CondBrTarget, UncondBrTarget); | ||||||||
4814 | |||||||||
4815 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||||
4816 | if (IntrID == Intrinsic::amdgcn_if) { | ||||||||
4817 | B.buildInstr(AMDGPU::SI_IF) | ||||||||
4818 | .addDef(Def) | ||||||||
4819 | .addUse(Use) | ||||||||
4820 | .addMBB(UncondBrTarget); | ||||||||
4821 | } else { | ||||||||
4822 | B.buildInstr(AMDGPU::SI_ELSE) | ||||||||
4823 | .addDef(Def) | ||||||||
4824 | .addUse(Use) | ||||||||
4825 | .addMBB(UncondBrTarget); | ||||||||
4826 | } | ||||||||
4827 | |||||||||
4828 | if (Br) { | ||||||||
4829 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||||
4830 | } else { | ||||||||
4831 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||||||
4832 | // since we're swapping branch targets it needs to be reinserted. | ||||||||
4833 | // FIXME: IRTranslator should probably not do this | ||||||||
4834 | B.buildBr(*CondBrTarget); | ||||||||
4835 | } | ||||||||
4836 | |||||||||
4837 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||||||
4838 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||||||
4839 | MI.eraseFromParent(); | ||||||||
4840 | BrCond->eraseFromParent(); | ||||||||
4841 | return true; | ||||||||
4842 | } | ||||||||
4843 | |||||||||
4844 | return false; | ||||||||
4845 | } | ||||||||
4846 | case Intrinsic::amdgcn_loop: { | ||||||||
4847 | MachineInstr *Br = nullptr; | ||||||||
4848 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||||
4849 | bool Negated = false; | ||||||||
4850 | if (MachineInstr *BrCond = | ||||||||
4851 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||||
4852 | const SIRegisterInfo *TRI | ||||||||
4853 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||||
4854 | |||||||||
4855 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||||
4856 | Register Reg = MI.getOperand(2).getReg(); | ||||||||
4857 | |||||||||
4858 | if (Negated) | ||||||||
4859 | std::swap(CondBrTarget, UncondBrTarget); | ||||||||
4860 | |||||||||
4861 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||||
4862 | B.buildInstr(AMDGPU::SI_LOOP) | ||||||||
4863 | .addUse(Reg) | ||||||||
4864 | .addMBB(UncondBrTarget); | ||||||||
4865 | |||||||||
4866 | if (Br) | ||||||||
4867 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||||
4868 | else | ||||||||
4869 | B.buildBr(*CondBrTarget); | ||||||||
4870 | |||||||||
4871 | MI.eraseFromParent(); | ||||||||
4872 | BrCond->eraseFromParent(); | ||||||||
4873 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||||||
4874 | return true; | ||||||||
4875 | } | ||||||||
4876 | |||||||||
4877 | return false; | ||||||||
4878 | } | ||||||||
4879 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||||||
4880 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||||||
4881 | // This only makes sense to call in a kernel, so just lower to null. | ||||||||
4882 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||||||
4883 | MI.eraseFromParent(); | ||||||||
4884 | return true; | ||||||||
4885 | } | ||||||||
4886 | |||||||||
4887 | return legalizePreloadedArgIntrin( | ||||||||
4888 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||||
4889 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||||||
4890 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||||||
4891 | case Intrinsic::amdgcn_workitem_id_x: | ||||||||
4892 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4893 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||||||
4894 | case Intrinsic::amdgcn_workitem_id_y: | ||||||||
4895 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4896 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||||||
4897 | case Intrinsic::amdgcn_workitem_id_z: | ||||||||
4898 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4899 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||||||
4900 | case Intrinsic::amdgcn_workgroup_id_x: | ||||||||
4901 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4902 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||||||
4903 | case Intrinsic::amdgcn_workgroup_id_y: | ||||||||
4904 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4905 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||||||
4906 | case Intrinsic::amdgcn_workgroup_id_z: | ||||||||
4907 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4908 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||||||
4909 | case Intrinsic::amdgcn_dispatch_ptr: | ||||||||
4910 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4911 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||||||
4912 | case Intrinsic::amdgcn_queue_ptr: | ||||||||
4913 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4914 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||||||
4915 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||||||
4916 | return legalizePreloadedArgIntrin( | ||||||||
4917 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||||||
4918 | case Intrinsic::amdgcn_dispatch_id: | ||||||||
4919 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||||
4920 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||||||
4921 | case Intrinsic::amdgcn_fdiv_fast: | ||||||||
4922 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||||||
4923 | case Intrinsic::amdgcn_is_shared: | ||||||||
4924 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||||||
4925 | case Intrinsic::amdgcn_is_private: | ||||||||
4926 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||||||
4927 | case Intrinsic::amdgcn_wavefrontsize: { | ||||||||
4928 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||||||
4929 | MI.eraseFromParent(); | ||||||||
4930 | return true; | ||||||||
4931 | } | ||||||||
4932 | case Intrinsic::amdgcn_s_buffer_load: | ||||||||
4933 | return legalizeSBufferLoad(Helper, MI); | ||||||||
4934 | case Intrinsic::amdgcn_raw_buffer_store: | ||||||||
4935 | case Intrinsic::amdgcn_struct_buffer_store: | ||||||||
4936 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||||||
4937 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||||||
4938 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||||||
4939 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||||||
4940 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||||||
4941 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||||||
4942 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||||||
4943 | case Intrinsic::amdgcn_raw_buffer_load: | ||||||||
4944 | case Intrinsic::amdgcn_struct_buffer_load: | ||||||||
4945 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||||||
4946 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||||||
4947 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||||||
4948 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||||||
4949 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||||||
4950 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||||||
4951 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||||||
4952 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||||
4953 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||||
4954 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||||
4955 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||||
4956 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||||
4957 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||||
4958 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||||
4959 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||||
4960 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||||
4961 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||||
4962 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||||
4963 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||||
4964 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||||
4965 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||||
4966 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||||
4967 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||||
4968 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||||
4969 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||||
4970 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||||
4971 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||||
4972 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||||
4973 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||||
4974 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||||
4975 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||||
4976 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||||
4977 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||||
4978 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||||
4979 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||||
4980 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||||
4981 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||||
4982 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||||
4983 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||||
4984 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||||
4985 | return legalizeBufferAtomic(MI, B, IntrID); | ||||||||
4986 | case Intrinsic::amdgcn_atomic_inc: | ||||||||
4987 | return legalizeAtomicIncDec(MI, B, true); | ||||||||
4988 | case Intrinsic::amdgcn_atomic_dec: | ||||||||
4989 | return legalizeAtomicIncDec(MI, B, false); | ||||||||
4990 | case Intrinsic::trap: | ||||||||
4991 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||||||
4992 | case Intrinsic::debugtrap: | ||||||||
4993 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||||||
4994 | case Intrinsic::amdgcn_rsq_clamp: | ||||||||
4995 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||||||
4996 | case Intrinsic::amdgcn_ds_fadd: | ||||||||
4997 | case Intrinsic::amdgcn_ds_fmin: | ||||||||
4998 | case Intrinsic::amdgcn_ds_fmax: | ||||||||
4999 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||||||
5000 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||||||
5001 | return legalizeBVHIntrinsic(MI, B); | ||||||||
5002 | default: { | ||||||||
5003 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||||||
5004 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||||||
5005 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||||||
5006 | return true; | ||||||||
5007 | } | ||||||||
5008 | } | ||||||||
5009 | |||||||||
5010 | return true; | ||||||||
5011 | } |
1 | //== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==// | ||||
2 | // | ||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
6 | // | ||||
7 | //===----------------------------------------------------------------------===// | ||||
8 | /// \file | ||||
9 | /// Implement a low-level type suitable for MachineInstr level instruction | ||||
10 | /// selection. | ||||
11 | /// | ||||
12 | /// For a type attached to a MachineInstr, we only care about 2 details: total | ||||
13 | /// size and the number of vector lanes (if any). Accordingly, there are 4 | ||||
14 | /// possible valid type-kinds: | ||||
15 | /// | ||||
16 | /// * `sN` for scalars and aggregates | ||||
17 | /// * `<N x sM>` for vectors, which must have at least 2 elements. | ||||
18 | /// * `pN` for pointers | ||||
19 | /// | ||||
20 | /// Other information required for correct selection is expected to be carried | ||||
21 | /// by the opcode, or non-type flags. For example the distinction between G_ADD | ||||
22 | /// and G_FADD for int/float or fast-math flags. | ||||
23 | /// | ||||
24 | //===----------------------------------------------------------------------===// | ||||
25 | |||||
26 | #ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H | ||||
27 | #define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H | ||||
28 | |||||
29 | #include "llvm/ADT/DenseMapInfo.h" | ||||
30 | #include "llvm/Support/Debug.h" | ||||
31 | #include "llvm/Support/MachineValueType.h" | ||||
32 | #include <cassert> | ||||
33 | |||||
34 | namespace llvm { | ||||
35 | |||||
36 | class DataLayout; | ||||
37 | class Type; | ||||
38 | class raw_ostream; | ||||
39 | |||||
40 | class LLT { | ||||
41 | public: | ||||
42 | /// Get a low-level scalar or aggregate "bag of bits". | ||||
43 | static LLT scalar(unsigned SizeInBits) { | ||||
44 | assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 44, __extension__ __PRETTY_FUNCTION__)); | ||||
45 | return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0, | ||||
46 | SizeInBits, /*AddressSpace=*/0}; | ||||
47 | } | ||||
48 | |||||
49 | /// Get a low-level pointer in the given address space. | ||||
50 | static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) { | ||||
51 | assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 51, __extension__ __PRETTY_FUNCTION__)); | ||||
52 | return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0, | ||||
53 | SizeInBits, AddressSpace}; | ||||
54 | } | ||||
55 | |||||
56 | /// Get a low-level vector of some number of elements and element width. | ||||
57 | /// \p NumElements must be at least 2. | ||||
58 | static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) { | ||||
59 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 59, __extension__ __PRETTY_FUNCTION__)); | ||||
60 | assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 && "invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 60, __extension__ __PRETTY_FUNCTION__)); | ||||
61 | return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements, | ||||
62 | ScalarSizeInBits, /*AddressSpace=*/0}; | ||||
63 | } | ||||
64 | |||||
65 | /// Get a low-level vector of some number of elements and element type. | ||||
66 | static LLT vector(uint16_t NumElements, LLT ScalarTy) { | ||||
67 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 67, __extension__ __PRETTY_FUNCTION__)); | ||||
68 | assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type" ) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 68, __extension__ __PRETTY_FUNCTION__)); | ||||
69 | return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements, | ||||
70 | ScalarTy.getSizeInBits(), | ||||
71 | ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0}; | ||||
72 | } | ||||
73 | |||||
74 | static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) { | ||||
75 | return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy); | ||||
76 | } | ||||
77 | |||||
78 | static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) { | ||||
79 | return scalarOrVector(NumElements, LLT::scalar(ScalarSize)); | ||||
80 | } | ||||
81 | |||||
82 | explicit LLT(bool isPointer, bool isVector, uint16_t NumElements, | ||||
83 | unsigned SizeInBits, unsigned AddressSpace) { | ||||
84 | init(isPointer, isVector, NumElements, SizeInBits, AddressSpace); | ||||
85 | } | ||||
86 | explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {} | ||||
87 | |||||
88 | explicit LLT(MVT VT); | ||||
89 | |||||
90 | bool isValid() const { return RawData != 0; } | ||||
91 | |||||
92 | bool isScalar() const { return isValid() && !IsPointer && !IsVector; } | ||||
93 | |||||
94 | bool isPointer() const { return isValid() && IsPointer && !IsVector; } | ||||
95 | |||||
96 | bool isVector() const { return isValid() && IsVector; } | ||||
97 | |||||
98 | /// Returns the number of elements in a vector LLT. Must only be called on | ||||
99 | /// vector types. | ||||
100 | uint16_t getNumElements() const { | ||||
101 | assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate" ) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 101, __extension__ __PRETTY_FUNCTION__)); | ||||
102 | if (!IsPointer) | ||||
103 | return getFieldValue(VectorElementsFieldInfo); | ||||
104 | else | ||||
105 | return getFieldValue(PointerVectorElementsFieldInfo); | ||||
106 | } | ||||
107 | |||||
108 | /// Returns the total size of the type. Must only be called on sized types. | ||||
109 | unsigned getSizeInBits() const { | ||||
110 | if (isPointer() || isScalar()) | ||||
111 | return getScalarSizeInBits(); | ||||
112 | return getScalarSizeInBits() * getNumElements(); | ||||
113 | } | ||||
114 | |||||
115 | /// Returns the total size of the type in bytes, i.e. number of whole bytes | ||||
116 | /// needed to represent the size in bits. Must only be called on sized types. | ||||
117 | unsigned getSizeInBytes() const { | ||||
118 | return (getSizeInBits() + 7) / 8; | ||||
119 | } | ||||
120 | |||||
121 | LLT getScalarType() const { | ||||
122 | return isVector() ? getElementType() : *this; | ||||
123 | } | ||||
124 | |||||
125 | /// If this type is a vector, return a vector with the same number of elements | ||||
126 | /// but the new element type. Otherwise, return the new element type. | ||||
127 | LLT changeElementType(LLT NewEltTy) const { | ||||
128 | return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy; | ||||
129 | } | ||||
130 | |||||
131 | /// If this type is a vector, return a vector with the same number of elements | ||||
132 | /// but the new element size. Otherwise, return the new element type. Invalid | ||||
133 | /// for pointer types. For pointer types, use changeElementType. | ||||
134 | LLT changeElementSize(unsigned NewEltSize) const { | ||||
135 | assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)) | ||||
136 | "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)); | ||||
137 | return isVector() ? LLT::vector(getNumElements(), NewEltSize) | ||||
138 | : LLT::scalar(NewEltSize); | ||||
139 | } | ||||
140 | |||||
141 | /// Return a vector or scalar with the same element type and the new number of | ||||
142 | /// elements. | ||||
143 | LLT changeNumElements(unsigned NewNumElts) const { | ||||
144 | return LLT::scalarOrVector(NewNumElts, getScalarType()); | ||||
145 | } | ||||
146 | |||||
147 | /// Return a type that is \p Factor times smaller. Reduces the number of | ||||
148 | /// elements if this is a vector, or the bitwidth for scalar/pointers. Does | ||||
149 | /// not attempt to handle cases that aren't evenly divisible. | ||||
150 | LLT divide(int Factor) const { | ||||
151 | assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail ("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 151, __extension__ __PRETTY_FUNCTION__)); | ||||
152 | if (isVector()) { | ||||
153 | assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void (0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 153, __extension__ __PRETTY_FUNCTION__)); | ||||
154 | return scalarOrVector(getNumElements() / Factor, getElementType()); | ||||
155 | } | ||||
156 | |||||
157 | assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void (0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 157, __extension__ __PRETTY_FUNCTION__)); | ||||
158 | return scalar(getSizeInBits() / Factor); | ||||
159 | } | ||||
160 | |||||
161 | bool isByteSized() const { return (getSizeInBits() & 7) == 0; } | ||||
162 | |||||
163 | unsigned getScalarSizeInBits() const { | ||||
164 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 164, __extension__ __PRETTY_FUNCTION__)); | ||||
165 | if (!IsVector) { | ||||
166 | if (!IsPointer) | ||||
167 | return getFieldValue(ScalarSizeFieldInfo); | ||||
168 | else | ||||
169 | return getFieldValue(PointerSizeFieldInfo); | ||||
170 | } else { | ||||
171 | if (!IsPointer) | ||||
172 | return getFieldValue(VectorSizeFieldInfo); | ||||
173 | else | ||||
174 | return getFieldValue(PointerVectorSizeFieldInfo); | ||||
175 | } | ||||
176 | } | ||||
177 | |||||
178 | unsigned getAddressSpace() const { | ||||
179 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 179, __extension__ __PRETTY_FUNCTION__)); | ||||
180 | assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type" ) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 180, __extension__ __PRETTY_FUNCTION__)); | ||||
181 | if (!IsVector) | ||||
182 | return getFieldValue(PointerAddressSpaceFieldInfo); | ||||
183 | else | ||||
184 | return getFieldValue(PointerVectorAddressSpaceFieldInfo); | ||||
185 | } | ||||
186 | |||||
187 | /// Returns the vector's element type. Only valid for vector types. | ||||
188 | LLT getElementType() const { | ||||
189 | assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate" ) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 189, __extension__ __PRETTY_FUNCTION__)); | ||||
190 | if (IsPointer) | ||||
191 | return pointer(getAddressSpace(), getScalarSizeInBits()); | ||||
192 | else | ||||
193 | return scalar(getScalarSizeInBits()); | ||||
194 | } | ||||
195 | |||||
196 | void print(raw_ostream &OS) const; | ||||
197 | |||||
198 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | ||||
199 | LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { | ||||
200 | print(dbgs()); | ||||
201 | dbgs() << '\n'; | ||||
202 | } | ||||
203 | #endif | ||||
204 | |||||
205 | bool operator==(const LLT &RHS) const { | ||||
206 | return IsPointer
| ||||
207 | RHS.RawData == RawData; | ||||
208 | } | ||||
209 | |||||
210 | bool operator!=(const LLT &RHS) const { return !(*this == RHS); } | ||||
211 | |||||
212 | friend struct DenseMapInfo<LLT>; | ||||
213 | friend class GISelInstProfileBuilder; | ||||
214 | |||||
215 | private: | ||||
216 | /// LLT is packed into 64 bits as follows: | ||||
217 | /// isPointer : 1 | ||||
218 | /// isVector : 1 | ||||
219 | /// with 62 bits remaining for Kind-specific data, packed in bitfields | ||||
220 | /// as described below. As there isn't a simple portable way to pack bits | ||||
221 | /// into bitfields, here the different fields in the packed structure is | ||||
222 | /// described in static const *Field variables. Each of these variables | ||||
223 | /// is a 2-element array, with the first element describing the bitfield size | ||||
224 | /// and the second element describing the bitfield offset. | ||||
225 | typedef int BitFieldInfo[2]; | ||||
226 | /// | ||||
227 | /// This is how the bitfields are packed per Kind: | ||||
228 | /// * Invalid: | ||||
229 | /// gets encoded as RawData == 0, as that is an invalid encoding, since for | ||||
230 | /// valid encodings, SizeInBits/SizeOfElement must be larger than 0. | ||||
231 | /// * Non-pointer scalar (isPointer == 0 && isVector == 0): | ||||
232 | /// SizeInBits: 32; | ||||
233 | static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0}; | ||||
234 | /// * Pointer (isPointer == 1 && isVector == 0): | ||||
235 | /// SizeInBits: 16; | ||||
236 | /// AddressSpace: 24; | ||||
237 | static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0}; | ||||
238 | static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{ | ||||
239 | 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]}; | ||||
240 | /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1): | ||||
241 | /// NumElements: 16; | ||||
242 | /// SizeOfElement: 32; | ||||
243 | static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0}; | ||||
244 | static const constexpr BitFieldInfo VectorSizeFieldInfo{ | ||||
245 | 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]}; | ||||
246 | /// * Vector-of-pointer (isPointer == 1 && isVector == 1): | ||||
247 | /// NumElements: 16; | ||||
248 | /// SizeOfElement: 16; | ||||
249 | /// AddressSpace: 24; | ||||
250 | static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0}; | ||||
251 | static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{ | ||||
252 | 16, | ||||
253 | PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]}; | ||||
254 | static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{ | ||||
255 | 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]}; | ||||
256 | |||||
257 | uint64_t IsPointer : 1; | ||||
258 | uint64_t IsVector : 1; | ||||
259 | uint64_t RawData : 62; | ||||
260 | |||||
261 | static uint64_t getMask(const BitFieldInfo FieldInfo) { | ||||
262 | const int FieldSizeInBits = FieldInfo[0]; | ||||
263 | return (((uint64_t)1) << FieldSizeInBits) - 1; | ||||
264 | } | ||||
265 | static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) { | ||||
266 | assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field" ) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 266, __extension__ __PRETTY_FUNCTION__)); | ||||
267 | return (Val & Mask) << Shift; | ||||
268 | } | ||||
269 | static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) { | ||||
270 | return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]); | ||||
271 | } | ||||
272 | uint64_t getFieldValue(const BitFieldInfo FieldInfo) const { | ||||
273 | return getMask(FieldInfo) & (RawData >> FieldInfo[1]); | ||||
274 | } | ||||
275 | |||||
276 | void init(bool IsPointer, bool IsVector, uint16_t NumElements, | ||||
277 | unsigned SizeInBits, unsigned AddressSpace) { | ||||
278 | this->IsPointer = IsPointer; | ||||
279 | this->IsVector = IsVector; | ||||
280 | if (!IsVector) { | ||||
281 | if (!IsPointer) | ||||
282 | RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo); | ||||
283 | else | ||||
284 | RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) | | ||||
285 | maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo); | ||||
286 | } else { | ||||
287 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 287, __extension__ __PRETTY_FUNCTION__)); | ||||
288 | if (!IsPointer) | ||||
289 | RawData = maskAndShift(NumElements, VectorElementsFieldInfo) | | ||||
290 | maskAndShift(SizeInBits, VectorSizeFieldInfo); | ||||
291 | else | ||||
292 | RawData = | ||||
293 | maskAndShift(NumElements, PointerVectorElementsFieldInfo) | | ||||
294 | maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) | | ||||
295 | maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo); | ||||
296 | } | ||||
297 | } | ||||
298 | |||||
299 | uint64_t getUniqueRAWLLTData() const { | ||||
300 | return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 | | ||||
301 | ((uint64_t)IsVector); | ||||
302 | } | ||||
303 | }; | ||||
304 | |||||
305 | inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) { | ||||
306 | Ty.print(OS); | ||||
307 | return OS; | ||||
308 | } | ||||
309 | |||||
310 | template<> struct DenseMapInfo<LLT> { | ||||
311 | static inline LLT getEmptyKey() { | ||||
312 | LLT Invalid; | ||||
313 | Invalid.IsPointer = true; | ||||
314 | return Invalid; | ||||
315 | } | ||||
316 | static inline LLT getTombstoneKey() { | ||||
317 | LLT Invalid; | ||||
318 | Invalid.IsVector = true; | ||||
319 | return Invalid; | ||||
320 | } | ||||
321 | static inline unsigned getHashValue(const LLT &Ty) { | ||||
322 | uint64_t Val = Ty.getUniqueRAWLLTData(); | ||||
323 | return DenseMapInfo<uint64_t>::getHashValue(Val); | ||||
324 | } | ||||
325 | static bool isEqual(const LLT &LHS, const LLT &RHS) { | ||||
326 | return LHS == RHS; | ||||
327 | } | ||||
328 | }; | ||||
329 | |||||
330 | } | ||||
331 | |||||
332 | #endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |