File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 2874, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | /// \file | ||||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||||
10 | /// AMDGPU. | ||||||
11 | /// \todo This should be generated by TableGen. | ||||||
12 | //===----------------------------------------------------------------------===// | ||||||
13 | |||||||
14 | #include "AMDGPULegalizerInfo.h" | ||||||
15 | |||||||
16 | #include "AMDGPU.h" | ||||||
17 | #include "AMDGPUGlobalISelUtils.h" | ||||||
18 | #include "AMDGPUInstrInfo.h" | ||||||
19 | #include "AMDGPUTargetMachine.h" | ||||||
20 | #include "SIMachineFunctionInfo.h" | ||||||
21 | #include "Utils/AMDGPUBaseInfo.h" | ||||||
22 | #include "llvm/ADT/ScopeExit.h" | ||||||
23 | #include "llvm/BinaryFormat/ELF.h" | ||||||
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||||
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||||
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||||
27 | #include "llvm/IR/DiagnosticInfo.h" | ||||||
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||||
29 | |||||||
30 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||||
31 | |||||||
32 | using namespace llvm; | ||||||
33 | using namespace LegalizeActions; | ||||||
34 | using namespace LegalizeMutations; | ||||||
35 | using namespace LegalityPredicates; | ||||||
36 | using namespace MIPatternMatch; | ||||||
37 | |||||||
38 | // Hack until load/store selection patterns support any tuple of legal types. | ||||||
39 | static cl::opt<bool> EnableNewLegality( | ||||||
40 | "amdgpu-global-isel-new-legality", | ||||||
41 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||||
42 | "rules compatible with selection patterns"), | ||||||
43 | cl::init(false), | ||||||
44 | cl::ReallyHidden); | ||||||
45 | |||||||
46 | static constexpr unsigned MaxRegisterSize = 1024; | ||||||
47 | |||||||
48 | // Round the number of elements to the next power of two elements | ||||||
49 | static LLT getPow2VectorType(LLT Ty) { | ||||||
50 | unsigned NElts = Ty.getNumElements(); | ||||||
51 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||||
52 | return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); | ||||||
53 | } | ||||||
54 | |||||||
55 | // Round the number of bits to the next power of two bits | ||||||
56 | static LLT getPow2ScalarType(LLT Ty) { | ||||||
57 | unsigned Bits = Ty.getSizeInBits(); | ||||||
58 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||||
59 | return LLT::scalar(Pow2Bits); | ||||||
60 | } | ||||||
61 | |||||||
62 | /// \returs true if this is an odd sized vector which should widen by adding an | ||||||
63 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||||
64 | /// excludes s1 vectors, which should always be scalarized. | ||||||
65 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||||
66 | return [=](const LegalityQuery &Query) { | ||||||
67 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
68 | if (!Ty.isVector()) | ||||||
69 | return false; | ||||||
70 | |||||||
71 | const LLT EltTy = Ty.getElementType(); | ||||||
72 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||
73 | return Ty.getNumElements() % 2 != 0 && | ||||||
74 | EltSize > 1 && EltSize < 32 && | ||||||
75 | Ty.getSizeInBits() % 32 != 0; | ||||||
76 | }; | ||||||
77 | } | ||||||
78 | |||||||
79 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||||
80 | return [=](const LegalityQuery &Query) { | ||||||
81 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
82 | return Ty.getSizeInBits() % 32 == 0; | ||||||
83 | }; | ||||||
84 | } | ||||||
85 | |||||||
86 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||||
87 | return [=](const LegalityQuery &Query) { | ||||||
88 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
89 | const LLT EltTy = Ty.getScalarType(); | ||||||
90 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||||
91 | }; | ||||||
92 | } | ||||||
93 | |||||||
94 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||||
95 | return [=](const LegalityQuery &Query) { | ||||||
96 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
97 | const LLT EltTy = Ty.getElementType(); | ||||||
98 | return std::make_pair(TypeIdx, | ||||||
99 | LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); | ||||||
100 | }; | ||||||
101 | } | ||||||
102 | |||||||
103 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||||
104 | return [=](const LegalityQuery &Query) { | ||||||
105 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
106 | const LLT EltTy = Ty.getElementType(); | ||||||
107 | unsigned Size = Ty.getSizeInBits(); | ||||||
108 | unsigned Pieces = (Size + 63) / 64; | ||||||
109 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||||
110 | return std::make_pair( | ||||||
111 | TypeIdx, | ||||||
112 | LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); | ||||||
113 | }; | ||||||
114 | } | ||||||
115 | |||||||
116 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||||
117 | // type. | ||||||
118 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||||
119 | return [=](const LegalityQuery &Query) { | ||||||
120 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
121 | |||||||
122 | const LLT EltTy = Ty.getElementType(); | ||||||
123 | const int Size = Ty.getSizeInBits(); | ||||||
124 | const int EltSize = EltTy.getSizeInBits(); | ||||||
125 | const int NextMul32 = (Size + 31) / 32; | ||||||
126 | |||||||
127 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 127, __extension__ __PRETTY_FUNCTION__)); | ||||||
128 | |||||||
129 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||||
130 | return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); | ||||||
131 | }; | ||||||
132 | } | ||||||
133 | |||||||
134 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||||
135 | const unsigned Size = Ty.getSizeInBits(); | ||||||
136 | |||||||
137 | LLT CoercedTy; | ||||||
138 | if (Size <= 32) { | ||||||
139 | // <2 x s8> -> s16 | ||||||
140 | // <4 x s8> -> s32 | ||||||
141 | return LLT::scalar(Size); | ||||||
142 | } | ||||||
143 | |||||||
144 | return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); | ||||||
145 | } | ||||||
146 | |||||||
147 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||||
148 | return [=](const LegalityQuery &Query) { | ||||||
149 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
150 | return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||||
151 | }; | ||||||
152 | } | ||||||
153 | |||||||
154 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||||
155 | return [=](const LegalityQuery &Query) { | ||||||
156 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
157 | unsigned Size = Ty.getSizeInBits(); | ||||||
158 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 158, __extension__ __PRETTY_FUNCTION__)); | ||||||
159 | return std::make_pair( | ||||||
160 | TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); | ||||||
161 | }; | ||||||
162 | } | ||||||
163 | |||||||
164 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||||
165 | return [=](const LegalityQuery &Query) { | ||||||
166 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
167 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||||
168 | }; | ||||||
169 | } | ||||||
170 | |||||||
171 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||||
172 | return [=](const LegalityQuery &Query) { | ||||||
173 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
174 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||||
175 | }; | ||||||
176 | } | ||||||
177 | |||||||
178 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||||
179 | return [=](const LegalityQuery &Query) { | ||||||
180 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
181 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||||
182 | }; | ||||||
183 | } | ||||||
184 | |||||||
185 | static bool isRegisterSize(unsigned Size) { | ||||||
186 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||||
187 | } | ||||||
188 | |||||||
189 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||||
190 | const int EltSize = EltTy.getSizeInBits(); | ||||||
191 | return EltSize == 16 || EltSize % 32 == 0; | ||||||
192 | } | ||||||
193 | |||||||
194 | static bool isRegisterVectorType(LLT Ty) { | ||||||
195 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||||
196 | return EltSize == 32 || EltSize == 64 || | ||||||
197 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||||
198 | EltSize == 128 || EltSize == 256; | ||||||
199 | } | ||||||
200 | |||||||
201 | static bool isRegisterType(LLT Ty) { | ||||||
202 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||||
203 | return false; | ||||||
204 | |||||||
205 | if (Ty.isVector()) | ||||||
206 | return isRegisterVectorType(Ty); | ||||||
207 | |||||||
208 | return true; | ||||||
209 | } | ||||||
210 | |||||||
211 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||||
212 | // multiples of v2s16. | ||||||
213 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||||
214 | return [=](const LegalityQuery &Query) { | ||||||
215 | return isRegisterType(Query.Types[TypeIdx]); | ||||||
216 | }; | ||||||
217 | } | ||||||
218 | |||||||
219 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||||
220 | return [=](const LegalityQuery &Query) { | ||||||
221 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
222 | if (!QueryTy.isVector()) | ||||||
223 | return false; | ||||||
224 | const LLT EltTy = QueryTy.getElementType(); | ||||||
225 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||||
226 | }; | ||||||
227 | } | ||||||
228 | |||||||
229 | // If we have a truncating store or an extending load with a data size larger | ||||||
230 | // than 32-bits, we need to reduce to a 32-bit type. | ||||||
231 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { | ||||||
232 | return [=](const LegalityQuery &Query) { | ||||||
233 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
234 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||||
235 | Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); | ||||||
236 | }; | ||||||
237 | } | ||||||
238 | |||||||
239 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||||
240 | // handle some operations by just promoting the register during | ||||||
241 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||||
242 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||||
243 | bool IsLoad) { | ||||||
244 | switch (AS) { | ||||||
245 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||||
246 | // FIXME: Private element size. | ||||||
247 | return ST.enableFlatScratch() ? 128 : 32; | ||||||
248 | case AMDGPUAS::LOCAL_ADDRESS: | ||||||
249 | return ST.useDS128() ? 128 : 64; | ||||||
250 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||||
251 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||||
252 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||||
253 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||||
254 | // global loads (ideally constant address space should be eliminated) | ||||||
255 | // depending on the context. Legality cannot be context dependent, but | ||||||
256 | // RegBankSelect can split the load as necessary depending on the pointer | ||||||
257 | // register bank/uniformity and if the memory is invariant or not written in a | ||||||
258 | // kernel. | ||||||
259 | return IsLoad ? 512 : 128; | ||||||
260 | default: | ||||||
261 | // Flat addresses may contextually need to be split to 32-bit parts if they | ||||||
262 | // may alias scratch depending on the subtarget. | ||||||
263 | return 128; | ||||||
264 | } | ||||||
265 | } | ||||||
266 | |||||||
267 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||||
268 | const LegalityQuery &Query) { | ||||||
269 | const LLT Ty = Query.Types[0]; | ||||||
270 | |||||||
271 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||||
272 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; | ||||||
273 | |||||||
274 | unsigned RegSize = Ty.getSizeInBits(); | ||||||
275 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
276 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||
277 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||||
278 | |||||||
279 | // All of these need to be custom lowered to cast the pointer operand. | ||||||
280 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||||
281 | return false; | ||||||
282 | |||||||
283 | // Do not handle extending vector loads. | ||||||
284 | if (Ty.isVector() && MemSize != RegSize) | ||||||
285 | return false; | ||||||
286 | |||||||
287 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||||
288 | // we also need to modify the memory access size. | ||||||
289 | #if 0 | ||||||
290 | // Accept widening loads based on alignment. | ||||||
291 | if (IsLoad && MemSize < Size) | ||||||
292 | MemSize = std::max(MemSize, Align); | ||||||
293 | #endif | ||||||
294 | |||||||
295 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||||
296 | if (MemSize != RegSize && RegSize != 32) | ||||||
297 | return false; | ||||||
298 | |||||||
299 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||
300 | return false; | ||||||
301 | |||||||
302 | switch (MemSize) { | ||||||
303 | case 8: | ||||||
304 | case 16: | ||||||
305 | case 32: | ||||||
306 | case 64: | ||||||
307 | case 128: | ||||||
308 | break; | ||||||
309 | case 96: | ||||||
310 | if (!ST.hasDwordx3LoadStores()) | ||||||
311 | return false; | ||||||
312 | break; | ||||||
313 | case 256: | ||||||
314 | case 512: | ||||||
315 | // These may contextually need to be broken down. | ||||||
316 | break; | ||||||
317 | default: | ||||||
318 | return false; | ||||||
319 | } | ||||||
320 | |||||||
321 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 321, __extension__ __PRETTY_FUNCTION__)); | ||||||
322 | |||||||
323 | if (AlignBits < MemSize) { | ||||||
324 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
325 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||
326 | Align(AlignBits / 8))) | ||||||
327 | return false; | ||||||
328 | } | ||||||
329 | |||||||
330 | return true; | ||||||
331 | } | ||||||
332 | |||||||
333 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||||
334 | // workaround this. Eventually it should ignore the type for loads and only care | ||||||
335 | // about the size. Return true in cases where we will workaround this for now by | ||||||
336 | // bitcasting. | ||||||
337 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||||
338 | if (EnableNewLegality) | ||||||
339 | return false; | ||||||
340 | |||||||
341 | const unsigned Size = Ty.getSizeInBits(); | ||||||
342 | if (Size <= 64) | ||||||
343 | return false; | ||||||
344 | if (!Ty.isVector()) | ||||||
345 | return true; | ||||||
346 | |||||||
347 | LLT EltTy = Ty.getElementType(); | ||||||
348 | if (EltTy.isPointer()) | ||||||
349 | return true; | ||||||
350 | |||||||
351 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
352 | return EltSize != 32 && EltSize != 64; | ||||||
353 | } | ||||||
354 | |||||||
355 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { | ||||||
356 | const LLT Ty = Query.Types[0]; | ||||||
357 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && | ||||||
358 | !loadStoreBitcastWorkaround(Ty); | ||||||
359 | } | ||||||
360 | |||||||
361 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||||
362 | /// to a different type. | ||||||
363 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||||
364 | const LLT MemTy) { | ||||||
365 | const unsigned MemSizeInBits = MemTy.getSizeInBits(); | ||||||
366 | const unsigned Size = Ty.getSizeInBits(); | ||||||
367 | if (Size != MemSizeInBits) | ||||||
368 | return Size <= 32 && Ty.isVector(); | ||||||
369 | |||||||
370 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||||
371 | return true; | ||||||
372 | |||||||
373 | // Don't try to handle bitcasting vector ext loads for now. | ||||||
374 | return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && | ||||||
375 | (Size <= 32 || isRegisterSize(Size)) && | ||||||
376 | !isRegisterVectorElementType(Ty.getElementType()); | ||||||
377 | } | ||||||
378 | |||||||
379 | /// Return true if we should legalize a load by widening an odd sized memory | ||||||
380 | /// access up to the alignment. Note this case when the memory access itself | ||||||
381 | /// changes, not the size of the result register. | ||||||
382 | static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, | ||||||
383 | unsigned AlignInBits, unsigned AddrSpace, | ||||||
384 | unsigned Opcode) { | ||||||
385 | unsigned SizeInBits = MemoryTy.getSizeInBits(); | ||||||
386 | // We don't want to widen cases that are naturally legal. | ||||||
387 | if (isPowerOf2_32(SizeInBits)) | ||||||
388 | return false; | ||||||
389 | |||||||
390 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||||
391 | // end up widening these for a scalar load during RegBankSelect, since there | ||||||
392 | // aren't 96-bit scalar loads. | ||||||
393 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||||
394 | return false; | ||||||
395 | |||||||
396 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) | ||||||
397 | return false; | ||||||
398 | |||||||
399 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||||
400 | // to it. | ||||||
401 | // | ||||||
402 | // TODO: Could check dereferenceable for less aligned cases. | ||||||
403 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||||
404 | if (AlignInBits < RoundedSize) | ||||||
405 | return false; | ||||||
406 | |||||||
407 | // Do not widen if it would introduce a slow unaligned load. | ||||||
408 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
409 | bool Fast = false; | ||||||
410 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||||
411 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||||
412 | MachineMemOperand::MOLoad, &Fast) && | ||||||
413 | Fast; | ||||||
414 | } | ||||||
415 | |||||||
416 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||||
417 | unsigned Opcode) { | ||||||
418 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||||
419 | return false; | ||||||
420 | |||||||
421 | return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, | ||||||
422 | Query.MMODescrs[0].AlignInBits, | ||||||
423 | Query.Types[1].getAddressSpace(), Opcode); | ||||||
424 | } | ||||||
425 | |||||||
426 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||||
427 | const GCNTargetMachine &TM) | ||||||
428 | : ST(ST_) { | ||||||
429 | using namespace TargetOpcode; | ||||||
430 | |||||||
431 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||||
432 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||||
433 | }; | ||||||
434 | |||||||
435 | const LLT S1 = LLT::scalar(1); | ||||||
436 | const LLT S8 = LLT::scalar(8); | ||||||
437 | const LLT S16 = LLT::scalar(16); | ||||||
438 | const LLT S32 = LLT::scalar(32); | ||||||
439 | const LLT S64 = LLT::scalar(64); | ||||||
440 | const LLT S128 = LLT::scalar(128); | ||||||
441 | const LLT S256 = LLT::scalar(256); | ||||||
442 | const LLT S512 = LLT::scalar(512); | ||||||
443 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||||
444 | |||||||
445 | const LLT V2S8 = LLT::fixed_vector(2, 8); | ||||||
446 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
447 | const LLT V4S16 = LLT::fixed_vector(4, 16); | ||||||
448 | |||||||
449 | const LLT V2S32 = LLT::fixed_vector(2, 32); | ||||||
450 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||||
451 | const LLT V4S32 = LLT::fixed_vector(4, 32); | ||||||
452 | const LLT V5S32 = LLT::fixed_vector(5, 32); | ||||||
453 | const LLT V6S32 = LLT::fixed_vector(6, 32); | ||||||
454 | const LLT V7S32 = LLT::fixed_vector(7, 32); | ||||||
455 | const LLT V8S32 = LLT::fixed_vector(8, 32); | ||||||
456 | const LLT V9S32 = LLT::fixed_vector(9, 32); | ||||||
457 | const LLT V10S32 = LLT::fixed_vector(10, 32); | ||||||
458 | const LLT V11S32 = LLT::fixed_vector(11, 32); | ||||||
459 | const LLT V12S32 = LLT::fixed_vector(12, 32); | ||||||
460 | const LLT V13S32 = LLT::fixed_vector(13, 32); | ||||||
461 | const LLT V14S32 = LLT::fixed_vector(14, 32); | ||||||
462 | const LLT V15S32 = LLT::fixed_vector(15, 32); | ||||||
463 | const LLT V16S32 = LLT::fixed_vector(16, 32); | ||||||
464 | const LLT V32S32 = LLT::fixed_vector(32, 32); | ||||||
465 | |||||||
466 | const LLT V2S64 = LLT::fixed_vector(2, 64); | ||||||
467 | const LLT V3S64 = LLT::fixed_vector(3, 64); | ||||||
468 | const LLT V4S64 = LLT::fixed_vector(4, 64); | ||||||
469 | const LLT V5S64 = LLT::fixed_vector(5, 64); | ||||||
470 | const LLT V6S64 = LLT::fixed_vector(6, 64); | ||||||
471 | const LLT V7S64 = LLT::fixed_vector(7, 64); | ||||||
472 | const LLT V8S64 = LLT::fixed_vector(8, 64); | ||||||
473 | const LLT V16S64 = LLT::fixed_vector(16, 64); | ||||||
474 | |||||||
475 | std::initializer_list<LLT> AllS32Vectors = | ||||||
476 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||||
477 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||||
478 | std::initializer_list<LLT> AllS64Vectors = | ||||||
479 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||||
480 | |||||||
481 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||||
482 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
483 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||||
484 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||||
485 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||||
486 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||||
487 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||||
488 | |||||||
489 | const LLT CodePtr = FlatPtr; | ||||||
490 | |||||||
491 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||||
492 | GlobalPtr, ConstantPtr, FlatPtr | ||||||
493 | }; | ||||||
494 | |||||||
495 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||||
496 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||||
497 | }; | ||||||
498 | |||||||
499 | const std::initializer_list<LLT> FPTypesBase = { | ||||||
500 | S32, S64 | ||||||
501 | }; | ||||||
502 | |||||||
503 | const std::initializer_list<LLT> FPTypes16 = { | ||||||
504 | S32, S64, S16 | ||||||
505 | }; | ||||||
506 | |||||||
507 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||||
508 | S32, S64, S16, V2S16 | ||||||
509 | }; | ||||||
510 | |||||||
511 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||||
512 | |||||||
513 | // s1 for VCC branches, s32 for SCC branches. | ||||||
514 | getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); | ||||||
515 | |||||||
516 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||||
517 | // elements for v3s16 | ||||||
518 | getActionDefinitionsBuilder(G_PHI) | ||||||
519 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||||
520 | .legalFor(AllS32Vectors) | ||||||
521 | .legalFor(AllS64Vectors) | ||||||
522 | .legalFor(AddrSpaces64) | ||||||
523 | .legalFor(AddrSpaces32) | ||||||
524 | .legalIf(isPointer(0)) | ||||||
525 | .clampScalar(0, S16, S256) | ||||||
526 | .widenScalarToNextPow2(0, 32) | ||||||
527 | .clampMaxNumElements(0, S32, 16) | ||||||
528 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
529 | .scalarize(0); | ||||||
530 | |||||||
531 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||||
532 | // Full set of gfx9 features. | ||||||
533 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
534 | .legalFor({S32, S16, V2S16}) | ||||||
535 | .clampScalar(0, S16, S32) | ||||||
536 | .clampMaxNumElements(0, S16, 2) | ||||||
537 | .scalarize(0) | ||||||
538 | .widenScalarToNextPow2(0, 32); | ||||||
539 | |||||||
540 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||||
541 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||||
542 | .minScalarOrElt(0, S16) | ||||||
543 | .clampMaxNumElements(0, S16, 2) | ||||||
544 | .scalarize(0) | ||||||
545 | .widenScalarToNextPow2(0, 32) | ||||||
546 | .lower(); | ||||||
547 | } else if (ST.has16BitInsts()) { | ||||||
548 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
549 | .legalFor({S32, S16}) | ||||||
550 | .clampScalar(0, S16, S32) | ||||||
551 | .scalarize(0) | ||||||
552 | .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 | ||||||
553 | |||||||
554 | // Technically the saturating operations require clamp bit support, but this | ||||||
555 | // was introduced at the same time as 16-bit operations. | ||||||
556 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
557 | .legalFor({S32, S16}) // Clamp modifier | ||||||
558 | .minScalar(0, S16) | ||||||
559 | .scalarize(0) | ||||||
560 | .widenScalarToNextPow2(0, 16) | ||||||
561 | .lower(); | ||||||
562 | |||||||
563 | // We're just lowering this, but it helps get a better result to try to | ||||||
564 | // coerce to the desired type first. | ||||||
565 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||
566 | .minScalar(0, S16) | ||||||
567 | .scalarize(0) | ||||||
568 | .lower(); | ||||||
569 | } else { | ||||||
570 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
571 | .legalFor({S32}) | ||||||
572 | .clampScalar(0, S32, S32) | ||||||
573 | .scalarize(0); | ||||||
574 | |||||||
575 | if (ST.hasIntClamp()) { | ||||||
576 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
577 | .legalFor({S32}) // Clamp modifier. | ||||||
578 | .scalarize(0) | ||||||
579 | .minScalarOrElt(0, S32) | ||||||
580 | .lower(); | ||||||
581 | } else { | ||||||
582 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||||
583 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
584 | .minScalar(0, S32) | ||||||
585 | .scalarize(0) | ||||||
586 | .lower(); | ||||||
587 | } | ||||||
588 | |||||||
589 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||||
590 | // range values and goes for the min/max lowering directly. | ||||||
591 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||
592 | .minScalar(0, S32) | ||||||
593 | .scalarize(0) | ||||||
594 | .lower(); | ||||||
595 | } | ||||||
596 | |||||||
597 | getActionDefinitionsBuilder( | ||||||
598 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) | ||||||
599 | .customFor({S32, S64}) | ||||||
600 | .clampScalar(0, S32, S64) | ||||||
601 | .widenScalarToNextPow2(0, 32) | ||||||
602 | .scalarize(0); | ||||||
603 | |||||||
604 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||||
605 | .legalFor({S32}) | ||||||
606 | .maxScalarOrElt(0, S32); | ||||||
607 | |||||||
608 | if (ST.hasVOP3PInsts()) { | ||||||
609 | Mulh | ||||||
610 | .clampMaxNumElements(0, S8, 2) | ||||||
611 | .lowerFor({V2S8}); | ||||||
612 | } | ||||||
613 | |||||||
614 | Mulh | ||||||
615 | .scalarize(0) | ||||||
616 | .lower(); | ||||||
617 | |||||||
618 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||||
619 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||||
620 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||||
621 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||||
622 | .clampScalar(0, S32, S64) | ||||||
623 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
624 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||||
625 | .widenScalarToNextPow2(0) | ||||||
626 | .scalarize(0); | ||||||
627 | |||||||
628 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | ||||||
629 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||||
630 | .legalFor({{S32, S1}, {S32, S32}}) | ||||||
631 | .minScalar(0, S32) | ||||||
632 | // TODO: .scalarize(0) | ||||||
633 | .lower(); | ||||||
634 | |||||||
635 | getActionDefinitionsBuilder(G_BITCAST) | ||||||
636 | // Don't worry about the size constraint. | ||||||
637 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
638 | .lower(); | ||||||
639 | |||||||
640 | |||||||
641 | getActionDefinitionsBuilder(G_CONSTANT) | ||||||
642 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||||
643 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||||
644 | .legalIf(isPointer(0)) | ||||||
645 | .clampScalar(0, S32, S64) | ||||||
646 | .widenScalarToNextPow2(0); | ||||||
647 | |||||||
648 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||||
649 | .legalFor({S32, S64, S16}) | ||||||
650 | .clampScalar(0, S16, S64); | ||||||
651 | |||||||
652 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||||
653 | .legalIf(isRegisterType(0)) | ||||||
654 | // s1 and s16 are special cases because they have legal operations on | ||||||
655 | // them, but don't really occupy registers in the normal way. | ||||||
656 | .legalFor({S1, S16}) | ||||||
657 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
658 | .clampScalarOrElt(0, S32, MaxScalar) | ||||||
659 | .widenScalarToNextPow2(0, 32) | ||||||
660 | .clampMaxNumElements(0, S32, 16); | ||||||
661 | |||||||
662 | getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); | ||||||
663 | |||||||
664 | // If the amount is divergent, we have to do a wave reduction to get the | ||||||
665 | // maximum value, so this is expanded during RegBankSelect. | ||||||
666 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||||
667 | .legalFor({{PrivatePtr, S32}}); | ||||||
668 | |||||||
669 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||||
670 | .customIf(typeIsNot(0, PrivatePtr)); | ||||||
671 | |||||||
672 | getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); | ||||||
673 | |||||||
674 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||||
675 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | ||||||
676 | .legalFor({S32, S64}); | ||||||
677 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||||
678 | .customFor({S32, S64}); | ||||||
679 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||||
680 | .customFor({S32, S64}); | ||||||
681 | |||||||
682 | if (ST.has16BitInsts()) { | ||||||
683 | if (ST.hasVOP3PInsts()) | ||||||
684 | FPOpActions.legalFor({S16, V2S16}); | ||||||
685 | else | ||||||
686 | FPOpActions.legalFor({S16}); | ||||||
687 | |||||||
688 | TrigActions.customFor({S16}); | ||||||
689 | FDIVActions.customFor({S16}); | ||||||
690 | } | ||||||
691 | |||||||
692 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||||
693 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||||
694 | |||||||
695 | if (ST.hasVOP3PInsts()) { | ||||||
696 | MinNumMaxNum.customFor(FPTypesPK16) | ||||||
697 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
698 | .clampMaxNumElements(0, S16, 2) | ||||||
699 | .clampScalar(0, S16, S64) | ||||||
700 | .scalarize(0); | ||||||
701 | } else if (ST.has16BitInsts()) { | ||||||
702 | MinNumMaxNum.customFor(FPTypes16) | ||||||
703 | .clampScalar(0, S16, S64) | ||||||
704 | .scalarize(0); | ||||||
705 | } else { | ||||||
706 | MinNumMaxNum.customFor(FPTypesBase) | ||||||
707 | .clampScalar(0, S32, S64) | ||||||
708 | .scalarize(0); | ||||||
709 | } | ||||||
710 | |||||||
711 | if (ST.hasVOP3PInsts()) | ||||||
712 | FPOpActions.clampMaxNumElements(0, S16, 2); | ||||||
713 | |||||||
714 | FPOpActions | ||||||
715 | .scalarize(0) | ||||||
716 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
717 | |||||||
718 | TrigActions | ||||||
719 | .scalarize(0) | ||||||
720 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
721 | |||||||
722 | FDIVActions | ||||||
723 | .scalarize(0) | ||||||
724 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
725 | |||||||
726 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||||
727 | .legalFor(FPTypesPK16) | ||||||
728 | .clampMaxNumElements(0, S16, 2) | ||||||
729 | .scalarize(0) | ||||||
730 | .clampScalar(0, S16, S64); | ||||||
731 | |||||||
732 | if (ST.has16BitInsts()) { | ||||||
733 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||
734 | .legalFor({S32, S64, S16}) | ||||||
735 | .scalarize(0) | ||||||
736 | .clampScalar(0, S16, S64); | ||||||
737 | } else { | ||||||
738 | getActionDefinitionsBuilder(G_FSQRT) | ||||||
739 | .legalFor({S32, S64}) | ||||||
740 | .scalarize(0) | ||||||
741 | .clampScalar(0, S32, S64); | ||||||
742 | |||||||
743 | if (ST.hasFractBug()) { | ||||||
744 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
745 | .customFor({S64}) | ||||||
746 | .legalFor({S32, S64}) | ||||||
747 | .scalarize(0) | ||||||
748 | .clampScalar(0, S32, S64); | ||||||
749 | } else { | ||||||
750 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
751 | .legalFor({S32, S64}) | ||||||
752 | .scalarize(0) | ||||||
753 | .clampScalar(0, S32, S64); | ||||||
754 | } | ||||||
755 | } | ||||||
756 | |||||||
757 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||||
758 | .legalFor({{S32, S64}, {S16, S32}}) | ||||||
759 | .scalarize(0) | ||||||
760 | .lower(); | ||||||
761 | |||||||
762 | getActionDefinitionsBuilder(G_FPEXT) | ||||||
763 | .legalFor({{S64, S32}, {S32, S16}}) | ||||||
764 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||||
765 | .scalarize(0); | ||||||
766 | |||||||
767 | getActionDefinitionsBuilder(G_FSUB) | ||||||
768 | // Use actual fsub instruction | ||||||
769 | .legalFor({S32}) | ||||||
770 | // Must use fadd + fneg | ||||||
771 | .lowerFor({S64, S16, V2S16}) | ||||||
772 | .scalarize(0) | ||||||
773 | .clampScalar(0, S32, S64); | ||||||
774 | |||||||
775 | // Whether this is legal depends on the floating point mode for the function. | ||||||
776 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||||
777 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||||
778 | FMad.customFor({S32, S16}); | ||||||
779 | else if (ST.hasMadMacF32Insts()) | ||||||
780 | FMad.customFor({S32}); | ||||||
781 | else if (ST.hasMadF16()) | ||||||
782 | FMad.customFor({S16}); | ||||||
783 | FMad.scalarize(0) | ||||||
784 | .lower(); | ||||||
785 | |||||||
786 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||||
787 | if (ST.has16BitInsts()) { | ||||||
788 | FRem.customFor({S16, S32, S64}); | ||||||
789 | } else { | ||||||
790 | FRem.minScalar(0, S32) | ||||||
791 | .customFor({S32, S64}); | ||||||
792 | } | ||||||
793 | FRem.scalarize(0); | ||||||
794 | |||||||
795 | // TODO: Do we need to clamp maximum bitwidth? | ||||||
796 | getActionDefinitionsBuilder(G_TRUNC) | ||||||
797 | .legalIf(isScalar(0)) | ||||||
798 | .legalFor({{V2S16, V2S32}}) | ||||||
799 | .clampMaxNumElements(0, S16, 2) | ||||||
800 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||||
801 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||||
802 | // in the legalizer. | ||||||
803 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||||
804 | .alwaysLegal(); | ||||||
805 | |||||||
806 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||||
807 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||||
808 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||||
809 | .scalarize(0) | ||||||
810 | .clampScalar(0, S32, S64) | ||||||
811 | .widenScalarToNextPow2(1, 32); | ||||||
812 | |||||||
813 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||||
814 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||||
815 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||||
816 | .lowerIf(typeIs(1, S1)) | ||||||
817 | .customFor({{S32, S64}, {S64, S64}}); | ||||||
818 | if (ST.has16BitInsts()) | ||||||
819 | IToFP.legalFor({{S16, S16}}); | ||||||
820 | IToFP.clampScalar(1, S32, S64) | ||||||
821 | .minScalar(0, S32) | ||||||
822 | .scalarize(0) | ||||||
823 | .widenScalarToNextPow2(1); | ||||||
824 | |||||||
825 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||||
826 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||||
827 | .customFor({{S64, S32}, {S64, S64}}) | ||||||
828 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||||
829 | if (ST.has16BitInsts()) | ||||||
830 | FPToI.legalFor({{S16, S16}}); | ||||||
831 | else | ||||||
832 | FPToI.minScalar(1, S32); | ||||||
833 | |||||||
834 | FPToI.minScalar(0, S32) | ||||||
835 | .widenScalarToNextPow2(0, 32) | ||||||
836 | .scalarize(0) | ||||||
837 | .lower(); | ||||||
838 | |||||||
839 | // Lower roundeven into G_FRINT | ||||||
840 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||||
841 | .scalarize(0) | ||||||
842 | .lower(); | ||||||
843 | |||||||
844 | if (ST.has16BitInsts()) { | ||||||
845 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
846 | .legalFor({S16, S32, S64}) | ||||||
847 | .clampScalar(0, S16, S64) | ||||||
848 | .scalarize(0); | ||||||
849 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||||
850 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
851 | .legalFor({S32, S64}) | ||||||
852 | .clampScalar(0, S32, S64) | ||||||
853 | .scalarize(0); | ||||||
854 | } else { | ||||||
855 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
856 | .legalFor({S32}) | ||||||
857 | .customFor({S64}) | ||||||
858 | .clampScalar(0, S32, S64) | ||||||
859 | .scalarize(0); | ||||||
860 | } | ||||||
861 | |||||||
862 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||||
863 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||||
864 | .scalarize(0) | ||||||
865 | .scalarSameSizeAs(1, 0); | ||||||
866 | |||||||
867 | getActionDefinitionsBuilder(G_PTRMASK) | ||||||
868 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||||
869 | .scalarSameSizeAs(1, 0) | ||||||
870 | .scalarize(0); | ||||||
871 | |||||||
872 | auto &CmpBuilder = | ||||||
873 | getActionDefinitionsBuilder(G_ICMP) | ||||||
874 | // The compare output type differs based on the register bank of the output, | ||||||
875 | // so make both s1 and s32 legal. | ||||||
876 | // | ||||||
877 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||||
878 | // is the allocatable register type that will be needed for the copy from | ||||||
879 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||||
880 | // before that won't try to use s32 result types. | ||||||
881 | // | ||||||
882 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||||
883 | // bank. | ||||||
884 | .legalForCartesianProduct( | ||||||
885 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||||
886 | .legalForCartesianProduct( | ||||||
887 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||||
888 | if (ST.has16BitInsts()) { | ||||||
889 | CmpBuilder.legalFor({{S1, S16}}); | ||||||
890 | } | ||||||
891 | |||||||
892 | CmpBuilder | ||||||
893 | .widenScalarToNextPow2(1) | ||||||
894 | .clampScalar(1, S32, S64) | ||||||
895 | .scalarize(0) | ||||||
896 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||||
897 | |||||||
898 | getActionDefinitionsBuilder(G_FCMP) | ||||||
899 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||||
900 | .widenScalarToNextPow2(1) | ||||||
901 | .clampScalar(1, S32, S64) | ||||||
902 | .scalarize(0); | ||||||
903 | |||||||
904 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||||
905 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||||
906 | if (ST.has16BitInsts()) | ||||||
907 | Exp2Ops.legalFor({S32, S16}); | ||||||
908 | else | ||||||
909 | Exp2Ops.legalFor({S32}); | ||||||
910 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||||
911 | Exp2Ops.scalarize(0); | ||||||
912 | |||||||
913 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||||
914 | if (ST.has16BitInsts()) | ||||||
915 | ExpOps.customFor({{S32}, {S16}}); | ||||||
916 | else | ||||||
917 | ExpOps.customFor({S32}); | ||||||
918 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||||
919 | .scalarize(0); | ||||||
920 | |||||||
921 | getActionDefinitionsBuilder(G_FPOWI) | ||||||
922 | .clampScalar(0, MinScalarFPTy, S32) | ||||||
923 | .lower(); | ||||||
924 | |||||||
925 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
926 | getActionDefinitionsBuilder(G_CTPOP) | ||||||
927 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
928 | .clampScalar(0, S32, S32) | ||||||
929 | .clampScalar(1, S32, S64) | ||||||
930 | .scalarize(0) | ||||||
931 | .widenScalarToNextPow2(0, 32) | ||||||
932 | .widenScalarToNextPow2(1, 32); | ||||||
933 | |||||||
934 | // The hardware instructions return a different result on 0 than the generic | ||||||
935 | // instructions expect. The hardware produces -1, but these produce the | ||||||
936 | // bitwidth. | ||||||
937 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||||
938 | .scalarize(0) | ||||||
939 | .clampScalar(0, S32, S32) | ||||||
940 | .clampScalar(1, S32, S64) | ||||||
941 | .widenScalarToNextPow2(0, 32) | ||||||
942 | .widenScalarToNextPow2(1, 32) | ||||||
943 | .custom(); | ||||||
944 | |||||||
945 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
946 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||||
947 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
948 | .clampScalar(0, S32, S32) | ||||||
949 | .clampScalar(1, S32, S64) | ||||||
950 | .scalarize(0) | ||||||
951 | .widenScalarToNextPow2(0, 32) | ||||||
952 | .widenScalarToNextPow2(1, 32); | ||||||
953 | |||||||
954 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||
955 | // RegBankSelect. | ||||||
956 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||||
957 | .legalFor({S32, S64}) | ||||||
958 | .clampScalar(0, S32, S64) | ||||||
959 | .scalarize(0) | ||||||
960 | .widenScalarToNextPow2(0); | ||||||
961 | |||||||
962 | if (ST.has16BitInsts()) { | ||||||
963 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
964 | .legalFor({S16, S32, V2S16}) | ||||||
965 | .clampMaxNumElements(0, S16, 2) | ||||||
966 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
967 | // narrowScalar limitation. | ||||||
968 | .widenScalarToNextPow2(0) | ||||||
969 | .clampScalar(0, S16, S32) | ||||||
970 | .scalarize(0); | ||||||
971 | |||||||
972 | if (ST.hasVOP3PInsts()) { | ||||||
973 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||
974 | .legalFor({S32, S16, V2S16}) | ||||||
975 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
976 | .clampMaxNumElements(0, S16, 2) | ||||||
977 | .minScalar(0, S16) | ||||||
978 | .widenScalarToNextPow2(0) | ||||||
979 | .scalarize(0) | ||||||
980 | .lower(); | ||||||
981 | } else { | ||||||
982 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||
983 | .legalFor({S32, S16}) | ||||||
984 | .widenScalarToNextPow2(0) | ||||||
985 | .minScalar(0, S16) | ||||||
986 | .scalarize(0) | ||||||
987 | .lower(); | ||||||
988 | } | ||||||
989 | } else { | ||||||
990 | // TODO: Should have same legality without v_perm_b32 | ||||||
991 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
992 | .legalFor({S32}) | ||||||
993 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||||
994 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
995 | // narrowScalar limitation. | ||||||
996 | .widenScalarToNextPow2(0) | ||||||
997 | .maxScalar(0, S32) | ||||||
998 | .scalarize(0) | ||||||
999 | .lower(); | ||||||
1000 | |||||||
1001 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||||
1002 | .legalFor({S32}) | ||||||
1003 | .minScalar(0, S32) | ||||||
1004 | .widenScalarToNextPow2(0) | ||||||
1005 | .scalarize(0) | ||||||
1006 | .lower(); | ||||||
1007 | } | ||||||
1008 | |||||||
1009 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||||
1010 | // List the common cases | ||||||
1011 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
1012 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
1013 | .scalarize(0) | ||||||
1014 | // Accept any address space as long as the size matches | ||||||
1015 | .legalIf(sameSize(0, 1)) | ||||||
1016 | .widenScalarIf(smallerThan(1, 0), | ||||||
1017 | [](const LegalityQuery &Query) { | ||||||
1018 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
1019 | }) | ||||||
1020 | .narrowScalarIf(largerThan(1, 0), | ||||||
1021 | [](const LegalityQuery &Query) { | ||||||
1022 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
1023 | }); | ||||||
1024 | |||||||
1025 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||||
1026 | // List the common cases | ||||||
1027 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
1028 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
1029 | .scalarize(0) | ||||||
1030 | // Accept any address space as long as the size matches | ||||||
1031 | .legalIf(sameSize(0, 1)) | ||||||
1032 | .widenScalarIf(smallerThan(0, 1), | ||||||
1033 | [](const LegalityQuery &Query) { | ||||||
1034 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
1035 | }) | ||||||
1036 | .narrowScalarIf( | ||||||
1037 | largerThan(0, 1), | ||||||
1038 | [](const LegalityQuery &Query) { | ||||||
1039 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
1040 | }); | ||||||
1041 | |||||||
1042 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||||
1043 | .scalarize(0) | ||||||
1044 | .custom(); | ||||||
1045 | |||||||
1046 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||||
1047 | bool IsLoad) -> bool { | ||||||
1048 | const LLT DstTy = Query.Types[0]; | ||||||
1049 | |||||||
1050 | // Split vector extloads. | ||||||
1051 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
1052 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||
1053 | |||||||
1054 | if (MemSize < DstTy.getSizeInBits()) | ||||||
1055 | MemSize = std::max(MemSize, AlignBits); | ||||||
1056 | |||||||
1057 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||||
1058 | return true; | ||||||
1059 | |||||||
1060 | const LLT PtrTy = Query.Types[1]; | ||||||
1061 | unsigned AS = PtrTy.getAddressSpace(); | ||||||
1062 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||
1063 | return true; | ||||||
1064 | |||||||
1065 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||||
1066 | // TODO: May be able to widen depending on alignment etc. | ||||||
1067 | unsigned NumRegs = (MemSize + 31) / 32; | ||||||
1068 | if (NumRegs == 3) { | ||||||
1069 | if (!ST.hasDwordx3LoadStores()) | ||||||
1070 | return true; | ||||||
1071 | } else { | ||||||
1072 | // If the alignment allows, these should have been widened. | ||||||
1073 | if (!isPowerOf2_32(NumRegs)) | ||||||
1074 | return true; | ||||||
1075 | } | ||||||
1076 | |||||||
1077 | if (AlignBits < MemSize) { | ||||||
1078 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
1079 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||
1080 | Align(AlignBits / 8)); | ||||||
1081 | } | ||||||
1082 | |||||||
1083 | return false; | ||||||
1084 | }; | ||||||
1085 | |||||||
1086 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||||
1087 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||||
1088 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||||
1089 | |||||||
1090 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||||
1091 | // LDS | ||||||
1092 | // TODO: Unsupported flat for SI. | ||||||
1093 | |||||||
1094 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||||
1095 | const bool IsStore = Op == G_STORE; | ||||||
1096 | |||||||
1097 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||||
1098 | // Explicitly list some common cases. | ||||||
1099 | // TODO: Does this help compile time at all? | ||||||
1100 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, | ||||||
1101 | {V2S32, GlobalPtr, V2S32, GlobalAlign32}, | ||||||
1102 | {V4S32, GlobalPtr, V4S32, GlobalAlign32}, | ||||||
1103 | {S64, GlobalPtr, S64, GlobalAlign32}, | ||||||
1104 | {V2S64, GlobalPtr, V2S64, GlobalAlign32}, | ||||||
1105 | {V2S16, GlobalPtr, V2S16, GlobalAlign32}, | ||||||
1106 | {S32, GlobalPtr, S8, GlobalAlign8}, | ||||||
1107 | {S32, GlobalPtr, S16, GlobalAlign16}, | ||||||
1108 | |||||||
1109 | {S32, LocalPtr, S32, 32}, | ||||||
1110 | {S64, LocalPtr, S64, 32}, | ||||||
1111 | {V2S32, LocalPtr, V2S32, 32}, | ||||||
1112 | {S32, LocalPtr, S8, 8}, | ||||||
1113 | {S32, LocalPtr, S16, 16}, | ||||||
1114 | {V2S16, LocalPtr, S32, 32}, | ||||||
1115 | |||||||
1116 | {S32, PrivatePtr, S32, 32}, | ||||||
1117 | {S32, PrivatePtr, S8, 8}, | ||||||
1118 | {S32, PrivatePtr, S16, 16}, | ||||||
1119 | {V2S16, PrivatePtr, S32, 32}, | ||||||
1120 | |||||||
1121 | {S32, ConstantPtr, S32, GlobalAlign32}, | ||||||
1122 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}, | ||||||
1123 | {V4S32, ConstantPtr, V4S32, GlobalAlign32}, | ||||||
1124 | {S64, ConstantPtr, S64, GlobalAlign32}, | ||||||
1125 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); | ||||||
1126 | Actions.legalIf( | ||||||
1127 | [=](const LegalityQuery &Query) -> bool { | ||||||
1128 | return isLoadStoreLegal(ST, Query); | ||||||
1129 | }); | ||||||
1130 | |||||||
1131 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||
1132 | // 64-bits. | ||||||
1133 | // | ||||||
1134 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||
1135 | // inserting addrspacecasts. | ||||||
1136 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||||
1137 | |||||||
1138 | // Turn any illegal element vectors into something easier to deal | ||||||
1139 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||||
1140 | // parts anyway. | ||||||
1141 | // | ||||||
1142 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||||
1143 | // 16-bit vector parts. | ||||||
1144 | Actions.bitcastIf( | ||||||
1145 | [=](const LegalityQuery &Query) -> bool { | ||||||
1146 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||||
1147 | Query.MMODescrs[0].MemoryTy); | ||||||
1148 | }, bitcastToRegisterType(0)); | ||||||
1149 | |||||||
1150 | if (!IsStore) { | ||||||
1151 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||||
1152 | // legalization actions can't properly express widening memory operands. | ||||||
1153 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||||
1154 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||||
1155 | }); | ||||||
1156 | } | ||||||
1157 | |||||||
1158 | // FIXME: load/store narrowing should be moved to lower action | ||||||
1159 | Actions | ||||||
1160 | .narrowScalarIf( | ||||||
1161 | [=](const LegalityQuery &Query) -> bool { | ||||||
1162 | return !Query.Types[0].isVector() && | ||||||
1163 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
1164 | }, | ||||||
1165 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
1166 | const LLT DstTy = Query.Types[0]; | ||||||
1167 | const LLT PtrTy = Query.Types[1]; | ||||||
1168 | |||||||
1169 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||||
1170 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
1171 | |||||||
1172 | // Split extloads. | ||||||
1173 | if (DstSize > MemSize) | ||||||
1174 | return std::make_pair(0, LLT::scalar(MemSize)); | ||||||
1175 | |||||||
1176 | if (!isPowerOf2_32(DstSize)) { | ||||||
1177 | // We're probably decomposing an odd sized store. Try to split | ||||||
1178 | // to the widest type. TODO: Account for alignment. As-is it | ||||||
1179 | // should be OK, since the new parts will be further legalized. | ||||||
1180 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||
1181 | return std::make_pair(0, LLT::scalar(FloorSize)); | ||||||
1182 | } | ||||||
1183 | |||||||
1184 | if (DstSize > 32 && (DstSize % 32 != 0)) { | ||||||
1185 | // FIXME: Need a way to specify non-extload of larger size if | ||||||
1186 | // suitably aligned. | ||||||
1187 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); | ||||||
1188 | } | ||||||
1189 | |||||||
1190 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||
1191 | PtrTy.getAddressSpace(), | ||||||
1192 | Op == G_LOAD); | ||||||
1193 | if (MemSize > MaxSize) | ||||||
1194 | return std::make_pair(0, LLT::scalar(MaxSize)); | ||||||
1195 | |||||||
1196 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
1197 | return std::make_pair(0, LLT::scalar(Align)); | ||||||
1198 | }) | ||||||
1199 | .fewerElementsIf( | ||||||
1200 | [=](const LegalityQuery &Query) -> bool { | ||||||
1201 | return Query.Types[0].isVector() && | ||||||
1202 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
1203 | }, | ||||||
1204 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
1205 | const LLT DstTy = Query.Types[0]; | ||||||
1206 | const LLT PtrTy = Query.Types[1]; | ||||||
1207 | |||||||
1208 | LLT EltTy = DstTy.getElementType(); | ||||||
1209 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||
1210 | PtrTy.getAddressSpace(), | ||||||
1211 | Op == G_LOAD); | ||||||
1212 | |||||||
1213 | // FIXME: Handle widened to power of 2 results better. This ends | ||||||
1214 | // up scalarizing. | ||||||
1215 | // FIXME: 3 element stores scalarized on SI | ||||||
1216 | |||||||
1217 | // Split if it's too large for the address space. | ||||||
1218 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||||
1219 | if (MemSize > MaxSize) { | ||||||
1220 | unsigned NumElts = DstTy.getNumElements(); | ||||||
1221 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1222 | |||||||
1223 | if (MaxSize % EltSize == 0) { | ||||||
1224 | return std::make_pair( | ||||||
1225 | 0, LLT::scalarOrVector( | ||||||
1226 | ElementCount::getFixed(MaxSize / EltSize), EltTy)); | ||||||
1227 | } | ||||||
1228 | |||||||
1229 | unsigned NumPieces = MemSize / MaxSize; | ||||||
1230 | |||||||
1231 | // FIXME: Refine when odd breakdowns handled | ||||||
1232 | // The scalars will need to be re-legalized. | ||||||
1233 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||||
1234 | NumElts % NumPieces != 0) | ||||||
1235 | return std::make_pair(0, EltTy); | ||||||
1236 | |||||||
1237 | return std::make_pair( | ||||||
1238 | 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); | ||||||
1239 | } | ||||||
1240 | |||||||
1241 | // FIXME: We could probably handle weird extending loads better. | ||||||
1242 | if (DstTy.getSizeInBits() > MemSize) | ||||||
1243 | return std::make_pair(0, EltTy); | ||||||
1244 | |||||||
1245 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1246 | unsigned DstSize = DstTy.getSizeInBits(); | ||||||
1247 | if (!isPowerOf2_32(DstSize)) { | ||||||
1248 | // We're probably decomposing an odd sized store. Try to split | ||||||
1249 | // to the widest type. TODO: Account for alignment. As-is it | ||||||
1250 | // should be OK, since the new parts will be further legalized. | ||||||
1251 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||
1252 | return std::make_pair( | ||||||
1253 | 0, LLT::scalarOrVector( | ||||||
1254 | ElementCount::getFixed(FloorSize / EltSize), EltTy)); | ||||||
1255 | } | ||||||
1256 | |||||||
1257 | // Need to split because of alignment. | ||||||
1258 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
1259 | if (EltSize > Align && | ||||||
1260 | (EltSize / Align < DstTy.getNumElements())) { | ||||||
1261 | return std::make_pair( | ||||||
1262 | 0, LLT::fixed_vector(EltSize / Align, EltTy)); | ||||||
1263 | } | ||||||
1264 | |||||||
1265 | // May need relegalization for the scalars. | ||||||
1266 | return std::make_pair(0, EltTy); | ||||||
1267 | }) | ||||||
1268 | .minScalar(0, S32) | ||||||
1269 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) | ||||||
1270 | .widenScalarToNextPow2(0) | ||||||
1271 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||||
1272 | .lower(); | ||||||
1273 | } | ||||||
1274 | |||||||
1275 | // FIXME: Unaligned accesses not lowered. | ||||||
1276 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||||
1277 | .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, | ||||||
1278 | {S32, GlobalPtr, S16, 2 * 8}, | ||||||
1279 | {S32, LocalPtr, S8, 8}, | ||||||
1280 | {S32, LocalPtr, S16, 16}, | ||||||
1281 | {S32, PrivatePtr, S8, 8}, | ||||||
1282 | {S32, PrivatePtr, S16, 16}, | ||||||
1283 | {S32, ConstantPtr, S8, 8}, | ||||||
1284 | {S32, ConstantPtr, S16, 2 * 8}}) | ||||||
1285 | .legalIf( | ||||||
1286 | [=](const LegalityQuery &Query) -> bool { | ||||||
1287 | return isLoadStoreLegal(ST, Query); | ||||||
1288 | }); | ||||||
1289 | |||||||
1290 | if (ST.hasFlatAddressSpace()) { | ||||||
1291 | ExtLoads.legalForTypesWithMemDesc( | ||||||
1292 | {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); | ||||||
1293 | } | ||||||
1294 | |||||||
1295 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||
1296 | // 64-bits. | ||||||
1297 | // | ||||||
1298 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||
1299 | // inserting addrspacecasts. | ||||||
1300 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); | ||||||
1301 | |||||||
1302 | ExtLoads.clampScalar(0, S32, S32) | ||||||
1303 | .widenScalarToNextPow2(0) | ||||||
1304 | .lower(); | ||||||
1305 | |||||||
1306 | auto &Atomics = getActionDefinitionsBuilder( | ||||||
1307 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||||
1308 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||||
1309 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||||
1310 | G_ATOMICRMW_UMIN}) | ||||||
1311 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||||
1312 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||||
1313 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
1314 | if (ST.hasFlatAddressSpace()) { | ||||||
1315 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||||
1316 | } | ||||||
1317 | |||||||
1318 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | ||||||
1319 | if (ST.hasLDSFPAtomics()) { | ||||||
1320 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||||
1321 | if (ST.hasGFX90AInsts()) | ||||||
1322 | Atomic.legalFor({{S64, LocalPtr}}); | ||||||
1323 | } | ||||||
1324 | if (ST.hasAtomicFaddInsts()) | ||||||
1325 | Atomic.legalFor({{S32, GlobalPtr}}); | ||||||
1326 | |||||||
1327 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||||
1328 | // demarshalling | ||||||
1329 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||||
1330 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||||
1331 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||||
1332 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||||
1333 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
1334 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||||
1335 | |||||||
1336 | // Condition should be s32 for scalar, s1 for vector. | ||||||
1337 | getActionDefinitionsBuilder(G_SELECT) | ||||||
1338 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, | ||||||
1339 | LocalPtr, FlatPtr, PrivatePtr, | ||||||
1340 | LLT::fixed_vector(2, LocalPtr), | ||||||
1341 | LLT::fixed_vector(2, PrivatePtr)}, | ||||||
1342 | {S1, S32}) | ||||||
1343 | .clampScalar(0, S16, S64) | ||||||
1344 | .scalarize(1) | ||||||
1345 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
1346 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||||
1347 | .clampMaxNumElements(0, S32, 2) | ||||||
1348 | .clampMaxNumElements(0, LocalPtr, 2) | ||||||
1349 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||||
1350 | .scalarize(0) | ||||||
1351 | .widenScalarToNextPow2(0) | ||||||
1352 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||||
1353 | |||||||
1354 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||||
1355 | // be more flexible with the shift amount type. | ||||||
1356 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||||
1357 | .legalFor({{S32, S32}, {S64, S32}}); | ||||||
1358 | if (ST.has16BitInsts()) { | ||||||
1359 | if (ST.hasVOP3PInsts()) { | ||||||
1360 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||||
1361 | .clampMaxNumElements(0, S16, 2); | ||||||
1362 | } else | ||||||
1363 | Shifts.legalFor({{S16, S16}}); | ||||||
1364 | |||||||
1365 | // TODO: Support 16-bit shift amounts for all types | ||||||
1366 | Shifts.widenScalarIf( | ||||||
1367 | [=](const LegalityQuery &Query) { | ||||||
1368 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||||
1369 | // 32-bit amount. | ||||||
1370 | const LLT ValTy = Query.Types[0]; | ||||||
1371 | const LLT AmountTy = Query.Types[1]; | ||||||
1372 | return ValTy.getSizeInBits() <= 16 && | ||||||
1373 | AmountTy.getSizeInBits() < 16; | ||||||
1374 | }, changeTo(1, S16)); | ||||||
1375 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||||
1376 | Shifts.clampScalar(1, S32, S32); | ||||||
1377 | Shifts.clampScalar(0, S16, S64); | ||||||
1378 | Shifts.widenScalarToNextPow2(0, 16); | ||||||
1379 | |||||||
1380 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||
1381 | .minScalar(0, S16) | ||||||
1382 | .scalarize(0) | ||||||
1383 | .lower(); | ||||||
1384 | } else { | ||||||
1385 | // Make sure we legalize the shift amount type first, as the general | ||||||
1386 | // expansion for the shifted type will produce much worse code if it hasn't | ||||||
1387 | // been truncated already. | ||||||
1388 | Shifts.clampScalar(1, S32, S32); | ||||||
1389 | Shifts.clampScalar(0, S32, S64); | ||||||
1390 | Shifts.widenScalarToNextPow2(0, 32); | ||||||
1391 | |||||||
1392 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||
1393 | .minScalar(0, S32) | ||||||
1394 | .scalarize(0) | ||||||
1395 | .lower(); | ||||||
1396 | } | ||||||
1397 | Shifts.scalarize(0); | ||||||
1398 | |||||||
1399 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||||
1400 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||||
1401 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||||
1402 | unsigned IdxTypeIdx = 2; | ||||||
1403 | |||||||
1404 | getActionDefinitionsBuilder(Op) | ||||||
1405 | .customIf([=](const LegalityQuery &Query) { | ||||||
1406 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
1407 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
1408 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||||
1409 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1410 | return (EltSize == 32 || EltSize == 64) && | ||||||
1411 | VecTy.getSizeInBits() % 32 == 0 && | ||||||
1412 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||||
1413 | IdxTy.getSizeInBits() == 32; | ||||||
1414 | }) | ||||||
1415 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||||
1416 | bitcastToVectorElement32(VecTypeIdx)) | ||||||
1417 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||||
1418 | .bitcastIf( | ||||||
1419 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||||
1420 | [=](const LegalityQuery &Query) { | ||||||
1421 | // For > 64-bit element types, try to turn this into a 64-bit | ||||||
1422 | // element vector since we may be able to do better indexing | ||||||
1423 | // if this is scalar. If not, fall back to 32. | ||||||
1424 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
1425 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
1426 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||||
1427 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||||
1428 | |||||||
1429 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||||
1430 | return std::make_pair( | ||||||
1431 | VecTypeIdx, | ||||||
1432 | LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); | ||||||
1433 | }) | ||||||
1434 | .clampScalar(EltTypeIdx, S32, S64) | ||||||
1435 | .clampScalar(VecTypeIdx, S32, S64) | ||||||
1436 | .clampScalar(IdxTypeIdx, S32, S32) | ||||||
1437 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||||
1438 | // TODO: Clamp elements for 64-bit vectors? | ||||||
1439 | // It should only be necessary with variable indexes. | ||||||
1440 | // As a last resort, lower to the stack | ||||||
1441 | .lower(); | ||||||
1442 | } | ||||||
1443 | |||||||
1444 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||||
1445 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||||
1446 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||||
1447 | return Query.Types[0] != EltTy; | ||||||
1448 | }); | ||||||
1449 | |||||||
1450 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||||
1451 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||||
1452 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||||
1453 | |||||||
1454 | // FIXME: Doesn't handle extract of illegal sizes. | ||||||
1455 | getActionDefinitionsBuilder(Op) | ||||||
1456 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||||
1457 | // FIXME: Multiples of 16 should not be legal. | ||||||
1458 | .legalIf([=](const LegalityQuery &Query) { | ||||||
1459 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1460 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1461 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||||
1462 | (LitTy.getSizeInBits() % 16 == 0); | ||||||
1463 | }) | ||||||
1464 | .widenScalarIf( | ||||||
1465 | [=](const LegalityQuery &Query) { | ||||||
1466 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1467 | return (BigTy.getScalarSizeInBits() < 16); | ||||||
1468 | }, | ||||||
1469 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||||
1470 | .widenScalarIf( | ||||||
1471 | [=](const LegalityQuery &Query) { | ||||||
1472 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1473 | return (LitTy.getScalarSizeInBits() < 16); | ||||||
1474 | }, | ||||||
1475 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||||
1476 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1477 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||||
1478 | |||||||
1479 | } | ||||||
1480 | |||||||
1481 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||||
1482 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||||
1483 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||||
1484 | .clampNumElements(0, V16S32, V32S32) | ||||||
1485 | .clampNumElements(0, V2S64, V16S64) | ||||||
1486 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||||
1487 | |||||||
1488 | if (ST.hasScalarPackInsts()) { | ||||||
1489 | BuildVector | ||||||
1490 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||||
1491 | .minScalarOrElt(0, S16) | ||||||
1492 | // Widen source elements and produce a G_BUILD_VECTOR_TRUNC | ||||||
1493 | .minScalar(1, S32); | ||||||
1494 | |||||||
1495 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1496 | .legalFor({V2S16, S32}) | ||||||
1497 | .lower(); | ||||||
1498 | BuildVector.minScalarOrElt(0, S32); | ||||||
1499 | } else { | ||||||
1500 | BuildVector.customFor({V2S16, S16}); | ||||||
1501 | BuildVector.minScalarOrElt(0, S32); | ||||||
1502 | |||||||
1503 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1504 | .customFor({V2S16, S32}) | ||||||
1505 | .lower(); | ||||||
1506 | } | ||||||
1507 | |||||||
1508 | BuildVector.legalIf(isRegisterType(0)); | ||||||
1509 | |||||||
1510 | // FIXME: Clamp maximum size | ||||||
1511 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||||
1512 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
1513 | .clampMaxNumElements(0, S32, 32) | ||||||
1514 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||||
1515 | .clampMaxNumElements(0, S16, 64); | ||||||
1516 | |||||||
1517 | // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse | ||||||
1518 | // pre-legalize. | ||||||
1519 | if (ST.hasVOP3PInsts()) { | ||||||
1520 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) | ||||||
1521 | .customFor({V2S16, V2S16}) | ||||||
1522 | .lower(); | ||||||
1523 | } else | ||||||
1524 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||||
1525 | |||||||
1526 | // Merge/Unmerge | ||||||
1527 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||||
1528 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||||
1529 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||||
1530 | |||||||
1531 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||||
1532 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
1533 | if (Ty.isVector()) { | ||||||
1534 | const LLT &EltTy = Ty.getElementType(); | ||||||
1535 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||||
1536 | return true; | ||||||
1537 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | ||||||
1538 | return true; | ||||||
1539 | } | ||||||
1540 | return false; | ||||||
1541 | }; | ||||||
1542 | |||||||
1543 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||||
1544 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
1545 | .lowerFor({{S16, V2S16}}) | ||||||
1546 | .lowerIf([=](const LegalityQuery &Query) { | ||||||
1547 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1548 | return BigTy.getSizeInBits() == 32; | ||||||
1549 | }) | ||||||
1550 | // Try to widen to s16 first for small types. | ||||||
1551 | // TODO: Only do this on targets with legal s16 shifts | ||||||
1552 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||||
1553 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||||
1554 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1555 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||||
1556 | elementTypeIs(1, S16)), | ||||||
1557 | changeTo(1, V2S16)) | ||||||
1558 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||||
1559 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||||
1560 | // valid. | ||||||
1561 | .clampScalar(LitTyIdx, S32, S512) | ||||||
1562 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||||
1563 | // Break up vectors with weird elements into scalars | ||||||
1564 | .fewerElementsIf( | ||||||
1565 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||||
1566 | scalarize(0)) | ||||||
1567 | .fewerElementsIf( | ||||||
1568 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||||
1569 | scalarize(1)) | ||||||
1570 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||||
1571 | |||||||
1572 | if (Op == G_MERGE_VALUES) { | ||||||
1573 | Builder.widenScalarIf( | ||||||
1574 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||||
1575 | [=](const LegalityQuery &Query) { | ||||||
1576 | const LLT Ty = Query.Types[LitTyIdx]; | ||||||
1577 | return Ty.getSizeInBits() < 32; | ||||||
1578 | }, | ||||||
1579 | changeTo(LitTyIdx, S32)); | ||||||
1580 | } | ||||||
1581 | |||||||
1582 | Builder.widenScalarIf( | ||||||
1583 | [=](const LegalityQuery &Query) { | ||||||
1584 | const LLT Ty = Query.Types[BigTyIdx]; | ||||||
1585 | return !isPowerOf2_32(Ty.getSizeInBits()) && | ||||||
1586 | Ty.getSizeInBits() % 16 != 0; | ||||||
1587 | }, | ||||||
1588 | [=](const LegalityQuery &Query) { | ||||||
1589 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||||
1590 | // Whichever is smaller. | ||||||
1591 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||||
1592 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||||
1593 | if (NewSizeInBits >= 256) { | ||||||
1594 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||||
1595 | if (RoundedTo < NewSizeInBits) | ||||||
1596 | NewSizeInBits = RoundedTo; | ||||||
1597 | } | ||||||
1598 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||||
1599 | }) | ||||||
1600 | // Any vectors left are the wrong size. Scalarize them. | ||||||
1601 | .scalarize(0) | ||||||
1602 | .scalarize(1); | ||||||
1603 | } | ||||||
1604 | |||||||
1605 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||
1606 | // RegBankSelect. | ||||||
1607 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||||
1608 | .legalFor({{S32}, {S64}}); | ||||||
1609 | |||||||
1610 | if (ST.hasVOP3PInsts()) { | ||||||
1611 | SextInReg.lowerFor({{V2S16}}) | ||||||
1612 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||||
1613 | // get more vector shift opportunities, since we'll get those when | ||||||
1614 | // expanded. | ||||||
1615 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); | ||||||
1616 | } else if (ST.has16BitInsts()) { | ||||||
1617 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||||
1618 | } else { | ||||||
1619 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||||
1620 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||||
1621 | SextInReg.lowerFor({{S32}, {S64}}); | ||||||
1622 | } | ||||||
1623 | |||||||
1624 | SextInReg | ||||||
1625 | .scalarize(0) | ||||||
1626 | .clampScalar(0, S32, S64) | ||||||
1627 | .lower(); | ||||||
1628 | |||||||
1629 | // TODO: Only Try to form v2s16 with legal packed instructions. | ||||||
1630 | getActionDefinitionsBuilder(G_FSHR) | ||||||
1631 | .legalFor({{S32, S32}}) | ||||||
1632 | .lowerFor({{V2S16, V2S16}}) | ||||||
1633 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | ||||||
1634 | .scalarize(0) | ||||||
1635 | .lower(); | ||||||
1636 | |||||||
1637 | if (ST.hasVOP3PInsts()) { | ||||||
1638 | getActionDefinitionsBuilder(G_FSHL) | ||||||
1639 | .lowerFor({{V2S16, V2S16}}) | ||||||
1640 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) | ||||||
1641 | .scalarize(0) | ||||||
1642 | .lower(); | ||||||
1643 | } else { | ||||||
1644 | getActionDefinitionsBuilder(G_FSHL) | ||||||
1645 | .scalarize(0) | ||||||
1646 | .lower(); | ||||||
1647 | } | ||||||
1648 | |||||||
1649 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||||
1650 | .legalFor({S64}); | ||||||
1651 | |||||||
1652 | getActionDefinitionsBuilder(G_FENCE) | ||||||
1653 | .alwaysLegal(); | ||||||
1654 | |||||||
1655 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | ||||||
1656 | .scalarize(0) | ||||||
1657 | .minScalar(0, S32) | ||||||
1658 | .lower(); | ||||||
1659 | |||||||
1660 | getActionDefinitionsBuilder({G_SBFX, G_UBFX}) | ||||||
1661 | .legalFor({{S32, S32}, {S64, S32}}) | ||||||
1662 | .clampScalar(1, S32, S32) | ||||||
1663 | .clampScalar(0, S32, S64) | ||||||
1664 | .widenScalarToNextPow2(0) | ||||||
1665 | .scalarize(0); | ||||||
1666 | |||||||
1667 | getActionDefinitionsBuilder({ | ||||||
1668 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||||
1669 | G_FCOPYSIGN, | ||||||
1670 | |||||||
1671 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||||
1672 | G_ATOMICRMW_NAND, | ||||||
1673 | G_ATOMICRMW_FSUB, | ||||||
1674 | G_READ_REGISTER, | ||||||
1675 | G_WRITE_REGISTER, | ||||||
1676 | |||||||
1677 | G_SADDO, G_SSUBO, | ||||||
1678 | |||||||
1679 | // TODO: Implement | ||||||
1680 | G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||||
1681 | |||||||
1682 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||||
1683 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||||
1684 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||||
1685 | .unsupported(); | ||||||
1686 | |||||||
1687 | getLegacyLegalizerInfo().computeTables(); | ||||||
1688 | verify(*ST.getInstrInfo()); | ||||||
1689 | } | ||||||
1690 | |||||||
1691 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||||
1692 | MachineInstr &MI) const { | ||||||
1693 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
1694 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
1695 | |||||||
1696 | switch (MI.getOpcode()) { | ||||||
1697 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||||
1698 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||||
1699 | case TargetOpcode::G_FRINT: | ||||||
1700 | return legalizeFrint(MI, MRI, B); | ||||||
1701 | case TargetOpcode::G_FCEIL: | ||||||
1702 | return legalizeFceil(MI, MRI, B); | ||||||
1703 | case TargetOpcode::G_FREM: | ||||||
1704 | return legalizeFrem(MI, MRI, B); | ||||||
1705 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||||
1706 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||||
1707 | case TargetOpcode::G_SITOFP: | ||||||
1708 | return legalizeITOFP(MI, MRI, B, true); | ||||||
1709 | case TargetOpcode::G_UITOFP: | ||||||
1710 | return legalizeITOFP(MI, MRI, B, false); | ||||||
1711 | case TargetOpcode::G_FPTOSI: | ||||||
1712 | return legalizeFPTOI(MI, MRI, B, true); | ||||||
1713 | case TargetOpcode::G_FPTOUI: | ||||||
1714 | return legalizeFPTOI(MI, MRI, B, false); | ||||||
1715 | case TargetOpcode::G_FMINNUM: | ||||||
1716 | case TargetOpcode::G_FMAXNUM: | ||||||
1717 | case TargetOpcode::G_FMINNUM_IEEE: | ||||||
1718 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||||
1719 | return legalizeMinNumMaxNum(Helper, MI); | ||||||
1720 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||||
1721 | return legalizeExtractVectorElt(MI, MRI, B); | ||||||
1722 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||||
1723 | return legalizeInsertVectorElt(MI, MRI, B); | ||||||
1724 | case TargetOpcode::G_SHUFFLE_VECTOR: | ||||||
1725 | return legalizeShuffleVector(MI, MRI, B); | ||||||
1726 | case TargetOpcode::G_FSIN: | ||||||
1727 | case TargetOpcode::G_FCOS: | ||||||
1728 | return legalizeSinCos(MI, MRI, B); | ||||||
1729 | case TargetOpcode::G_GLOBAL_VALUE: | ||||||
1730 | return legalizeGlobalValue(MI, MRI, B); | ||||||
1731 | case TargetOpcode::G_LOAD: | ||||||
1732 | case TargetOpcode::G_SEXTLOAD: | ||||||
1733 | case TargetOpcode::G_ZEXTLOAD: | ||||||
1734 | return legalizeLoad(Helper, MI); | ||||||
1735 | case TargetOpcode::G_FMAD: | ||||||
1736 | return legalizeFMad(MI, MRI, B); | ||||||
1737 | case TargetOpcode::G_FDIV: | ||||||
1738 | return legalizeFDIV(MI, MRI, B); | ||||||
1739 | case TargetOpcode::G_UDIV: | ||||||
1740 | case TargetOpcode::G_UREM: | ||||||
1741 | case TargetOpcode::G_UDIVREM: | ||||||
1742 | return legalizeUnsignedDIV_REM(MI, MRI, B); | ||||||
1743 | case TargetOpcode::G_SDIV: | ||||||
1744 | case TargetOpcode::G_SREM: | ||||||
1745 | case TargetOpcode::G_SDIVREM: | ||||||
1746 | return legalizeSignedDIV_REM(MI, MRI, B); | ||||||
1747 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||||
1748 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||||
1749 | case TargetOpcode::G_FLOG: | ||||||
1750 | return legalizeFlog(MI, B, numbers::ln2f); | ||||||
1751 | case TargetOpcode::G_FLOG10: | ||||||
1752 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||||
1753 | case TargetOpcode::G_FEXP: | ||||||
1754 | return legalizeFExp(MI, B); | ||||||
1755 | case TargetOpcode::G_FPOW: | ||||||
1756 | return legalizeFPow(MI, B); | ||||||
1757 | case TargetOpcode::G_FFLOOR: | ||||||
1758 | return legalizeFFloor(MI, MRI, B); | ||||||
1759 | case TargetOpcode::G_BUILD_VECTOR: | ||||||
1760 | return legalizeBuildVector(MI, MRI, B); | ||||||
1761 | case TargetOpcode::G_CTLZ: | ||||||
1762 | case TargetOpcode::G_CTTZ: | ||||||
1763 | return legalizeCTLZ_CTTZ(MI, MRI, B); | ||||||
1764 | default: | ||||||
1765 | return false; | ||||||
1766 | } | ||||||
1767 | |||||||
1768 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1768); | ||||||
1769 | } | ||||||
1770 | |||||||
1771 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||||
1772 | unsigned AS, | ||||||
1773 | MachineRegisterInfo &MRI, | ||||||
1774 | MachineIRBuilder &B) const { | ||||||
1775 | MachineFunction &MF = B.getMF(); | ||||||
1776 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
1777 | const LLT S32 = LLT::scalar(32); | ||||||
1778 | |||||||
1779 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1779, __extension__ __PRETTY_FUNCTION__)); | ||||||
1780 | |||||||
1781 | if (ST.hasApertureRegs()) { | ||||||
1782 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | ||||||
1783 | // getreg. | ||||||
1784 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1785 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | ||||||
1786 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | ||||||
1787 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1788 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | ||||||
1789 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | ||||||
1790 | unsigned Encoding = | ||||||
1791 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | ||||||
1792 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | ||||||
1793 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | ||||||
1794 | |||||||
1795 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||||||
1796 | |||||||
1797 | B.buildInstr(AMDGPU::S_GETREG_B32) | ||||||
1798 | .addDef(GetReg) | ||||||
1799 | .addImm(Encoding); | ||||||
1800 | MRI.setType(GetReg, S32); | ||||||
1801 | |||||||
1802 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | ||||||
1803 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); | ||||||
1804 | } | ||||||
1805 | |||||||
1806 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||||
1807 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
1808 | |||||||
1809 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||
1810 | return Register(); | ||||||
1811 | |||||||
1812 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||||
1813 | // private_segment_aperture_base_hi. | ||||||
1814 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||||
1815 | |||||||
1816 | // TODO: can we be smarter about machine pointer info? | ||||||
1817 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
1818 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
1819 | PtrInfo, | ||||||
1820 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
1821 | MachineMemOperand::MOInvariant, | ||||||
1822 | LLT::scalar(32), commonAlignment(Align(64), StructOffset)); | ||||||
1823 | |||||||
1824 | Register LoadAddr; | ||||||
1825 | |||||||
1826 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | ||||||
1827 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||||
1828 | } | ||||||
1829 | |||||||
1830 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||||
1831 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1832 | MachineIRBuilder &B) const { | ||||||
1833 | MachineFunction &MF = B.getMF(); | ||||||
1834 | |||||||
1835 | const LLT S32 = LLT::scalar(32); | ||||||
1836 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1837 | Register Src = MI.getOperand(1).getReg(); | ||||||
1838 | |||||||
1839 | LLT DstTy = MRI.getType(Dst); | ||||||
1840 | LLT SrcTy = MRI.getType(Src); | ||||||
1841 | unsigned DestAS = DstTy.getAddressSpace(); | ||||||
1842 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||||
1843 | |||||||
1844 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||||
1845 | // vector element. | ||||||
1846 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1846, __extension__ __PRETTY_FUNCTION__)); | ||||||
1847 | |||||||
1848 | const AMDGPUTargetMachine &TM | ||||||
1849 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||||
1850 | |||||||
1851 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||||
1852 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||||
1853 | return true; | ||||||
1854 | } | ||||||
1855 | |||||||
1856 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1857 | // Truncate. | ||||||
1858 | B.buildExtract(Dst, Src, 0); | ||||||
1859 | MI.eraseFromParent(); | ||||||
1860 | return true; | ||||||
1861 | } | ||||||
1862 | |||||||
1863 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1864 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1865 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||||
1866 | |||||||
1867 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | ||||||
1868 | // another. Merge operands are required to be the same type, but creating an | ||||||
1869 | // extra ptrtoint would be kind of pointless. | ||||||
1870 | auto HighAddr = B.buildConstant( | ||||||
1871 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | ||||||
1872 | B.buildMerge(Dst, {Src, HighAddr}); | ||||||
1873 | MI.eraseFromParent(); | ||||||
1874 | return true; | ||||||
1875 | } | ||||||
1876 | |||||||
1877 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | ||||||
1878 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1879, __extension__ __PRETTY_FUNCTION__)) | ||||||
1879 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1879, __extension__ __PRETTY_FUNCTION__)); | ||||||
1880 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||||
1881 | |||||||
1882 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||||
1883 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||||
1884 | |||||||
1885 | // Extract low 32-bits of the pointer. | ||||||
1886 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||||
1887 | |||||||
1888 | auto CmpRes = | ||||||
1889 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||||
1890 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||||
1891 | |||||||
1892 | MI.eraseFromParent(); | ||||||
1893 | return true; | ||||||
1894 | } | ||||||
1895 | |||||||
1896 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | ||||||
1897 | return false; | ||||||
1898 | |||||||
1899 | if (!ST.hasFlatAddressSpace()) | ||||||
1900 | return false; | ||||||
1901 | |||||||
1902 | auto SegmentNull = | ||||||
1903 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||||
1904 | auto FlatNull = | ||||||
1905 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||||
1906 | |||||||
1907 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||||
1908 | if (!ApertureReg.isValid()) | ||||||
1909 | return false; | ||||||
1910 | |||||||
1911 | auto CmpRes = | ||||||
1912 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); | ||||||
1913 | |||||||
1914 | // Coerce the type of the low half of the result so we can use merge_values. | ||||||
1915 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||||
1916 | |||||||
1917 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||||
1918 | // avoid the ptrtoint? | ||||||
1919 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); | ||||||
1920 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||||
1921 | |||||||
1922 | MI.eraseFromParent(); | ||||||
1923 | return true; | ||||||
1924 | } | ||||||
1925 | |||||||
1926 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||||
1927 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1928 | MachineIRBuilder &B) const { | ||||||
1929 | Register Src = MI.getOperand(1).getReg(); | ||||||
1930 | LLT Ty = MRI.getType(Src); | ||||||
1931 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1931, __extension__ __PRETTY_FUNCTION__)); | ||||||
1932 | |||||||
1933 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||||
1934 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||||
1935 | |||||||
1936 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||||
1937 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||||
1938 | |||||||
1939 | // TODO: Should this propagate fast-math-flags? | ||||||
1940 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||||
1941 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||||
1942 | |||||||
1943 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||||
1944 | auto Fabs = B.buildFAbs(Ty, Src); | ||||||
1945 | |||||||
1946 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||||
1947 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||||
1948 | MI.eraseFromParent(); | ||||||
1949 | return true; | ||||||
1950 | } | ||||||
1951 | |||||||
1952 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||||
1953 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1954 | MachineIRBuilder &B) const { | ||||||
1955 | |||||||
1956 | const LLT S1 = LLT::scalar(1); | ||||||
1957 | const LLT S64 = LLT::scalar(64); | ||||||
1958 | |||||||
1959 | Register Src = MI.getOperand(1).getReg(); | ||||||
1960 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1960, __extension__ __PRETTY_FUNCTION__)); | ||||||
1961 | |||||||
1962 | // result = trunc(src) | ||||||
1963 | // if (src > 0.0 && src != result) | ||||||
1964 | // result += 1.0 | ||||||
1965 | |||||||
1966 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||||
1967 | |||||||
1968 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||||
1969 | const auto One = B.buildFConstant(S64, 1.0); | ||||||
1970 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||||
1971 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||||
1972 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||||
1973 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||||
1974 | |||||||
1975 | // TODO: Should this propagate fast-math-flags? | ||||||
1976 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||||
1977 | return true; | ||||||
1978 | } | ||||||
1979 | |||||||
1980 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||||
1981 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1982 | MachineIRBuilder &B) const { | ||||||
1983 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
1984 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||||
1985 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||||
1986 | auto Flags = MI.getFlags(); | ||||||
1987 | LLT Ty = MRI.getType(DstReg); | ||||||
1988 | |||||||
1989 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||||
1990 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||||
1991 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||||
1992 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||||
1993 | MI.eraseFromParent(); | ||||||
1994 | return true; | ||||||
1995 | } | ||||||
1996 | |||||||
1997 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||||
1998 | MachineIRBuilder &B) { | ||||||
1999 | const unsigned FractBits = 52; | ||||||
2000 | const unsigned ExpBits = 11; | ||||||
2001 | LLT S32 = LLT::scalar(32); | ||||||
2002 | |||||||
2003 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||||
2004 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||||
2005 | |||||||
2006 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||||
2007 | .addUse(Hi) | ||||||
2008 | .addUse(Const0.getReg(0)) | ||||||
2009 | .addUse(Const1.getReg(0)); | ||||||
2010 | |||||||
2011 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||||
2012 | } | ||||||
2013 | |||||||
2014 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||||
2015 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2016 | MachineIRBuilder &B) const { | ||||||
2017 | const LLT S1 = LLT::scalar(1); | ||||||
2018 | const LLT S32 = LLT::scalar(32); | ||||||
2019 | const LLT S64 = LLT::scalar(64); | ||||||
2020 | |||||||
2021 | Register Src = MI.getOperand(1).getReg(); | ||||||
2022 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2022, __extension__ __PRETTY_FUNCTION__)); | ||||||
2023 | |||||||
2024 | // TODO: Should this use extract since the low half is unused? | ||||||
2025 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
2026 | Register Hi = Unmerge.getReg(1); | ||||||
2027 | |||||||
2028 | // Extract the upper half, since this is where we will find the sign and | ||||||
2029 | // exponent. | ||||||
2030 | auto Exp = extractF64Exponent(Hi, B); | ||||||
2031 | |||||||
2032 | const unsigned FractBits = 52; | ||||||
2033 | |||||||
2034 | // Extract the sign bit. | ||||||
2035 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||||
2036 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||||
2037 | |||||||
2038 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||||
2039 | |||||||
2040 | const auto Zero32 = B.buildConstant(S32, 0); | ||||||
2041 | |||||||
2042 | // Extend back to 64-bits. | ||||||
2043 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); | ||||||
2044 | |||||||
2045 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||||
2046 | auto Not = B.buildNot(S64, Shr); | ||||||
2047 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||||
2048 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||||
2049 | |||||||
2050 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||||
2051 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||||
2052 | |||||||
2053 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||||
2054 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||||
2055 | MI.eraseFromParent(); | ||||||
2056 | return true; | ||||||
2057 | } | ||||||
2058 | |||||||
2059 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||||
2060 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2061 | MachineIRBuilder &B, bool Signed) const { | ||||||
2062 | |||||||
2063 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2064 | Register Src = MI.getOperand(1).getReg(); | ||||||
2065 | |||||||
2066 | const LLT S64 = LLT::scalar(64); | ||||||
2067 | const LLT S32 = LLT::scalar(32); | ||||||
2068 | |||||||
2069 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2069, __extension__ __PRETTY_FUNCTION__)); | ||||||
2070 | |||||||
2071 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
2072 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||||
2073 | |||||||
2074 | if (MRI.getType(Dst) == S64) { | ||||||
2075 | auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) | ||||||
2076 | : B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||||
2077 | |||||||
2078 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||||
2079 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||||
2080 | .addUse(CvtHi.getReg(0)) | ||||||
2081 | .addUse(ThirtyTwo.getReg(0)); | ||||||
2082 | |||||||
2083 | // TODO: Should this propagate fast-math-flags? | ||||||
2084 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||||
2085 | MI.eraseFromParent(); | ||||||
2086 | return true; | ||||||
2087 | } | ||||||
2088 | |||||||
2089 | assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0 ) : __assert_fail ("MRI.getType(Dst) == S32", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2089, __extension__ __PRETTY_FUNCTION__)); | ||||||
2090 | |||||||
2091 | auto One = B.buildConstant(S32, 1); | ||||||
2092 | |||||||
2093 | MachineInstrBuilder ShAmt; | ||||||
2094 | if (Signed) { | ||||||
2095 | auto ThirtyOne = B.buildConstant(S32, 31); | ||||||
2096 | auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); | ||||||
2097 | auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); | ||||||
2098 | auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); | ||||||
2099 | auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, | ||||||
2100 | /*HasSideEffects=*/false) | ||||||
2101 | .addUse(Unmerge.getReg(1)); | ||||||
2102 | auto LS2 = B.buildSub(S32, LS, One); | ||||||
2103 | ShAmt = B.buildUMin(S32, LS2, MaxShAmt); | ||||||
2104 | } else | ||||||
2105 | ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); | ||||||
2106 | auto Norm = B.buildShl(S64, Src, ShAmt); | ||||||
2107 | auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); | ||||||
2108 | auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); | ||||||
2109 | auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); | ||||||
2110 | auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); | ||||||
2111 | auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); | ||||||
2112 | B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, | ||||||
2113 | /*HasSideEffects=*/false) | ||||||
2114 | .addUse(FVal.getReg(0)) | ||||||
2115 | .addUse(Scale.getReg(0)); | ||||||
2116 | MI.eraseFromParent(); | ||||||
2117 | return true; | ||||||
2118 | } | ||||||
2119 | |||||||
2120 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||||
2121 | // actually works. | ||||||
2122 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, | ||||||
2123 | MachineRegisterInfo &MRI, | ||||||
2124 | MachineIRBuilder &B, | ||||||
2125 | bool Signed) const { | ||||||
2126 | |||||||
2127 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2128 | Register Src = MI.getOperand(1).getReg(); | ||||||
2129 | |||||||
2130 | const LLT S64 = LLT::scalar(64); | ||||||
2131 | const LLT S32 = LLT::scalar(32); | ||||||
2132 | |||||||
2133 | const LLT SrcLT = MRI.getType(Src); | ||||||
2134 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2134, __extension__ __PRETTY_FUNCTION__)); | ||||||
2135 | |||||||
2136 | unsigned Flags = MI.getFlags(); | ||||||
2137 | |||||||
2138 | // The basic idea of converting a floating point number into a pair of 32-bit | ||||||
2139 | // integers is illustrated as follows: | ||||||
2140 | // | ||||||
2141 | // tf := trunc(val); | ||||||
2142 | // hif := floor(tf * 2^-32); | ||||||
2143 | // lof := tf - hif * 2^32; // lof is always positive due to floor. | ||||||
2144 | // hi := fptoi(hif); | ||||||
2145 | // lo := fptoi(lof); | ||||||
2146 | // | ||||||
2147 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); | ||||||
2148 | MachineInstrBuilder Sign; | ||||||
2149 | if (Signed && SrcLT == S32) { | ||||||
2150 | // However, a 32-bit floating point number has only 23 bits mantissa and | ||||||
2151 | // it's not enough to hold all the significant bits of `lof` if val is | ||||||
2152 | // negative. To avoid the loss of precision, We need to take the absolute | ||||||
2153 | // value after truncating and flip the result back based on the original | ||||||
2154 | // signedness. | ||||||
2155 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); | ||||||
2156 | Trunc = B.buildFAbs(S32, Trunc, Flags); | ||||||
2157 | } | ||||||
2158 | MachineInstrBuilder K0, K1; | ||||||
2159 | if (SrcLT == S64) { | ||||||
2160 | K0 = B.buildFConstant(S64, | ||||||
2161 | BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL)); | ||||||
2162 | K1 = B.buildFConstant(S64, | ||||||
2163 | BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL)); | ||||||
2164 | } else { | ||||||
2165 | K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U)); | ||||||
2166 | K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U)); | ||||||
2167 | } | ||||||
2168 | |||||||
2169 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); | ||||||
2170 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); | ||||||
2171 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); | ||||||
2172 | |||||||
2173 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) | ||||||
2174 | : B.buildFPTOUI(S32, FloorMul); | ||||||
2175 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||||
2176 | |||||||
2177 | if (Signed && SrcLT == S32) { | ||||||
2178 | // Flip the result based on the signedness, which is either all 0s or 1s. | ||||||
2179 | Sign = B.buildMerge(S64, {Sign, Sign}); | ||||||
2180 | // r := xor({lo, hi}, sign) - sign; | ||||||
2181 | B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); | ||||||
2182 | } else | ||||||
2183 | B.buildMerge(Dst, {Lo, Hi}); | ||||||
2184 | MI.eraseFromParent(); | ||||||
2185 | |||||||
2186 | return true; | ||||||
2187 | } | ||||||
2188 | |||||||
2189 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||||
2190 | MachineInstr &MI) const { | ||||||
2191 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||||
2192 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2193 | |||||||
2194 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||||
2195 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||||
2196 | |||||||
2197 | // With ieee_mode disabled, the instructions have the correct behavior | ||||||
2198 | // already for G_FMINNUM/G_FMAXNUM | ||||||
2199 | if (!MFI->getMode().IEEE) | ||||||
2200 | return !IsIEEEOp; | ||||||
2201 | |||||||
2202 | if (IsIEEEOp) | ||||||
2203 | return true; | ||||||
2204 | |||||||
2205 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||||
2206 | } | ||||||
2207 | |||||||
2208 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||||
2209 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2210 | MachineIRBuilder &B) const { | ||||||
2211 | // TODO: Should move some of this into LegalizerHelper. | ||||||
2212 | |||||||
2213 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
2214 | |||||||
2215 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
2216 | // constant before this, so we shouldn't need | ||||||
2217 | // getConstantVRegValWithLookThrough. | ||||||
2218 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||
2219 | getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||||
2220 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||
2221 | return true; | ||||||
2222 | const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||
2223 | |||||||
2224 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2225 | Register Vec = MI.getOperand(1).getReg(); | ||||||
2226 | |||||||
2227 | LLT VecTy = MRI.getType(Vec); | ||||||
2228 | LLT EltTy = VecTy.getElementType(); | ||||||
2229 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2229, __extension__ __PRETTY_FUNCTION__)); | ||||||
2230 | |||||||
2231 | if (IdxVal < VecTy.getNumElements()) | ||||||
2232 | B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); | ||||||
2233 | else | ||||||
2234 | B.buildUndef(Dst); | ||||||
2235 | |||||||
2236 | MI.eraseFromParent(); | ||||||
2237 | return true; | ||||||
2238 | } | ||||||
2239 | |||||||
2240 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||||
2241 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2242 | MachineIRBuilder &B) const { | ||||||
2243 | // TODO: Should move some of this into LegalizerHelper. | ||||||
2244 | |||||||
2245 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
2246 | |||||||
2247 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
2248 | // constant before this, so we shouldn't need | ||||||
2249 | // getConstantVRegValWithLookThrough. | ||||||
2250 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||
2251 | getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||||
2252 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||
2253 | return true; | ||||||
2254 | |||||||
2255 | int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||
2256 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2257 | Register Vec = MI.getOperand(1).getReg(); | ||||||
2258 | Register Ins = MI.getOperand(2).getReg(); | ||||||
2259 | |||||||
2260 | LLT VecTy = MRI.getType(Vec); | ||||||
2261 | LLT EltTy = VecTy.getElementType(); | ||||||
2262 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2262, __extension__ __PRETTY_FUNCTION__)); | ||||||
2263 | |||||||
2264 | if (IdxVal < VecTy.getNumElements()) | ||||||
2265 | B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); | ||||||
2266 | else | ||||||
2267 | B.buildUndef(Dst); | ||||||
2268 | |||||||
2269 | MI.eraseFromParent(); | ||||||
2270 | return true; | ||||||
2271 | } | ||||||
2272 | |||||||
2273 | bool AMDGPULegalizerInfo::legalizeShuffleVector( | ||||||
2274 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2275 | MachineIRBuilder &B) const { | ||||||
2276 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
2277 | |||||||
2278 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2279 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2280 | LLT DstTy = MRI.getType(Dst); | ||||||
2281 | LLT SrcTy = MRI.getType(Src0); | ||||||
2282 | |||||||
2283 | if (SrcTy == V2S16 && DstTy == V2S16 && | ||||||
2284 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) | ||||||
2285 | return true; | ||||||
2286 | |||||||
2287 | MachineIRBuilder HelperBuilder(MI); | ||||||
2288 | GISelObserverWrapper DummyObserver; | ||||||
2289 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); | ||||||
2290 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; | ||||||
2291 | } | ||||||
2292 | |||||||
2293 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||||
2294 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2295 | MachineIRBuilder &B) const { | ||||||
2296 | |||||||
2297 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2298 | Register SrcReg = MI.getOperand(1).getReg(); | ||||||
2299 | LLT Ty = MRI.getType(DstReg); | ||||||
2300 | unsigned Flags = MI.getFlags(); | ||||||
2301 | |||||||
2302 | Register TrigVal; | ||||||
2303 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||||
2304 | if (ST.hasTrigReducedRange()) { | ||||||
2305 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||||
2306 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||||
2307 | .addUse(MulVal.getReg(0)) | ||||||
2308 | .setMIFlags(Flags).getReg(0); | ||||||
2309 | } else | ||||||
2310 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||||
2311 | |||||||
2312 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||||
2313 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||||
2314 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | ||||||
2315 | .addUse(TrigVal) | ||||||
2316 | .setMIFlags(Flags); | ||||||
2317 | MI.eraseFromParent(); | ||||||
2318 | return true; | ||||||
2319 | } | ||||||
2320 | |||||||
2321 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||||
2322 | MachineIRBuilder &B, | ||||||
2323 | const GlobalValue *GV, | ||||||
2324 | int64_t Offset, | ||||||
2325 | unsigned GAFlags) const { | ||||||
2326 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2326, __extension__ __PRETTY_FUNCTION__)); | ||||||
2327 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||||
2328 | // to the following code sequence: | ||||||
2329 | // | ||||||
2330 | // For constant address space: | ||||||
2331 | // s_getpc_b64 s[0:1] | ||||||
2332 | // s_add_u32 s0, s0, $symbol | ||||||
2333 | // s_addc_u32 s1, s1, 0 | ||||||
2334 | // | ||||||
2335 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
2336 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||||
2337 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||||
2338 | // operand to the global variable. | ||||||
2339 | // | ||||||
2340 | // For global address space: | ||||||
2341 | // s_getpc_b64 s[0:1] | ||||||
2342 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||||
2343 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||||
2344 | // | ||||||
2345 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
2346 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||||
2347 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||||
2348 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||||
2349 | // operand to the global variable. | ||||||
2350 | // | ||||||
2351 | // What we want here is an offset from the value returned by s_getpc | ||||||
2352 | // (which is the address of the s_add_u32 instruction) to the global | ||||||
2353 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||||
2354 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||||
2355 | // small. This requires us to add 4 to the global variable offset in order to | ||||||
2356 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||||
2357 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||||
2358 | // instruction. | ||||||
2359 | |||||||
2360 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2361 | |||||||
2362 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||||
2363 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||||
2364 | |||||||
2365 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||||
2366 | .addDef(PCReg); | ||||||
2367 | |||||||
2368 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||||
2369 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||||
2370 | MIB.addImm(0); | ||||||
2371 | else | ||||||
2372 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||||
2373 | |||||||
2374 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||||
2375 | |||||||
2376 | if (PtrTy.getSizeInBits() == 32) | ||||||
2377 | B.buildExtract(DstReg, PCReg, 0); | ||||||
2378 | return true; | ||||||
2379 | } | ||||||
2380 | |||||||
2381 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||||
2382 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2383 | MachineIRBuilder &B) const { | ||||||
2384 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2385 | LLT Ty = MRI.getType(DstReg); | ||||||
2386 | unsigned AS = Ty.getAddressSpace(); | ||||||
2387 | |||||||
2388 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||||
2389 | MachineFunction &MF = B.getMF(); | ||||||
2390 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2391 | |||||||
2392 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||||
2393 | if (!MFI->isModuleEntryFunction() && | ||||||
2394 | !GV->getName().equals("llvm.amdgcn.module.lds")) { | ||||||
2395 | const Function &Fn = MF.getFunction(); | ||||||
2396 | DiagnosticInfoUnsupported BadLDSDecl( | ||||||
2397 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||||
2398 | DS_Warning); | ||||||
2399 | Fn.getContext().diagnose(BadLDSDecl); | ||||||
2400 | |||||||
2401 | // We currently don't have a way to correctly allocate LDS objects that | ||||||
2402 | // aren't directly associated with a kernel. We do force inlining of | ||||||
2403 | // functions that use local objects. However, if these dead functions are | ||||||
2404 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||||
2405 | // and a trap, since there should be no callable path here. | ||||||
2406 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||||
2407 | B.buildUndef(DstReg); | ||||||
2408 | MI.eraseFromParent(); | ||||||
2409 | return true; | ||||||
2410 | } | ||||||
2411 | |||||||
2412 | // TODO: We could emit code to handle the initialization somewhere. | ||||||
2413 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { | ||||||
2414 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
2415 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||||
2416 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||||
2417 | return true; // Leave in place; | ||||||
2418 | } | ||||||
2419 | |||||||
2420 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||||
2421 | Type *Ty = GV->getValueType(); | ||||||
2422 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||||
2423 | // zero-sized type in other languages to declare the dynamic shared | ||||||
2424 | // memory which size is not known at the compile time. They will be | ||||||
2425 | // allocated by the runtime and placed directly after the static | ||||||
2426 | // allocated ones. They all share the same offset. | ||||||
2427 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||||
2428 | // Adjust alignment for that dynamic shared memory array. | ||||||
2429 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); | ||||||
2430 | LLT S32 = LLT::scalar(32); | ||||||
2431 | auto Sz = | ||||||
2432 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||||
2433 | B.buildIntToPtr(DstReg, Sz); | ||||||
2434 | MI.eraseFromParent(); | ||||||
2435 | return true; | ||||||
2436 | } | ||||||
2437 | } | ||||||
2438 | |||||||
2439 | B.buildConstant( | ||||||
2440 | DstReg, | ||||||
2441 | MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); | ||||||
2442 | MI.eraseFromParent(); | ||||||
2443 | return true; | ||||||
2444 | } | ||||||
2445 | |||||||
2446 | const Function &Fn = MF.getFunction(); | ||||||
2447 | DiagnosticInfoUnsupported BadInit( | ||||||
2448 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); | ||||||
2449 | Fn.getContext().diagnose(BadInit); | ||||||
2450 | return true; | ||||||
2451 | } | ||||||
2452 | |||||||
2453 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
2454 | |||||||
2455 | if (TLI->shouldEmitFixup(GV)) { | ||||||
2456 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||||
2457 | MI.eraseFromParent(); | ||||||
2458 | return true; | ||||||
2459 | } | ||||||
2460 | |||||||
2461 | if (TLI->shouldEmitPCReloc(GV)) { | ||||||
2462 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||||
2463 | MI.eraseFromParent(); | ||||||
2464 | return true; | ||||||
2465 | } | ||||||
2466 | |||||||
2467 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2468 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||||
2469 | |||||||
2470 | LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; | ||||||
2471 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||||
2472 | MachinePointerInfo::getGOT(MF), | ||||||
2473 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
2474 | MachineMemOperand::MOInvariant, | ||||||
2475 | LoadTy, Align(8)); | ||||||
2476 | |||||||
2477 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||||
2478 | |||||||
2479 | if (Ty.getSizeInBits() == 32) { | ||||||
2480 | // Truncate if this is a 32-bit constant adrdess. | ||||||
2481 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||||
2482 | B.buildExtract(DstReg, Load, 0); | ||||||
2483 | } else | ||||||
2484 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||||
2485 | |||||||
2486 | MI.eraseFromParent(); | ||||||
2487 | return true; | ||||||
2488 | } | ||||||
2489 | |||||||
2490 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||||
2491 | if (Ty.isVector()) | ||||||
2492 | return Ty.changeElementCount( | ||||||
2493 | ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); | ||||||
2494 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||||
2495 | } | ||||||
2496 | |||||||
2497 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||||
2498 | MachineInstr &MI) const { | ||||||
2499 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
2500 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
2501 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
2502 | |||||||
2503 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
2504 | LLT PtrTy = MRI.getType(PtrReg); | ||||||
2505 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||||
2506 | |||||||
2507 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
2508 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2509 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||||
2510 | Observer.changingInstr(MI); | ||||||
2511 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||||
2512 | Observer.changedInstr(MI); | ||||||
2513 | return true; | ||||||
2514 | } | ||||||
2515 | |||||||
2516 | if (MI.getOpcode() != AMDGPU::G_LOAD) | ||||||
2517 | return false; | ||||||
2518 | |||||||
2519 | Register ValReg = MI.getOperand(0).getReg(); | ||||||
2520 | LLT ValTy = MRI.getType(ValReg); | ||||||
2521 | |||||||
2522 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
2523 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||||
2524 | const LLT MemTy = MMO->getMemoryType(); | ||||||
2525 | const Align MemAlign = MMO->getAlign(); | ||||||
2526 | const unsigned MemSize = MemTy.getSizeInBits(); | ||||||
2527 | const unsigned AlignInBits = 8 * MemAlign.value(); | ||||||
2528 | |||||||
2529 | // Widen non-power-of-2 loads to the alignment if needed | ||||||
2530 | if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||||
2531 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||||
2532 | |||||||
2533 | // This was already the correct extending load result type, so just adjust | ||||||
2534 | // the memory type. | ||||||
2535 | if (WideMemSize == ValSize) { | ||||||
2536 | MachineFunction &MF = B.getMF(); | ||||||
2537 | |||||||
2538 | MachineMemOperand *WideMMO = | ||||||
2539 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||||
2540 | Observer.changingInstr(MI); | ||||||
2541 | MI.setMemRefs(MF, {WideMMO}); | ||||||
2542 | Observer.changedInstr(MI); | ||||||
2543 | return true; | ||||||
2544 | } | ||||||
2545 | |||||||
2546 | // Don't bother handling edge case that should probably never be produced. | ||||||
2547 | if (ValSize > WideMemSize) | ||||||
2548 | return false; | ||||||
2549 | |||||||
2550 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||||
2551 | |||||||
2552 | Register WideLoad; | ||||||
2553 | if (!WideTy.isVector()) { | ||||||
2554 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||
2555 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||||
2556 | } else { | ||||||
2557 | // Extract the subvector. | ||||||
2558 | |||||||
2559 | if (isRegisterType(ValTy)) { | ||||||
2560 | // If this a case where G_EXTRACT is legal, use it. | ||||||
2561 | // (e.g. <3 x s32> -> <4 x s32>) | ||||||
2562 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||
2563 | B.buildExtract(ValReg, WideLoad, 0); | ||||||
2564 | } else { | ||||||
2565 | // For cases where the widened type isn't a nice register value, unmerge | ||||||
2566 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||||
2567 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||
2568 | WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); | ||||||
2569 | B.setInsertPt(B.getMBB(), MI.getIterator()); | ||||||
2570 | B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); | ||||||
2571 | } | ||||||
2572 | } | ||||||
2573 | |||||||
2574 | MI.eraseFromParent(); | ||||||
2575 | return true; | ||||||
2576 | } | ||||||
2577 | |||||||
2578 | return false; | ||||||
2579 | } | ||||||
2580 | |||||||
2581 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||||
2582 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2583 | MachineIRBuilder &B) const { | ||||||
2584 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
2585 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2585, __extension__ __PRETTY_FUNCTION__)); | ||||||
2586 | |||||||
2587 | MachineFunction &MF = B.getMF(); | ||||||
2588 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2589 | |||||||
2590 | // TODO: Always legal with future ftz flag. | ||||||
2591 | // FIXME: Do we need just output? | ||||||
2592 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||||
2593 | return true; | ||||||
2594 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||||
2595 | return true; | ||||||
2596 | |||||||
2597 | MachineIRBuilder HelperBuilder(MI); | ||||||
2598 | GISelObserverWrapper DummyObserver; | ||||||
2599 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||
2600 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||||
2601 | } | ||||||
2602 | |||||||
2603 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||||
2604 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2605 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2606 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
2607 | Register CmpVal = MI.getOperand(2).getReg(); | ||||||
2608 | Register NewVal = MI.getOperand(3).getReg(); | ||||||
2609 | |||||||
2610 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2611, __extension__ __PRETTY_FUNCTION__)) | ||||||
2611 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2611, __extension__ __PRETTY_FUNCTION__)); | ||||||
2612 | |||||||
2613 | LLT ValTy = MRI.getType(CmpVal); | ||||||
2614 | LLT VecTy = LLT::fixed_vector(2, ValTy); | ||||||
2615 | |||||||
2616 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||||
2617 | |||||||
2618 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||||
2619 | .addDef(DstReg) | ||||||
2620 | .addUse(PtrReg) | ||||||
2621 | .addUse(PackedVal) | ||||||
2622 | .setMemRefs(MI.memoperands()); | ||||||
2623 | |||||||
2624 | MI.eraseFromParent(); | ||||||
2625 | return true; | ||||||
2626 | } | ||||||
2627 | |||||||
2628 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||||
2629 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||||
2630 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2631 | Register Src = MI.getOperand(1).getReg(); | ||||||
2632 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2633 | unsigned Flags = MI.getFlags(); | ||||||
2634 | |||||||
2635 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||||
2636 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||||
2637 | |||||||
2638 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||||
2639 | MI.eraseFromParent(); | ||||||
2640 | return true; | ||||||
2641 | } | ||||||
2642 | |||||||
2643 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||||
2644 | MachineIRBuilder &B) const { | ||||||
2645 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2646 | Register Src = MI.getOperand(1).getReg(); | ||||||
2647 | unsigned Flags = MI.getFlags(); | ||||||
2648 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2649 | |||||||
2650 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||||
2651 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||||
2652 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2653 | MI.eraseFromParent(); | ||||||
2654 | return true; | ||||||
2655 | } | ||||||
2656 | |||||||
2657 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||||
2658 | MachineIRBuilder &B) const { | ||||||
2659 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2660 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2661 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2662 | unsigned Flags = MI.getFlags(); | ||||||
2663 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2664 | const LLT S16 = LLT::scalar(16); | ||||||
2665 | const LLT S32 = LLT::scalar(32); | ||||||
2666 | |||||||
2667 | if (Ty == S32) { | ||||||
2668 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||||
2669 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2670 | .addUse(Log.getReg(0)) | ||||||
2671 | .addUse(Src1) | ||||||
2672 | .setMIFlags(Flags); | ||||||
2673 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2674 | } else if (Ty == S16) { | ||||||
2675 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||||
2676 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||||
2677 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||||
2678 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||||
2679 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2680 | .addUse(Ext0.getReg(0)) | ||||||
2681 | .addUse(Ext1.getReg(0)) | ||||||
2682 | .setMIFlags(Flags); | ||||||
2683 | |||||||
2684 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||||
2685 | } else | ||||||
2686 | return false; | ||||||
2687 | |||||||
2688 | MI.eraseFromParent(); | ||||||
2689 | return true; | ||||||
2690 | } | ||||||
2691 | |||||||
2692 | // Find a source register, ignoring any possible source modifiers. | ||||||
2693 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||||
2694 | Register ModSrc = OrigSrc; | ||||||
2695 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||||
2696 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||||
2697 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2698 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2699 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2700 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2701 | return ModSrc; | ||||||
2702 | } | ||||||
2703 | |||||||
2704 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||||
2705 | MachineRegisterInfo &MRI, | ||||||
2706 | MachineIRBuilder &B) const { | ||||||
2707 | |||||||
2708 | const LLT S1 = LLT::scalar(1); | ||||||
2709 | const LLT S64 = LLT::scalar(64); | ||||||
2710 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2711 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||||
2712 | unsigned Flags = MI.getFlags(); | ||||||
2713 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2714, __extension__ __PRETTY_FUNCTION__)) | ||||||
2714 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2714, __extension__ __PRETTY_FUNCTION__)); | ||||||
2715 | |||||||
2716 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||||
2717 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||||
2718 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||||
2719 | // V_FRACT bug is: | ||||||
2720 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||||
2721 | // | ||||||
2722 | // Convert floor(x) to (x - fract(x)) | ||||||
2723 | |||||||
2724 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||||
2725 | .addUse(OrigSrc) | ||||||
2726 | .setMIFlags(Flags); | ||||||
2727 | |||||||
2728 | // Give source modifier matching some assistance before obscuring a foldable | ||||||
2729 | // pattern. | ||||||
2730 | |||||||
2731 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||||
2732 | // shouldn't matter? | ||||||
2733 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||||
2734 | |||||||
2735 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); | ||||||
2736 | |||||||
2737 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||||
2738 | |||||||
2739 | // We don't need to concern ourselves with the snan handling difference, so | ||||||
2740 | // use the one which will directly select. | ||||||
2741 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2742 | if (MFI->getMode().IEEE) | ||||||
2743 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||||
2744 | else | ||||||
2745 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||||
2746 | |||||||
2747 | Register CorrectedFract = Min; | ||||||
2748 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||||
2749 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||||
2750 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||||
2751 | } | ||||||
2752 | |||||||
2753 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||||
2754 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||||
2755 | |||||||
2756 | MI.eraseFromParent(); | ||||||
2757 | return true; | ||||||
2758 | } | ||||||
2759 | |||||||
2760 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||||
2761 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||||
2762 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||||
2763 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2764 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2765 | const LLT S32 = LLT::scalar(32); | ||||||
2766 | assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector (2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2766, __extension__ __PRETTY_FUNCTION__)); | ||||||
2767 | |||||||
2768 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2769 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2770 | assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16 )) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2770, __extension__ __PRETTY_FUNCTION__)); | ||||||
2771 | |||||||
2772 | auto Merge = B.buildMerge(S32, {Src0, Src1}); | ||||||
2773 | B.buildBitcast(Dst, Merge); | ||||||
2774 | |||||||
2775 | MI.eraseFromParent(); | ||||||
2776 | return true; | ||||||
2777 | } | ||||||
2778 | |||||||
2779 | // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to | ||||||
2780 | // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input | ||||||
2781 | // case with a single min instruction instead of a compare+select. | ||||||
2782 | bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, | ||||||
2783 | MachineRegisterInfo &MRI, | ||||||
2784 | MachineIRBuilder &B) const { | ||||||
2785 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2786 | Register Src = MI.getOperand(1).getReg(); | ||||||
2787 | LLT DstTy = MRI.getType(Dst); | ||||||
2788 | LLT SrcTy = MRI.getType(Src); | ||||||
2789 | |||||||
2790 | unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ | ||||||
2791 | ? AMDGPU::G_AMDGPU_FFBH_U32 | ||||||
2792 | : AMDGPU::G_AMDGPU_FFBL_B32; | ||||||
2793 | auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); | ||||||
2794 | B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); | ||||||
2795 | |||||||
2796 | MI.eraseFromParent(); | ||||||
2797 | return true; | ||||||
2798 | } | ||||||
2799 | |||||||
2800 | // Check that this is a G_XOR x, -1 | ||||||
2801 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||||
2802 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||||
2803 | return false; | ||||||
2804 | auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||||
2805 | return ConstVal && *ConstVal == -1; | ||||||
2806 | } | ||||||
2807 | |||||||
2808 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||||
2809 | static MachineInstr * | ||||||
2810 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||||
2811 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||||
2812 | Register CondDef = MI.getOperand(0).getReg(); | ||||||
2813 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||||
2814 | return nullptr; | ||||||
2815 | |||||||
2816 | MachineBasicBlock *Parent = MI.getParent(); | ||||||
2817 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||||
2818 | |||||||
2819 | if (isNot(MRI, *UseMI)) { | ||||||
2820 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||||
2821 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||||
2822 | return nullptr; | ||||||
2823 | |||||||
2824 | // We're deleting the def of this value, so we need to remove it. | ||||||
2825 | UseMI->eraseFromParent(); | ||||||
2826 | |||||||
2827 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||||
2828 | Negated = true; | ||||||
2829 | } | ||||||
2830 | |||||||
2831 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||||
2832 | return nullptr; | ||||||
2833 | |||||||
2834 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||||
2835 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||||
2836 | if (Next == Parent->end()) { | ||||||
2837 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||||
2838 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||||
2839 | return nullptr; | ||||||
2840 | UncondBrTarget = &*NextMBB; | ||||||
2841 | } else { | ||||||
2842 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||||
2843 | return nullptr; | ||||||
2844 | Br = &*Next; | ||||||
2845 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||||
2846 | } | ||||||
2847 | |||||||
2848 | return UseMI; | ||||||
2849 | } | ||||||
2850 | |||||||
2851 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||||
2852 | const ArgDescriptor *Arg, | ||||||
2853 | const TargetRegisterClass *ArgRC, | ||||||
2854 | LLT ArgTy) const { | ||||||
2855 | MCRegister SrcReg = Arg->getRegister(); | ||||||
2856 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2856, __extension__ __PRETTY_FUNCTION__)); | ||||||
2857 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2857, __extension__ __PRETTY_FUNCTION__)); | ||||||
2858 | |||||||
2859 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, | ||||||
2860 | ArgTy); | ||||||
2861 | if (Arg->isMasked()) { | ||||||
2862 | // TODO: Should we try to emit this once in the entry block? | ||||||
2863 | const LLT S32 = LLT::scalar(32); | ||||||
2864 | const unsigned Mask = Arg->getMask(); | ||||||
2865 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | ||||||
2866 | |||||||
2867 | Register AndMaskSrc = LiveIn; | ||||||
2868 | |||||||
2869 | if (Shift
| ||||||
2870 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||||
2871 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||||
2872 | } | ||||||
2873 | |||||||
2874 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||||
| |||||||
2875 | } else { | ||||||
2876 | B.buildCopy(DstReg, LiveIn); | ||||||
2877 | } | ||||||
2878 | |||||||
2879 | return true; | ||||||
2880 | } | ||||||
2881 | |||||||
2882 | bool AMDGPULegalizerInfo::loadInputValue( | ||||||
2883 | Register DstReg, MachineIRBuilder &B, | ||||||
2884 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
2885 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2886 | const ArgDescriptor *Arg; | ||||||
2887 | const TargetRegisterClass *ArgRC; | ||||||
2888 | LLT ArgTy; | ||||||
2889 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||||
2890 | |||||||
2891 | if (!Arg) { | ||||||
2892 | assert(ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)(static_cast <bool> (ArgType == AMDGPUFunctionArgInfo:: KERNARG_SEGMENT_PTR) ? void (0) : __assert_fail ("ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2892, __extension__ __PRETTY_FUNCTION__)); | ||||||
2893 | // The intrinsic may appear when we have a 0 sized kernarg segment, in which | ||||||
2894 | // case the pointer argument may be missing and we use null. | ||||||
2895 | B.buildConstant(DstReg, 0); | ||||||
2896 | return true; | ||||||
2897 | } | ||||||
2898 | |||||||
2899 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||||
2900 | return false; // TODO: Handle these | ||||||
2901 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||||
2902 | } | ||||||
2903 | |||||||
2904 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||||
2905 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||||
2906 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
2907 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||||
2908 | return false; | ||||||
2909 | |||||||
2910 | MI.eraseFromParent(); | ||||||
2911 | return true; | ||||||
2912 | } | ||||||
2913 | |||||||
2914 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||||
2915 | MachineRegisterInfo &MRI, | ||||||
2916 | MachineIRBuilder &B) const { | ||||||
2917 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2918 | LLT DstTy = MRI.getType(Dst); | ||||||
2919 | LLT S16 = LLT::scalar(16); | ||||||
2920 | LLT S32 = LLT::scalar(32); | ||||||
2921 | LLT S64 = LLT::scalar(64); | ||||||
2922 | |||||||
2923 | if (DstTy == S16) | ||||||
2924 | return legalizeFDIV16(MI, MRI, B); | ||||||
2925 | if (DstTy == S32) | ||||||
2926 | return legalizeFDIV32(MI, MRI, B); | ||||||
2927 | if (DstTy == S64) | ||||||
2928 | return legalizeFDIV64(MI, MRI, B); | ||||||
2929 | |||||||
2930 | return false; | ||||||
2931 | } | ||||||
2932 | |||||||
2933 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, | ||||||
2934 | Register DstDivReg, | ||||||
2935 | Register DstRemReg, | ||||||
2936 | Register X, | ||||||
2937 | Register Y) const { | ||||||
2938 | const LLT S1 = LLT::scalar(1); | ||||||
2939 | const LLT S32 = LLT::scalar(32); | ||||||
2940 | |||||||
2941 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||||
2942 | // algorithm used here. | ||||||
2943 | |||||||
2944 | // Initial estimate of inv(y). | ||||||
2945 | auto FloatY = B.buildUITOFP(S32, Y); | ||||||
2946 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||||
2947 | auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); | ||||||
2948 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||||
2949 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||||
2950 | |||||||
2951 | // One round of UNR. | ||||||
2952 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||||
2953 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||||
2954 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||||
2955 | |||||||
2956 | // Quotient/remainder estimate. | ||||||
2957 | auto Q = B.buildUMulH(S32, X, Z); | ||||||
2958 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||||
2959 | |||||||
2960 | // First quotient/remainder refinement. | ||||||
2961 | auto One = B.buildConstant(S32, 1); | ||||||
2962 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||
2963 | if (DstDivReg) | ||||||
2964 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||||
2965 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||||
2966 | |||||||
2967 | // Second quotient/remainder refinement. | ||||||
2968 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||
2969 | if (DstDivReg) | ||||||
2970 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||||
2971 | |||||||
2972 | if (DstRemReg) | ||||||
2973 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); | ||||||
2974 | } | ||||||
2975 | |||||||
2976 | // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 | ||||||
2977 | // | ||||||
2978 | // Return lo, hi of result | ||||||
2979 | // | ||||||
2980 | // %cvt.lo = G_UITOFP Val.lo | ||||||
2981 | // %cvt.hi = G_UITOFP Val.hi | ||||||
2982 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||||
2983 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||||
2984 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||||
2985 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||||
2986 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||||
2987 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||||
2988 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||||
2989 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||||
2990 | Register Val) { | ||||||
2991 | const LLT S32 = LLT::scalar(32); | ||||||
2992 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||||
2993 | |||||||
2994 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||||
2995 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||||
2996 | |||||||
2997 | auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 | ||||||
2998 | B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); | ||||||
2999 | |||||||
3000 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||||
3001 | auto Mul1 = | ||||||
3002 | B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); | ||||||
3003 | |||||||
3004 | // 2**(-32) | ||||||
3005 | auto Mul2 = | ||||||
3006 | B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); | ||||||
3007 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||||
3008 | |||||||
3009 | // -(2**32) | ||||||
3010 | auto Mad2 = B.buildFMAD(S32, Trunc, | ||||||
3011 | B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); | ||||||
3012 | |||||||
3013 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||||
3014 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||||
3015 | |||||||
3016 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||||
3017 | } | ||||||
3018 | |||||||
3019 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, | ||||||
3020 | Register DstDivReg, | ||||||
3021 | Register DstRemReg, | ||||||
3022 | Register Numer, | ||||||
3023 | Register Denom) const { | ||||||
3024 | const LLT S32 = LLT::scalar(32); | ||||||
3025 | const LLT S64 = LLT::scalar(64); | ||||||
3026 | const LLT S1 = LLT::scalar(1); | ||||||
3027 | Register RcpLo, RcpHi; | ||||||
3028 | |||||||
3029 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||||
3030 | |||||||
3031 | auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); | ||||||
3032 | |||||||
3033 | auto Zero64 = B.buildConstant(S64, 0); | ||||||
3034 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||||
3035 | |||||||
3036 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||||
3037 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||||
3038 | |||||||
3039 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||||
3040 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||||
3041 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||||
3042 | |||||||
3043 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||||
3044 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||||
3045 | auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); | ||||||
3046 | auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); | ||||||
3047 | |||||||
3048 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||||
3049 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||||
3050 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||||
3051 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||||
3052 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||||
3053 | |||||||
3054 | auto Zero32 = B.buildConstant(S32, 0); | ||||||
3055 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||||
3056 | auto Add2_HiC = | ||||||
3057 | B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); | ||||||
3058 | auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); | ||||||
3059 | auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); | ||||||
3060 | |||||||
3061 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||||
3062 | Register NumerLo = UnmergeNumer.getReg(0); | ||||||
3063 | Register NumerHi = UnmergeNumer.getReg(1); | ||||||
3064 | |||||||
3065 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||||
3066 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||||
3067 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||||
3068 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||||
3069 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||||
3070 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||||
3071 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||||
3072 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||||
3073 | auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); | ||||||
3074 | |||||||
3075 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||||
3076 | Register DenomLo = UnmergeDenom.getReg(0); | ||||||
3077 | Register DenomHi = UnmergeDenom.getReg(1); | ||||||
3078 | |||||||
3079 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||||
3080 | auto C1 = B.buildSExt(S32, CmpHi); | ||||||
3081 | |||||||
3082 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||||
3083 | auto C2 = B.buildSExt(S32, CmpLo); | ||||||
3084 | |||||||
3085 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||||
3086 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||||
3087 | |||||||
3088 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||||
3089 | // Currently control flow is unconditional and we have 4 selects after | ||||||
3090 | // potential endif to substitute PHIs. | ||||||
3091 | |||||||
3092 | // if C3 != 0 ... | ||||||
3093 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||||
3094 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||||
3095 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||||
3096 | auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); | ||||||
3097 | |||||||
3098 | auto One64 = B.buildConstant(S64, 1); | ||||||
3099 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||||
3100 | |||||||
3101 | auto C4 = | ||||||
3102 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||||
3103 | auto C5 = | ||||||
3104 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||||
3105 | auto C6 = B.buildSelect( | ||||||
3106 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||||
3107 | |||||||
3108 | // if (C6 != 0) | ||||||
3109 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||||
3110 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||||
3111 | |||||||
3112 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||||
3113 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||||
3114 | auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); | ||||||
3115 | |||||||
3116 | // endif C6 | ||||||
3117 | // endif C3 | ||||||
3118 | |||||||
3119 | if (DstDivReg) { | ||||||
3120 | auto Sel1 = B.buildSelect( | ||||||
3121 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||||
3122 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||||
3123 | Sel1, MulHi3); | ||||||
3124 | } | ||||||
3125 | |||||||
3126 | if (DstRemReg) { | ||||||
3127 | auto Sel2 = B.buildSelect( | ||||||
3128 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||||
3129 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||||
3130 | Sel2, Sub1); | ||||||
3131 | } | ||||||
3132 | } | ||||||
3133 | |||||||
3134 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, | ||||||
3135 | MachineRegisterInfo &MRI, | ||||||
3136 | MachineIRBuilder &B) const { | ||||||
3137 | Register DstDivReg, DstRemReg; | ||||||
3138 | switch (MI.getOpcode()) { | ||||||
3139 | default: | ||||||
3140 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3140); | ||||||
3141 | case AMDGPU::G_UDIV: { | ||||||
3142 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3143 | break; | ||||||
3144 | } | ||||||
3145 | case AMDGPU::G_UREM: { | ||||||
3146 | DstRemReg = MI.getOperand(0).getReg(); | ||||||
3147 | break; | ||||||
3148 | } | ||||||
3149 | case AMDGPU::G_UDIVREM: { | ||||||
3150 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3151 | DstRemReg = MI.getOperand(1).getReg(); | ||||||
3152 | break; | ||||||
3153 | } | ||||||
3154 | } | ||||||
3155 | |||||||
3156 | const LLT S64 = LLT::scalar(64); | ||||||
3157 | const LLT S32 = LLT::scalar(32); | ||||||
3158 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||||
3159 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||||
3160 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||||
3161 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
3162 | |||||||
3163 | if (Ty == S32) | ||||||
3164 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||||
3165 | else if (Ty == S64) | ||||||
3166 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||||
3167 | else | ||||||
3168 | return false; | ||||||
3169 | |||||||
3170 | MI.eraseFromParent(); | ||||||
3171 | return true; | ||||||
3172 | } | ||||||
3173 | |||||||
3174 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, | ||||||
3175 | MachineRegisterInfo &MRI, | ||||||
3176 | MachineIRBuilder &B) const { | ||||||
3177 | const LLT S64 = LLT::scalar(64); | ||||||
3178 | const LLT S32 = LLT::scalar(32); | ||||||
3179 | |||||||
3180 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
3181 | if (Ty != S32 && Ty != S64) | ||||||
3182 | return false; | ||||||
3183 | |||||||
3184 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||||
3185 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||||
3186 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||||
3187 | |||||||
3188 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||||
3189 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||||
3190 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||||
3191 | |||||||
3192 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||||
3193 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||||
3194 | |||||||
3195 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||||
3196 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||||
3197 | |||||||
3198 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; | ||||||
3199 | switch (MI.getOpcode()) { | ||||||
3200 | default: | ||||||
3201 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3201); | ||||||
3202 | case AMDGPU::G_SDIV: { | ||||||
3203 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3204 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3205 | break; | ||||||
3206 | } | ||||||
3207 | case AMDGPU::G_SREM: { | ||||||
3208 | DstRemReg = MI.getOperand(0).getReg(); | ||||||
3209 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3210 | break; | ||||||
3211 | } | ||||||
3212 | case AMDGPU::G_SDIVREM: { | ||||||
3213 | DstDivReg = MI.getOperand(0).getReg(); | ||||||
3214 | DstRemReg = MI.getOperand(1).getReg(); | ||||||
3215 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3216 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||||
3217 | break; | ||||||
3218 | } | ||||||
3219 | } | ||||||
3220 | |||||||
3221 | if (Ty == S32) | ||||||
3222 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||||
3223 | else | ||||||
3224 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||||
3225 | |||||||
3226 | if (DstDivReg) { | ||||||
3227 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||||
3228 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); | ||||||
3229 | B.buildSub(DstDivReg, SignXor, Sign); | ||||||
3230 | } | ||||||
3231 | |||||||
3232 | if (DstRemReg) { | ||||||
3233 | auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||||
3234 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); | ||||||
3235 | B.buildSub(DstRemReg, SignXor, Sign); | ||||||
3236 | } | ||||||
3237 | |||||||
3238 | MI.eraseFromParent(); | ||||||
3239 | return true; | ||||||
3240 | } | ||||||
3241 | |||||||
3242 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||||
3243 | MachineRegisterInfo &MRI, | ||||||
3244 | MachineIRBuilder &B) const { | ||||||
3245 | Register Res = MI.getOperand(0).getReg(); | ||||||
3246 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3247 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3248 | uint16_t Flags = MI.getFlags(); | ||||||
3249 | LLT ResTy = MRI.getType(Res); | ||||||
3250 | |||||||
3251 | const MachineFunction &MF = B.getMF(); | ||||||
3252 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||
3253 | MI.getFlag(MachineInstr::FmAfn); | ||||||
3254 | |||||||
3255 | if (!AllowInaccurateRcp) | ||||||
3256 | return false; | ||||||
3257 | |||||||
3258 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||||
3259 | // 1 / x -> RCP(x) | ||||||
3260 | if (CLHS->isExactlyValue(1.0)) { | ||||||
3261 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
3262 | .addUse(RHS) | ||||||
3263 | .setMIFlags(Flags); | ||||||
3264 | |||||||
3265 | MI.eraseFromParent(); | ||||||
3266 | return true; | ||||||
3267 | } | ||||||
3268 | |||||||
3269 | // -1 / x -> RCP( FNEG(x) ) | ||||||
3270 | if (CLHS->isExactlyValue(-1.0)) { | ||||||
3271 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||||
3272 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
3273 | .addUse(FNeg.getReg(0)) | ||||||
3274 | .setMIFlags(Flags); | ||||||
3275 | |||||||
3276 | MI.eraseFromParent(); | ||||||
3277 | return true; | ||||||
3278 | } | ||||||
3279 | } | ||||||
3280 | |||||||
3281 | // x / y -> x * (1.0 / y) | ||||||
3282 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
3283 | .addUse(RHS) | ||||||
3284 | .setMIFlags(Flags); | ||||||
3285 | B.buildFMul(Res, LHS, RCP, Flags); | ||||||
3286 | |||||||
3287 | MI.eraseFromParent(); | ||||||
3288 | return true; | ||||||
3289 | } | ||||||
3290 | |||||||
3291 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||||
3292 | MachineRegisterInfo &MRI, | ||||||
3293 | MachineIRBuilder &B) const { | ||||||
3294 | Register Res = MI.getOperand(0).getReg(); | ||||||
3295 | Register X = MI.getOperand(1).getReg(); | ||||||
3296 | Register Y = MI.getOperand(2).getReg(); | ||||||
3297 | uint16_t Flags = MI.getFlags(); | ||||||
3298 | LLT ResTy = MRI.getType(Res); | ||||||
3299 | |||||||
3300 | const MachineFunction &MF = B.getMF(); | ||||||
3301 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||
3302 | MI.getFlag(MachineInstr::FmAfn); | ||||||
3303 | |||||||
3304 | if (!AllowInaccurateRcp) | ||||||
3305 | return false; | ||||||
3306 | |||||||
3307 | auto NegY = B.buildFNeg(ResTy, Y); | ||||||
3308 | auto One = B.buildFConstant(ResTy, 1.0); | ||||||
3309 | |||||||
3310 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
3311 | .addUse(Y) | ||||||
3312 | .setMIFlags(Flags); | ||||||
3313 | |||||||
3314 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||||
3315 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||||
3316 | |||||||
3317 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||||
3318 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||||
3319 | |||||||
3320 | auto Ret = B.buildFMul(ResTy, X, R); | ||||||
3321 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||||
3322 | |||||||
3323 | B.buildFMA(Res, Tmp2, R, Ret); | ||||||
3324 | MI.eraseFromParent(); | ||||||
3325 | return true; | ||||||
3326 | } | ||||||
3327 | |||||||
3328 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||||
3329 | MachineRegisterInfo &MRI, | ||||||
3330 | MachineIRBuilder &B) const { | ||||||
3331 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
3332 | return true; | ||||||
3333 | |||||||
3334 | Register Res = MI.getOperand(0).getReg(); | ||||||
3335 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3336 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3337 | |||||||
3338 | uint16_t Flags = MI.getFlags(); | ||||||
3339 | |||||||
3340 | LLT S16 = LLT::scalar(16); | ||||||
3341 | LLT S32 = LLT::scalar(32); | ||||||
3342 | |||||||
3343 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||||
3344 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||||
3345 | |||||||
3346 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3347 | .addUse(RHSExt.getReg(0)) | ||||||
3348 | .setMIFlags(Flags); | ||||||
3349 | |||||||
3350 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||||
3351 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||||
3352 | |||||||
3353 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
3354 | .addUse(RDst.getReg(0)) | ||||||
3355 | .addUse(RHS) | ||||||
3356 | .addUse(LHS) | ||||||
3357 | .setMIFlags(Flags); | ||||||
3358 | |||||||
3359 | MI.eraseFromParent(); | ||||||
3360 | return true; | ||||||
3361 | } | ||||||
3362 | |||||||
3363 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||||
3364 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||||
3365 | static void toggleSPDenormMode(bool Enable, | ||||||
3366 | MachineIRBuilder &B, | ||||||
3367 | const GCNSubtarget &ST, | ||||||
3368 | AMDGPU::SIModeRegisterDefaults Mode) { | ||||||
3369 | // Set SP denorm mode to this value. | ||||||
3370 | unsigned SPDenormMode = | ||||||
3371 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||||
3372 | |||||||
3373 | if (ST.hasDenormModeInst()) { | ||||||
3374 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||||
3375 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||||
3376 | |||||||
3377 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||||
3378 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||||
3379 | .addImm(NewDenormModeValue); | ||||||
3380 | |||||||
3381 | } else { | ||||||
3382 | // Select FP32 bit field in mode register. | ||||||
3383 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||||
3384 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||||
3385 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||||
3386 | |||||||
3387 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||||
3388 | .addImm(SPDenormMode) | ||||||
3389 | .addImm(SPDenormModeBitField); | ||||||
3390 | } | ||||||
3391 | } | ||||||
3392 | |||||||
3393 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||||
3394 | MachineRegisterInfo &MRI, | ||||||
3395 | MachineIRBuilder &B) const { | ||||||
3396 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
3397 | return true; | ||||||
3398 | |||||||
3399 | Register Res = MI.getOperand(0).getReg(); | ||||||
3400 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3401 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3402 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3403 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); | ||||||
3404 | |||||||
3405 | uint16_t Flags = MI.getFlags(); | ||||||
3406 | |||||||
3407 | LLT S32 = LLT::scalar(32); | ||||||
3408 | LLT S1 = LLT::scalar(1); | ||||||
3409 | |||||||
3410 | auto One = B.buildFConstant(S32, 1.0f); | ||||||
3411 | |||||||
3412 | auto DenominatorScaled = | ||||||
3413 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
3414 | .addUse(LHS) | ||||||
3415 | .addUse(RHS) | ||||||
3416 | .addImm(0) | ||||||
3417 | .setMIFlags(Flags); | ||||||
3418 | auto NumeratorScaled = | ||||||
3419 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
3420 | .addUse(LHS) | ||||||
3421 | .addUse(RHS) | ||||||
3422 | .addImm(1) | ||||||
3423 | .setMIFlags(Flags); | ||||||
3424 | |||||||
3425 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3426 | .addUse(DenominatorScaled.getReg(0)) | ||||||
3427 | .setMIFlags(Flags); | ||||||
3428 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||||
3429 | |||||||
3430 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||||
3431 | // aren't modeled as reading it. | ||||||
3432 | if (!Mode.allFP32Denormals()) | ||||||
3433 | toggleSPDenormMode(true, B, ST, Mode); | ||||||
3434 | |||||||
3435 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||||
3436 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||||
3437 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||||
3438 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||||
3439 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||||
3440 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||||
3441 | |||||||
3442 | if (!Mode.allFP32Denormals()) | ||||||
3443 | toggleSPDenormMode(false, B, ST, Mode); | ||||||
3444 | |||||||
3445 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||||
3446 | .addUse(Fma4.getReg(0)) | ||||||
3447 | .addUse(Fma1.getReg(0)) | ||||||
3448 | .addUse(Fma3.getReg(0)) | ||||||
3449 | .addUse(NumeratorScaled.getReg(1)) | ||||||
3450 | .setMIFlags(Flags); | ||||||
3451 | |||||||
3452 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
3453 | .addUse(Fmas.getReg(0)) | ||||||
3454 | .addUse(RHS) | ||||||
3455 | .addUse(LHS) | ||||||
3456 | .setMIFlags(Flags); | ||||||
3457 | |||||||
3458 | MI.eraseFromParent(); | ||||||
3459 | return true; | ||||||
3460 | } | ||||||
3461 | |||||||
3462 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||||
3463 | MachineRegisterInfo &MRI, | ||||||
3464 | MachineIRBuilder &B) const { | ||||||
3465 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||||
3466 | return true; | ||||||
3467 | |||||||
3468 | Register Res = MI.getOperand(0).getReg(); | ||||||
3469 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3470 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3471 | |||||||
3472 | uint16_t Flags = MI.getFlags(); | ||||||
3473 | |||||||
3474 | LLT S64 = LLT::scalar(64); | ||||||
3475 | LLT S1 = LLT::scalar(1); | ||||||
3476 | |||||||
3477 | auto One = B.buildFConstant(S64, 1.0); | ||||||
3478 | |||||||
3479 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
3480 | .addUse(LHS) | ||||||
3481 | .addUse(RHS) | ||||||
3482 | .addImm(0) | ||||||
3483 | .setMIFlags(Flags); | ||||||
3484 | |||||||
3485 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||||
3486 | |||||||
3487 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||||
3488 | .addUse(DivScale0.getReg(0)) | ||||||
3489 | .setMIFlags(Flags); | ||||||
3490 | |||||||
3491 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||||
3492 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||||
3493 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||||
3494 | |||||||
3495 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
3496 | .addUse(LHS) | ||||||
3497 | .addUse(RHS) | ||||||
3498 | .addImm(1) | ||||||
3499 | .setMIFlags(Flags); | ||||||
3500 | |||||||
3501 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||||
3502 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||||
3503 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||||
3504 | |||||||
3505 | Register Scale; | ||||||
3506 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||||
3507 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||||
3508 | // is not usable. | ||||||
3509 | |||||||
3510 | LLT S32 = LLT::scalar(32); | ||||||
3511 | |||||||
3512 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||||
3513 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||||
3514 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||||
3515 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||||
3516 | |||||||
3517 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||||
3518 | Scale1Unmerge.getReg(1)); | ||||||
3519 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||||
3520 | Scale0Unmerge.getReg(1)); | ||||||
3521 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||||
3522 | } else { | ||||||
3523 | Scale = DivScale1.getReg(1); | ||||||
3524 | } | ||||||
3525 | |||||||
3526 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||||
3527 | .addUse(Fma4.getReg(0)) | ||||||
3528 | .addUse(Fma3.getReg(0)) | ||||||
3529 | .addUse(Mul.getReg(0)) | ||||||
3530 | .addUse(Scale) | ||||||
3531 | .setMIFlags(Flags); | ||||||
3532 | |||||||
3533 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) | ||||||
3534 | .addUse(Fmas.getReg(0)) | ||||||
3535 | .addUse(RHS) | ||||||
3536 | .addUse(LHS) | ||||||
3537 | .setMIFlags(Flags); | ||||||
3538 | |||||||
3539 | MI.eraseFromParent(); | ||||||
3540 | return true; | ||||||
3541 | } | ||||||
3542 | |||||||
3543 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||||
3544 | MachineRegisterInfo &MRI, | ||||||
3545 | MachineIRBuilder &B) const { | ||||||
3546 | Register Res = MI.getOperand(0).getReg(); | ||||||
3547 | Register LHS = MI.getOperand(2).getReg(); | ||||||
3548 | Register RHS = MI.getOperand(3).getReg(); | ||||||
3549 | uint16_t Flags = MI.getFlags(); | ||||||
3550 | |||||||
3551 | LLT S32 = LLT::scalar(32); | ||||||
3552 | LLT S1 = LLT::scalar(1); | ||||||
3553 | |||||||
3554 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||||
3555 | const APFloat C0Val(1.0f); | ||||||
3556 | |||||||
3557 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||||
3558 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||||
3559 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | ||||||
3560 | |||||||
3561 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||||
3562 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||||
3563 | |||||||
3564 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||||
3565 | |||||||
3566 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3567 | .addUse(Mul0.getReg(0)) | ||||||
3568 | .setMIFlags(Flags); | ||||||
3569 | |||||||
3570 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||||
3571 | |||||||
3572 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||||
3573 | |||||||
3574 | MI.eraseFromParent(); | ||||||
3575 | return true; | ||||||
3576 | } | ||||||
3577 | |||||||
3578 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||||
3579 | // FIXME: Why do we handle this one but not other removed instructions? | ||||||
3580 | // | ||||||
3581 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||||
3582 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||||
3583 | // +-max_float. | ||||||
3584 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||||
3585 | MachineRegisterInfo &MRI, | ||||||
3586 | MachineIRBuilder &B) const { | ||||||
3587 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||||
3588 | return true; | ||||||
3589 | |||||||
3590 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3591 | Register Src = MI.getOperand(2).getReg(); | ||||||
3592 | auto Flags = MI.getFlags(); | ||||||
3593 | |||||||
3594 | LLT Ty = MRI.getType(Dst); | ||||||
3595 | |||||||
3596 | const fltSemantics *FltSemantics; | ||||||
3597 | if (Ty == LLT::scalar(32)) | ||||||
3598 | FltSemantics = &APFloat::IEEEsingle(); | ||||||
3599 | else if (Ty == LLT::scalar(64)) | ||||||
3600 | FltSemantics = &APFloat::IEEEdouble(); | ||||||
3601 | else | ||||||
3602 | return false; | ||||||
3603 | |||||||
3604 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||||
3605 | .addUse(Src) | ||||||
3606 | .setMIFlags(Flags); | ||||||
3607 | |||||||
3608 | // We don't need to concern ourselves with the snan handling difference, since | ||||||
3609 | // the rsq quieted (or not) so use the one which will directly select. | ||||||
3610 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3611 | const bool UseIEEE = MFI->getMode().IEEE; | ||||||
3612 | |||||||
3613 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||||
3614 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||||
3615 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||||
3616 | |||||||
3617 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||||
3618 | |||||||
3619 | if (UseIEEE) | ||||||
3620 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||||
3621 | else | ||||||
3622 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||||
3623 | MI.eraseFromParent(); | ||||||
3624 | return true; | ||||||
3625 | } | ||||||
3626 | |||||||
3627 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||||
3628 | switch (IID) { | ||||||
3629 | case Intrinsic::amdgcn_ds_fadd: | ||||||
3630 | return AMDGPU::G_ATOMICRMW_FADD; | ||||||
3631 | case Intrinsic::amdgcn_ds_fmin: | ||||||
3632 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||||
3633 | case Intrinsic::amdgcn_ds_fmax: | ||||||
3634 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||||
3635 | default: | ||||||
3636 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3636); | ||||||
3637 | } | ||||||
3638 | } | ||||||
3639 | |||||||
3640 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||||
3641 | MachineInstr &MI, | ||||||
3642 | Intrinsic::ID IID) const { | ||||||
3643 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
3644 | Observer.changingInstr(MI); | ||||||
3645 | |||||||
3646 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||||
3647 | |||||||
3648 | // The remaining operands were used to set fields in the MemOperand on | ||||||
3649 | // construction. | ||||||
3650 | for (int I = 6; I > 3; --I) | ||||||
3651 | MI.RemoveOperand(I); | ||||||
3652 | |||||||
3653 | MI.RemoveOperand(1); // Remove the intrinsic ID. | ||||||
3654 | Observer.changedInstr(MI); | ||||||
3655 | return true; | ||||||
3656 | } | ||||||
3657 | |||||||
3658 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||||
3659 | MachineRegisterInfo &MRI, | ||||||
3660 | MachineIRBuilder &B) const { | ||||||
3661 | uint64_t Offset = | ||||||
3662 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||||
3663 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||||
3664 | LLT DstTy = MRI.getType(DstReg); | ||||||
3665 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||||
3666 | |||||||
3667 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||||
3668 | if (!loadInputValue(KernargPtrReg, B, | ||||||
3669 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||||
3670 | return false; | ||||||
3671 | |||||||
3672 | // FIXME: This should be nuw | ||||||
3673 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||||
3674 | return true; | ||||||
3675 | } | ||||||
3676 | |||||||
3677 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||||
3678 | MachineRegisterInfo &MRI, | ||||||
3679 | MachineIRBuilder &B) const { | ||||||
3680 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3681 | if (!MFI->isEntryFunction()) { | ||||||
3682 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3683 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||||
3684 | } | ||||||
3685 | |||||||
3686 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
3687 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||||
3688 | return false; | ||||||
3689 | |||||||
3690 | MI.eraseFromParent(); | ||||||
3691 | return true; | ||||||
3692 | } | ||||||
3693 | |||||||
3694 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||||
3695 | MachineRegisterInfo &MRI, | ||||||
3696 | MachineIRBuilder &B, | ||||||
3697 | unsigned AddrSpace) const { | ||||||
3698 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||||
3699 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||||
3700 | Register Hi32 = Unmerge.getReg(1); | ||||||
3701 | |||||||
3702 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||||
3703 | MI.eraseFromParent(); | ||||||
3704 | return true; | ||||||
3705 | } | ||||||
3706 | |||||||
3707 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||||
3708 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||||
3709 | // split between the instruction's voffset and immoffset fields) and soffset | ||||||
3710 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||||
3711 | // the instruction's soffset field). This function takes the first kind of | ||||||
3712 | // offset and figures out how to split it between voffset and immoffset. | ||||||
3713 | std::pair<Register, unsigned> | ||||||
3714 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||||
3715 | Register OrigOffset) const { | ||||||
3716 | const unsigned MaxImm = 4095; | ||||||
3717 | Register BaseReg; | ||||||
3718 | unsigned ImmOffset; | ||||||
3719 | const LLT S32 = LLT::scalar(32); | ||||||
3720 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
3721 | |||||||
3722 | std::tie(BaseReg, ImmOffset) = | ||||||
3723 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | ||||||
3724 | |||||||
3725 | // If BaseReg is a pointer, convert it to int. | ||||||
3726 | if (MRI.getType(BaseReg).isPointer()) | ||||||
3727 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | ||||||
3728 | |||||||
3729 | // If the immediate value is too big for the immoffset field, put the value | ||||||
3730 | // and -4096 into the immoffset field so that the value that is copied/added | ||||||
3731 | // for the voffset field is a multiple of 4096, and it stands more chance | ||||||
3732 | // of being CSEd with the copy/add for another similar load/store. | ||||||
3733 | // However, do not do that rounding down to a multiple of 4096 if that is a | ||||||
3734 | // negative number, as it appears to be illegal to have a negative offset | ||||||
3735 | // in the vgpr, even if adding the immediate offset makes it positive. | ||||||
3736 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||||
3737 | ImmOffset -= Overflow; | ||||||
3738 | if ((int32_t)Overflow < 0) { | ||||||
3739 | Overflow += ImmOffset; | ||||||
3740 | ImmOffset = 0; | ||||||
3741 | } | ||||||
3742 | |||||||
3743 | if (Overflow != 0) { | ||||||
3744 | if (!BaseReg) { | ||||||
3745 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||||
3746 | } else { | ||||||
3747 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||||
3748 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||||
3749 | } | ||||||
3750 | } | ||||||
3751 | |||||||
3752 | if (!BaseReg) | ||||||
3753 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||||
3754 | |||||||
3755 | return std::make_pair(BaseReg, ImmOffset); | ||||||
3756 | } | ||||||
3757 | |||||||
3758 | /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. | ||||||
3759 | void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, | ||||||
3760 | Register VOffset, Register SOffset, | ||||||
3761 | unsigned ImmOffset, Register VIndex, | ||||||
3762 | MachineRegisterInfo &MRI) const { | ||||||
3763 | Optional<ValueAndVReg> MaybeVOffsetVal = | ||||||
3764 | getConstantVRegValWithLookThrough(VOffset, MRI); | ||||||
3765 | Optional<ValueAndVReg> MaybeSOffsetVal = | ||||||
3766 | getConstantVRegValWithLookThrough(SOffset, MRI); | ||||||
3767 | Optional<ValueAndVReg> MaybeVIndexVal = | ||||||
3768 | getConstantVRegValWithLookThrough(VIndex, MRI); | ||||||
3769 | // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, | ||||||
3770 | // update the MMO with that offset. The stride is unknown so we can only do | ||||||
3771 | // this if VIndex is constant 0. | ||||||
3772 | if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && | ||||||
3773 | MaybeVIndexVal->Value == 0) { | ||||||
3774 | uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + | ||||||
3775 | MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; | ||||||
3776 | MMO->setOffset(TotalOffset); | ||||||
3777 | } else { | ||||||
3778 | // We don't have a constant combined offset to use in the MMO. Give up. | ||||||
3779 | MMO->setValue((Value *)nullptr); | ||||||
3780 | } | ||||||
3781 | } | ||||||
3782 | |||||||
3783 | /// Handle register layout difference for f16 images for some subtargets. | ||||||
3784 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||||
3785 | MachineRegisterInfo &MRI, | ||||||
3786 | Register Reg, | ||||||
3787 | bool ImageStore) const { | ||||||
3788 | const LLT S16 = LLT::scalar(16); | ||||||
3789 | const LLT S32 = LLT::scalar(32); | ||||||
3790 | LLT StoreVT = MRI.getType(Reg); | ||||||
3791 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3791, __extension__ __PRETTY_FUNCTION__)); | ||||||
3792 | |||||||
3793 | if (ST.hasUnpackedD16VMem()) { | ||||||
3794 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
3795 | |||||||
3796 | SmallVector<Register, 4> WideRegs; | ||||||
3797 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3798 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||||
3799 | |||||||
3800 | int NumElts = StoreVT.getNumElements(); | ||||||
3801 | |||||||
3802 | return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) | ||||||
3803 | .getReg(0); | ||||||
3804 | } | ||||||
3805 | |||||||
3806 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||||
3807 | if (StoreVT.getNumElements() == 2) { | ||||||
3808 | SmallVector<Register, 4> PackedRegs; | ||||||
3809 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||||
3810 | PackedRegs.push_back(Reg); | ||||||
3811 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||||
3812 | return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) | ||||||
3813 | .getReg(0); | ||||||
3814 | } | ||||||
3815 | |||||||
3816 | if (StoreVT.getNumElements() == 3) { | ||||||
3817 | SmallVector<Register, 4> PackedRegs; | ||||||
3818 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
3819 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3820 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||
3821 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||||
3822 | Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); | ||||||
3823 | return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); | ||||||
3824 | } | ||||||
3825 | |||||||
3826 | if (StoreVT.getNumElements() == 4) { | ||||||
3827 | SmallVector<Register, 4> PackedRegs; | ||||||
3828 | Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); | ||||||
3829 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||||
3830 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3831 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||
3832 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||||
3833 | return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) | ||||||
3834 | .getReg(0); | ||||||
3835 | } | ||||||
3836 | |||||||
3837 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3837); | ||||||
3838 | } | ||||||
3839 | |||||||
3840 | return Reg; | ||||||
3841 | } | ||||||
3842 | |||||||
3843 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||||
3844 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||||
3845 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
3846 | LLT Ty = MRI->getType(VData); | ||||||
3847 | |||||||
3848 | const LLT S16 = LLT::scalar(16); | ||||||
3849 | |||||||
3850 | // Fixup illegal register types for i8 stores. | ||||||
3851 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||||
3852 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||||
3853 | return AnyExt; | ||||||
3854 | } | ||||||
3855 | |||||||
3856 | if (Ty.isVector()) { | ||||||
3857 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||||
3858 | if (IsFormat) | ||||||
3859 | return handleD16VData(B, *MRI, VData); | ||||||
3860 | } | ||||||
3861 | } | ||||||
3862 | |||||||
3863 | return VData; | ||||||
3864 | } | ||||||
3865 | |||||||
3866 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||||
3867 | MachineRegisterInfo &MRI, | ||||||
3868 | MachineIRBuilder &B, | ||||||
3869 | bool IsTyped, | ||||||
3870 | bool IsFormat) const { | ||||||
3871 | Register VData = MI.getOperand(1).getReg(); | ||||||
3872 | LLT Ty = MRI.getType(VData); | ||||||
3873 | LLT EltTy = Ty.getScalarType(); | ||||||
3874 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
3875 | const LLT S32 = LLT::scalar(32); | ||||||
3876 | |||||||
3877 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||||
3878 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
3879 | |||||||
3880 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3881 | const int MemSize = MMO->getSize(); | ||||||
3882 | |||||||
3883 | unsigned ImmOffset; | ||||||
3884 | |||||||
3885 | // The typed intrinsics add an immediate after the registers. | ||||||
3886 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3887 | |||||||
3888 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3889 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3890 | Register VIndex; | ||||||
3891 | int OpOffset = 0; | ||||||
3892 | if (HasVIndex) { | ||||||
3893 | VIndex = MI.getOperand(3).getReg(); | ||||||
3894 | OpOffset = 1; | ||||||
3895 | } else { | ||||||
3896 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
3897 | } | ||||||
3898 | |||||||
3899 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3900 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3901 | |||||||
3902 | unsigned Format = 0; | ||||||
3903 | if (IsTyped) { | ||||||
3904 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3905 | ++OpOffset; | ||||||
3906 | } | ||||||
3907 | |||||||
3908 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3909 | |||||||
3910 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||||
3911 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||||
3912 | |||||||
3913 | unsigned Opc; | ||||||
3914 | if (IsTyped) { | ||||||
3915 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||||
3916 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||||
3917 | } else if (IsFormat) { | ||||||
3918 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||||
3919 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||||
3920 | } else { | ||||||
3921 | switch (MemSize) { | ||||||
3922 | case 1: | ||||||
3923 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||||
3924 | break; | ||||||
3925 | case 2: | ||||||
3926 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||||
3927 | break; | ||||||
3928 | default: | ||||||
3929 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||||
3930 | break; | ||||||
3931 | } | ||||||
3932 | } | ||||||
3933 | |||||||
3934 | auto MIB = B.buildInstr(Opc) | ||||||
3935 | .addUse(VData) // vdata | ||||||
3936 | .addUse(RSrc) // rsrc | ||||||
3937 | .addUse(VIndex) // vindex | ||||||
3938 | .addUse(VOffset) // voffset | ||||||
3939 | .addUse(SOffset) // soffset | ||||||
3940 | .addImm(ImmOffset); // offset(imm) | ||||||
3941 | |||||||
3942 | if (IsTyped) | ||||||
3943 | MIB.addImm(Format); | ||||||
3944 | |||||||
3945 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3946 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3947 | .addMemOperand(MMO); | ||||||
3948 | |||||||
3949 | MI.eraseFromParent(); | ||||||
3950 | return true; | ||||||
3951 | } | ||||||
3952 | |||||||
3953 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||||
3954 | MachineRegisterInfo &MRI, | ||||||
3955 | MachineIRBuilder &B, | ||||||
3956 | bool IsFormat, | ||||||
3957 | bool IsTyped) const { | ||||||
3958 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||||
3959 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3960 | const LLT MemTy = MMO->getMemoryType(); | ||||||
3961 | const LLT S32 = LLT::scalar(32); | ||||||
3962 | |||||||
3963 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3964 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
3965 | |||||||
3966 | // The typed intrinsics add an immediate after the registers. | ||||||
3967 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3968 | |||||||
3969 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3970 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3971 | Register VIndex; | ||||||
3972 | int OpOffset = 0; | ||||||
3973 | if (HasVIndex) { | ||||||
3974 | VIndex = MI.getOperand(3).getReg(); | ||||||
3975 | OpOffset = 1; | ||||||
3976 | } else { | ||||||
3977 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
3978 | } | ||||||
3979 | |||||||
3980 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3981 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3982 | |||||||
3983 | unsigned Format = 0; | ||||||
3984 | if (IsTyped) { | ||||||
3985 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3986 | ++OpOffset; | ||||||
3987 | } | ||||||
3988 | |||||||
3989 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3990 | unsigned ImmOffset; | ||||||
3991 | |||||||
3992 | LLT Ty = MRI.getType(Dst); | ||||||
3993 | LLT EltTy = Ty.getScalarType(); | ||||||
3994 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
3995 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||||
3996 | |||||||
3997 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||||
3998 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||||
3999 | |||||||
4000 | unsigned Opc; | ||||||
4001 | |||||||
4002 | if (IsTyped) { | ||||||
4003 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||||
4004 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||||
4005 | } else if (IsFormat) { | ||||||
4006 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : | ||||||
4007 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||||
4008 | } else { | ||||||
4009 | switch (MemTy.getSizeInBits()) { | ||||||
4010 | case 8: | ||||||
4011 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||||
4012 | break; | ||||||
4013 | case 16: | ||||||
4014 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||||
4015 | break; | ||||||
4016 | default: | ||||||
4017 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||||
4018 | break; | ||||||
4019 | } | ||||||
4020 | } | ||||||
4021 | |||||||
4022 | Register LoadDstReg; | ||||||
4023 | |||||||
4024 | bool IsExtLoad = | ||||||
4025 | (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); | ||||||
4026 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||||
4027 | |||||||
4028 | if (IsExtLoad) | ||||||
4029 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||||
4030 | else if (Unpacked && IsD16 && Ty.isVector()) | ||||||
4031 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||||
4032 | else | ||||||
4033 | LoadDstReg = Dst; | ||||||
4034 | |||||||
4035 | auto MIB = B.buildInstr(Opc) | ||||||
4036 | .addDef(LoadDstReg) // vdata | ||||||
4037 | .addUse(RSrc) // rsrc | ||||||
4038 | .addUse(VIndex) // vindex | ||||||
4039 | .addUse(VOffset) // voffset | ||||||
4040 | .addUse(SOffset) // soffset | ||||||
4041 | .addImm(ImmOffset); // offset(imm) | ||||||
4042 | |||||||
4043 | if (IsTyped) | ||||||
4044 | MIB.addImm(Format); | ||||||
4045 | |||||||
4046 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
4047 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
4048 | .addMemOperand(MMO); | ||||||
4049 | |||||||
4050 | if (LoadDstReg != Dst) { | ||||||
4051 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||
4052 | |||||||
4053 | // Widen result for extending loads was widened. | ||||||
4054 | if (IsExtLoad) | ||||||
4055 | B.buildTrunc(Dst, LoadDstReg); | ||||||
4056 | else { | ||||||
4057 | // Repack to original 16-bit vector result | ||||||
4058 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||||
4059 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||||
4060 | SmallVector<Register, 4> Repack; | ||||||
4061 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||||
4062 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||||
4063 | B.buildMerge(Dst, Repack); | ||||||
4064 | } | ||||||
4065 | } | ||||||
4066 | |||||||
4067 | MI.eraseFromParent(); | ||||||
4068 | return true; | ||||||
4069 | } | ||||||
4070 | |||||||
4071 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||||
4072 | MachineIRBuilder &B, | ||||||
4073 | bool IsInc) const { | ||||||
4074 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : | ||||||
4075 | AMDGPU::G_AMDGPU_ATOMIC_DEC; | ||||||
4076 | B.buildInstr(Opc) | ||||||
4077 | .addDef(MI.getOperand(0).getReg()) | ||||||
4078 | .addUse(MI.getOperand(2).getReg()) | ||||||
4079 | .addUse(MI.getOperand(3).getReg()) | ||||||
4080 | .cloneMemRefs(MI); | ||||||
4081 | MI.eraseFromParent(); | ||||||
4082 | return true; | ||||||
4083 | } | ||||||
4084 | |||||||
4085 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||||
4086 | switch (IntrID) { | ||||||
4087 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
4088 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
4089 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||||
4090 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
4091 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
4092 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||||
4093 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
4094 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
4095 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||||
4096 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
4097 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
4098 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||||
4099 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
4100 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
4101 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||||
4102 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
4103 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
4104 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||||
4105 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
4106 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
4107 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||||
4108 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
4109 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
4110 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||||
4111 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
4112 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
4113 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||||
4114 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
4115 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
4116 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||||
4117 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
4118 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
4119 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||||
4120 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
4121 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
4122 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||||
4123 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
4124 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
4125 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||||
4126 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||
4127 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||
4128 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||
4129 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||||
4130 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||
4131 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||
4132 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | ||||||
4133 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||
4134 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||
4135 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | ||||||
4136 | default: | ||||||
4137 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4137); | ||||||
4138 | } | ||||||
4139 | } | ||||||
4140 | |||||||
4141 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||||
4142 | MachineIRBuilder &B, | ||||||
4143 | Intrinsic::ID IID) const { | ||||||
4144 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||||
4145 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||||
4146 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||||
4147 | |||||||
4148 | Register Dst; | ||||||
4149 | |||||||
4150 | int OpOffset = 0; | ||||||
4151 | if (HasReturn) { | ||||||
4152 | // A few FP atomics do not support return values. | ||||||
4153 | Dst = MI.getOperand(0).getReg(); | ||||||
4154 | } else { | ||||||
4155 | OpOffset = -1; | ||||||
4156 | } | ||||||
4157 | |||||||
4158 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||||
4159 | Register CmpVal; | ||||||
4160 | |||||||
4161 | if (IsCmpSwap) { | ||||||
4162 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||||
4163 | ++OpOffset; | ||||||
4164 | } | ||||||
4165 | |||||||
4166 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||||
4167 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||||
4168 | |||||||
4169 | // The struct intrinsic variants add one additional operand over raw. | ||||||
4170 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
4171 | Register VIndex; | ||||||
4172 | if (HasVIndex) { | ||||||
4173 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||||
4174 | ++OpOffset; | ||||||
4175 | } else { | ||||||
4176 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||||
4177 | } | ||||||
4178 | |||||||
4179 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
4180 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||||
4181 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||||
4182 | |||||||
4183 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
4184 | |||||||
4185 | unsigned ImmOffset; | ||||||
4186 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||||
4187 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); | ||||||
4188 | |||||||
4189 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||||
4190 | |||||||
4191 | if (HasReturn) | ||||||
4192 | MIB.addDef(Dst); | ||||||
4193 | |||||||
4194 | MIB.addUse(VData); // vdata | ||||||
4195 | |||||||
4196 | if (IsCmpSwap) | ||||||
4197 | MIB.addReg(CmpVal); | ||||||
4198 | |||||||
4199 | MIB.addUse(RSrc) // rsrc | ||||||
4200 | .addUse(VIndex) // vindex | ||||||
4201 | .addUse(VOffset) // voffset | ||||||
4202 | .addUse(SOffset) // soffset | ||||||
4203 | .addImm(ImmOffset) // offset(imm) | ||||||
4204 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
4205 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
4206 | .addMemOperand(MMO); | ||||||
4207 | |||||||
4208 | MI.eraseFromParent(); | ||||||
4209 | return true; | ||||||
4210 | } | ||||||
4211 | |||||||
4212 | /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized | ||||||
4213 | /// vector with s16 typed elements. | ||||||
4214 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, | ||||||
4215 | SmallVectorImpl<Register> &PackedAddrs, | ||||||
4216 | unsigned ArgOffset, | ||||||
4217 | const AMDGPU::ImageDimIntrinsicInfo *Intr, | ||||||
4218 | bool IsA16, bool IsG16) { | ||||||
4219 | const LLT S16 = LLT::scalar(16); | ||||||
4220 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
4221 | auto EndIdx = Intr->VAddrEnd; | ||||||
4222 | |||||||
4223 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||||
4224 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||
4225 | if (!SrcOp.isReg()) | ||||||
4226 | continue; // _L to _LZ may have eliminated this. | ||||||
4227 | |||||||
4228 | Register AddrReg = SrcOp.getReg(); | ||||||
4229 | |||||||
4230 | if ((I < Intr->GradientStart) || | ||||||
4231 | (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || | ||||||
4232 | (I >= Intr->CoordStart && !IsA16)) { | ||||||
4233 | // Handle any gradient or coordinate operands that should not be packed | ||||||
4234 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||||
4235 | PackedAddrs.push_back(AddrReg); | ||||||
4236 | } else { | ||||||
4237 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||||
4238 | // derivatives dx/dh and dx/dv are packed with undef. | ||||||
4239 | if (((I + 1) >= EndIdx) || | ||||||
4240 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||||
4241 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||||
4242 | (Intr->NumGradients / 2) - 1) || | ||||||
4243 | I == static_cast<unsigned>(Intr->GradientStart + | ||||||
4244 | Intr->NumGradients - 1))) || | ||||||
4245 | // Check for _L to _LZ optimization | ||||||
4246 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||||
4247 | PackedAddrs.push_back( | ||||||
4248 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||||
4249 | .getReg(0)); | ||||||
4250 | } else { | ||||||
4251 | PackedAddrs.push_back( | ||||||
4252 | B.buildBuildVector( | ||||||
4253 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||||
4254 | .getReg(0)); | ||||||
4255 | ++I; | ||||||
4256 | } | ||||||
4257 | } | ||||||
4258 | } | ||||||
4259 | } | ||||||
4260 | |||||||
4261 | /// Convert from separate vaddr components to a single vector address register, | ||||||
4262 | /// and replace the remaining operands with $noreg. | ||||||
4263 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||||
4264 | int DimIdx, int NumVAddrs) { | ||||||
4265 | const LLT S32 = LLT::scalar(32); | ||||||
4266 | |||||||
4267 | SmallVector<Register, 8> AddrRegs; | ||||||
4268 | for (int I = 0; I != NumVAddrs; ++I) { | ||||||
4269 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||
4270 | if (SrcOp.isReg()) { | ||||||
4271 | AddrRegs.push_back(SrcOp.getReg()); | ||||||
4272 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4272, __extension__ __PRETTY_FUNCTION__)); | ||||||
4273 | } | ||||||
4274 | } | ||||||
4275 | |||||||
4276 | int NumAddrRegs = AddrRegs.size(); | ||||||
4277 | if (NumAddrRegs != 1) { | ||||||
4278 | // Above 8 elements round up to next power of 2 (i.e. 16). | ||||||
4279 | if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { | ||||||
4280 | const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); | ||||||
4281 | auto Undef = B.buildUndef(S32); | ||||||
4282 | AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); | ||||||
4283 | NumAddrRegs = RoundedNumRegs; | ||||||
4284 | } | ||||||
4285 | |||||||
4286 | auto VAddr = | ||||||
4287 | B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); | ||||||
4288 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||||
4289 | } | ||||||
4290 | |||||||
4291 | for (int I = 1; I != NumVAddrs; ++I) { | ||||||
4292 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||
4293 | if (SrcOp.isReg()) | ||||||
4294 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||||
4295 | } | ||||||
4296 | } | ||||||
4297 | |||||||
4298 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||||
4299 | /// | ||||||
4300 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||||
4301 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||||
4302 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||||
4303 | /// registers. | ||||||
4304 | /// | ||||||
4305 | /// We don't want to directly select image instructions just yet, but also want | ||||||
4306 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||||
4307 | /// want a selected instrution entering RegBankSelect. In order to avoid | ||||||
4308 | /// defining a multitude of intermediate image instructions, directly hack on | ||||||
4309 | /// the intrinsic's arguments. In cases like a16 addreses, this requires padding | ||||||
4310 | /// now unnecessary arguments with $noreg. | ||||||
4311 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||||
4312 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||||
4313 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||||
4314 | |||||||
4315 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||||
4316 | const unsigned ArgOffset = NumDefs + 1; | ||||||
4317 | bool IsTFE = NumDefs == 2; | ||||||
4318 | // We are only processing the operands of d16 image operations on subtargets | ||||||
4319 | // that use the unpacked register layout, or need to repack the TFE result. | ||||||
4320 | |||||||
4321 | // TODO: Do we need to guard against already legalized intrinsics? | ||||||
4322 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||||
4323 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||||
4324 | |||||||
4325 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
4326 | const LLT S32 = LLT::scalar(32); | ||||||
4327 | const LLT S16 = LLT::scalar(16); | ||||||
4328 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||||
4329 | |||||||
4330 | unsigned DMask = 0; | ||||||
4331 | |||||||
4332 | // Check for 16 bit addresses and pack if true. | ||||||
4333 | LLT GradTy = | ||||||
4334 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||||
4335 | LLT AddrTy = | ||||||
4336 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||||
4337 | const bool IsG16 = GradTy == S16; | ||||||
4338 | const bool IsA16 = AddrTy == S16; | ||||||
4339 | |||||||
4340 | int DMaskLanes = 0; | ||||||
4341 | if (!BaseOpcode->Atomic) { | ||||||
4342 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||||
4343 | if (BaseOpcode->Gather4) { | ||||||
4344 | DMaskLanes = 4; | ||||||
4345 | } else if (DMask != 0) { | ||||||
4346 | DMaskLanes = countPopulation(DMask); | ||||||
4347 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||||
4348 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||||
4349 | B.buildUndef(MI.getOperand(0)); | ||||||
4350 | MI.eraseFromParent(); | ||||||
4351 | return true; | ||||||
4352 | } | ||||||
4353 | } | ||||||
4354 | |||||||
4355 | Observer.changingInstr(MI); | ||||||
4356 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||||
4357 | |||||||
4358 | unsigned NewOpcode = NumDefs == 0 ? | ||||||
4359 | AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||||
4360 | |||||||
4361 | // Track that we legalized this | ||||||
4362 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||||
4363 | |||||||
4364 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||||
4365 | // dmask to be at least 1 otherwise the instruction will fail | ||||||
4366 | if (IsTFE && DMask == 0) { | ||||||
4367 | DMask = 0x1; | ||||||
4368 | DMaskLanes = 1; | ||||||
4369 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||||
4370 | } | ||||||
4371 | |||||||
4372 | if (BaseOpcode->Atomic) { | ||||||
4373 | Register VData0 = MI.getOperand(2).getReg(); | ||||||
4374 | LLT Ty = MRI->getType(VData0); | ||||||
4375 | |||||||
4376 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||||
4377 | if (Ty.isVector()) | ||||||
4378 | return false; | ||||||
4379 | |||||||
4380 | if (BaseOpcode->AtomicX2) { | ||||||
4381 | Register VData1 = MI.getOperand(3).getReg(); | ||||||
4382 | // The two values are packed in one register. | ||||||
4383 | LLT PackedTy = LLT::fixed_vector(2, Ty); | ||||||
4384 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||||
4385 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||||
4386 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||||
4387 | } | ||||||
4388 | } | ||||||
4389 | |||||||
4390 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||||
4391 | |||||||
4392 | // Optimize _L to _LZ when _L is zero | ||||||
4393 | if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = | ||||||
4394 | AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { | ||||||
4395 | const ConstantFP *ConstantLod; | ||||||
4396 | |||||||
4397 | if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, | ||||||
4398 | m_GFCst(ConstantLod))) { | ||||||
4399 | if (ConstantLod->isZero() || ConstantLod->isNegative()) { | ||||||
4400 | // Set new opcode to _lz variant of _l, and change the intrinsic ID. | ||||||
4401 | const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = | ||||||
4402 | AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, | ||||||
4403 | Intr->Dim); | ||||||
4404 | |||||||
4405 | // The starting indexes should remain in the same place. | ||||||
4406 | --CorrectedNumVAddrs; | ||||||
4407 | |||||||
4408 | MI.getOperand(MI.getNumExplicitDefs()) | ||||||
4409 | .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); | ||||||
4410 | MI.RemoveOperand(ArgOffset + Intr->LodIndex); | ||||||
4411 | Intr = NewImageDimIntr; | ||||||
4412 | } | ||||||
4413 | } | ||||||
4414 | } | ||||||
4415 | |||||||
4416 | // Optimize _mip away, when 'lod' is zero | ||||||
4417 | if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { | ||||||
4418 | int64_t ConstantLod; | ||||||
4419 | if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, | ||||||
4420 | m_ICst(ConstantLod))) { | ||||||
4421 | if (ConstantLod == 0) { | ||||||
4422 | // TODO: Change intrinsic opcode and remove operand instead or replacing | ||||||
4423 | // it with 0, as the _L to _LZ handling is done above. | ||||||
4424 | MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); | ||||||
4425 | --CorrectedNumVAddrs; | ||||||
4426 | } | ||||||
4427 | } | ||||||
4428 | } | ||||||
4429 | |||||||
4430 | // Rewrite the addressing register layout before doing anything else. | ||||||
4431 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { | ||||||
4432 | // 16 bit gradients are supported, but are tied to the A16 control | ||||||
4433 | // so both gradients and addresses must be 16 bit | ||||||
4434 | return false; | ||||||
4435 | } | ||||||
4436 | |||||||
4437 | if (IsA16 && !ST.hasA16()) { | ||||||
4438 | // A16 not supported | ||||||
4439 | return false; | ||||||
4440 | } | ||||||
4441 | |||||||
4442 | if (IsA16 || IsG16) { | ||||||
4443 | if (Intr->NumVAddrs > 1) { | ||||||
4444 | SmallVector<Register, 4> PackedRegs; | ||||||
4445 | |||||||
4446 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, | ||||||
4447 | IsG16); | ||||||
4448 | |||||||
4449 | // See also below in the non-a16 branch | ||||||
4450 | const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && | ||||||
4451 | PackedRegs.size() <= ST.getNSAMaxSize(); | ||||||
4452 | |||||||
4453 | if (!UseNSA && PackedRegs.size() > 1) { | ||||||
4454 | LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); | ||||||
4455 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||||
4456 | PackedRegs[0] = Concat.getReg(0); | ||||||
4457 | PackedRegs.resize(1); | ||||||
4458 | } | ||||||
4459 | |||||||
4460 | const unsigned NumPacked = PackedRegs.size(); | ||||||
4461 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||||
4462 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||
4463 | if (!SrcOp.isReg()) { | ||||||
4464 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4464, __extension__ __PRETTY_FUNCTION__)); | ||||||
4465 | continue; | ||||||
4466 | } | ||||||
4467 | |||||||
4468 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4468, __extension__ __PRETTY_FUNCTION__)); | ||||||
4469 | |||||||
4470 | if (I - Intr->VAddrStart < NumPacked) | ||||||
4471 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||||
4472 | else | ||||||
4473 | SrcOp.setReg(AMDGPU::NoRegister); | ||||||
4474 | } | ||||||
4475 | } | ||||||
4476 | } else { | ||||||
4477 | // If the register allocator cannot place the address registers contiguously | ||||||
4478 | // without introducing moves, then using the non-sequential address encoding | ||||||
4479 | // is always preferable, since it saves VALU instructions and is usually a | ||||||
4480 | // wash in terms of code size or even better. | ||||||
4481 | // | ||||||
4482 | // However, we currently have no way of hinting to the register allocator | ||||||
4483 | // that MIMG addresses should be placed contiguously when it is possible to | ||||||
4484 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||||
4485 | // | ||||||
4486 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||||
4487 | // allocation when possible. | ||||||
4488 | const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && | ||||||
4489 | CorrectedNumVAddrs <= ST.getNSAMaxSize(); | ||||||
4490 | |||||||
4491 | if (!UseNSA && Intr->NumVAddrs > 1) | ||||||
4492 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||||
4493 | Intr->NumVAddrs); | ||||||
4494 | } | ||||||
4495 | |||||||
4496 | int Flags = 0; | ||||||
4497 | if (IsA16) | ||||||
4498 | Flags |= 1; | ||||||
4499 | if (IsG16) | ||||||
4500 | Flags |= 2; | ||||||
4501 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||||
4502 | |||||||
4503 | if (BaseOpcode->Store) { // No TFE for stores? | ||||||
4504 | // TODO: Handle dmask trim | ||||||
4505 | Register VData = MI.getOperand(1).getReg(); | ||||||
4506 | LLT Ty = MRI->getType(VData); | ||||||
4507 | if (!Ty.isVector() || Ty.getElementType() != S16) | ||||||
4508 | return true; | ||||||
4509 | |||||||
4510 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||||
4511 | if (RepackedReg != VData) { | ||||||
4512 | MI.getOperand(1).setReg(RepackedReg); | ||||||
4513 | } | ||||||
4514 | |||||||
4515 | return true; | ||||||
4516 | } | ||||||
4517 | |||||||
4518 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
4519 | LLT Ty = MRI->getType(DstReg); | ||||||
4520 | const LLT EltTy = Ty.getScalarType(); | ||||||
4521 | const bool IsD16 = Ty.getScalarType() == S16; | ||||||
4522 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||||
4523 | |||||||
4524 | // Confirm that the return type is large enough for the dmask specified | ||||||
4525 | if (NumElts < DMaskLanes) | ||||||
4526 | return false; | ||||||
4527 | |||||||
4528 | if (NumElts > 4 || DMaskLanes > 4) | ||||||
4529 | return false; | ||||||
4530 | |||||||
4531 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; | ||||||
4532 | const LLT AdjustedTy = | ||||||
4533 | Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); | ||||||
4534 | |||||||
4535 | // The raw dword aligned data component of the load. The only legal cases | ||||||
4536 | // where this matters should be when using the packed D16 format, for | ||||||
4537 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||||
4538 | LLT RoundedTy; | ||||||
4539 | |||||||
4540 | // S32 vector to to cover all data, plus TFE result element. | ||||||
4541 | LLT TFETy; | ||||||
4542 | |||||||
4543 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||||
4544 | LLT RegTy; | ||||||
4545 | |||||||
4546 | if (IsD16 && ST.hasUnpackedD16VMem()) { | ||||||
4547 | RoundedTy = | ||||||
4548 | LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); | ||||||
4549 | TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); | ||||||
4550 | RegTy = S32; | ||||||
4551 | } else { | ||||||
4552 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
4553 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||||
4554 | unsigned RoundedSize = 32 * RoundedElts; | ||||||
4555 | RoundedTy = LLT::scalarOrVector( | ||||||
4556 | ElementCount::getFixed(RoundedSize / EltSize), EltSize); | ||||||
4557 | TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); | ||||||
4558 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||||
4559 | } | ||||||
4560 | |||||||
4561 | // The return type does not need adjustment. | ||||||
4562 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||||
4563 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||||
4564 | return true; | ||||||
4565 | |||||||
4566 | Register Dst1Reg; | ||||||
4567 | |||||||
4568 | // Insert after the instruction. | ||||||
4569 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||||
4570 | |||||||
4571 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||||
4572 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||||
4573 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||||
4574 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||||
4575 | |||||||
4576 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||||
4577 | |||||||
4578 | MI.getOperand(0).setReg(NewResultReg); | ||||||
4579 | |||||||
4580 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||||
4581 | // type. The intruction really returns these two values in one contiguous | ||||||
4582 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||||
4583 | // return type to use a single register result. | ||||||
4584 | |||||||
4585 | if (IsTFE) { | ||||||
4586 | Dst1Reg = MI.getOperand(1).getReg(); | ||||||
4587 | if (MRI->getType(Dst1Reg) != S32) | ||||||
4588 | return false; | ||||||
4589 | |||||||
4590 | // TODO: Make sure the TFE operand bit is set. | ||||||
4591 | MI.RemoveOperand(1); | ||||||
4592 | |||||||
4593 | // Handle the easy case that requires no repack instructions. | ||||||
4594 | if (Ty == S32) { | ||||||
4595 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||||
4596 | return true; | ||||||
4597 | } | ||||||
4598 | } | ||||||
4599 | |||||||
4600 | // Now figure out how to copy the new result register back into the old | ||||||
4601 | // result. | ||||||
4602 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||||
4603 | |||||||
4604 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||||
4605 | |||||||
4606 | if (ResultNumRegs == 1) { | ||||||
4607 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4607, __extension__ __PRETTY_FUNCTION__)); | ||||||
4608 | ResultRegs[0] = NewResultReg; | ||||||
4609 | } else { | ||||||
4610 | // We have to repack into a new vector of some kind. | ||||||
4611 | for (int I = 0; I != NumDataRegs; ++I) | ||||||
4612 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||||
4613 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||||
4614 | |||||||
4615 | // Drop the final TFE element to get the data part. The TFE result is | ||||||
4616 | // directly written to the right place already. | ||||||
4617 | if (IsTFE) | ||||||
4618 | ResultRegs.resize(NumDataRegs); | ||||||
4619 | } | ||||||
4620 | |||||||
4621 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||||
4622 | // of packed vs. unpacked. | ||||||
4623 | if (IsD16 && !Ty.isVector()) { | ||||||
4624 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||||
4625 | return true; | ||||||
4626 | } | ||||||
4627 | |||||||
4628 | // Avoid a build/concat_vector of 1 entry. | ||||||
4629 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||||
4630 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||||
4631 | return true; | ||||||
4632 | } | ||||||
4633 | |||||||
4634 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4634, __extension__ __PRETTY_FUNCTION__)); | ||||||
4635 | |||||||
4636 | if (IsD16) { | ||||||
4637 | // For packed D16 results with TFE enabled, all the data components are | ||||||
4638 | // S32. Cast back to the expected type. | ||||||
4639 | // | ||||||
4640 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||||
4641 | // cast for the TFE result if a multiple of v2s16 was used. | ||||||
4642 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||||
4643 | for (Register &Reg : ResultRegs) | ||||||
4644 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||||
4645 | } else if (ST.hasUnpackedD16VMem()) { | ||||||
4646 | for (Register &Reg : ResultRegs) | ||||||
4647 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||||
4648 | } | ||||||
4649 | } | ||||||
4650 | |||||||
4651 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||||
4652 | if (NumElts == 0) | ||||||
4653 | return; | ||||||
4654 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||||
4655 | for (int I = 0; I != NumElts; ++I) | ||||||
4656 | ResultRegs.push_back(Undef); | ||||||
4657 | }; | ||||||
4658 | |||||||
4659 | // Pad out any elements eliminated due to the dmask. | ||||||
4660 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||||
4661 | if (!ResTy.isVector()) { | ||||||
4662 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||||
4663 | B.buildBuildVector(DstReg, ResultRegs); | ||||||
4664 | return true; | ||||||
4665 | } | ||||||
4666 | |||||||
4667 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4667, __extension__ __PRETTY_FUNCTION__)); | ||||||
4668 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||||
4669 | |||||||
4670 | // Deal with the one annoying legal case. | ||||||
4671 | const LLT V3S16 = LLT::fixed_vector(3, 16); | ||||||
4672 | if (Ty == V3S16) { | ||||||
4673 | padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); | ||||||
4674 | auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); | ||||||
4675 | B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); | ||||||
4676 | return true; | ||||||
4677 | } | ||||||
4678 | |||||||
4679 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||||
4680 | B.buildConcatVectors(DstReg, ResultRegs); | ||||||
4681 | return true; | ||||||
4682 | } | ||||||
4683 | |||||||
4684 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||||
4685 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||||
4686 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
4687 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
4688 | |||||||
4689 | Register Dst = MI.getOperand(0).getReg(); | ||||||
4690 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
4691 | unsigned Size = Ty.getSizeInBits(); | ||||||
4692 | MachineFunction &MF = B.getMF(); | ||||||
4693 | |||||||
4694 | Observer.changingInstr(MI); | ||||||
4695 | |||||||
4696 | if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { | ||||||
4697 | Ty = getBitcastRegisterType(Ty); | ||||||
4698 | Helper.bitcastDst(MI, Ty, 0); | ||||||
4699 | Dst = MI.getOperand(0).getReg(); | ||||||
4700 | B.setInsertPt(B.getMBB(), MI); | ||||||
4701 | } | ||||||
4702 | |||||||
4703 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||||
4704 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||||
4705 | // allowed to add one. | ||||||
4706 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||||
4707 | MI.RemoveOperand(1); // Remove intrinsic ID | ||||||
4708 | |||||||
4709 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||||
4710 | // TODO: Should this use datalayout alignment? | ||||||
4711 | const unsigned MemSize = (Size + 7) / 8; | ||||||
4712 | const Align MemAlign(4); | ||||||
4713 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
4714 | MachinePointerInfo(), | ||||||
4715 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
4716 | MachineMemOperand::MOInvariant, | ||||||
4717 | MemSize, MemAlign); | ||||||
4718 | MI.addMemOperand(MF, MMO); | ||||||
4719 | |||||||
4720 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||||
4721 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||||
4722 | // out this needs to be converted to a vector load during RegBankSelect. | ||||||
4723 | if (!isPowerOf2_32(Size)) { | ||||||
4724 | if (Ty.isVector()) | ||||||
4725 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||||
4726 | else | ||||||
4727 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||||
4728 | } | ||||||
4729 | |||||||
4730 | Observer.changedInstr(MI); | ||||||
4731 | return true; | ||||||
4732 | } | ||||||
4733 | |||||||
4734 | // TODO: Move to selection | ||||||
4735 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||||
4736 | MachineRegisterInfo &MRI, | ||||||
4737 | MachineIRBuilder &B) const { | ||||||
4738 | if (!ST.isTrapHandlerEnabled() || | ||||||
4739 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | ||||||
4740 | return legalizeTrapEndpgm(MI, MRI, B); | ||||||
4741 | |||||||
4742 | if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { | ||||||
4743 | switch (*HsaAbiVer) { | ||||||
4744 | case ELF::ELFABIVERSION_AMDGPU_HSA_V2: | ||||||
4745 | case ELF::ELFABIVERSION_AMDGPU_HSA_V3: | ||||||
4746 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||
4747 | case ELF::ELFABIVERSION_AMDGPU_HSA_V4: | ||||||
4748 | return ST.supportsGetDoorbellID() ? | ||||||
4749 | legalizeTrapHsa(MI, MRI, B) : | ||||||
4750 | legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||||
4751 | } | ||||||
4752 | } | ||||||
4753 | |||||||
4754 | llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4754); | ||||||
4755 | } | ||||||
4756 | |||||||
4757 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | ||||||
4758 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4759 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||||
4760 | MI.eraseFromParent(); | ||||||
4761 | return true; | ||||||
4762 | } | ||||||
4763 | |||||||
4764 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | ||||||
4765 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4766 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||||
4767 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||||
4768 | Register LiveIn = | ||||||
4769 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
4770 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||
4771 | return false; | ||||||
4772 | |||||||
4773 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||||
4774 | B.buildCopy(SGPR01, LiveIn); | ||||||
4775 | B.buildInstr(AMDGPU::S_TRAP) | ||||||
4776 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||||
4777 | .addReg(SGPR01, RegState::Implicit); | ||||||
4778 | |||||||
4779 | MI.eraseFromParent(); | ||||||
4780 | return true; | ||||||
4781 | } | ||||||
4782 | |||||||
4783 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | ||||||
4784 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4785 | B.buildInstr(AMDGPU::S_TRAP) | ||||||
4786 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | ||||||
4787 | MI.eraseFromParent(); | ||||||
4788 | return true; | ||||||
4789 | } | ||||||
4790 | |||||||
4791 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||||
4792 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4793 | // Is non-HSA path or trap-handler disabled? then, report a warning | ||||||
4794 | // accordingly | ||||||
4795 | if (!ST.isTrapHandlerEnabled() || | ||||||
4796 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | ||||||
4797 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||||
4798 | "debugtrap handler not supported", | ||||||
4799 | MI.getDebugLoc(), DS_Warning); | ||||||
4800 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||||
4801 | Ctx.diagnose(NoTrap); | ||||||
4802 | } else { | ||||||
4803 | // Insert debug-trap instruction | ||||||
4804 | B.buildInstr(AMDGPU::S_TRAP) | ||||||
4805 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | ||||||
4806 | } | ||||||
4807 | |||||||
4808 | MI.eraseFromParent(); | ||||||
4809 | return true; | ||||||
4810 | } | ||||||
4811 | |||||||
4812 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||||
4813 | MachineIRBuilder &B) const { | ||||||
4814 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
4815 | const LLT S16 = LLT::scalar(16); | ||||||
4816 | const LLT S32 = LLT::scalar(32); | ||||||
4817 | |||||||
4818 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
4819 | Register NodePtr = MI.getOperand(2).getReg(); | ||||||
4820 | Register RayExtent = MI.getOperand(3).getReg(); | ||||||
4821 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||||
4822 | Register RayDir = MI.getOperand(5).getReg(); | ||||||
4823 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||||
4824 | Register TDescr = MI.getOperand(7).getReg(); | ||||||
4825 | |||||||
4826 | if (!ST.hasGFX10_AEncoding()) { | ||||||
4827 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), | ||||||
4828 | "intrinsic not supported on subtarget", | ||||||
4829 | MI.getDebugLoc()); | ||||||
4830 | B.getMF().getFunction().getContext().diagnose(BadIntrin); | ||||||
4831 | return false; | ||||||
4832 | } | ||||||
4833 | |||||||
4834 | const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||||
4835 | const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||||
4836 | const unsigned NumVDataDwords = 4; | ||||||
4837 | const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); | ||||||
4838 | const bool UseNSA = | ||||||
4839 | ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); | ||||||
4840 | const unsigned BaseOpcodes[2][2] = { | ||||||
4841 | {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, | ||||||
4842 | {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, | ||||||
4843 | AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; | ||||||
4844 | int Opcode; | ||||||
4845 | if (UseNSA) { | ||||||
4846 | Opcode = | ||||||
4847 | AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, | ||||||
4848 | NumVDataDwords, NumVAddrDwords); | ||||||
4849 | } else { | ||||||
4850 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], | ||||||
4851 | AMDGPU::MIMGEncGfx10Default, NumVDataDwords, | ||||||
4852 | PowerOf2Ceil(NumVAddrDwords)); | ||||||
4853 | } | ||||||
4854 | assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail ("Opcode != -1", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4854, __extension__ __PRETTY_FUNCTION__)); | ||||||
4855 | |||||||
4856 | SmallVector<Register, 12> Ops; | ||||||
4857 | if (Is64) { | ||||||
4858 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||||
4859 | Ops.push_back(Unmerge.getReg(0)); | ||||||
4860 | Ops.push_back(Unmerge.getReg(1)); | ||||||
4861 | } else { | ||||||
4862 | Ops.push_back(NodePtr); | ||||||
4863 | } | ||||||
4864 | Ops.push_back(RayExtent); | ||||||
4865 | |||||||
4866 | auto packLanes = [&Ops, &S32, &B] (Register Src) { | ||||||
4867 | auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); | ||||||
4868 | Ops.push_back(Unmerge.getReg(0)); | ||||||
4869 | Ops.push_back(Unmerge.getReg(1)); | ||||||
4870 | Ops.push_back(Unmerge.getReg(2)); | ||||||
4871 | }; | ||||||
4872 | |||||||
4873 | packLanes(RayOrigin); | ||||||
4874 | if (IsA16) { | ||||||
4875 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); | ||||||
4876 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); | ||||||
4877 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||||
4878 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||||
4879 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||||
4880 | B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||||
4881 | B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||||
4882 | B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||||
4883 | Ops.push_back(R1); | ||||||
4884 | Ops.push_back(R2); | ||||||
4885 | Ops.push_back(R3); | ||||||
4886 | } else { | ||||||
4887 | packLanes(RayDir); | ||||||
4888 | packLanes(RayInvDir); | ||||||
4889 | } | ||||||
4890 | |||||||
4891 | if (!UseNSA) { | ||||||
4892 | // Build a single vector containing all the operands so far prepared. | ||||||
4893 | LLT OpTy = LLT::fixed_vector(Ops.size(), 32); | ||||||
4894 | Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); | ||||||
4895 | Ops.clear(); | ||||||
4896 | Ops.push_back(MergedOps); | ||||||
4897 | } | ||||||
4898 | |||||||
4899 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||||
4900 | .addDef(DstReg) | ||||||
4901 | .addImm(Opcode); | ||||||
4902 | |||||||
4903 | for (Register R : Ops) { | ||||||
4904 | MIB.addUse(R); | ||||||
4905 | } | ||||||
4906 | |||||||
4907 | MIB.addUse(TDescr) | ||||||
4908 | .addImm(IsA16 ? 1 : 0) | ||||||
4909 | .cloneMemRefs(MI); | ||||||
4910 | |||||||
4911 | MI.eraseFromParent(); | ||||||
4912 | return true; | ||||||
4913 | } | ||||||
4914 | |||||||
4915 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||||
4916 | MachineInstr &MI) const { | ||||||
4917 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
4918 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
4919 | |||||||
4920 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||||
4921 | auto IntrID = MI.getIntrinsicID(); | ||||||
4922 | switch (IntrID) { | ||||||
| |||||||
4923 | case Intrinsic::amdgcn_if: | ||||||
4924 | case Intrinsic::amdgcn_else: { | ||||||
4925 | MachineInstr *Br = nullptr; | ||||||
4926 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||
4927 | bool Negated = false; | ||||||
4928 | if (MachineInstr *BrCond = | ||||||
4929 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||
4930 | const SIRegisterInfo *TRI | ||||||
4931 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
4932 | |||||||
4933 | Register Def = MI.getOperand(1).getReg(); | ||||||
4934 | Register Use = MI.getOperand(3).getReg(); | ||||||
4935 | |||||||
4936 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||
4937 | |||||||
4938 | if (Negated) | ||||||
4939 | std::swap(CondBrTarget, UncondBrTarget); | ||||||
4940 | |||||||
4941 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||
4942 | if (IntrID == Intrinsic::amdgcn_if) { | ||||||
4943 | B.buildInstr(AMDGPU::SI_IF) | ||||||
4944 | .addDef(Def) | ||||||
4945 | .addUse(Use) | ||||||
4946 | .addMBB(UncondBrTarget); | ||||||
4947 | } else { | ||||||
4948 | B.buildInstr(AMDGPU::SI_ELSE) | ||||||
4949 | .addDef(Def) | ||||||
4950 | .addUse(Use) | ||||||
4951 | .addMBB(UncondBrTarget); | ||||||
4952 | } | ||||||
4953 | |||||||
4954 | if (Br) { | ||||||
4955 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||
4956 | } else { | ||||||
4957 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||||
4958 | // since we're swapping branch targets it needs to be reinserted. | ||||||
4959 | // FIXME: IRTranslator should probably not do this | ||||||
4960 | B.buildBr(*CondBrTarget); | ||||||
4961 | } | ||||||
4962 | |||||||
4963 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||||
4964 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||||
4965 | MI.eraseFromParent(); | ||||||
4966 | BrCond->eraseFromParent(); | ||||||
4967 | return true; | ||||||
4968 | } | ||||||
4969 | |||||||
4970 | return false; | ||||||
4971 | } | ||||||
4972 | case Intrinsic::amdgcn_loop: { | ||||||
4973 | MachineInstr *Br = nullptr; | ||||||
4974 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||
4975 | bool Negated = false; | ||||||
4976 | if (MachineInstr *BrCond = | ||||||
4977 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||
4978 | const SIRegisterInfo *TRI | ||||||
4979 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
4980 | |||||||
4981 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||
4982 | Register Reg = MI.getOperand(2).getReg(); | ||||||
4983 | |||||||
4984 | if (Negated) | ||||||
4985 | std::swap(CondBrTarget, UncondBrTarget); | ||||||
4986 | |||||||
4987 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||
4988 | B.buildInstr(AMDGPU::SI_LOOP) | ||||||
4989 | .addUse(Reg) | ||||||
4990 | .addMBB(UncondBrTarget); | ||||||
4991 | |||||||
4992 | if (Br) | ||||||
4993 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||
4994 | else | ||||||
4995 | B.buildBr(*CondBrTarget); | ||||||
4996 | |||||||
4997 | MI.eraseFromParent(); | ||||||
4998 | BrCond->eraseFromParent(); | ||||||
4999 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||||
5000 | return true; | ||||||
5001 | } | ||||||
5002 | |||||||
5003 | return false; | ||||||
5004 | } | ||||||
5005 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||||
5006 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||||
5007 | // This only makes sense to call in a kernel, so just lower to null. | ||||||
5008 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||||
5009 | MI.eraseFromParent(); | ||||||
5010 | return true; | ||||||
5011 | } | ||||||
5012 | |||||||
5013 | return legalizePreloadedArgIntrin( | ||||||
5014 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||
5015 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||||
5016 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||||
5017 | case Intrinsic::amdgcn_workitem_id_x: | ||||||
5018 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5019 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||||
5020 | case Intrinsic::amdgcn_workitem_id_y: | ||||||
5021 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5022 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||||
5023 | case Intrinsic::amdgcn_workitem_id_z: | ||||||
5024 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5025 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||||
5026 | case Intrinsic::amdgcn_workgroup_id_x: | ||||||
5027 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5028 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||||
5029 | case Intrinsic::amdgcn_workgroup_id_y: | ||||||
5030 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5031 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||||
5032 | case Intrinsic::amdgcn_workgroup_id_z: | ||||||
5033 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5034 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||||
5035 | case Intrinsic::amdgcn_dispatch_ptr: | ||||||
5036 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5037 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||||
5038 | case Intrinsic::amdgcn_queue_ptr: | ||||||
5039 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5040 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||||
5041 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||||
5042 | return legalizePreloadedArgIntrin( | ||||||
5043 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||||
5044 | case Intrinsic::amdgcn_dispatch_id: | ||||||
5045 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
5046 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||||
5047 | case Intrinsic::amdgcn_fdiv_fast: | ||||||
5048 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||||
5049 | case Intrinsic::amdgcn_is_shared: | ||||||
5050 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||||
5051 | case Intrinsic::amdgcn_is_private: | ||||||
5052 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||||
5053 | case Intrinsic::amdgcn_wavefrontsize: { | ||||||
5054 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||||
5055 | MI.eraseFromParent(); | ||||||
5056 | return true; | ||||||
5057 | } | ||||||
5058 | case Intrinsic::amdgcn_s_buffer_load: | ||||||
5059 | return legalizeSBufferLoad(Helper, MI); | ||||||
5060 | case Intrinsic::amdgcn_raw_buffer_store: | ||||||
5061 | case Intrinsic::amdgcn_struct_buffer_store: | ||||||
5062 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||||
5063 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||||
5064 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||||
5065 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||||
5066 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||||
5067 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||||
5068 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||||
5069 | case Intrinsic::amdgcn_raw_buffer_load: | ||||||
5070 | case Intrinsic::amdgcn_struct_buffer_load: | ||||||
5071 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||||
5072 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||||
5073 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||||
5074 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||||
5075 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||||
5076 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||||
5077 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||||
5078 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
5079 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
5080 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
5081 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
5082 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
5083 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
5084 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
5085 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
5086 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
5087 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
5088 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
5089 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
5090 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
5091 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
5092 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
5093 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
5094 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
5095 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
5096 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
5097 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
5098 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
5099 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
5100 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
5101 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
5102 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||
5103 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||
5104 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
5105 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
5106 | case Intrinsic::amdgcn_buffer_atomic_fadd: | ||||||
5107 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||||
5108 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||||
5109 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||||
5110 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||||
5111 | return legalizeBufferAtomic(MI, B, IntrID); | ||||||
5112 | case Intrinsic::amdgcn_atomic_inc: | ||||||
5113 | return legalizeAtomicIncDec(MI, B, true); | ||||||
5114 | case Intrinsic::amdgcn_atomic_dec: | ||||||
5115 | return legalizeAtomicIncDec(MI, B, false); | ||||||
5116 | case Intrinsic::trap: | ||||||
5117 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||||
5118 | case Intrinsic::debugtrap: | ||||||
5119 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||||
5120 | case Intrinsic::amdgcn_rsq_clamp: | ||||||
5121 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||||
5122 | case Intrinsic::amdgcn_ds_fadd: | ||||||
5123 | case Intrinsic::amdgcn_ds_fmin: | ||||||
5124 | case Intrinsic::amdgcn_ds_fmax: | ||||||
5125 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||||
5126 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||||
5127 | return legalizeBVHIntrinsic(MI, B); | ||||||
5128 | default: { | ||||||
5129 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||||
5130 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||||
5131 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||||
5132 | return true; | ||||||
5133 | } | ||||||
5134 | } | ||||||
5135 | |||||||
5136 | return true; | ||||||
5137 | } |
1 | //==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
10 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
11 | |
12 | #include "llvm/CodeGen/Register.h" |
13 | #include "llvm/Pass.h" |
14 | |
15 | namespace llvm { |
16 | |
17 | class Function; |
18 | class LLT; |
19 | class raw_ostream; |
20 | class TargetRegisterClass; |
21 | class TargetRegisterInfo; |
22 | |
23 | struct ArgDescriptor { |
24 | private: |
25 | friend struct AMDGPUFunctionArgInfo; |
26 | friend class AMDGPUArgumentUsageInfo; |
27 | |
28 | union { |
29 | MCRegister Reg; |
30 | unsigned StackOffset; |
31 | }; |
32 | |
33 | // Bitmask to locate argument within the register. |
34 | unsigned Mask; |
35 | |
36 | bool IsStack : 1; |
37 | bool IsSet : 1; |
38 | |
39 | public: |
40 | constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, |
41 | bool IsStack = false, bool IsSet = false) |
42 | : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} |
43 | |
44 | static constexpr ArgDescriptor createRegister(Register Reg, |
45 | unsigned Mask = ~0u) { |
46 | return ArgDescriptor(Reg, Mask, false, true); |
47 | } |
48 | |
49 | static constexpr ArgDescriptor createStack(unsigned Offset, |
50 | unsigned Mask = ~0u) { |
51 | return ArgDescriptor(Offset, Mask, true, true); |
52 | } |
53 | |
54 | static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, |
55 | unsigned Mask) { |
56 | return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); |
57 | } |
58 | |
59 | bool isSet() const { |
60 | return IsSet; |
61 | } |
62 | |
63 | explicit operator bool() const { |
64 | return isSet(); |
65 | } |
66 | |
67 | bool isRegister() const { |
68 | return !IsStack; |
69 | } |
70 | |
71 | MCRegister getRegister() const { |
72 | assert(!IsStack)(static_cast <bool> (!IsStack) ? void (0) : __assert_fail ("!IsStack", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 72, __extension__ __PRETTY_FUNCTION__)); |
73 | return Reg; |
74 | } |
75 | |
76 | unsigned getStackOffset() const { |
77 | assert(IsStack)(static_cast <bool> (IsStack) ? void (0) : __assert_fail ("IsStack", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 77, __extension__ __PRETTY_FUNCTION__)); |
78 | return StackOffset; |
79 | } |
80 | |
81 | unsigned getMask() const { |
82 | return Mask; |
83 | } |
84 | |
85 | bool isMasked() const { |
86 | return Mask != ~0u; |
87 | } |
88 | |
89 | void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; |
90 | }; |
91 | |
92 | inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { |
93 | Arg.print(OS); |
94 | return OS; |
95 | } |
96 | |
97 | struct AMDGPUFunctionArgInfo { |
98 | enum PreloadedValue { |
99 | // SGPRS: |
100 | PRIVATE_SEGMENT_BUFFER = 0, |
101 | DISPATCH_PTR = 1, |
102 | QUEUE_PTR = 2, |
103 | KERNARG_SEGMENT_PTR = 3, |
104 | DISPATCH_ID = 4, |
105 | FLAT_SCRATCH_INIT = 5, |
106 | WORKGROUP_ID_X = 10, |
107 | WORKGROUP_ID_Y = 11, |
108 | WORKGROUP_ID_Z = 12, |
109 | PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, |
110 | IMPLICIT_BUFFER_PTR = 15, |
111 | IMPLICIT_ARG_PTR = 16, |
112 | |
113 | // VGPRS: |
114 | WORKITEM_ID_X = 17, |
115 | WORKITEM_ID_Y = 18, |
116 | WORKITEM_ID_Z = 19, |
117 | FIRST_VGPR_VALUE = WORKITEM_ID_X |
118 | }; |
119 | |
120 | // Kernel input registers setup for the HSA ABI in allocation order. |
121 | |
122 | // User SGPRs in kernels |
123 | // XXX - Can these require argument spills? |
124 | ArgDescriptor PrivateSegmentBuffer; |
125 | ArgDescriptor DispatchPtr; |
126 | ArgDescriptor QueuePtr; |
127 | ArgDescriptor KernargSegmentPtr; |
128 | ArgDescriptor DispatchID; |
129 | ArgDescriptor FlatScratchInit; |
130 | ArgDescriptor PrivateSegmentSize; |
131 | |
132 | // System SGPRs in kernels. |
133 | ArgDescriptor WorkGroupIDX; |
134 | ArgDescriptor WorkGroupIDY; |
135 | ArgDescriptor WorkGroupIDZ; |
136 | ArgDescriptor WorkGroupInfo; |
137 | ArgDescriptor PrivateSegmentWaveByteOffset; |
138 | |
139 | // Pointer with offset from kernargsegmentptr to where special ABI arguments |
140 | // are passed to callable functions. |
141 | ArgDescriptor ImplicitArgPtr; |
142 | |
143 | // Input registers for non-HSA ABI |
144 | ArgDescriptor ImplicitBufferPtr; |
145 | |
146 | // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed |
147 | // into v0, 10 bits per dimension if packed-tid is set. |
148 | ArgDescriptor WorkItemIDX; |
149 | ArgDescriptor WorkItemIDY; |
150 | ArgDescriptor WorkItemIDZ; |
151 | |
152 | std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> |
153 | getPreloadedValue(PreloadedValue Value) const; |
154 | |
155 | static constexpr AMDGPUFunctionArgInfo fixedABILayout(); |
156 | }; |
157 | |
158 | class AMDGPUArgumentUsageInfo : public ImmutablePass { |
159 | private: |
160 | DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; |
161 | |
162 | public: |
163 | static char ID; |
164 | |
165 | static const AMDGPUFunctionArgInfo ExternFunctionInfo; |
166 | static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; |
167 | |
168 | AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } |
169 | |
170 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
171 | AU.setPreservesAll(); |
172 | } |
173 | |
174 | bool doInitialization(Module &M) override; |
175 | bool doFinalization(Module &M) override; |
176 | |
177 | void print(raw_ostream &OS, const Module *M = nullptr) const override; |
178 | |
179 | void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { |
180 | ArgInfoMap[&F] = ArgInfo; |
181 | } |
182 | |
183 | const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; |
184 | }; |
185 | |
186 | } // end namespace llvm |
187 | |
188 | #endif |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file contains some functions that are useful for math stuff. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||||
14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||||
15 | |||||||
16 | #include "llvm/Support/Compiler.h" | ||||||
17 | #include <cassert> | ||||||
18 | #include <climits> | ||||||
19 | #include <cmath> | ||||||
20 | #include <cstdint> | ||||||
21 | #include <cstring> | ||||||
22 | #include <limits> | ||||||
23 | #include <type_traits> | ||||||
24 | |||||||
25 | #ifdef __ANDROID_NDK__ | ||||||
26 | #include <android/api-level.h> | ||||||
27 | #endif | ||||||
28 | |||||||
29 | #ifdef _MSC_VER | ||||||
30 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||||
31 | // expensive, and MathExtras.h is popular. | ||||||
32 | // #include <intrin.h> | ||||||
33 | extern "C" { | ||||||
34 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||||
35 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
36 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||||
37 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
38 | } | ||||||
39 | #endif | ||||||
40 | |||||||
41 | namespace llvm { | ||||||
42 | |||||||
43 | /// The behavior an operation has on an input of 0. | ||||||
44 | enum ZeroBehavior { | ||||||
45 | /// The returned value is undefined. | ||||||
46 | ZB_Undefined, | ||||||
47 | /// The returned value is numeric_limits<T>::max() | ||||||
48 | ZB_Max, | ||||||
49 | /// The returned value is numeric_limits<T>::digits | ||||||
50 | ZB_Width | ||||||
51 | }; | ||||||
52 | |||||||
53 | /// Mathematical constants. | ||||||
54 | namespace numbers { | ||||||
55 | // TODO: Track C++20 std::numbers. | ||||||
56 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||||
57 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||||
58 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||||
59 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||||
60 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||||
61 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||||
62 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||||
63 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||||
64 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||||
65 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||||
66 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||||
67 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||||
68 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||||
69 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||||
70 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||||
71 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||||
72 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||||
73 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||||
74 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||||
75 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||||
76 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||||
77 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||||
78 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||||
79 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||||
80 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||||
81 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||||
82 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||||
83 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||||
84 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||||
85 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||||
86 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||||
87 | } // namespace numbers | ||||||
88 | |||||||
89 | namespace detail { | ||||||
90 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||||
91 | static unsigned count(T Val, ZeroBehavior) { | ||||||
92 | if (!Val) | ||||||
93 | return std::numeric_limits<T>::digits; | ||||||
94 | if (Val & 0x1) | ||||||
95 | return 0; | ||||||
96 | |||||||
97 | // Bisection method. | ||||||
98 | unsigned ZeroBits = 0; | ||||||
99 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||||
100 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||||
101 | while (Shift) { | ||||||
102 | if ((Val & Mask) == 0) { | ||||||
103 | Val >>= Shift; | ||||||
104 | ZeroBits |= Shift; | ||||||
105 | } | ||||||
106 | Shift >>= 1; | ||||||
107 | Mask >>= Shift; | ||||||
108 | } | ||||||
109 | return ZeroBits; | ||||||
110 | } | ||||||
111 | }; | ||||||
112 | |||||||
113 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
114 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||||
115 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
116 | if (ZB
| ||||||
117 | return 32; | ||||||
118 | |||||||
119 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||||
120 | return __builtin_ctz(Val); | ||||||
121 | #elif defined(_MSC_VER) | ||||||
122 | unsigned long Index; | ||||||
123 | _BitScanForward(&Index, Val); | ||||||
124 | return Index; | ||||||
125 | #endif | ||||||
126 | } | ||||||
127 | }; | ||||||
128 | |||||||
129 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
130 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||||
131 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
132 | if (ZB != ZB_Undefined && Val == 0) | ||||||
133 | return 64; | ||||||
134 | |||||||
135 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||||
136 | return __builtin_ctzll(Val); | ||||||
137 | #elif defined(_MSC_VER) | ||||||
138 | unsigned long Index; | ||||||
139 | _BitScanForward64(&Index, Val); | ||||||
140 | return Index; | ||||||
141 | #endif | ||||||
142 | } | ||||||
143 | }; | ||||||
144 | #endif | ||||||
145 | #endif | ||||||
146 | } // namespace detail | ||||||
147 | |||||||
148 | /// Count number of 0's from the least significant bit to the most | ||||||
149 | /// stopping at the first 1. | ||||||
150 | /// | ||||||
151 | /// Only unsigned integral types are allowed. | ||||||
152 | /// | ||||||
153 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
154 | /// valid arguments. | ||||||
155 | template <typename T> | ||||||
156 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
157 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
158 | !std::numeric_limits<T>::is_signed, | ||||||
159 | "Only unsigned integral types are allowed."); | ||||||
160 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
161 | } | ||||||
162 | |||||||
163 | namespace detail { | ||||||
164 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||||
165 | static unsigned count(T Val, ZeroBehavior) { | ||||||
166 | if (!Val) | ||||||
167 | return std::numeric_limits<T>::digits; | ||||||
168 | |||||||
169 | // Bisection method. | ||||||
170 | unsigned ZeroBits = 0; | ||||||
171 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||||
172 | T Tmp = Val >> Shift; | ||||||
173 | if (Tmp) | ||||||
174 | Val = Tmp; | ||||||
175 | else | ||||||
176 | ZeroBits |= Shift; | ||||||
177 | } | ||||||
178 | return ZeroBits; | ||||||
179 | } | ||||||
180 | }; | ||||||
181 | |||||||
182 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
183 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||||
184 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
185 | if (ZB != ZB_Undefined && Val == 0) | ||||||
186 | return 32; | ||||||
187 | |||||||
188 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||||
189 | return __builtin_clz(Val); | ||||||
190 | #elif defined(_MSC_VER) | ||||||
191 | unsigned long Index; | ||||||
192 | _BitScanReverse(&Index, Val); | ||||||
193 | return Index ^ 31; | ||||||
194 | #endif | ||||||
195 | } | ||||||
196 | }; | ||||||
197 | |||||||
198 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
199 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||||
200 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
201 | if (ZB != ZB_Undefined && Val == 0) | ||||||
202 | return 64; | ||||||
203 | |||||||
204 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||||
205 | return __builtin_clzll(Val); | ||||||
206 | #elif defined(_MSC_VER) | ||||||
207 | unsigned long Index; | ||||||
208 | _BitScanReverse64(&Index, Val); | ||||||
209 | return Index ^ 63; | ||||||
210 | #endif | ||||||
211 | } | ||||||
212 | }; | ||||||
213 | #endif | ||||||
214 | #endif | ||||||
215 | } // namespace detail | ||||||
216 | |||||||
217 | /// Count number of 0's from the most significant bit to the least | ||||||
218 | /// stopping at the first 1. | ||||||
219 | /// | ||||||
220 | /// Only unsigned integral types are allowed. | ||||||
221 | /// | ||||||
222 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
223 | /// valid arguments. | ||||||
224 | template <typename T> | ||||||
225 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
226 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
227 | !std::numeric_limits<T>::is_signed, | ||||||
228 | "Only unsigned integral types are allowed."); | ||||||
229 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
230 | } | ||||||
231 | |||||||
232 | /// Get the index of the first set bit starting from the least | ||||||
233 | /// significant bit. | ||||||
234 | /// | ||||||
235 | /// Only unsigned integral types are allowed. | ||||||
236 | /// | ||||||
237 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
238 | /// valid arguments. | ||||||
239 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
240 | if (ZB == ZB_Max && Val == 0) | ||||||
241 | return std::numeric_limits<T>::max(); | ||||||
242 | |||||||
243 | return countTrailingZeros(Val, ZB_Undefined); | ||||||
244 | } | ||||||
245 | |||||||
246 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||||
247 | /// bits set to 0. Only unsigned types are allowed. | ||||||
248 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||||
249 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||||
250 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||||
251 | assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index" ) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 251, __extension__ __PRETTY_FUNCTION__)); | ||||||
252 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||||
253 | } | ||||||
254 | |||||||
255 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||||
256 | /// bits set to 0. Only unsigned types are allowed. | ||||||
257 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||||
258 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
259 | } | ||||||
260 | |||||||
261 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||||
262 | /// bits set to 1. Only unsigned types are allowed. | ||||||
263 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||||
264 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
265 | } | ||||||
266 | |||||||
267 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||||
268 | /// bits set to 1. Only unsigned types are allowed. | ||||||
269 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||||
270 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
271 | } | ||||||
272 | |||||||
273 | /// Get the index of the last set bit starting from the least | ||||||
274 | /// significant bit. | ||||||
275 | /// | ||||||
276 | /// Only unsigned integral types are allowed. | ||||||
277 | /// | ||||||
278 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
279 | /// valid arguments. | ||||||
280 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
281 | if (ZB == ZB_Max && Val == 0) | ||||||
282 | return std::numeric_limits<T>::max(); | ||||||
283 | |||||||
284 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||||
285 | // in the __builtin_clz intrinsic on x86. | ||||||
286 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||||
287 | (std::numeric_limits<T>::digits - 1); | ||||||
288 | } | ||||||
289 | |||||||
290 | /// Macro compressed bit reversal table for 256 bits. | ||||||
291 | /// | ||||||
292 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||||
293 | static const unsigned char BitReverseTable256[256] = { | ||||||
294 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||||
295 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||||
296 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||||
297 | R6(0), R6(2), R6(1), R6(3) | ||||||
298 | #undef R2 | ||||||
299 | #undef R4 | ||||||
300 | #undef R6 | ||||||
301 | }; | ||||||
302 | |||||||
303 | /// Reverse the bits in \p Val. | ||||||
304 | template <typename T> | ||||||
305 | T reverseBits(T Val) { | ||||||
306 | unsigned char in[sizeof(Val)]; | ||||||
307 | unsigned char out[sizeof(Val)]; | ||||||
308 | std::memcpy(in, &Val, sizeof(Val)); | ||||||
309 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||||
310 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||||
311 | std::memcpy(&Val, out, sizeof(Val)); | ||||||
312 | return Val; | ||||||
313 | } | ||||||
314 | |||||||
315 | #if __has_builtin(__builtin_bitreverse8)1 | ||||||
316 | template<> | ||||||
317 | inline uint8_t reverseBits<uint8_t>(uint8_t Val) { | ||||||
318 | return __builtin_bitreverse8(Val); | ||||||
319 | } | ||||||
320 | #endif | ||||||
321 | |||||||
322 | #if __has_builtin(__builtin_bitreverse16)1 | ||||||
323 | template<> | ||||||
324 | inline uint16_t reverseBits<uint16_t>(uint16_t Val) { | ||||||
325 | return __builtin_bitreverse16(Val); | ||||||
326 | } | ||||||
327 | #endif | ||||||
328 | |||||||
329 | #if __has_builtin(__builtin_bitreverse32)1 | ||||||
330 | template<> | ||||||
331 | inline uint32_t reverseBits<uint32_t>(uint32_t Val) { | ||||||
332 | return __builtin_bitreverse32(Val); | ||||||
333 | } | ||||||
334 | #endif | ||||||
335 | |||||||
336 | #if __has_builtin(__builtin_bitreverse64)1 | ||||||
337 | template<> | ||||||
338 | inline uint64_t reverseBits<uint64_t>(uint64_t Val) { | ||||||
339 | return __builtin_bitreverse64(Val); | ||||||
340 | } | ||||||
341 | #endif | ||||||
342 | |||||||
343 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||||
344 | // type overloading so that signed and unsigned integers can be used without | ||||||
345 | // ambiguity. | ||||||
346 | |||||||
347 | /// Return the high 32 bits of a 64 bit value. | ||||||
348 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||||
349 | return static_cast<uint32_t>(Value >> 32); | ||||||
350 | } | ||||||
351 | |||||||
352 | /// Return the low 32 bits of a 64 bit value. | ||||||
353 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||||
354 | return static_cast<uint32_t>(Value); | ||||||
355 | } | ||||||
356 | |||||||
357 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||||
358 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||||
359 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||||
360 | } | ||||||
361 | |||||||
362 | /// Checks if an integer fits into the given bit width. | ||||||
363 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||||
364 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); | ||||||
365 | } | ||||||
366 | // Template specializations to get better code for common cases. | ||||||
367 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||||
368 | return static_cast<int8_t>(x) == x; | ||||||
369 | } | ||||||
370 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||||
371 | return static_cast<int16_t>(x) == x; | ||||||
372 | } | ||||||
373 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||||
374 | return static_cast<int32_t>(x) == x; | ||||||
375 | } | ||||||
376 | |||||||
377 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||||
378 | template <unsigned N, unsigned S> | ||||||
379 | constexpr inline bool isShiftedInt(int64_t x) { | ||||||
380 | static_assert( | ||||||
381 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||||
382 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||||
383 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
384 | } | ||||||
385 | |||||||
386 | /// Checks if an unsigned integer fits into the given bit width. | ||||||
387 | /// | ||||||
388 | /// This is written as two functions rather than as simply | ||||||
389 | /// | ||||||
390 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||||
391 | /// | ||||||
392 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||||
393 | /// left too many places. | ||||||
394 | template <unsigned N> | ||||||
395 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { | ||||||
396 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||||
397 | return X < (UINT64_C(1)1UL << (N)); | ||||||
398 | } | ||||||
399 | template <unsigned N> | ||||||
400 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) { | ||||||
401 | return true; | ||||||
402 | } | ||||||
403 | |||||||
404 | // Template specializations to get better code for common cases. | ||||||
405 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||||
406 | return static_cast<uint8_t>(x) == x; | ||||||
407 | } | ||||||
408 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||||
409 | return static_cast<uint16_t>(x) == x; | ||||||
410 | } | ||||||
411 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||||
412 | return static_cast<uint32_t>(x) == x; | ||||||
413 | } | ||||||
414 | |||||||
415 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||||
416 | template <unsigned N, unsigned S> | ||||||
417 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||||
418 | static_assert( | ||||||
419 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||||
420 | static_assert(N + S <= 64, | ||||||
421 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||||
422 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||||
423 | // 1 << S is not undefined behavior. | ||||||
424 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
425 | } | ||||||
426 | |||||||
427 | /// Gets the maximum value for a N-bit unsigned integer. | ||||||
428 | inline uint64_t maxUIntN(uint64_t N) { | ||||||
429 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 429, __extension__ __PRETTY_FUNCTION__)); | ||||||
430 | |||||||
431 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||||
432 | // (uint64_t(1) << N) - 1 | ||||||
433 | // without checking first that N != 64. But this works and doesn't have a | ||||||
434 | // branch. | ||||||
435 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); | ||||||
436 | } | ||||||
437 | |||||||
438 | /// Gets the minimum value for a N-bit signed integer. | ||||||
439 | inline int64_t minIntN(int64_t N) { | ||||||
440 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 440, __extension__ __PRETTY_FUNCTION__)); | ||||||
441 | |||||||
442 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); | ||||||
443 | } | ||||||
444 | |||||||
445 | /// Gets the maximum value for a N-bit signed integer. | ||||||
446 | inline int64_t maxIntN(int64_t N) { | ||||||
447 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 447, __extension__ __PRETTY_FUNCTION__)); | ||||||
448 | |||||||
449 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||||
450 | // int64_t only at the very end to avoid UB. | ||||||
451 | return (UINT64_C(1)1UL << (N - 1)) - 1; | ||||||
452 | } | ||||||
453 | |||||||
454 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||||
455 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||||
456 | return N >= 64 || x <= maxUIntN(N); | ||||||
457 | } | ||||||
458 | |||||||
459 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||||
460 | inline bool isIntN(unsigned N, int64_t x) { | ||||||
461 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||||
462 | } | ||||||
463 | |||||||
464 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
465 | /// least significant bit with the remainder zero (32 bit version). | ||||||
466 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||||
467 | constexpr inline bool isMask_32(uint32_t Value) { | ||||||
468 | return Value && ((Value + 1) & Value) == 0; | ||||||
469 | } | ||||||
470 | |||||||
471 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
472 | /// least significant bit with the remainder zero (64 bit version). | ||||||
473 | constexpr inline bool isMask_64(uint64_t Value) { | ||||||
474 | return Value && ((Value + 1) & Value) == 0; | ||||||
475 | } | ||||||
476 | |||||||
477 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
478 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||||
479 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||||
480 | return Value && isMask_32((Value - 1) | Value); | ||||||
481 | } | ||||||
482 | |||||||
483 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
484 | /// remainder zero (64 bit version.) | ||||||
485 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||||
486 | return Value && isMask_64((Value - 1) | Value); | ||||||
487 | } | ||||||
488 | |||||||
489 | /// Return true if the argument is a power of two > 0. | ||||||
490 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||||
491 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||||
492 | return Value && !(Value & (Value - 1)); | ||||||
493 | } | ||||||
494 | |||||||
495 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||||
496 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||||
497 | return Value && !(Value & (Value - 1)); | ||||||
498 | } | ||||||
499 | |||||||
500 | /// Count the number of ones from the most significant bit to the first | ||||||
501 | /// zero bit. | ||||||
502 | /// | ||||||
503 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||||
504 | /// Only unsigned integral types are allowed. | ||||||
505 | /// | ||||||
506 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
507 | /// ZB_Undefined are valid arguments. | ||||||
508 | template <typename T> | ||||||
509 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
510 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
511 | !std::numeric_limits<T>::is_signed, | ||||||
512 | "Only unsigned integral types are allowed."); | ||||||
513 | return countLeadingZeros<T>(~Value, ZB); | ||||||
514 | } | ||||||
515 | |||||||
516 | /// Count the number of ones from the least significant bit to the first | ||||||
517 | /// zero bit. | ||||||
518 | /// | ||||||
519 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||||
520 | /// Only unsigned integral types are allowed. | ||||||
521 | /// | ||||||
522 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
523 | /// ZB_Undefined are valid arguments. | ||||||
524 | template <typename T> | ||||||
525 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
526 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
527 | !std::numeric_limits<T>::is_signed, | ||||||
528 | "Only unsigned integral types are allowed."); | ||||||
529 | return countTrailingZeros<T>(~Value, ZB); | ||||||
530 | } | ||||||
531 | |||||||
532 | namespace detail { | ||||||
533 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||||
534 | static unsigned count(T Value) { | ||||||
535 | // Generic version, forward to 32 bits. | ||||||
536 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||||
537 | #if defined(__GNUC__4) | ||||||
538 | return __builtin_popcount(Value); | ||||||
539 | #else | ||||||
540 | uint32_t v = Value; | ||||||
541 | v = v - ((v >> 1) & 0x55555555); | ||||||
542 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||||
543 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||||
544 | #endif | ||||||
545 | } | ||||||
546 | }; | ||||||
547 | |||||||
548 | template <typename T> struct PopulationCounter<T, 8> { | ||||||
549 | static unsigned count(T Value) { | ||||||
550 | #if defined(__GNUC__4) | ||||||
551 | return __builtin_popcountll(Value); | ||||||
552 | #else | ||||||
553 | uint64_t v = Value; | ||||||
554 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||||
555 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||||
556 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||||
557 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||||
558 | #endif | ||||||
559 | } | ||||||
560 | }; | ||||||
561 | } // namespace detail | ||||||
562 | |||||||
563 | /// Count the number of set bits in a value. | ||||||
564 | /// Ex. countPopulation(0xF000F000) = 8 | ||||||
565 | /// Returns 0 if the word is zero. | ||||||
566 | template <typename T> | ||||||
567 | inline unsigned countPopulation(T Value) { | ||||||
568 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
569 | !std::numeric_limits<T>::is_signed, | ||||||
570 | "Only unsigned integral types are allowed."); | ||||||
571 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||||
572 | } | ||||||
573 | |||||||
574 | /// Compile time Log2. | ||||||
575 | /// Valid only for positive powers of two. | ||||||
576 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||||
577 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||||
578 | "Value is not a valid power of 2"); | ||||||
579 | return 1 + CTLog2<kValue / 2>(); | ||||||
580 | } | ||||||
581 | |||||||
582 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||||
583 | |||||||
584 | /// Return the log base 2 of the specified value. | ||||||
585 | inline double Log2(double Value) { | ||||||
586 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||||
587 | return __builtin_log(Value) / __builtin_log(2.0); | ||||||
588 | #else | ||||||
589 | return log2(Value); | ||||||
590 | #endif | ||||||
591 | } | ||||||
592 | |||||||
593 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
594 | /// (32 bit edition.) | ||||||
595 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||||
596 | inline unsigned Log2_32(uint32_t Value) { | ||||||
597 | return 31 - countLeadingZeros(Value); | ||||||
598 | } | ||||||
599 | |||||||
600 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
601 | /// (64 bit edition.) | ||||||
602 | inline unsigned Log2_64(uint64_t Value) { | ||||||
603 | return 63 - countLeadingZeros(Value); | ||||||
604 | } | ||||||
605 | |||||||
606 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||||
607 | /// (32 bit edition). | ||||||
608 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||||
609 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||||
610 | return 32 - countLeadingZeros(Value - 1); | ||||||
611 | } | ||||||
612 | |||||||
613 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||||
614 | /// (64 bit edition.) | ||||||
615 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||||
616 | return 64 - countLeadingZeros(Value - 1); | ||||||
617 | } | ||||||
618 | |||||||
619 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||||
620 | template <typename T> | ||||||
621 | inline T greatestCommonDivisor(T A, T B) { | ||||||
622 | while (B) { | ||||||
623 | T Tmp = B; | ||||||
624 | B = A % B; | ||||||
625 | A = Tmp; | ||||||
626 | } | ||||||
627 | return A; | ||||||
628 | } | ||||||
629 | |||||||
630 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||||
631 | return greatestCommonDivisor<uint64_t>(A, B); | ||||||
632 | } | ||||||
633 | |||||||
634 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||||
635 | inline double BitsToDouble(uint64_t Bits) { | ||||||
636 | double D; | ||||||
637 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
638 | memcpy(&D, &Bits, sizeof(Bits)); | ||||||
639 | return D; | ||||||
640 | } | ||||||
641 | |||||||
642 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||||
643 | inline float BitsToFloat(uint32_t Bits) { | ||||||
644 | float F; | ||||||
645 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
646 | memcpy(&F, &Bits, sizeof(Bits)); | ||||||
647 | return F; | ||||||
648 | } | ||||||
649 | |||||||
650 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||||
651 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||||
652 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
653 | inline uint64_t DoubleToBits(double Double) { | ||||||
654 | uint64_t Bits; | ||||||
655 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
656 | memcpy(&Bits, &Double, sizeof(Double)); | ||||||
657 | return Bits; | ||||||
658 | } | ||||||
659 | |||||||
660 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||||
661 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||||
662 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
663 | inline uint32_t FloatToBits(float Float) { | ||||||
664 | uint32_t Bits; | ||||||
665 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
666 | memcpy(&Bits, &Float, sizeof(Float)); | ||||||
667 | return Bits; | ||||||
668 | } | ||||||
669 | |||||||
670 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||||
671 | /// may be assumed after adding the two together. | ||||||
672 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||||
673 | // The largest power of 2 that divides both A and B. | ||||||
674 | // | ||||||
675 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||||
676 | // MSVC warning C4146 | ||||||
677 | // return (A | B) & -(A | B); | ||||||
678 | return (A | B) & (1 + ~(A | B)); | ||||||
679 | } | ||||||
680 | |||||||
681 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||||
682 | /// Returns zero on overflow. | ||||||
683 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||||
684 | A |= (A >> 1); | ||||||
685 | A |= (A >> 2); | ||||||
686 | A |= (A >> 4); | ||||||
687 | A |= (A >> 8); | ||||||
688 | A |= (A >> 16); | ||||||
689 | A |= (A >> 32); | ||||||
690 | return A + 1; | ||||||
691 | } | ||||||
692 | |||||||
693 | /// Returns the power of two which is less than or equal to the given value. | ||||||
694 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||||
695 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||||
696 | if (!A) return 0; | ||||||
697 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||||
698 | } | ||||||
699 | |||||||
700 | /// Returns the power of two which is greater than or equal to the given value. | ||||||
701 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||||
702 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||||
703 | if (!A) | ||||||
704 | return 0; | ||||||
705 | return NextPowerOf2(A - 1); | ||||||
706 | } | ||||||
707 | |||||||
708 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
709 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||||
710 | /// | ||||||
711 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||||
712 | /// integer that is greater than or equal to \p Value and equal to | ||||||
713 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||||
714 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||||
715 | /// | ||||||
716 | /// Examples: | ||||||
717 | /// \code | ||||||
718 | /// alignTo(5, 8) = 8 | ||||||
719 | /// alignTo(17, 8) = 24 | ||||||
720 | /// alignTo(~0LL, 8) = 0 | ||||||
721 | /// alignTo(321, 255) = 510 | ||||||
722 | /// | ||||||
723 | /// alignTo(5, 8, 7) = 7 | ||||||
724 | /// alignTo(17, 8, 1) = 17 | ||||||
725 | /// alignTo(~0LL, 8, 3) = 3 | ||||||
726 | /// alignTo(321, 255, 42) = 552 | ||||||
727 | /// \endcode | ||||||
728 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
729 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 729, __extension__ __PRETTY_FUNCTION__)); | ||||||
730 | Skew %= Align; | ||||||
731 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||||
732 | } | ||||||
733 | |||||||
734 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
735 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||||
736 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||||
737 | static_assert(Align != 0u, "Align must be non-zero"); | ||||||
738 | return (Value + Align - 1) / Align * Align; | ||||||
739 | } | ||||||
740 | |||||||
741 | /// Returns the integer ceil(Numerator / Denominator). | ||||||
742 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||||
743 | return alignTo(Numerator, Denominator) / Denominator; | ||||||
744 | } | ||||||
745 | |||||||
746 | /// Returns the integer nearest(Numerator / Denominator). | ||||||
747 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { | ||||||
748 | return (Numerator + (Denominator / 2)) / Denominator; | ||||||
749 | } | ||||||
750 | |||||||
751 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||||
752 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||||
753 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
754 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 754, __extension__ __PRETTY_FUNCTION__)); | ||||||
755 | Skew %= Align; | ||||||
756 | return (Value - Skew) / Align * Align + Skew; | ||||||
757 | } | ||||||
758 | |||||||
759 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
760 | /// Requires 0 < B <= 32. | ||||||
761 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||||
762 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
763 | static_assert(B <= 32, "Bit width out of range."); | ||||||
764 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
765 | } | ||||||
766 | |||||||
767 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
768 | /// Requires 0 < B <= 32. | ||||||
769 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||||
770 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 770, __extension__ __PRETTY_FUNCTION__)); | ||||||
771 | assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 771, __extension__ __PRETTY_FUNCTION__)); | ||||||
772 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
773 | } | ||||||
774 | |||||||
775 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
776 | /// Requires 0 < B <= 64. | ||||||
777 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||||
778 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
779 | static_assert(B <= 64, "Bit width out of range."); | ||||||
780 | return int64_t(x << (64 - B)) >> (64 - B); | ||||||
781 | } | ||||||
782 | |||||||
783 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
784 | /// Requires 0 < B <= 64. | ||||||
785 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||||
786 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 786, __extension__ __PRETTY_FUNCTION__)); | ||||||
787 | assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 787, __extension__ __PRETTY_FUNCTION__)); | ||||||
788 | return int64_t(X << (64 - B)) >> (64 - B); | ||||||
789 | } | ||||||
790 | |||||||
791 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||||
792 | /// value of the result. | ||||||
793 | template <typename T> | ||||||
794 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { | ||||||
795 | return X > Y ? (X - Y) : (Y - X); | ||||||
796 | } | ||||||
797 | |||||||
798 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
799 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
800 | /// the result is larger than the maximum representable value of type T. | ||||||
801 | template <typename T> | ||||||
802 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
803 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
804 | bool Dummy; | ||||||
805 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
806 | // Hacker's Delight, p. 29 | ||||||
807 | T Z = X + Y; | ||||||
808 | Overflowed = (Z < X || Z < Y); | ||||||
809 | if (Overflowed) | ||||||
810 | return std::numeric_limits<T>::max(); | ||||||
811 | else | ||||||
812 | return Z; | ||||||
813 | } | ||||||
814 | |||||||
815 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
816 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
817 | /// the result is larger than the maximum representable value of type T. | ||||||
818 | template <typename T> | ||||||
819 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
820 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
821 | bool Dummy; | ||||||
822 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
823 | |||||||
824 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||||
825 | // because it fails for uint16_t (where multiplication can have undefined | ||||||
826 | // behavior due to promotion to int), and requires a division in addition | ||||||
827 | // to the multiplication. | ||||||
828 | |||||||
829 | Overflowed = false; | ||||||
830 | |||||||
831 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||||
832 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||||
833 | // will necessarily be less than Log2Max as desired. | ||||||
834 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||||
835 | const T Max = std::numeric_limits<T>::max(); | ||||||
836 | int Log2Max = Log2_64(Max); | ||||||
837 | if (Log2Z < Log2Max) { | ||||||
838 | return X * Y; | ||||||
839 | } | ||||||
840 | if (Log2Z > Log2Max) { | ||||||
841 | Overflowed = true; | ||||||
842 | return Max; | ||||||
843 | } | ||||||
844 | |||||||
845 | // We're going to use the top bit, and maybe overflow one | ||||||
846 | // bit past it. Multiply all but the bottom bit then add | ||||||
847 | // that on at the end. | ||||||
848 | T Z = (X >> 1) * Y; | ||||||
849 | if (Z & ~(Max >> 1)) { | ||||||
850 | Overflowed = true; | ||||||
851 | return Max; | ||||||
852 | } | ||||||
853 | Z <<= 1; | ||||||
854 | if (X & 1) | ||||||
855 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||||
856 | |||||||
857 | return Z; | ||||||
858 | } | ||||||
859 | |||||||
860 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||||
861 | /// the product. Clamp the result to the maximum representable value of T on | ||||||
862 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||||
863 | /// maximum representable value of type T. | ||||||
864 | template <typename T> | ||||||
865 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
866 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||||
867 | bool Dummy; | ||||||
868 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
869 | |||||||
870 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||||
871 | if (Overflowed) | ||||||
872 | return Product; | ||||||
873 | |||||||
874 | return SaturatingAdd(A, Product, &Overflowed); | ||||||
875 | } | ||||||
876 | |||||||
877 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||||
878 | extern const float huge_valf; | ||||||
879 | |||||||
880 | |||||||
881 | /// Add two signed integers, computing the two's complement truncated result, | ||||||
882 | /// returning true if overflow occured. | ||||||
883 | template <typename T> | ||||||
884 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { | ||||||
885 | #if __has_builtin(__builtin_add_overflow)1 | ||||||
886 | return __builtin_add_overflow(X, Y, &Result); | ||||||
887 | #else | ||||||
888 | // Perform the unsigned addition. | ||||||
889 | using U = std::make_unsigned_t<T>; | ||||||
890 | const U UX = static_cast<U>(X); | ||||||
891 | const U UY = static_cast<U>(Y); | ||||||
892 | const U UResult = UX + UY; | ||||||
893 | |||||||
894 | // Convert to signed. | ||||||
895 | Result = static_cast<T>(UResult); | ||||||
896 | |||||||
897 | // Adding two positive numbers should result in a positive number. | ||||||
898 | if (X > 0 && Y > 0) | ||||||
899 | return Result <= 0; | ||||||
900 | // Adding two negatives should result in a negative number. | ||||||
901 | if (X < 0 && Y < 0) | ||||||
902 | return Result >= 0; | ||||||
903 | return false; | ||||||
904 | #endif | ||||||
905 | } | ||||||
906 | |||||||
907 | /// Subtract two signed integers, computing the two's complement truncated | ||||||
908 | /// result, returning true if an overflow ocurred. | ||||||
909 | template <typename T> | ||||||
910 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { | ||||||
911 | #if __has_builtin(__builtin_sub_overflow)1 | ||||||
912 | return __builtin_sub_overflow(X, Y, &Result); | ||||||
913 | #else | ||||||
914 | // Perform the unsigned addition. | ||||||
915 | using U = std::make_unsigned_t<T>; | ||||||
916 | const U UX = static_cast<U>(X); | ||||||
917 | const U UY = static_cast<U>(Y); | ||||||
918 | const U UResult = UX - UY; | ||||||
919 | |||||||
920 | // Convert to signed. | ||||||
921 | Result = static_cast<T>(UResult); | ||||||
922 | |||||||
923 | // Subtracting a positive number from a negative results in a negative number. | ||||||
924 | if (X <= 0 && Y > 0) | ||||||
925 | return Result >= 0; | ||||||
926 | // Subtracting a negative number from a positive results in a positive number. | ||||||
927 | if (X >= 0 && Y < 0) | ||||||
928 | return Result <= 0; | ||||||
929 | return false; | ||||||
930 | #endif | ||||||
931 | } | ||||||
932 | |||||||
933 | /// Multiply two signed integers, computing the two's complement truncated | ||||||
934 | /// result, returning true if an overflow ocurred. | ||||||
935 | template <typename T> | ||||||
936 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { | ||||||
937 | // Perform the unsigned multiplication on absolute values. | ||||||
938 | using U = std::make_unsigned_t<T>; | ||||||
939 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||||
940 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||||
941 | const U UResult = UX * UY; | ||||||
942 | |||||||
943 | // Convert to signed. | ||||||
944 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||||
945 | Result = IsNegative ? (0 - UResult) : UResult; | ||||||
946 | |||||||
947 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||||
948 | if (UX == 0 || UY == 0) | ||||||
949 | return false; | ||||||
950 | |||||||
951 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||||
952 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||||
953 | // positive) divided by an argument compares to the other. | ||||||
954 | if (IsNegative) | ||||||
955 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||||
956 | else | ||||||
957 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||||
958 | } | ||||||
959 | |||||||
960 | } // End llvm namespace | ||||||
961 | |||||||
962 | #endif |