File: | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
Warning: | line 2714, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | /// \file | ||||||
9 | /// This file implements the targeting of the Machinelegalizer class for | ||||||
10 | /// AMDGPU. | ||||||
11 | /// \todo This should be generated by TableGen. | ||||||
12 | //===----------------------------------------------------------------------===// | ||||||
13 | |||||||
14 | #include "AMDGPULegalizerInfo.h" | ||||||
15 | |||||||
16 | #include "AMDGPU.h" | ||||||
17 | #include "AMDGPUGlobalISelUtils.h" | ||||||
18 | #include "AMDGPUInstrInfo.h" | ||||||
19 | #include "AMDGPUTargetMachine.h" | ||||||
20 | #include "SIMachineFunctionInfo.h" | ||||||
21 | #include "llvm/ADT/ScopeExit.h" | ||||||
22 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||||
23 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||||
24 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||||
25 | #include "llvm/IR/DiagnosticInfo.h" | ||||||
26 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||||
27 | |||||||
28 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||||
29 | |||||||
30 | using namespace llvm; | ||||||
31 | using namespace LegalizeActions; | ||||||
32 | using namespace LegalizeMutations; | ||||||
33 | using namespace LegalityPredicates; | ||||||
34 | using namespace MIPatternMatch; | ||||||
35 | |||||||
36 | // Hack until load/store selection patterns support any tuple of legal types. | ||||||
37 | static cl::opt<bool> EnableNewLegality( | ||||||
38 | "amdgpu-global-isel-new-legality", | ||||||
39 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||||
40 | "rules compatible with selection patterns"), | ||||||
41 | cl::init(false), | ||||||
42 | cl::ReallyHidden); | ||||||
43 | |||||||
44 | static constexpr unsigned MaxRegisterSize = 1024; | ||||||
45 | |||||||
46 | // Round the number of elements to the next power of two elements | ||||||
47 | static LLT getPow2VectorType(LLT Ty) { | ||||||
48 | unsigned NElts = Ty.getNumElements(); | ||||||
49 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||||
50 | return Ty.changeNumElements(Pow2NElts); | ||||||
51 | } | ||||||
52 | |||||||
53 | // Round the number of bits to the next power of two bits | ||||||
54 | static LLT getPow2ScalarType(LLT Ty) { | ||||||
55 | unsigned Bits = Ty.getSizeInBits(); | ||||||
56 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||||
57 | return LLT::scalar(Pow2Bits); | ||||||
58 | } | ||||||
59 | |||||||
60 | /// \returs true if this is an odd sized vector which should widen by adding an | ||||||
61 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||||
62 | /// excludes s1 vectors, which should always be scalarized. | ||||||
63 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||||
64 | return [=](const LegalityQuery &Query) { | ||||||
65 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
66 | if (!Ty.isVector()) | ||||||
67 | return false; | ||||||
68 | |||||||
69 | const LLT EltTy = Ty.getElementType(); | ||||||
70 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||
71 | return Ty.getNumElements() % 2 != 0 && | ||||||
72 | EltSize > 1 && EltSize < 32 && | ||||||
73 | Ty.getSizeInBits() % 32 != 0; | ||||||
74 | }; | ||||||
75 | } | ||||||
76 | |||||||
77 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||||
78 | return [=](const LegalityQuery &Query) { | ||||||
79 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
80 | return Ty.getSizeInBits() % 32 == 0; | ||||||
81 | }; | ||||||
82 | } | ||||||
83 | |||||||
84 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||||
85 | return [=](const LegalityQuery &Query) { | ||||||
86 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
87 | const LLT EltTy = Ty.getScalarType(); | ||||||
88 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||||
89 | }; | ||||||
90 | } | ||||||
91 | |||||||
92 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||||
93 | return [=](const LegalityQuery &Query) { | ||||||
94 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
95 | const LLT EltTy = Ty.getElementType(); | ||||||
96 | return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); | ||||||
97 | }; | ||||||
98 | } | ||||||
99 | |||||||
100 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||||
101 | return [=](const LegalityQuery &Query) { | ||||||
102 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
103 | const LLT EltTy = Ty.getElementType(); | ||||||
104 | unsigned Size = Ty.getSizeInBits(); | ||||||
105 | unsigned Pieces = (Size + 63) / 64; | ||||||
106 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||||
107 | return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); | ||||||
108 | }; | ||||||
109 | } | ||||||
110 | |||||||
111 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||||
112 | // type. | ||||||
113 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||||
114 | return [=](const LegalityQuery &Query) { | ||||||
115 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
116 | |||||||
117 | const LLT EltTy = Ty.getElementType(); | ||||||
118 | const int Size = Ty.getSizeInBits(); | ||||||
119 | const int EltSize = EltTy.getSizeInBits(); | ||||||
120 | const int NextMul32 = (Size + 31) / 32; | ||||||
121 | |||||||
122 | assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail ("EltSize < 32", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 122, __PRETTY_FUNCTION__)); | ||||||
123 | |||||||
124 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||||
125 | return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); | ||||||
126 | }; | ||||||
127 | } | ||||||
128 | |||||||
129 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||||
130 | const unsigned Size = Ty.getSizeInBits(); | ||||||
131 | |||||||
132 | LLT CoercedTy; | ||||||
133 | if (Size <= 32) { | ||||||
134 | // <2 x s8> -> s16 | ||||||
135 | // <4 x s8> -> s32 | ||||||
136 | return LLT::scalar(Size); | ||||||
137 | } | ||||||
138 | |||||||
139 | return LLT::scalarOrVector(Size / 32, 32); | ||||||
140 | } | ||||||
141 | |||||||
142 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||||
143 | return [=](const LegalityQuery &Query) { | ||||||
144 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
145 | return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||||
146 | }; | ||||||
147 | } | ||||||
148 | |||||||
149 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||||
150 | return [=](const LegalityQuery &Query) { | ||||||
151 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
152 | unsigned Size = Ty.getSizeInBits(); | ||||||
153 | assert(Size % 32 == 0)((Size % 32 == 0) ? static_cast<void> (0) : __assert_fail ("Size % 32 == 0", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 153, __PRETTY_FUNCTION__)); | ||||||
154 | return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); | ||||||
155 | }; | ||||||
156 | } | ||||||
157 | |||||||
158 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||||
159 | return [=](const LegalityQuery &Query) { | ||||||
160 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
161 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||||
162 | }; | ||||||
163 | } | ||||||
164 | |||||||
165 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||||
166 | return [=](const LegalityQuery &Query) { | ||||||
167 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
168 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||||
169 | }; | ||||||
170 | } | ||||||
171 | |||||||
172 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||||
173 | return [=](const LegalityQuery &Query) { | ||||||
174 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
175 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||||
176 | }; | ||||||
177 | } | ||||||
178 | |||||||
179 | static bool isRegisterSize(unsigned Size) { | ||||||
180 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||||
181 | } | ||||||
182 | |||||||
183 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||||
184 | const int EltSize = EltTy.getSizeInBits(); | ||||||
185 | return EltSize == 16 || EltSize % 32 == 0; | ||||||
186 | } | ||||||
187 | |||||||
188 | static bool isRegisterVectorType(LLT Ty) { | ||||||
189 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||||
190 | return EltSize == 32 || EltSize == 64 || | ||||||
191 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||||
192 | EltSize == 128 || EltSize == 256; | ||||||
193 | } | ||||||
194 | |||||||
195 | static bool isRegisterType(LLT Ty) { | ||||||
196 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||||
197 | return false; | ||||||
198 | |||||||
199 | if (Ty.isVector()) | ||||||
200 | return isRegisterVectorType(Ty); | ||||||
201 | |||||||
202 | return true; | ||||||
203 | } | ||||||
204 | |||||||
205 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||||
206 | // multiples of v2s16. | ||||||
207 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||||
208 | return [=](const LegalityQuery &Query) { | ||||||
209 | return isRegisterType(Query.Types[TypeIdx]); | ||||||
210 | }; | ||||||
211 | } | ||||||
212 | |||||||
213 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||||
214 | return [=](const LegalityQuery &Query) { | ||||||
215 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||||
216 | if (!QueryTy.isVector()) | ||||||
217 | return false; | ||||||
218 | const LLT EltTy = QueryTy.getElementType(); | ||||||
219 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||||
220 | }; | ||||||
221 | } | ||||||
222 | |||||||
223 | static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { | ||||||
224 | return [=](const LegalityQuery &Query) { | ||||||
225 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
226 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||||
227 | Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); | ||||||
228 | }; | ||||||
229 | } | ||||||
230 | |||||||
231 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||||
232 | // handle some operations by just promoting the register during | ||||||
233 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||||
234 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||||
235 | bool IsLoad) { | ||||||
236 | switch (AS) { | ||||||
237 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||||
238 | // FIXME: Private element size. | ||||||
239 | return ST.enableFlatScratch() ? 128 : 32; | ||||||
240 | case AMDGPUAS::LOCAL_ADDRESS: | ||||||
241 | return ST.useDS128() ? 128 : 64; | ||||||
242 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||||
243 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||||
244 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||||
245 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||||
246 | // global loads (ideally constant address space should be eliminated) | ||||||
247 | // depending on the context. Legality cannot be context dependent, but | ||||||
248 | // RegBankSelect can split the load as necessary depending on the pointer | ||||||
249 | // register bank/uniformity and if the memory is invariant or not written in a | ||||||
250 | // kernel. | ||||||
251 | return IsLoad ? 512 : 128; | ||||||
252 | default: | ||||||
253 | // Flat addresses may contextually need to be split to 32-bit parts if they | ||||||
254 | // may alias scratch depending on the subtarget. | ||||||
255 | return 128; | ||||||
256 | } | ||||||
257 | } | ||||||
258 | |||||||
259 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||||
260 | const LegalityQuery &Query, | ||||||
261 | unsigned Opcode) { | ||||||
262 | const LLT Ty = Query.Types[0]; | ||||||
263 | |||||||
264 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||||
265 | const bool IsLoad = Opcode != AMDGPU::G_STORE; | ||||||
266 | |||||||
267 | unsigned RegSize = Ty.getSizeInBits(); | ||||||
268 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
269 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||
270 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||||
271 | |||||||
272 | // All of these need to be custom lowered to cast the pointer operand. | ||||||
273 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||||
274 | return false; | ||||||
275 | |||||||
276 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||||
277 | // we also need to modify the memory access size. | ||||||
278 | #if 0 | ||||||
279 | // Accept widening loads based on alignment. | ||||||
280 | if (IsLoad && MemSize < Size) | ||||||
281 | MemSize = std::max(MemSize, Align); | ||||||
282 | #endif | ||||||
283 | |||||||
284 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||||
285 | if (MemSize != RegSize && RegSize != 32) | ||||||
286 | return false; | ||||||
287 | |||||||
288 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||
289 | return false; | ||||||
290 | |||||||
291 | switch (MemSize) { | ||||||
292 | case 8: | ||||||
293 | case 16: | ||||||
294 | case 32: | ||||||
295 | case 64: | ||||||
296 | case 128: | ||||||
297 | break; | ||||||
298 | case 96: | ||||||
299 | if (!ST.hasDwordx3LoadStores()) | ||||||
300 | return false; | ||||||
301 | break; | ||||||
302 | case 256: | ||||||
303 | case 512: | ||||||
304 | // These may contextually need to be broken down. | ||||||
305 | break; | ||||||
306 | default: | ||||||
307 | return false; | ||||||
308 | } | ||||||
309 | |||||||
310 | assert(RegSize >= MemSize)((RegSize >= MemSize) ? static_cast<void> (0) : __assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 310, __PRETTY_FUNCTION__)); | ||||||
311 | |||||||
312 | if (AlignBits < MemSize) { | ||||||
313 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
314 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||
315 | Align(AlignBits / 8))) | ||||||
316 | return false; | ||||||
317 | } | ||||||
318 | |||||||
319 | return true; | ||||||
320 | } | ||||||
321 | |||||||
322 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||||
323 | // workaround this. Eventually it should ignore the type for loads and only care | ||||||
324 | // about the size. Return true in cases where we will workaround this for now by | ||||||
325 | // bitcasting. | ||||||
326 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||||
327 | if (EnableNewLegality) | ||||||
328 | return false; | ||||||
329 | |||||||
330 | const unsigned Size = Ty.getSizeInBits(); | ||||||
331 | if (Size <= 64) | ||||||
332 | return false; | ||||||
333 | if (!Ty.isVector()) | ||||||
334 | return true; | ||||||
335 | |||||||
336 | LLT EltTy = Ty.getElementType(); | ||||||
337 | if (EltTy.isPointer()) | ||||||
338 | return true; | ||||||
339 | |||||||
340 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
341 | return EltSize != 32 && EltSize != 64; | ||||||
342 | } | ||||||
343 | |||||||
344 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||||
345 | unsigned Opcode) { | ||||||
346 | const LLT Ty = Query.Types[0]; | ||||||
347 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && | ||||||
348 | !loadStoreBitcastWorkaround(Ty); | ||||||
349 | } | ||||||
350 | |||||||
351 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||||
352 | /// to a different type. | ||||||
353 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||||
354 | const unsigned MemSizeInBits) { | ||||||
355 | const unsigned Size = Ty.getSizeInBits(); | ||||||
356 | if (Size != MemSizeInBits) | ||||||
357 | return Size <= 32 && Ty.isVector(); | ||||||
358 | |||||||
359 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||||
360 | return true; | ||||||
361 | return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && | ||||||
362 | !isRegisterVectorElementType(Ty.getElementType()); | ||||||
363 | } | ||||||
364 | |||||||
365 | /// Return true if we should legalize a load by widening an odd sized memory | ||||||
366 | /// access up to the alignment. Note this case when the memory access itself | ||||||
367 | /// changes, not the size of the result register. | ||||||
368 | static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, | ||||||
369 | unsigned AlignInBits, unsigned AddrSpace, | ||||||
370 | unsigned Opcode) { | ||||||
371 | // We don't want to widen cases that are naturally legal. | ||||||
372 | if (isPowerOf2_32(SizeInBits)) | ||||||
373 | return false; | ||||||
374 | |||||||
375 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||||
376 | // end up widening these for a scalar load during RegBankSelect, since there | ||||||
377 | // aren't 96-bit scalar loads. | ||||||
378 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||||
379 | return false; | ||||||
380 | |||||||
381 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) | ||||||
382 | return false; | ||||||
383 | |||||||
384 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||||
385 | // to it. | ||||||
386 | // | ||||||
387 | // TODO: Could check dereferenceable for less aligned cases. | ||||||
388 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||||
389 | if (AlignInBits < RoundedSize) | ||||||
390 | return false; | ||||||
391 | |||||||
392 | // Do not widen if it would introduce a slow unaligned load. | ||||||
393 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
394 | bool Fast = false; | ||||||
395 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||||
396 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||||
397 | MachineMemOperand::MOLoad, &Fast) && | ||||||
398 | Fast; | ||||||
399 | } | ||||||
400 | |||||||
401 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||||
402 | unsigned Opcode) { | ||||||
403 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||||
404 | return false; | ||||||
405 | |||||||
406 | return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, | ||||||
407 | Query.MMODescrs[0].AlignInBits, | ||||||
408 | Query.Types[1].getAddressSpace(), Opcode); | ||||||
409 | } | ||||||
410 | |||||||
411 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||||
412 | const GCNTargetMachine &TM) | ||||||
413 | : ST(ST_) { | ||||||
414 | using namespace TargetOpcode; | ||||||
415 | |||||||
416 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||||
417 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||||
418 | }; | ||||||
419 | |||||||
420 | const LLT S1 = LLT::scalar(1); | ||||||
421 | const LLT S8 = LLT::scalar(8); | ||||||
422 | const LLT S16 = LLT::scalar(16); | ||||||
423 | const LLT S32 = LLT::scalar(32); | ||||||
424 | const LLT S64 = LLT::scalar(64); | ||||||
425 | const LLT S128 = LLT::scalar(128); | ||||||
426 | const LLT S256 = LLT::scalar(256); | ||||||
427 | const LLT S512 = LLT::scalar(512); | ||||||
428 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||||
429 | |||||||
430 | const LLT V2S8 = LLT::vector(2, 8); | ||||||
431 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
432 | const LLT V4S16 = LLT::vector(4, 16); | ||||||
433 | |||||||
434 | const LLT V2S32 = LLT::vector(2, 32); | ||||||
435 | const LLT V3S32 = LLT::vector(3, 32); | ||||||
436 | const LLT V4S32 = LLT::vector(4, 32); | ||||||
437 | const LLT V5S32 = LLT::vector(5, 32); | ||||||
438 | const LLT V6S32 = LLT::vector(6, 32); | ||||||
439 | const LLT V7S32 = LLT::vector(7, 32); | ||||||
440 | const LLT V8S32 = LLT::vector(8, 32); | ||||||
441 | const LLT V9S32 = LLT::vector(9, 32); | ||||||
442 | const LLT V10S32 = LLT::vector(10, 32); | ||||||
443 | const LLT V11S32 = LLT::vector(11, 32); | ||||||
444 | const LLT V12S32 = LLT::vector(12, 32); | ||||||
445 | const LLT V13S32 = LLT::vector(13, 32); | ||||||
446 | const LLT V14S32 = LLT::vector(14, 32); | ||||||
447 | const LLT V15S32 = LLT::vector(15, 32); | ||||||
448 | const LLT V16S32 = LLT::vector(16, 32); | ||||||
449 | const LLT V32S32 = LLT::vector(32, 32); | ||||||
450 | |||||||
451 | const LLT V2S64 = LLT::vector(2, 64); | ||||||
452 | const LLT V3S64 = LLT::vector(3, 64); | ||||||
453 | const LLT V4S64 = LLT::vector(4, 64); | ||||||
454 | const LLT V5S64 = LLT::vector(5, 64); | ||||||
455 | const LLT V6S64 = LLT::vector(6, 64); | ||||||
456 | const LLT V7S64 = LLT::vector(7, 64); | ||||||
457 | const LLT V8S64 = LLT::vector(8, 64); | ||||||
458 | const LLT V16S64 = LLT::vector(16, 64); | ||||||
459 | |||||||
460 | std::initializer_list<LLT> AllS32Vectors = | ||||||
461 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||||
462 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||||
463 | std::initializer_list<LLT> AllS64Vectors = | ||||||
464 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||||
465 | |||||||
466 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||||
467 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
468 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||||
469 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||||
470 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||||
471 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||||
472 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||||
473 | |||||||
474 | const LLT CodePtr = FlatPtr; | ||||||
475 | |||||||
476 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||||
477 | GlobalPtr, ConstantPtr, FlatPtr | ||||||
478 | }; | ||||||
479 | |||||||
480 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||||
481 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||||
482 | }; | ||||||
483 | |||||||
484 | const std::initializer_list<LLT> FPTypesBase = { | ||||||
485 | S32, S64 | ||||||
486 | }; | ||||||
487 | |||||||
488 | const std::initializer_list<LLT> FPTypes16 = { | ||||||
489 | S32, S64, S16 | ||||||
490 | }; | ||||||
491 | |||||||
492 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||||
493 | S32, S64, S16, V2S16 | ||||||
494 | }; | ||||||
495 | |||||||
496 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||||
497 | |||||||
498 | setAction({G_BRCOND, S1}, Legal); // VCC branches | ||||||
499 | setAction({G_BRCOND, S32}, Legal); // SCC branches | ||||||
500 | |||||||
501 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||||
502 | // elements for v3s16 | ||||||
503 | getActionDefinitionsBuilder(G_PHI) | ||||||
504 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||||
505 | .legalFor(AllS32Vectors) | ||||||
506 | .legalFor(AllS64Vectors) | ||||||
507 | .legalFor(AddrSpaces64) | ||||||
508 | .legalFor(AddrSpaces32) | ||||||
509 | .legalIf(isPointer(0)) | ||||||
510 | .clampScalar(0, S16, S256) | ||||||
511 | .widenScalarToNextPow2(0, 32) | ||||||
512 | .clampMaxNumElements(0, S32, 16) | ||||||
513 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
514 | .scalarize(0); | ||||||
515 | |||||||
516 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||||
517 | // Full set of gfx9 features. | ||||||
518 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
519 | .legalFor({S32, S16, V2S16}) | ||||||
520 | .clampScalar(0, S16, S32) | ||||||
521 | .clampMaxNumElements(0, S16, 2) | ||||||
522 | .scalarize(0) | ||||||
523 | .widenScalarToNextPow2(0, 32); | ||||||
524 | |||||||
525 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||||
526 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||||
527 | .minScalarOrElt(0, S16) | ||||||
528 | .clampMaxNumElements(0, S16, 2) | ||||||
529 | .scalarize(0) | ||||||
530 | .widenScalarToNextPow2(0, 32) | ||||||
531 | .lower(); | ||||||
532 | } else if (ST.has16BitInsts()) { | ||||||
533 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
534 | .legalFor({S32, S16}) | ||||||
535 | .clampScalar(0, S16, S32) | ||||||
536 | .scalarize(0) | ||||||
537 | .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 | ||||||
538 | |||||||
539 | // Technically the saturating operations require clamp bit support, but this | ||||||
540 | // was introduced at the same time as 16-bit operations. | ||||||
541 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
542 | .legalFor({S32, S16}) // Clamp modifier | ||||||
543 | .minScalar(0, S16) | ||||||
544 | .scalarize(0) | ||||||
545 | .widenScalarToNextPow2(0, 16) | ||||||
546 | .lower(); | ||||||
547 | |||||||
548 | // We're just lowering this, but it helps get a better result to try to | ||||||
549 | // coerce to the desired type first. | ||||||
550 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||
551 | .minScalar(0, S16) | ||||||
552 | .scalarize(0) | ||||||
553 | .lower(); | ||||||
554 | } else { | ||||||
555 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) | ||||||
556 | .legalFor({S32}) | ||||||
557 | .clampScalar(0, S32, S32) | ||||||
558 | .scalarize(0); | ||||||
559 | |||||||
560 | if (ST.hasIntClamp()) { | ||||||
561 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
562 | .legalFor({S32}) // Clamp modifier. | ||||||
563 | .scalarize(0) | ||||||
564 | .minScalarOrElt(0, S32) | ||||||
565 | .lower(); | ||||||
566 | } else { | ||||||
567 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||||
568 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||||
569 | .minScalar(0, S32) | ||||||
570 | .scalarize(0) | ||||||
571 | .lower(); | ||||||
572 | } | ||||||
573 | |||||||
574 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||||
575 | // range values and goes for the min/max lowering directly. | ||||||
576 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||||
577 | .minScalar(0, S32) | ||||||
578 | .scalarize(0) | ||||||
579 | .lower(); | ||||||
580 | } | ||||||
581 | |||||||
582 | getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) | ||||||
583 | .customFor({S32, S64}) | ||||||
584 | .clampScalar(0, S32, S64) | ||||||
585 | .widenScalarToNextPow2(0, 32) | ||||||
586 | .scalarize(0); | ||||||
587 | |||||||
588 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||||
589 | .legalFor({S32}) | ||||||
590 | .maxScalarOrElt(0, S32); | ||||||
591 | |||||||
592 | if (ST.hasVOP3PInsts()) { | ||||||
593 | Mulh | ||||||
594 | .clampMaxNumElements(0, S8, 2) | ||||||
595 | .lowerFor({V2S8}); | ||||||
596 | } | ||||||
597 | |||||||
598 | Mulh | ||||||
599 | .scalarize(0) | ||||||
600 | .lower(); | ||||||
601 | |||||||
602 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||||
603 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||||
604 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||||
605 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||||
606 | .clampScalar(0, S32, S64) | ||||||
607 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
608 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||||
609 | .widenScalarToNextPow2(0) | ||||||
610 | .scalarize(0); | ||||||
611 | |||||||
612 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, | ||||||
613 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||||
614 | .legalFor({{S32, S1}, {S32, S32}}) | ||||||
615 | .minScalar(0, S32) | ||||||
616 | // TODO: .scalarize(0) | ||||||
617 | .lower(); | ||||||
618 | |||||||
619 | getActionDefinitionsBuilder(G_BITCAST) | ||||||
620 | // Don't worry about the size constraint. | ||||||
621 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
622 | .lower(); | ||||||
623 | |||||||
624 | |||||||
625 | getActionDefinitionsBuilder(G_CONSTANT) | ||||||
626 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||||
627 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||||
628 | .legalIf(isPointer(0)) | ||||||
629 | .clampScalar(0, S32, S64) | ||||||
630 | .widenScalarToNextPow2(0); | ||||||
631 | |||||||
632 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||||
633 | .legalFor({S32, S64, S16}) | ||||||
634 | .clampScalar(0, S16, S64); | ||||||
635 | |||||||
636 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||||
637 | .legalIf(isRegisterType(0)) | ||||||
638 | // s1 and s16 are special cases because they have legal operations on | ||||||
639 | // them, but don't really occupy registers in the normal way. | ||||||
640 | .legalFor({S1, S16}) | ||||||
641 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
642 | .clampScalarOrElt(0, S32, MaxScalar) | ||||||
643 | .widenScalarToNextPow2(0, 32) | ||||||
644 | .clampMaxNumElements(0, S32, 16); | ||||||
645 | |||||||
646 | setAction({G_FRAME_INDEX, PrivatePtr}, Legal); | ||||||
647 | |||||||
648 | // If the amount is divergent, we have to do a wave reduction to get the | ||||||
649 | // maximum value, so this is expanded during RegBankSelect. | ||||||
650 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||||
651 | .legalFor({{PrivatePtr, S32}}); | ||||||
652 | |||||||
653 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||||
654 | .customIf(typeIsNot(0, PrivatePtr)); | ||||||
655 | |||||||
656 | setAction({G_BLOCK_ADDR, CodePtr}, Legal); | ||||||
657 | |||||||
658 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||||
659 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) | ||||||
660 | .legalFor({S32, S64}); | ||||||
661 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||||
662 | .customFor({S32, S64}); | ||||||
663 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||||
664 | .customFor({S32, S64}); | ||||||
665 | |||||||
666 | if (ST.has16BitInsts()) { | ||||||
667 | if (ST.hasVOP3PInsts()) | ||||||
668 | FPOpActions.legalFor({S16, V2S16}); | ||||||
669 | else | ||||||
670 | FPOpActions.legalFor({S16}); | ||||||
671 | |||||||
672 | TrigActions.customFor({S16}); | ||||||
673 | FDIVActions.customFor({S16}); | ||||||
674 | } | ||||||
675 | |||||||
676 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||||
677 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||||
678 | |||||||
679 | if (ST.hasVOP3PInsts()) { | ||||||
680 | MinNumMaxNum.customFor(FPTypesPK16) | ||||||
681 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
682 | .clampMaxNumElements(0, S16, 2) | ||||||
683 | .clampScalar(0, S16, S64) | ||||||
684 | .scalarize(0); | ||||||
685 | } else if (ST.has16BitInsts()) { | ||||||
686 | MinNumMaxNum.customFor(FPTypes16) | ||||||
687 | .clampScalar(0, S16, S64) | ||||||
688 | .scalarize(0); | ||||||
689 | } else { | ||||||
690 | MinNumMaxNum.customFor(FPTypesBase) | ||||||
691 | .clampScalar(0, S32, S64) | ||||||
692 | .scalarize(0); | ||||||
693 | } | ||||||
694 | |||||||
695 | if (ST.hasVOP3PInsts()) | ||||||
696 | FPOpActions.clampMaxNumElements(0, S16, 2); | ||||||
697 | |||||||
698 | FPOpActions | ||||||
699 | .scalarize(0) | ||||||
700 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
701 | |||||||
702 | TrigActions | ||||||
703 | .scalarize(0) | ||||||
704 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
705 | |||||||
706 | FDIVActions | ||||||
707 | .scalarize(0) | ||||||
708 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||||
709 | |||||||
710 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||||
711 | .legalFor(FPTypesPK16) | ||||||
712 | .clampMaxNumElements(0, S16, 2) | ||||||
713 | .scalarize(0) | ||||||
714 | .clampScalar(0, S16, S64); | ||||||
715 | |||||||
716 | if (ST.has16BitInsts()) { | ||||||
717 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||||
718 | .legalFor({S32, S64, S16}) | ||||||
719 | .scalarize(0) | ||||||
720 | .clampScalar(0, S16, S64); | ||||||
721 | } else { | ||||||
722 | getActionDefinitionsBuilder(G_FSQRT) | ||||||
723 | .legalFor({S32, S64}) | ||||||
724 | .scalarize(0) | ||||||
725 | .clampScalar(0, S32, S64); | ||||||
726 | |||||||
727 | if (ST.hasFractBug()) { | ||||||
728 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
729 | .customFor({S64}) | ||||||
730 | .legalFor({S32, S64}) | ||||||
731 | .scalarize(0) | ||||||
732 | .clampScalar(0, S32, S64); | ||||||
733 | } else { | ||||||
734 | getActionDefinitionsBuilder(G_FFLOOR) | ||||||
735 | .legalFor({S32, S64}) | ||||||
736 | .scalarize(0) | ||||||
737 | .clampScalar(0, S32, S64); | ||||||
738 | } | ||||||
739 | } | ||||||
740 | |||||||
741 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||||
742 | .legalFor({{S32, S64}, {S16, S32}}) | ||||||
743 | .scalarize(0) | ||||||
744 | .lower(); | ||||||
745 | |||||||
746 | getActionDefinitionsBuilder(G_FPEXT) | ||||||
747 | .legalFor({{S64, S32}, {S32, S16}}) | ||||||
748 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||||
749 | .scalarize(0); | ||||||
750 | |||||||
751 | getActionDefinitionsBuilder(G_FSUB) | ||||||
752 | // Use actual fsub instruction | ||||||
753 | .legalFor({S32}) | ||||||
754 | // Must use fadd + fneg | ||||||
755 | .lowerFor({S64, S16, V2S16}) | ||||||
756 | .scalarize(0) | ||||||
757 | .clampScalar(0, S32, S64); | ||||||
758 | |||||||
759 | // Whether this is legal depends on the floating point mode for the function. | ||||||
760 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||||
761 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||||
762 | FMad.customFor({S32, S16}); | ||||||
763 | else if (ST.hasMadMacF32Insts()) | ||||||
764 | FMad.customFor({S32}); | ||||||
765 | else if (ST.hasMadF16()) | ||||||
766 | FMad.customFor({S16}); | ||||||
767 | FMad.scalarize(0) | ||||||
768 | .lower(); | ||||||
769 | |||||||
770 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||||
771 | if (ST.has16BitInsts()) { | ||||||
772 | FRem.customFor({S16, S32, S64}); | ||||||
773 | } else { | ||||||
774 | FRem.minScalar(0, S32) | ||||||
775 | .customFor({S32, S64}); | ||||||
776 | } | ||||||
777 | FRem.scalarize(0); | ||||||
778 | |||||||
779 | // TODO: Do we need to clamp maximum bitwidth? | ||||||
780 | getActionDefinitionsBuilder(G_TRUNC) | ||||||
781 | .legalIf(isScalar(0)) | ||||||
782 | .legalFor({{V2S16, V2S32}}) | ||||||
783 | .clampMaxNumElements(0, S16, 2) | ||||||
784 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||||
785 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||||
786 | // in the legalizer. | ||||||
787 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||||
788 | .alwaysLegal(); | ||||||
789 | |||||||
790 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||||
791 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||||
792 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||||
793 | .scalarize(0) | ||||||
794 | .clampScalar(0, S32, S64) | ||||||
795 | .widenScalarToNextPow2(1, 32); | ||||||
796 | |||||||
797 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||||
798 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||||
799 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||||
800 | .lowerFor({{S32, S64}}) | ||||||
801 | .lowerIf(typeIs(1, S1)) | ||||||
802 | .customFor({{S64, S64}}); | ||||||
803 | if (ST.has16BitInsts()) | ||||||
804 | IToFP.legalFor({{S16, S16}}); | ||||||
805 | IToFP.clampScalar(1, S32, S64) | ||||||
806 | .minScalar(0, S32) | ||||||
807 | .scalarize(0) | ||||||
808 | .widenScalarToNextPow2(1); | ||||||
809 | |||||||
810 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||||
811 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||||
812 | .customFor({{S64, S64}}) | ||||||
813 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||||
814 | if (ST.has16BitInsts()) | ||||||
815 | FPToI.legalFor({{S16, S16}}); | ||||||
816 | else | ||||||
817 | FPToI.minScalar(1, S32); | ||||||
818 | |||||||
819 | FPToI.minScalar(0, S32) | ||||||
820 | .scalarize(0) | ||||||
821 | .lower(); | ||||||
822 | |||||||
823 | // Lower roundeven into G_FRINT | ||||||
824 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||||
825 | .scalarize(0) | ||||||
826 | .lower(); | ||||||
827 | |||||||
828 | if (ST.has16BitInsts()) { | ||||||
829 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
830 | .legalFor({S16, S32, S64}) | ||||||
831 | .clampScalar(0, S16, S64) | ||||||
832 | .scalarize(0); | ||||||
833 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||||
834 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
835 | .legalFor({S32, S64}) | ||||||
836 | .clampScalar(0, S32, S64) | ||||||
837 | .scalarize(0); | ||||||
838 | } else { | ||||||
839 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||||
840 | .legalFor({S32}) | ||||||
841 | .customFor({S64}) | ||||||
842 | .clampScalar(0, S32, S64) | ||||||
843 | .scalarize(0); | ||||||
844 | } | ||||||
845 | |||||||
846 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||||
847 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||||
848 | .scalarize(0) | ||||||
849 | .scalarSameSizeAs(1, 0); | ||||||
850 | |||||||
851 | getActionDefinitionsBuilder(G_PTRMASK) | ||||||
852 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||||
853 | .scalarSameSizeAs(1, 0) | ||||||
854 | .scalarize(0); | ||||||
855 | |||||||
856 | auto &CmpBuilder = | ||||||
857 | getActionDefinitionsBuilder(G_ICMP) | ||||||
858 | // The compare output type differs based on the register bank of the output, | ||||||
859 | // so make both s1 and s32 legal. | ||||||
860 | // | ||||||
861 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||||
862 | // is the allocatable register type that will be needed for the copy from | ||||||
863 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||||
864 | // before that won't try to use s32 result types. | ||||||
865 | // | ||||||
866 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||||
867 | // bank. | ||||||
868 | .legalForCartesianProduct( | ||||||
869 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||||
870 | .legalForCartesianProduct( | ||||||
871 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||||
872 | if (ST.has16BitInsts()) { | ||||||
873 | CmpBuilder.legalFor({{S1, S16}}); | ||||||
874 | } | ||||||
875 | |||||||
876 | CmpBuilder | ||||||
877 | .widenScalarToNextPow2(1) | ||||||
878 | .clampScalar(1, S32, S64) | ||||||
879 | .scalarize(0) | ||||||
880 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||||
881 | |||||||
882 | getActionDefinitionsBuilder(G_FCMP) | ||||||
883 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||||
884 | .widenScalarToNextPow2(1) | ||||||
885 | .clampScalar(1, S32, S64) | ||||||
886 | .scalarize(0); | ||||||
887 | |||||||
888 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||||
889 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||||
890 | if (ST.has16BitInsts()) | ||||||
891 | Exp2Ops.legalFor({S32, S16}); | ||||||
892 | else | ||||||
893 | Exp2Ops.legalFor({S32}); | ||||||
894 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||||
895 | Exp2Ops.scalarize(0); | ||||||
896 | |||||||
897 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||||
898 | if (ST.has16BitInsts()) | ||||||
899 | ExpOps.customFor({{S32}, {S16}}); | ||||||
900 | else | ||||||
901 | ExpOps.customFor({S32}); | ||||||
902 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||||
903 | .scalarize(0); | ||||||
904 | |||||||
905 | getActionDefinitionsBuilder(G_FPOWI) | ||||||
906 | .clampScalar(0, MinScalarFPTy, S32) | ||||||
907 | .lower(); | ||||||
908 | |||||||
909 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
910 | getActionDefinitionsBuilder(G_CTPOP) | ||||||
911 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
912 | .clampScalar(0, S32, S32) | ||||||
913 | .clampScalar(1, S32, S64) | ||||||
914 | .scalarize(0) | ||||||
915 | .widenScalarToNextPow2(0, 32) | ||||||
916 | .widenScalarToNextPow2(1, 32); | ||||||
917 | |||||||
918 | // The hardware instructions return a different result on 0 than the generic | ||||||
919 | // instructions expect. The hardware produces -1, but these produce the | ||||||
920 | // bitwidth. | ||||||
921 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||||
922 | .scalarize(0) | ||||||
923 | .clampScalar(0, S32, S32) | ||||||
924 | .clampScalar(1, S32, S64) | ||||||
925 | .widenScalarToNextPow2(0, 32) | ||||||
926 | .widenScalarToNextPow2(1, 32) | ||||||
927 | .lower(); | ||||||
928 | |||||||
929 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||||
930 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||||
931 | .legalFor({{S32, S32}, {S32, S64}}) | ||||||
932 | .clampScalar(0, S32, S32) | ||||||
933 | .clampScalar(1, S32, S64) | ||||||
934 | .scalarize(0) | ||||||
935 | .widenScalarToNextPow2(0, 32) | ||||||
936 | .widenScalarToNextPow2(1, 32); | ||||||
937 | |||||||
938 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||||
939 | .legalFor({S32}) | ||||||
940 | .clampScalar(0, S32, S32) | ||||||
941 | .scalarize(0); | ||||||
942 | |||||||
943 | if (ST.has16BitInsts()) { | ||||||
944 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
945 | .legalFor({S16, S32, V2S16}) | ||||||
946 | .clampMaxNumElements(0, S16, 2) | ||||||
947 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
948 | // narrowScalar limitation. | ||||||
949 | .widenScalarToNextPow2(0) | ||||||
950 | .clampScalar(0, S16, S32) | ||||||
951 | .scalarize(0); | ||||||
952 | |||||||
953 | if (ST.hasVOP3PInsts()) { | ||||||
954 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
955 | .legalFor({S32, S16, V2S16}) | ||||||
956 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
957 | .clampMaxNumElements(0, S16, 2) | ||||||
958 | .minScalar(0, S16) | ||||||
959 | .widenScalarToNextPow2(0) | ||||||
960 | .scalarize(0) | ||||||
961 | .lower(); | ||||||
962 | } else { | ||||||
963 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
964 | .legalFor({S32, S16}) | ||||||
965 | .widenScalarToNextPow2(0) | ||||||
966 | .minScalar(0, S16) | ||||||
967 | .scalarize(0) | ||||||
968 | .lower(); | ||||||
969 | } | ||||||
970 | } else { | ||||||
971 | // TODO: Should have same legality without v_perm_b32 | ||||||
972 | getActionDefinitionsBuilder(G_BSWAP) | ||||||
973 | .legalFor({S32}) | ||||||
974 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||||
975 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||||
976 | // narrowScalar limitation. | ||||||
977 | .widenScalarToNextPow2(0) | ||||||
978 | .maxScalar(0, S32) | ||||||
979 | .scalarize(0) | ||||||
980 | .lower(); | ||||||
981 | |||||||
982 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) | ||||||
983 | .legalFor({S32}) | ||||||
984 | .minScalar(0, S32) | ||||||
985 | .widenScalarToNextPow2(0) | ||||||
986 | .scalarize(0) | ||||||
987 | .lower(); | ||||||
988 | } | ||||||
989 | |||||||
990 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||||
991 | // List the common cases | ||||||
992 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
993 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
994 | .scalarize(0) | ||||||
995 | // Accept any address space as long as the size matches | ||||||
996 | .legalIf(sameSize(0, 1)) | ||||||
997 | .widenScalarIf(smallerThan(1, 0), | ||||||
998 | [](const LegalityQuery &Query) { | ||||||
999 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
1000 | }) | ||||||
1001 | .narrowScalarIf(largerThan(1, 0), | ||||||
1002 | [](const LegalityQuery &Query) { | ||||||
1003 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||||
1004 | }); | ||||||
1005 | |||||||
1006 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||||
1007 | // List the common cases | ||||||
1008 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||||
1009 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||||
1010 | .scalarize(0) | ||||||
1011 | // Accept any address space as long as the size matches | ||||||
1012 | .legalIf(sameSize(0, 1)) | ||||||
1013 | .widenScalarIf(smallerThan(0, 1), | ||||||
1014 | [](const LegalityQuery &Query) { | ||||||
1015 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
1016 | }) | ||||||
1017 | .narrowScalarIf( | ||||||
1018 | largerThan(0, 1), | ||||||
1019 | [](const LegalityQuery &Query) { | ||||||
1020 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||||
1021 | }); | ||||||
1022 | |||||||
1023 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||||
1024 | .scalarize(0) | ||||||
1025 | .custom(); | ||||||
1026 | |||||||
1027 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||||
1028 | bool IsLoad) -> bool { | ||||||
1029 | const LLT DstTy = Query.Types[0]; | ||||||
1030 | |||||||
1031 | // Split vector extloads. | ||||||
1032 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
1033 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; | ||||||
1034 | |||||||
1035 | if (MemSize < DstTy.getSizeInBits()) | ||||||
1036 | MemSize = std::max(MemSize, AlignBits); | ||||||
1037 | |||||||
1038 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||||
1039 | return true; | ||||||
1040 | |||||||
1041 | const LLT PtrTy = Query.Types[1]; | ||||||
1042 | unsigned AS = PtrTy.getAddressSpace(); | ||||||
1043 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) | ||||||
1044 | return true; | ||||||
1045 | |||||||
1046 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||||
1047 | // TODO: May be able to widen depending on alignment etc. | ||||||
1048 | unsigned NumRegs = (MemSize + 31) / 32; | ||||||
1049 | if (NumRegs == 3) { | ||||||
1050 | if (!ST.hasDwordx3LoadStores()) | ||||||
1051 | return true; | ||||||
1052 | } else { | ||||||
1053 | // If the alignment allows, these should have been widened. | ||||||
1054 | if (!isPowerOf2_32(NumRegs)) | ||||||
1055 | return true; | ||||||
1056 | } | ||||||
1057 | |||||||
1058 | if (AlignBits < MemSize) { | ||||||
1059 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
1060 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||||
1061 | Align(AlignBits / 8)); | ||||||
1062 | } | ||||||
1063 | |||||||
1064 | return false; | ||||||
1065 | }; | ||||||
1066 | |||||||
1067 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||||
1068 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||||
1069 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||||
1070 | |||||||
1071 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||||
1072 | // LDS | ||||||
1073 | // TODO: Unsupported flat for SI. | ||||||
1074 | |||||||
1075 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||||
1076 | const bool IsStore = Op == G_STORE; | ||||||
1077 | |||||||
1078 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||||
1079 | // Explicitly list some common cases. | ||||||
1080 | // TODO: Does this help compile time at all? | ||||||
1081 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, | ||||||
1082 | {V2S32, GlobalPtr, 64, GlobalAlign32}, | ||||||
1083 | {V4S32, GlobalPtr, 128, GlobalAlign32}, | ||||||
1084 | {S64, GlobalPtr, 64, GlobalAlign32}, | ||||||
1085 | {V2S64, GlobalPtr, 128, GlobalAlign32}, | ||||||
1086 | {V2S16, GlobalPtr, 32, GlobalAlign32}, | ||||||
1087 | {S32, GlobalPtr, 8, GlobalAlign8}, | ||||||
1088 | {S32, GlobalPtr, 16, GlobalAlign16}, | ||||||
1089 | |||||||
1090 | {S32, LocalPtr, 32, 32}, | ||||||
1091 | {S64, LocalPtr, 64, 32}, | ||||||
1092 | {V2S32, LocalPtr, 64, 32}, | ||||||
1093 | {S32, LocalPtr, 8, 8}, | ||||||
1094 | {S32, LocalPtr, 16, 16}, | ||||||
1095 | {V2S16, LocalPtr, 32, 32}, | ||||||
1096 | |||||||
1097 | {S32, PrivatePtr, 32, 32}, | ||||||
1098 | {S32, PrivatePtr, 8, 8}, | ||||||
1099 | {S32, PrivatePtr, 16, 16}, | ||||||
1100 | {V2S16, PrivatePtr, 32, 32}, | ||||||
1101 | |||||||
1102 | {S32, ConstantPtr, 32, GlobalAlign32}, | ||||||
1103 | {V2S32, ConstantPtr, 64, GlobalAlign32}, | ||||||
1104 | {V4S32, ConstantPtr, 128, GlobalAlign32}, | ||||||
1105 | {S64, ConstantPtr, 64, GlobalAlign32}, | ||||||
1106 | {V2S32, ConstantPtr, 32, GlobalAlign32}}); | ||||||
1107 | Actions.legalIf( | ||||||
1108 | [=](const LegalityQuery &Query) -> bool { | ||||||
1109 | return isLoadStoreLegal(ST, Query, Op); | ||||||
1110 | }); | ||||||
1111 | |||||||
1112 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||||
1113 | // 64-bits. | ||||||
1114 | // | ||||||
1115 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||||
1116 | // inserting addrspacecasts. | ||||||
1117 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||||
1118 | |||||||
1119 | // Turn any illegal element vectors into something easier to deal | ||||||
1120 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||||
1121 | // parts anyway. | ||||||
1122 | // | ||||||
1123 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||||
1124 | // 16-bit vector parts. | ||||||
1125 | Actions.bitcastIf( | ||||||
1126 | [=](const LegalityQuery &Query) -> bool { | ||||||
1127 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||||
1128 | Query.MMODescrs[0].SizeInBits); | ||||||
1129 | }, bitcastToRegisterType(0)); | ||||||
1130 | |||||||
1131 | if (!IsStore) { | ||||||
1132 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||||
1133 | // legalization actions can't properly express widening memory operands. | ||||||
1134 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||||
1135 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||||
1136 | }); | ||||||
1137 | } | ||||||
1138 | |||||||
1139 | // FIXME: load/store narrowing should be moved to lower action | ||||||
1140 | Actions | ||||||
1141 | .narrowScalarIf( | ||||||
1142 | [=](const LegalityQuery &Query) -> bool { | ||||||
1143 | return !Query.Types[0].isVector() && | ||||||
1144 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
1145 | }, | ||||||
1146 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
1147 | const LLT DstTy = Query.Types[0]; | ||||||
1148 | const LLT PtrTy = Query.Types[1]; | ||||||
1149 | |||||||
1150 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||||
1151 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
1152 | |||||||
1153 | // Split extloads. | ||||||
1154 | if (DstSize > MemSize) | ||||||
1155 | return std::make_pair(0, LLT::scalar(MemSize)); | ||||||
1156 | |||||||
1157 | if (!isPowerOf2_32(DstSize)) { | ||||||
1158 | // We're probably decomposing an odd sized store. Try to split | ||||||
1159 | // to the widest type. TODO: Account for alignment. As-is it | ||||||
1160 | // should be OK, since the new parts will be further legalized. | ||||||
1161 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||
1162 | return std::make_pair(0, LLT::scalar(FloorSize)); | ||||||
1163 | } | ||||||
1164 | |||||||
1165 | if (DstSize > 32 && (DstSize % 32 != 0)) { | ||||||
1166 | // FIXME: Need a way to specify non-extload of larger size if | ||||||
1167 | // suitably aligned. | ||||||
1168 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); | ||||||
1169 | } | ||||||
1170 | |||||||
1171 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||
1172 | PtrTy.getAddressSpace(), | ||||||
1173 | Op == G_LOAD); | ||||||
1174 | if (MemSize > MaxSize) | ||||||
1175 | return std::make_pair(0, LLT::scalar(MaxSize)); | ||||||
1176 | |||||||
1177 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
1178 | return std::make_pair(0, LLT::scalar(Align)); | ||||||
1179 | }) | ||||||
1180 | .fewerElementsIf( | ||||||
1181 | [=](const LegalityQuery &Query) -> bool { | ||||||
1182 | return Query.Types[0].isVector() && | ||||||
1183 | needToSplitMemOp(Query, Op == G_LOAD); | ||||||
1184 | }, | ||||||
1185 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||||
1186 | const LLT DstTy = Query.Types[0]; | ||||||
1187 | const LLT PtrTy = Query.Types[1]; | ||||||
1188 | |||||||
1189 | LLT EltTy = DstTy.getElementType(); | ||||||
1190 | unsigned MaxSize = maxSizeForAddrSpace(ST, | ||||||
1191 | PtrTy.getAddressSpace(), | ||||||
1192 | Op == G_LOAD); | ||||||
1193 | |||||||
1194 | // FIXME: Handle widened to power of 2 results better. This ends | ||||||
1195 | // up scalarizing. | ||||||
1196 | // FIXME: 3 element stores scalarized on SI | ||||||
1197 | |||||||
1198 | // Split if it's too large for the address space. | ||||||
1199 | if (Query.MMODescrs[0].SizeInBits > MaxSize) { | ||||||
1200 | unsigned NumElts = DstTy.getNumElements(); | ||||||
1201 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1202 | |||||||
1203 | if (MaxSize % EltSize == 0) { | ||||||
1204 | return std::make_pair( | ||||||
1205 | 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); | ||||||
1206 | } | ||||||
1207 | |||||||
1208 | unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; | ||||||
1209 | |||||||
1210 | // FIXME: Refine when odd breakdowns handled | ||||||
1211 | // The scalars will need to be re-legalized. | ||||||
1212 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||||
1213 | NumElts % NumPieces != 0) | ||||||
1214 | return std::make_pair(0, EltTy); | ||||||
1215 | |||||||
1216 | return std::make_pair(0, | ||||||
1217 | LLT::vector(NumElts / NumPieces, EltTy)); | ||||||
1218 | } | ||||||
1219 | |||||||
1220 | // FIXME: We could probably handle weird extending loads better. | ||||||
1221 | unsigned MemSize = Query.MMODescrs[0].SizeInBits; | ||||||
1222 | if (DstTy.getSizeInBits() > MemSize) | ||||||
1223 | return std::make_pair(0, EltTy); | ||||||
1224 | |||||||
1225 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1226 | unsigned DstSize = DstTy.getSizeInBits(); | ||||||
1227 | if (!isPowerOf2_32(DstSize)) { | ||||||
1228 | // We're probably decomposing an odd sized store. Try to split | ||||||
1229 | // to the widest type. TODO: Account for alignment. As-is it | ||||||
1230 | // should be OK, since the new parts will be further legalized. | ||||||
1231 | unsigned FloorSize = PowerOf2Floor(DstSize); | ||||||
1232 | return std::make_pair( | ||||||
1233 | 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); | ||||||
1234 | } | ||||||
1235 | |||||||
1236 | // Need to split because of alignment. | ||||||
1237 | unsigned Align = Query.MMODescrs[0].AlignInBits; | ||||||
1238 | if (EltSize > Align && | ||||||
1239 | (EltSize / Align < DstTy.getNumElements())) { | ||||||
1240 | return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); | ||||||
1241 | } | ||||||
1242 | |||||||
1243 | // May need relegalization for the scalars. | ||||||
1244 | return std::make_pair(0, EltTy); | ||||||
1245 | }) | ||||||
1246 | .lowerIfMemSizeNotPow2() | ||||||
1247 | .minScalar(0, S32); | ||||||
1248 | |||||||
1249 | if (IsStore) | ||||||
1250 | Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); | ||||||
1251 | |||||||
1252 | Actions | ||||||
1253 | .widenScalarToNextPow2(0) | ||||||
1254 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||||
1255 | .lower(); | ||||||
1256 | } | ||||||
1257 | |||||||
1258 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||||
1259 | .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, | ||||||
1260 | {S32, GlobalPtr, 16, 2 * 8}, | ||||||
1261 | {S32, LocalPtr, 8, 8}, | ||||||
1262 | {S32, LocalPtr, 16, 16}, | ||||||
1263 | {S32, PrivatePtr, 8, 8}, | ||||||
1264 | {S32, PrivatePtr, 16, 16}, | ||||||
1265 | {S32, ConstantPtr, 8, 8}, | ||||||
1266 | {S32, ConstantPtr, 16, 2 * 8}}); | ||||||
1267 | if (ST.hasFlatAddressSpace()) { | ||||||
1268 | ExtLoads.legalForTypesWithMemDesc( | ||||||
1269 | {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); | ||||||
1270 | } | ||||||
1271 | |||||||
1272 | ExtLoads.clampScalar(0, S32, S32) | ||||||
1273 | .widenScalarToNextPow2(0) | ||||||
1274 | .unsupportedIfMemSizeNotPow2() | ||||||
1275 | .lower(); | ||||||
1276 | |||||||
1277 | auto &Atomics = getActionDefinitionsBuilder( | ||||||
1278 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||||
1279 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||||
1280 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||||
1281 | G_ATOMICRMW_UMIN}) | ||||||
1282 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||||
1283 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||||
1284 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
1285 | if (ST.hasFlatAddressSpace()) { | ||||||
1286 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||||
1287 | } | ||||||
1288 | |||||||
1289 | if (ST.hasLDSFPAtomics()) { | ||||||
1290 | getActionDefinitionsBuilder(G_ATOMICRMW_FADD) | ||||||
1291 | .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||||
1292 | } | ||||||
1293 | |||||||
1294 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||||
1295 | // demarshalling | ||||||
1296 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||||
1297 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||||
1298 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||||
1299 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||||
1300 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||||
1301 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||||
1302 | |||||||
1303 | // Condition should be s32 for scalar, s1 for vector. | ||||||
1304 | getActionDefinitionsBuilder(G_SELECT) | ||||||
1305 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, | ||||||
1306 | GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, | ||||||
1307 | LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) | ||||||
1308 | .clampScalar(0, S16, S64) | ||||||
1309 | .scalarize(1) | ||||||
1310 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||||
1311 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||||
1312 | .clampMaxNumElements(0, S32, 2) | ||||||
1313 | .clampMaxNumElements(0, LocalPtr, 2) | ||||||
1314 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||||
1315 | .scalarize(0) | ||||||
1316 | .widenScalarToNextPow2(0) | ||||||
1317 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||||
1318 | |||||||
1319 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||||
1320 | // be more flexible with the shift amount type. | ||||||
1321 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||||
1322 | .legalFor({{S32, S32}, {S64, S32}}); | ||||||
1323 | if (ST.has16BitInsts()) { | ||||||
1324 | if (ST.hasVOP3PInsts()) { | ||||||
1325 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||||
1326 | .clampMaxNumElements(0, S16, 2); | ||||||
1327 | } else | ||||||
1328 | Shifts.legalFor({{S16, S16}}); | ||||||
1329 | |||||||
1330 | // TODO: Support 16-bit shift amounts for all types | ||||||
1331 | Shifts.widenScalarIf( | ||||||
1332 | [=](const LegalityQuery &Query) { | ||||||
1333 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||||
1334 | // 32-bit amount. | ||||||
1335 | const LLT ValTy = Query.Types[0]; | ||||||
1336 | const LLT AmountTy = Query.Types[1]; | ||||||
1337 | return ValTy.getSizeInBits() <= 16 && | ||||||
1338 | AmountTy.getSizeInBits() < 16; | ||||||
1339 | }, changeTo(1, S16)); | ||||||
1340 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||||
1341 | Shifts.clampScalar(1, S32, S32); | ||||||
1342 | Shifts.clampScalar(0, S16, S64); | ||||||
1343 | Shifts.widenScalarToNextPow2(0, 16); | ||||||
1344 | |||||||
1345 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||
1346 | .minScalar(0, S16) | ||||||
1347 | .scalarize(0) | ||||||
1348 | .lower(); | ||||||
1349 | } else { | ||||||
1350 | // Make sure we legalize the shift amount type first, as the general | ||||||
1351 | // expansion for the shifted type will produce much worse code if it hasn't | ||||||
1352 | // been truncated already. | ||||||
1353 | Shifts.clampScalar(1, S32, S32); | ||||||
1354 | Shifts.clampScalar(0, S32, S64); | ||||||
1355 | Shifts.widenScalarToNextPow2(0, 32); | ||||||
1356 | |||||||
1357 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||||
1358 | .minScalar(0, S32) | ||||||
1359 | .scalarize(0) | ||||||
1360 | .lower(); | ||||||
1361 | } | ||||||
1362 | Shifts.scalarize(0); | ||||||
1363 | |||||||
1364 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||||
1365 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||||
1366 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||||
1367 | unsigned IdxTypeIdx = 2; | ||||||
1368 | |||||||
1369 | getActionDefinitionsBuilder(Op) | ||||||
1370 | .customIf([=](const LegalityQuery &Query) { | ||||||
1371 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
1372 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
1373 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||||
1374 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||||
1375 | return (EltSize == 32 || EltSize == 64) && | ||||||
1376 | VecTy.getSizeInBits() % 32 == 0 && | ||||||
1377 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||||
1378 | IdxTy.getSizeInBits() == 32; | ||||||
1379 | }) | ||||||
1380 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||||
1381 | bitcastToVectorElement32(VecTypeIdx)) | ||||||
1382 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||||
1383 | .bitcastIf( | ||||||
1384 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||||
1385 | [=](const LegalityQuery &Query) { | ||||||
1386 | // For > 64-bit element types, try to turn this into a 64-bit | ||||||
1387 | // element vector since we may be able to do better indexing | ||||||
1388 | // if this is scalar. If not, fall back to 32. | ||||||
1389 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||||
1390 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||||
1391 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||||
1392 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||||
1393 | |||||||
1394 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||||
1395 | return std::make_pair( | ||||||
1396 | VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); | ||||||
1397 | }) | ||||||
1398 | .clampScalar(EltTypeIdx, S32, S64) | ||||||
1399 | .clampScalar(VecTypeIdx, S32, S64) | ||||||
1400 | .clampScalar(IdxTypeIdx, S32, S32) | ||||||
1401 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||||
1402 | // TODO: Clamp elements for 64-bit vectors? | ||||||
1403 | // It should only be necessary with variable indexes. | ||||||
1404 | // As a last resort, lower to the stack | ||||||
1405 | .lower(); | ||||||
1406 | } | ||||||
1407 | |||||||
1408 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||||
1409 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||||
1410 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||||
1411 | return Query.Types[0] != EltTy; | ||||||
1412 | }); | ||||||
1413 | |||||||
1414 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||||
1415 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||||
1416 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||||
1417 | |||||||
1418 | // FIXME: Doesn't handle extract of illegal sizes. | ||||||
1419 | getActionDefinitionsBuilder(Op) | ||||||
1420 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||||
1421 | // FIXME: Multiples of 16 should not be legal. | ||||||
1422 | .legalIf([=](const LegalityQuery &Query) { | ||||||
1423 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1424 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1425 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||||
1426 | (LitTy.getSizeInBits() % 16 == 0); | ||||||
1427 | }) | ||||||
1428 | .widenScalarIf( | ||||||
1429 | [=](const LegalityQuery &Query) { | ||||||
1430 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1431 | return (BigTy.getScalarSizeInBits() < 16); | ||||||
1432 | }, | ||||||
1433 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||||
1434 | .widenScalarIf( | ||||||
1435 | [=](const LegalityQuery &Query) { | ||||||
1436 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||||
1437 | return (LitTy.getScalarSizeInBits() < 16); | ||||||
1438 | }, | ||||||
1439 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||||
1440 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1441 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||||
1442 | |||||||
1443 | } | ||||||
1444 | |||||||
1445 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||||
1446 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||||
1447 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||||
1448 | .clampNumElements(0, V16S32, V32S32) | ||||||
1449 | .clampNumElements(0, V2S64, V16S64) | ||||||
1450 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); | ||||||
1451 | |||||||
1452 | if (ST.hasScalarPackInsts()) { | ||||||
1453 | BuildVector | ||||||
1454 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||||
1455 | .minScalarOrElt(0, S16) | ||||||
1456 | // Widen source elements and produce a G_BUILD_VECTOR_TRUNC | ||||||
1457 | .minScalar(1, S32); | ||||||
1458 | |||||||
1459 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1460 | .legalFor({V2S16, S32}) | ||||||
1461 | .lower(); | ||||||
1462 | BuildVector.minScalarOrElt(0, S32); | ||||||
1463 | } else { | ||||||
1464 | BuildVector.customFor({V2S16, S16}); | ||||||
1465 | BuildVector.minScalarOrElt(0, S32); | ||||||
1466 | |||||||
1467 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||||
1468 | .customFor({V2S16, S32}) | ||||||
1469 | .lower(); | ||||||
1470 | } | ||||||
1471 | |||||||
1472 | BuildVector.legalIf(isRegisterType(0)); | ||||||
1473 | |||||||
1474 | // FIXME: Clamp maximum size | ||||||
1475 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||||
1476 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
1477 | .clampMaxNumElements(0, S32, 32) | ||||||
1478 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||||
1479 | .clampMaxNumElements(0, S16, 64); | ||||||
1480 | |||||||
1481 | // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse | ||||||
1482 | // pre-legalize. | ||||||
1483 | if (ST.hasVOP3PInsts()) { | ||||||
1484 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) | ||||||
1485 | .customFor({V2S16, V2S16}) | ||||||
1486 | .lower(); | ||||||
1487 | } else | ||||||
1488 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||||
1489 | |||||||
1490 | // Merge/Unmerge | ||||||
1491 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||||
1492 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||||
1493 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||||
1494 | |||||||
1495 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||||
1496 | const LLT Ty = Query.Types[TypeIdx]; | ||||||
1497 | if (Ty.isVector()) { | ||||||
1498 | const LLT &EltTy = Ty.getElementType(); | ||||||
1499 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||||
1500 | return true; | ||||||
1501 | if (!isPowerOf2_32(EltTy.getSizeInBits())) | ||||||
1502 | return true; | ||||||
1503 | } | ||||||
1504 | return false; | ||||||
1505 | }; | ||||||
1506 | |||||||
1507 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||||
1508 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||||
1509 | .lowerFor({{S16, V2S16}}) | ||||||
1510 | .lowerIf([=](const LegalityQuery &Query) { | ||||||
1511 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||||
1512 | return BigTy.getSizeInBits() == 32; | ||||||
1513 | }) | ||||||
1514 | // Try to widen to s16 first for small types. | ||||||
1515 | // TODO: Only do this on targets with legal s16 shifts | ||||||
1516 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||||
1517 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||||
1518 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||||
1519 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||||
1520 | elementTypeIs(1, S16)), | ||||||
1521 | changeTo(1, V2S16)) | ||||||
1522 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||||
1523 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||||
1524 | // valid. | ||||||
1525 | .clampScalar(LitTyIdx, S32, S512) | ||||||
1526 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||||
1527 | // Break up vectors with weird elements into scalars | ||||||
1528 | .fewerElementsIf( | ||||||
1529 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||||
1530 | scalarize(0)) | ||||||
1531 | .fewerElementsIf( | ||||||
1532 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||||
1533 | scalarize(1)) | ||||||
1534 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||||
1535 | |||||||
1536 | if (Op == G_MERGE_VALUES) { | ||||||
1537 | Builder.widenScalarIf( | ||||||
1538 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||||
1539 | [=](const LegalityQuery &Query) { | ||||||
1540 | const LLT Ty = Query.Types[LitTyIdx]; | ||||||
1541 | return Ty.getSizeInBits() < 32; | ||||||
1542 | }, | ||||||
1543 | changeTo(LitTyIdx, S32)); | ||||||
1544 | } | ||||||
1545 | |||||||
1546 | Builder.widenScalarIf( | ||||||
1547 | [=](const LegalityQuery &Query) { | ||||||
1548 | const LLT Ty = Query.Types[BigTyIdx]; | ||||||
1549 | return !isPowerOf2_32(Ty.getSizeInBits()) && | ||||||
1550 | Ty.getSizeInBits() % 16 != 0; | ||||||
1551 | }, | ||||||
1552 | [=](const LegalityQuery &Query) { | ||||||
1553 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||||
1554 | // Whichever is smaller. | ||||||
1555 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||||
1556 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||||
1557 | if (NewSizeInBits >= 256) { | ||||||
1558 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||||
1559 | if (RoundedTo < NewSizeInBits) | ||||||
1560 | NewSizeInBits = RoundedTo; | ||||||
1561 | } | ||||||
1562 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||||
1563 | }) | ||||||
1564 | // Any vectors left are the wrong size. Scalarize them. | ||||||
1565 | .scalarize(0) | ||||||
1566 | .scalarize(1); | ||||||
1567 | } | ||||||
1568 | |||||||
1569 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||||
1570 | // RegBankSelect. | ||||||
1571 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||||
1572 | .legalFor({{S32}, {S64}}); | ||||||
1573 | |||||||
1574 | if (ST.hasVOP3PInsts()) { | ||||||
1575 | SextInReg.lowerFor({{V2S16}}) | ||||||
1576 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||||
1577 | // get more vector shift opportunities, since we'll get those when | ||||||
1578 | // expanded. | ||||||
1579 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); | ||||||
1580 | } else if (ST.has16BitInsts()) { | ||||||
1581 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||||
1582 | } else { | ||||||
1583 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||||
1584 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||||
1585 | SextInReg.lowerFor({{S32}, {S64}}); | ||||||
1586 | } | ||||||
1587 | |||||||
1588 | SextInReg | ||||||
1589 | .scalarize(0) | ||||||
1590 | .clampScalar(0, S32, S64) | ||||||
1591 | .lower(); | ||||||
1592 | |||||||
1593 | getActionDefinitionsBuilder(G_FSHR) | ||||||
1594 | .legalFor({{S32, S32}}) | ||||||
1595 | .scalarize(0) | ||||||
1596 | .lower(); | ||||||
1597 | |||||||
1598 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||||
1599 | .legalFor({S64}); | ||||||
1600 | |||||||
1601 | getActionDefinitionsBuilder(G_FENCE) | ||||||
1602 | .alwaysLegal(); | ||||||
1603 | |||||||
1604 | getActionDefinitionsBuilder({ | ||||||
1605 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||||
1606 | G_FCOPYSIGN, | ||||||
1607 | |||||||
1608 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||||
1609 | G_ATOMICRMW_NAND, | ||||||
1610 | G_ATOMICRMW_FSUB, | ||||||
1611 | G_READ_REGISTER, | ||||||
1612 | G_WRITE_REGISTER, | ||||||
1613 | |||||||
1614 | G_SADDO, G_SSUBO, | ||||||
1615 | |||||||
1616 | // TODO: Implement | ||||||
1617 | G_FMINIMUM, G_FMAXIMUM, | ||||||
1618 | G_FSHL | ||||||
1619 | }).lower(); | ||||||
1620 | |||||||
1621 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||||
1622 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||||
1623 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||||
1624 | .unsupported(); | ||||||
1625 | |||||||
1626 | computeTables(); | ||||||
1627 | verify(*ST.getInstrInfo()); | ||||||
1628 | } | ||||||
1629 | |||||||
1630 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||||
1631 | MachineInstr &MI) const { | ||||||
1632 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
1633 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
1634 | |||||||
1635 | switch (MI.getOpcode()) { | ||||||
1636 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||||
1637 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||||
1638 | case TargetOpcode::G_FRINT: | ||||||
1639 | return legalizeFrint(MI, MRI, B); | ||||||
1640 | case TargetOpcode::G_FCEIL: | ||||||
1641 | return legalizeFceil(MI, MRI, B); | ||||||
1642 | case TargetOpcode::G_FREM: | ||||||
1643 | return legalizeFrem(MI, MRI, B); | ||||||
1644 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||||
1645 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||||
1646 | case TargetOpcode::G_SITOFP: | ||||||
1647 | return legalizeITOFP(MI, MRI, B, true); | ||||||
1648 | case TargetOpcode::G_UITOFP: | ||||||
1649 | return legalizeITOFP(MI, MRI, B, false); | ||||||
1650 | case TargetOpcode::G_FPTOSI: | ||||||
1651 | return legalizeFPTOI(MI, MRI, B, true); | ||||||
1652 | case TargetOpcode::G_FPTOUI: | ||||||
1653 | return legalizeFPTOI(MI, MRI, B, false); | ||||||
1654 | case TargetOpcode::G_FMINNUM: | ||||||
1655 | case TargetOpcode::G_FMAXNUM: | ||||||
1656 | case TargetOpcode::G_FMINNUM_IEEE: | ||||||
1657 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||||
1658 | return legalizeMinNumMaxNum(Helper, MI); | ||||||
1659 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||||
1660 | return legalizeExtractVectorElt(MI, MRI, B); | ||||||
1661 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||||
1662 | return legalizeInsertVectorElt(MI, MRI, B); | ||||||
1663 | case TargetOpcode::G_SHUFFLE_VECTOR: | ||||||
1664 | return legalizeShuffleVector(MI, MRI, B); | ||||||
1665 | case TargetOpcode::G_FSIN: | ||||||
1666 | case TargetOpcode::G_FCOS: | ||||||
1667 | return legalizeSinCos(MI, MRI, B); | ||||||
1668 | case TargetOpcode::G_GLOBAL_VALUE: | ||||||
1669 | return legalizeGlobalValue(MI, MRI, B); | ||||||
1670 | case TargetOpcode::G_LOAD: | ||||||
1671 | return legalizeLoad(Helper, MI); | ||||||
1672 | case TargetOpcode::G_FMAD: | ||||||
1673 | return legalizeFMad(MI, MRI, B); | ||||||
1674 | case TargetOpcode::G_FDIV: | ||||||
1675 | return legalizeFDIV(MI, MRI, B); | ||||||
1676 | case TargetOpcode::G_UDIV: | ||||||
1677 | case TargetOpcode::G_UREM: | ||||||
1678 | return legalizeUDIV_UREM(MI, MRI, B); | ||||||
1679 | case TargetOpcode::G_SDIV: | ||||||
1680 | case TargetOpcode::G_SREM: | ||||||
1681 | return legalizeSDIV_SREM(MI, MRI, B); | ||||||
1682 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||||
1683 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||||
1684 | case TargetOpcode::G_FLOG: | ||||||
1685 | return legalizeFlog(MI, B, numbers::ln2f); | ||||||
1686 | case TargetOpcode::G_FLOG10: | ||||||
1687 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||||
1688 | case TargetOpcode::G_FEXP: | ||||||
1689 | return legalizeFExp(MI, B); | ||||||
1690 | case TargetOpcode::G_FPOW: | ||||||
1691 | return legalizeFPow(MI, B); | ||||||
1692 | case TargetOpcode::G_FFLOOR: | ||||||
1693 | return legalizeFFloor(MI, MRI, B); | ||||||
1694 | case TargetOpcode::G_BUILD_VECTOR: | ||||||
1695 | return legalizeBuildVector(MI, MRI, B); | ||||||
1696 | default: | ||||||
1697 | return false; | ||||||
1698 | } | ||||||
1699 | |||||||
1700 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1700); | ||||||
1701 | } | ||||||
1702 | |||||||
1703 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||||
1704 | unsigned AS, | ||||||
1705 | MachineRegisterInfo &MRI, | ||||||
1706 | MachineIRBuilder &B) const { | ||||||
1707 | MachineFunction &MF = B.getMF(); | ||||||
1708 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
1709 | const LLT S32 = LLT::scalar(32); | ||||||
1710 | |||||||
1711 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1711, __PRETTY_FUNCTION__)); | ||||||
1712 | |||||||
1713 | if (ST.hasApertureRegs()) { | ||||||
1714 | // FIXME: Use inline constants (src_{shared, private}_base) instead of | ||||||
1715 | // getreg. | ||||||
1716 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1717 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : | ||||||
1718 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; | ||||||
1719 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? | ||||||
1720 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : | ||||||
1721 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; | ||||||
1722 | unsigned Encoding = | ||||||
1723 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | | ||||||
1724 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | | ||||||
1725 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; | ||||||
1726 | |||||||
1727 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||||||
1728 | |||||||
1729 | B.buildInstr(AMDGPU::S_GETREG_B32) | ||||||
1730 | .addDef(GetReg) | ||||||
1731 | .addImm(Encoding); | ||||||
1732 | MRI.setType(GetReg, S32); | ||||||
1733 | |||||||
1734 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); | ||||||
1735 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); | ||||||
1736 | } | ||||||
1737 | |||||||
1738 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||||
1739 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
1740 | |||||||
1741 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||
1742 | return Register(); | ||||||
1743 | |||||||
1744 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||||
1745 | // private_segment_aperture_base_hi. | ||||||
1746 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||||
1747 | |||||||
1748 | // TODO: can we be smarter about machine pointer info? | ||||||
1749 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||||
1750 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
1751 | PtrInfo, | ||||||
1752 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
1753 | MachineMemOperand::MOInvariant, | ||||||
1754 | 4, commonAlignment(Align(64), StructOffset)); | ||||||
1755 | |||||||
1756 | Register LoadAddr; | ||||||
1757 | |||||||
1758 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); | ||||||
1759 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||||
1760 | } | ||||||
1761 | |||||||
1762 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||||
1763 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1764 | MachineIRBuilder &B) const { | ||||||
1765 | MachineFunction &MF = B.getMF(); | ||||||
1766 | |||||||
1767 | const LLT S32 = LLT::scalar(32); | ||||||
1768 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1769 | Register Src = MI.getOperand(1).getReg(); | ||||||
1770 | |||||||
1771 | LLT DstTy = MRI.getType(Dst); | ||||||
1772 | LLT SrcTy = MRI.getType(Src); | ||||||
1773 | unsigned DestAS = DstTy.getAddressSpace(); | ||||||
1774 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||||
1775 | |||||||
1776 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||||
1777 | // vector element. | ||||||
1778 | assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail ("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1778, __PRETTY_FUNCTION__)); | ||||||
1779 | |||||||
1780 | const AMDGPUTargetMachine &TM | ||||||
1781 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||||
1782 | |||||||
1783 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||||
1784 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||||
1785 | return true; | ||||||
1786 | } | ||||||
1787 | |||||||
1788 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1789 | // Truncate. | ||||||
1790 | B.buildExtract(Dst, Src, 0); | ||||||
1791 | MI.eraseFromParent(); | ||||||
1792 | return true; | ||||||
1793 | } | ||||||
1794 | |||||||
1795 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
1796 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
1797 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||||
1798 | |||||||
1799 | // FIXME: This is a bit ugly due to creating a merge of 2 pointers to | ||||||
1800 | // another. Merge operands are required to be the same type, but creating an | ||||||
1801 | // extra ptrtoint would be kind of pointless. | ||||||
1802 | auto HighAddr = B.buildConstant( | ||||||
1803 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); | ||||||
1804 | B.buildMerge(Dst, {Src, HighAddr}); | ||||||
1805 | MI.eraseFromParent(); | ||||||
1806 | return true; | ||||||
1807 | } | ||||||
1808 | |||||||
1809 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { | ||||||
1810 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1811, __PRETTY_FUNCTION__)) | ||||||
1811 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS ) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1811, __PRETTY_FUNCTION__)); | ||||||
1812 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||||
1813 | |||||||
1814 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||||
1815 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||||
1816 | |||||||
1817 | // Extract low 32-bits of the pointer. | ||||||
1818 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||||
1819 | |||||||
1820 | auto CmpRes = | ||||||
1821 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||||
1822 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||||
1823 | |||||||
1824 | MI.eraseFromParent(); | ||||||
1825 | return true; | ||||||
1826 | } | ||||||
1827 | |||||||
1828 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) | ||||||
1829 | return false; | ||||||
1830 | |||||||
1831 | if (!ST.hasFlatAddressSpace()) | ||||||
1832 | return false; | ||||||
1833 | |||||||
1834 | auto SegmentNull = | ||||||
1835 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||||
1836 | auto FlatNull = | ||||||
1837 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||||
1838 | |||||||
1839 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||||
1840 | if (!ApertureReg.isValid()) | ||||||
1841 | return false; | ||||||
1842 | |||||||
1843 | auto CmpRes = | ||||||
1844 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); | ||||||
1845 | |||||||
1846 | // Coerce the type of the low half of the result so we can use merge_values. | ||||||
1847 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||||
1848 | |||||||
1849 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||||
1850 | // avoid the ptrtoint? | ||||||
1851 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); | ||||||
1852 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||||
1853 | |||||||
1854 | MI.eraseFromParent(); | ||||||
1855 | return true; | ||||||
1856 | } | ||||||
1857 | |||||||
1858 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||||
1859 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1860 | MachineIRBuilder &B) const { | ||||||
1861 | Register Src = MI.getOperand(1).getReg(); | ||||||
1862 | LLT Ty = MRI.getType(Src); | ||||||
1863 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast <void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1863, __PRETTY_FUNCTION__)); | ||||||
1864 | |||||||
1865 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||||
1866 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||||
1867 | |||||||
1868 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||||
1869 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||||
1870 | |||||||
1871 | // TODO: Should this propagate fast-math-flags? | ||||||
1872 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||||
1873 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||||
1874 | |||||||
1875 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||||
1876 | auto Fabs = B.buildFAbs(Ty, Src); | ||||||
1877 | |||||||
1878 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||||
1879 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||||
1880 | MI.eraseFromParent(); | ||||||
1881 | return true; | ||||||
1882 | } | ||||||
1883 | |||||||
1884 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||||
1885 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1886 | MachineIRBuilder &B) const { | ||||||
1887 | |||||||
1888 | const LLT S1 = LLT::scalar(1); | ||||||
1889 | const LLT S64 = LLT::scalar(64); | ||||||
1890 | |||||||
1891 | Register Src = MI.getOperand(1).getReg(); | ||||||
1892 | assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1892, __PRETTY_FUNCTION__)); | ||||||
1893 | |||||||
1894 | // result = trunc(src) | ||||||
1895 | // if (src > 0.0 && src != result) | ||||||
1896 | // result += 1.0 | ||||||
1897 | |||||||
1898 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||||
1899 | |||||||
1900 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||||
1901 | const auto One = B.buildFConstant(S64, 1.0); | ||||||
1902 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||||
1903 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||||
1904 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||||
1905 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||||
1906 | |||||||
1907 | // TODO: Should this propagate fast-math-flags? | ||||||
1908 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||||
1909 | return true; | ||||||
1910 | } | ||||||
1911 | |||||||
1912 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||||
1913 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1914 | MachineIRBuilder &B) const { | ||||||
1915 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
1916 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||||
1917 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||||
1918 | auto Flags = MI.getFlags(); | ||||||
1919 | LLT Ty = MRI.getType(DstReg); | ||||||
1920 | |||||||
1921 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||||
1922 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||||
1923 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||||
1924 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||||
1925 | MI.eraseFromParent(); | ||||||
1926 | return true; | ||||||
1927 | } | ||||||
1928 | |||||||
1929 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||||
1930 | MachineIRBuilder &B) { | ||||||
1931 | const unsigned FractBits = 52; | ||||||
1932 | const unsigned ExpBits = 11; | ||||||
1933 | LLT S32 = LLT::scalar(32); | ||||||
1934 | |||||||
1935 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||||
1936 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||||
1937 | |||||||
1938 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||||
1939 | .addUse(Hi) | ||||||
1940 | .addUse(Const0.getReg(0)) | ||||||
1941 | .addUse(Const1.getReg(0)); | ||||||
1942 | |||||||
1943 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||||
1944 | } | ||||||
1945 | |||||||
1946 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||||
1947 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1948 | MachineIRBuilder &B) const { | ||||||
1949 | const LLT S1 = LLT::scalar(1); | ||||||
1950 | const LLT S32 = LLT::scalar(32); | ||||||
1951 | const LLT S64 = LLT::scalar(64); | ||||||
1952 | |||||||
1953 | Register Src = MI.getOperand(1).getReg(); | ||||||
1954 | assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1954, __PRETTY_FUNCTION__)); | ||||||
1955 | |||||||
1956 | // TODO: Should this use extract since the low half is unused? | ||||||
1957 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
1958 | Register Hi = Unmerge.getReg(1); | ||||||
1959 | |||||||
1960 | // Extract the upper half, since this is where we will find the sign and | ||||||
1961 | // exponent. | ||||||
1962 | auto Exp = extractF64Exponent(Hi, B); | ||||||
1963 | |||||||
1964 | const unsigned FractBits = 52; | ||||||
1965 | |||||||
1966 | // Extract the sign bit. | ||||||
1967 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||||
1968 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||||
1969 | |||||||
1970 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||||
1971 | |||||||
1972 | const auto Zero32 = B.buildConstant(S32, 0); | ||||||
1973 | |||||||
1974 | // Extend back to 64-bits. | ||||||
1975 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); | ||||||
1976 | |||||||
1977 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||||
1978 | auto Not = B.buildNot(S64, Shr); | ||||||
1979 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||||
1980 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||||
1981 | |||||||
1982 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||||
1983 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||||
1984 | |||||||
1985 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||||
1986 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||||
1987 | MI.eraseFromParent(); | ||||||
1988 | return true; | ||||||
1989 | } | ||||||
1990 | |||||||
1991 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||||
1992 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
1993 | MachineIRBuilder &B, bool Signed) const { | ||||||
1994 | |||||||
1995 | Register Dst = MI.getOperand(0).getReg(); | ||||||
1996 | Register Src = MI.getOperand(1).getReg(); | ||||||
1997 | |||||||
1998 | const LLT S64 = LLT::scalar(64); | ||||||
1999 | const LLT S32 = LLT::scalar(32); | ||||||
2000 | |||||||
2001 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2001, __PRETTY_FUNCTION__)); | ||||||
2002 | |||||||
2003 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||||
2004 | |||||||
2005 | auto CvtHi = Signed ? | ||||||
2006 | B.buildSITOFP(S64, Unmerge.getReg(1)) : | ||||||
2007 | B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||||
2008 | |||||||
2009 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||||
2010 | |||||||
2011 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||||
2012 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||||
2013 | .addUse(CvtHi.getReg(0)) | ||||||
2014 | .addUse(ThirtyTwo.getReg(0)); | ||||||
2015 | |||||||
2016 | // TODO: Should this propagate fast-math-flags? | ||||||
2017 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||||
2018 | MI.eraseFromParent(); | ||||||
2019 | return true; | ||||||
2020 | } | ||||||
2021 | |||||||
2022 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||||
2023 | // actually works. | ||||||
2024 | bool AMDGPULegalizerInfo::legalizeFPTOI( | ||||||
2025 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2026 | MachineIRBuilder &B, bool Signed) const { | ||||||
2027 | |||||||
2028 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2029 | Register Src = MI.getOperand(1).getReg(); | ||||||
2030 | |||||||
2031 | const LLT S64 = LLT::scalar(64); | ||||||
2032 | const LLT S32 = LLT::scalar(32); | ||||||
2033 | |||||||
2034 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2034, __PRETTY_FUNCTION__)); | ||||||
2035 | |||||||
2036 | unsigned Flags = MI.getFlags(); | ||||||
2037 | |||||||
2038 | auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); | ||||||
2039 | auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL)); | ||||||
2040 | auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL)); | ||||||
2041 | |||||||
2042 | auto Mul = B.buildFMul(S64, Trunc, K0, Flags); | ||||||
2043 | auto FloorMul = B.buildFFloor(S64, Mul, Flags); | ||||||
2044 | auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); | ||||||
2045 | |||||||
2046 | auto Hi = Signed ? | ||||||
2047 | B.buildFPTOSI(S32, FloorMul) : | ||||||
2048 | B.buildFPTOUI(S32, FloorMul); | ||||||
2049 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||||
2050 | |||||||
2051 | B.buildMerge(Dst, { Lo, Hi }); | ||||||
2052 | MI.eraseFromParent(); | ||||||
2053 | |||||||
2054 | return true; | ||||||
2055 | } | ||||||
2056 | |||||||
2057 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||||
2058 | MachineInstr &MI) const { | ||||||
2059 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||||
2060 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2061 | |||||||
2062 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||||
2063 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||||
2064 | |||||||
2065 | // With ieee_mode disabled, the instructions have the correct behavior | ||||||
2066 | // already for G_FMINNUM/G_FMAXNUM | ||||||
2067 | if (!MFI->getMode().IEEE) | ||||||
2068 | return !IsIEEEOp; | ||||||
2069 | |||||||
2070 | if (IsIEEEOp) | ||||||
2071 | return true; | ||||||
2072 | |||||||
2073 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||||
2074 | } | ||||||
2075 | |||||||
2076 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||||
2077 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2078 | MachineIRBuilder &B) const { | ||||||
2079 | // TODO: Should move some of this into LegalizerHelper. | ||||||
2080 | |||||||
2081 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
2082 | |||||||
2083 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
2084 | // constant before this, so we shouldn't need | ||||||
2085 | // getConstantVRegValWithLookThrough. | ||||||
2086 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||
2087 | getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||||
2088 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||
2089 | return true; | ||||||
2090 | const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||
2091 | |||||||
2092 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2093 | Register Vec = MI.getOperand(1).getReg(); | ||||||
2094 | |||||||
2095 | LLT VecTy = MRI.getType(Vec); | ||||||
2096 | LLT EltTy = VecTy.getElementType(); | ||||||
2097 | assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2097, __PRETTY_FUNCTION__)); | ||||||
2098 | |||||||
2099 | if (IdxVal < VecTy.getNumElements()) | ||||||
2100 | B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); | ||||||
2101 | else | ||||||
2102 | B.buildUndef(Dst); | ||||||
2103 | |||||||
2104 | MI.eraseFromParent(); | ||||||
2105 | return true; | ||||||
2106 | } | ||||||
2107 | |||||||
2108 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||||
2109 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2110 | MachineIRBuilder &B) const { | ||||||
2111 | // TODO: Should move some of this into LegalizerHelper. | ||||||
2112 | |||||||
2113 | // TODO: Promote dynamic indexing of s16 to s32 | ||||||
2114 | |||||||
2115 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||||
2116 | // constant before this, so we shouldn't need | ||||||
2117 | // getConstantVRegValWithLookThrough. | ||||||
2118 | Optional<ValueAndVReg> MaybeIdxVal = | ||||||
2119 | getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||||
2120 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||||
2121 | return true; | ||||||
2122 | |||||||
2123 | int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); | ||||||
2124 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2125 | Register Vec = MI.getOperand(1).getReg(); | ||||||
2126 | Register Ins = MI.getOperand(2).getReg(); | ||||||
2127 | |||||||
2128 | LLT VecTy = MRI.getType(Vec); | ||||||
2129 | LLT EltTy = VecTy.getElementType(); | ||||||
2130 | assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2130, __PRETTY_FUNCTION__)); | ||||||
2131 | |||||||
2132 | if (IdxVal < VecTy.getNumElements()) | ||||||
2133 | B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); | ||||||
2134 | else | ||||||
2135 | B.buildUndef(Dst); | ||||||
2136 | |||||||
2137 | MI.eraseFromParent(); | ||||||
2138 | return true; | ||||||
2139 | } | ||||||
2140 | |||||||
2141 | bool AMDGPULegalizerInfo::legalizeShuffleVector( | ||||||
2142 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2143 | MachineIRBuilder &B) const { | ||||||
2144 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
2145 | |||||||
2146 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2147 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2148 | LLT DstTy = MRI.getType(Dst); | ||||||
2149 | LLT SrcTy = MRI.getType(Src0); | ||||||
2150 | |||||||
2151 | if (SrcTy == V2S16 && DstTy == V2S16 && | ||||||
2152 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) | ||||||
2153 | return true; | ||||||
2154 | |||||||
2155 | MachineIRBuilder HelperBuilder(MI); | ||||||
2156 | GISelObserverWrapper DummyObserver; | ||||||
2157 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); | ||||||
2158 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; | ||||||
2159 | } | ||||||
2160 | |||||||
2161 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||||
2162 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2163 | MachineIRBuilder &B) const { | ||||||
2164 | |||||||
2165 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2166 | Register SrcReg = MI.getOperand(1).getReg(); | ||||||
2167 | LLT Ty = MRI.getType(DstReg); | ||||||
2168 | unsigned Flags = MI.getFlags(); | ||||||
2169 | |||||||
2170 | Register TrigVal; | ||||||
2171 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||||
2172 | if (ST.hasTrigReducedRange()) { | ||||||
2173 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||||
2174 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||||
2175 | .addUse(MulVal.getReg(0)) | ||||||
2176 | .setMIFlags(Flags).getReg(0); | ||||||
2177 | } else | ||||||
2178 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||||
2179 | |||||||
2180 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||||
2181 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||||
2182 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) | ||||||
2183 | .addUse(TrigVal) | ||||||
2184 | .setMIFlags(Flags); | ||||||
2185 | MI.eraseFromParent(); | ||||||
2186 | return true; | ||||||
2187 | } | ||||||
2188 | |||||||
2189 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||||
2190 | MachineIRBuilder &B, | ||||||
2191 | const GlobalValue *GV, | ||||||
2192 | int64_t Offset, | ||||||
2193 | unsigned GAFlags) const { | ||||||
2194 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")((isInt<32>(Offset + 4) && "32-bit offset is expected!" ) ? static_cast<void> (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2194, __PRETTY_FUNCTION__)); | ||||||
2195 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||||
2196 | // to the following code sequence: | ||||||
2197 | // | ||||||
2198 | // For constant address space: | ||||||
2199 | // s_getpc_b64 s[0:1] | ||||||
2200 | // s_add_u32 s0, s0, $symbol | ||||||
2201 | // s_addc_u32 s1, s1, 0 | ||||||
2202 | // | ||||||
2203 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
2204 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||||
2205 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||||
2206 | // operand to the global variable. | ||||||
2207 | // | ||||||
2208 | // For global address space: | ||||||
2209 | // s_getpc_b64 s[0:1] | ||||||
2210 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||||
2211 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||||
2212 | // | ||||||
2213 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||||
2214 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||||
2215 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||||
2216 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||||
2217 | // operand to the global variable. | ||||||
2218 | // | ||||||
2219 | // What we want here is an offset from the value returned by s_getpc | ||||||
2220 | // (which is the address of the s_add_u32 instruction) to the global | ||||||
2221 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||||
2222 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||||
2223 | // small. This requires us to add 4 to the global variable offset in order to | ||||||
2224 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||||
2225 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||||
2226 | // instruction. | ||||||
2227 | |||||||
2228 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2229 | |||||||
2230 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||||
2231 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||||
2232 | |||||||
2233 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||||
2234 | .addDef(PCReg); | ||||||
2235 | |||||||
2236 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||||
2237 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||||
2238 | MIB.addImm(0); | ||||||
2239 | else | ||||||
2240 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||||
2241 | |||||||
2242 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||||
2243 | |||||||
2244 | if (PtrTy.getSizeInBits() == 32) | ||||||
2245 | B.buildExtract(DstReg, PCReg, 0); | ||||||
2246 | return true; | ||||||
2247 | } | ||||||
2248 | |||||||
2249 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||||
2250 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2251 | MachineIRBuilder &B) const { | ||||||
2252 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2253 | LLT Ty = MRI.getType(DstReg); | ||||||
2254 | unsigned AS = Ty.getAddressSpace(); | ||||||
2255 | |||||||
2256 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||||
2257 | MachineFunction &MF = B.getMF(); | ||||||
2258 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2259 | |||||||
2260 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||||
2261 | if (!MFI->isModuleEntryFunction()) { | ||||||
2262 | const Function &Fn = MF.getFunction(); | ||||||
2263 | DiagnosticInfoUnsupported BadLDSDecl( | ||||||
2264 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||||
2265 | DS_Warning); | ||||||
2266 | Fn.getContext().diagnose(BadLDSDecl); | ||||||
2267 | |||||||
2268 | // We currently don't have a way to correctly allocate LDS objects that | ||||||
2269 | // aren't directly associated with a kernel. We do force inlining of | ||||||
2270 | // functions that use local objects. However, if these dead functions are | ||||||
2271 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||||
2272 | // and a trap, since there should be no callable path here. | ||||||
2273 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||||
2274 | B.buildUndef(DstReg); | ||||||
2275 | MI.eraseFromParent(); | ||||||
2276 | return true; | ||||||
2277 | } | ||||||
2278 | |||||||
2279 | // TODO: We could emit code to handle the initialization somewhere. | ||||||
2280 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { | ||||||
2281 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
2282 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||||
2283 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||||
2284 | return true; // Leave in place; | ||||||
2285 | } | ||||||
2286 | |||||||
2287 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||||
2288 | Type *Ty = GV->getValueType(); | ||||||
2289 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||||
2290 | // zero-sized type in other languages to declare the dynamic shared | ||||||
2291 | // memory which size is not known at the compile time. They will be | ||||||
2292 | // allocated by the runtime and placed directly after the static | ||||||
2293 | // allocated ones. They all share the same offset. | ||||||
2294 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||||
2295 | // Adjust alignment for that dynamic shared memory array. | ||||||
2296 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); | ||||||
2297 | LLT S32 = LLT::scalar(32); | ||||||
2298 | auto Sz = | ||||||
2299 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||||
2300 | B.buildIntToPtr(DstReg, Sz); | ||||||
2301 | MI.eraseFromParent(); | ||||||
2302 | return true; | ||||||
2303 | } | ||||||
2304 | } | ||||||
2305 | |||||||
2306 | B.buildConstant( | ||||||
2307 | DstReg, | ||||||
2308 | MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); | ||||||
2309 | MI.eraseFromParent(); | ||||||
2310 | return true; | ||||||
2311 | } | ||||||
2312 | |||||||
2313 | const Function &Fn = MF.getFunction(); | ||||||
2314 | DiagnosticInfoUnsupported BadInit( | ||||||
2315 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); | ||||||
2316 | Fn.getContext().diagnose(BadInit); | ||||||
2317 | return true; | ||||||
2318 | } | ||||||
2319 | |||||||
2320 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||||
2321 | |||||||
2322 | if (TLI->shouldEmitFixup(GV)) { | ||||||
2323 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||||
2324 | MI.eraseFromParent(); | ||||||
2325 | return true; | ||||||
2326 | } | ||||||
2327 | |||||||
2328 | if (TLI->shouldEmitPCReloc(GV)) { | ||||||
2329 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||||
2330 | MI.eraseFromParent(); | ||||||
2331 | return true; | ||||||
2332 | } | ||||||
2333 | |||||||
2334 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2335 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||||
2336 | |||||||
2337 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||||
2338 | MachinePointerInfo::getGOT(MF), | ||||||
2339 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
2340 | MachineMemOperand::MOInvariant, | ||||||
2341 | 8 /*Size*/, Align(8)); | ||||||
2342 | |||||||
2343 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||||
2344 | |||||||
2345 | if (Ty.getSizeInBits() == 32) { | ||||||
2346 | // Truncate if this is a 32-bit constant adrdess. | ||||||
2347 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||||
2348 | B.buildExtract(DstReg, Load, 0); | ||||||
2349 | } else | ||||||
2350 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||||
2351 | |||||||
2352 | MI.eraseFromParent(); | ||||||
2353 | return true; | ||||||
2354 | } | ||||||
2355 | |||||||
2356 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||||
2357 | if (Ty.isVector()) | ||||||
2358 | return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); | ||||||
2359 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||||
2360 | } | ||||||
2361 | |||||||
2362 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||||
2363 | MachineInstr &MI) const { | ||||||
2364 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
2365 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
2366 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
2367 | |||||||
2368 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
2369 | LLT PtrTy = MRI.getType(PtrReg); | ||||||
2370 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||||
2371 | |||||||
2372 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||||
2373 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||||
2374 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||||
2375 | Observer.changingInstr(MI); | ||||||
2376 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||||
2377 | Observer.changedInstr(MI); | ||||||
2378 | return true; | ||||||
2379 | } | ||||||
2380 | |||||||
2381 | Register ValReg = MI.getOperand(0).getReg(); | ||||||
2382 | LLT ValTy = MRI.getType(ValReg); | ||||||
2383 | |||||||
2384 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
2385 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||||
2386 | const unsigned MemSize = 8 * MMO->getSize(); | ||||||
2387 | const Align MemAlign = MMO->getAlign(); | ||||||
2388 | const unsigned AlignInBits = 8 * MemAlign.value(); | ||||||
2389 | |||||||
2390 | // Widen non-power-of-2 loads to the alignment if needed | ||||||
2391 | if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||||
2392 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||||
2393 | |||||||
2394 | // This was already the correct extending load result type, so just adjust | ||||||
2395 | // the memory type. | ||||||
2396 | if (WideMemSize == ValSize) { | ||||||
2397 | MachineFunction &MF = B.getMF(); | ||||||
2398 | |||||||
2399 | MachineMemOperand *WideMMO = | ||||||
2400 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||||
2401 | Observer.changingInstr(MI); | ||||||
2402 | MI.setMemRefs(MF, {WideMMO}); | ||||||
2403 | Observer.changedInstr(MI); | ||||||
2404 | return true; | ||||||
2405 | } | ||||||
2406 | |||||||
2407 | // Don't bother handling edge case that should probably never be produced. | ||||||
2408 | if (ValSize > WideMemSize) | ||||||
2409 | return false; | ||||||
2410 | |||||||
2411 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||||
2412 | |||||||
2413 | Register WideLoad; | ||||||
2414 | if (!WideTy.isVector()) { | ||||||
2415 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||
2416 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||||
2417 | } else { | ||||||
2418 | // Extract the subvector. | ||||||
2419 | |||||||
2420 | if (isRegisterType(ValTy)) { | ||||||
2421 | // If this a case where G_EXTRACT is legal, use it. | ||||||
2422 | // (e.g. <3 x s32> -> <4 x s32>) | ||||||
2423 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||||
2424 | B.buildExtract(ValReg, WideLoad, 0); | ||||||
2425 | } else { | ||||||
2426 | // For cases where the widened type isn't a nice register value, unmerge | ||||||
2427 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||||
2428 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||
2429 | WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); | ||||||
2430 | B.setInsertPt(B.getMBB(), MI.getIterator()); | ||||||
2431 | B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); | ||||||
2432 | } | ||||||
2433 | } | ||||||
2434 | |||||||
2435 | MI.eraseFromParent(); | ||||||
2436 | return true; | ||||||
2437 | } | ||||||
2438 | |||||||
2439 | return false; | ||||||
2440 | } | ||||||
2441 | |||||||
2442 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||||
2443 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||||
2444 | MachineIRBuilder &B) const { | ||||||
2445 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||||
2446 | assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail ("Ty.isScalar()", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2446, __PRETTY_FUNCTION__)); | ||||||
2447 | |||||||
2448 | MachineFunction &MF = B.getMF(); | ||||||
2449 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||||
2450 | |||||||
2451 | // TODO: Always legal with future ftz flag. | ||||||
2452 | // FIXME: Do we need just output? | ||||||
2453 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||||
2454 | return true; | ||||||
2455 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||||
2456 | return true; | ||||||
2457 | |||||||
2458 | MachineIRBuilder HelperBuilder(MI); | ||||||
2459 | GISelObserverWrapper DummyObserver; | ||||||
2460 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||||
2461 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||||
2462 | } | ||||||
2463 | |||||||
2464 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||||
2465 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2466 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2467 | Register PtrReg = MI.getOperand(1).getReg(); | ||||||
2468 | Register CmpVal = MI.getOperand(2).getReg(); | ||||||
2469 | Register NewVal = MI.getOperand(3).getReg(); | ||||||
2470 | |||||||
2471 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&((AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace ()) && "this should not have been custom lowered") ? static_cast <void> (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2472, __PRETTY_FUNCTION__)) | ||||||
2472 | "this should not have been custom lowered")((AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace ()) && "this should not have been custom lowered") ? static_cast <void> (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2472, __PRETTY_FUNCTION__)); | ||||||
2473 | |||||||
2474 | LLT ValTy = MRI.getType(CmpVal); | ||||||
2475 | LLT VecTy = LLT::vector(2, ValTy); | ||||||
2476 | |||||||
2477 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||||
2478 | |||||||
2479 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||||
2480 | .addDef(DstReg) | ||||||
2481 | .addUse(PtrReg) | ||||||
2482 | .addUse(PackedVal) | ||||||
2483 | .setMemRefs(MI.memoperands()); | ||||||
2484 | |||||||
2485 | MI.eraseFromParent(); | ||||||
2486 | return true; | ||||||
2487 | } | ||||||
2488 | |||||||
2489 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||||
2490 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||||
2491 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2492 | Register Src = MI.getOperand(1).getReg(); | ||||||
2493 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2494 | unsigned Flags = MI.getFlags(); | ||||||
2495 | |||||||
2496 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||||
2497 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||||
2498 | |||||||
2499 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||||
2500 | MI.eraseFromParent(); | ||||||
2501 | return true; | ||||||
2502 | } | ||||||
2503 | |||||||
2504 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||||
2505 | MachineIRBuilder &B) const { | ||||||
2506 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2507 | Register Src = MI.getOperand(1).getReg(); | ||||||
2508 | unsigned Flags = MI.getFlags(); | ||||||
2509 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2510 | |||||||
2511 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||||
2512 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||||
2513 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2514 | MI.eraseFromParent(); | ||||||
2515 | return true; | ||||||
2516 | } | ||||||
2517 | |||||||
2518 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||||
2519 | MachineIRBuilder &B) const { | ||||||
2520 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2521 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2522 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2523 | unsigned Flags = MI.getFlags(); | ||||||
2524 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
2525 | const LLT S16 = LLT::scalar(16); | ||||||
2526 | const LLT S32 = LLT::scalar(32); | ||||||
2527 | |||||||
2528 | if (Ty == S32) { | ||||||
2529 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||||
2530 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2531 | .addUse(Log.getReg(0)) | ||||||
2532 | .addUse(Src1) | ||||||
2533 | .setMIFlags(Flags); | ||||||
2534 | B.buildFExp2(Dst, Mul, Flags); | ||||||
2535 | } else if (Ty == S16) { | ||||||
2536 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||||
2537 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||||
2538 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||||
2539 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||||
2540 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||||
2541 | .addUse(Ext0.getReg(0)) | ||||||
2542 | .addUse(Ext1.getReg(0)) | ||||||
2543 | .setMIFlags(Flags); | ||||||
2544 | |||||||
2545 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||||
2546 | } else | ||||||
2547 | return false; | ||||||
2548 | |||||||
2549 | MI.eraseFromParent(); | ||||||
2550 | return true; | ||||||
2551 | } | ||||||
2552 | |||||||
2553 | // Find a source register, ignoring any possible source modifiers. | ||||||
2554 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||||
2555 | Register ModSrc = OrigSrc; | ||||||
2556 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||||
2557 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||||
2558 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2559 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2560 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||||
2561 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||||
2562 | return ModSrc; | ||||||
2563 | } | ||||||
2564 | |||||||
2565 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||||
2566 | MachineRegisterInfo &MRI, | ||||||
2567 | MachineIRBuilder &B) const { | ||||||
2568 | |||||||
2569 | const LLT S1 = LLT::scalar(1); | ||||||
2570 | const LLT S64 = LLT::scalar(64); | ||||||
2571 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2572 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||||
2573 | unsigned Flags = MI.getFlags(); | ||||||
2574 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&((ST.hasFractBug() && MRI.getType(Dst) == S64 && "this should not have been custom lowered") ? static_cast< void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2575, __PRETTY_FUNCTION__)) | ||||||
2575 | "this should not have been custom lowered")((ST.hasFractBug() && MRI.getType(Dst) == S64 && "this should not have been custom lowered") ? static_cast< void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2575, __PRETTY_FUNCTION__)); | ||||||
2576 | |||||||
2577 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||||
2578 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||||
2579 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||||
2580 | // V_FRACT bug is: | ||||||
2581 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||||
2582 | // | ||||||
2583 | // Convert floor(x) to (x - fract(x)) | ||||||
2584 | |||||||
2585 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||||
2586 | .addUse(OrigSrc) | ||||||
2587 | .setMIFlags(Flags); | ||||||
2588 | |||||||
2589 | // Give source modifier matching some assistance before obscuring a foldable | ||||||
2590 | // pattern. | ||||||
2591 | |||||||
2592 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||||
2593 | // shouldn't matter? | ||||||
2594 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||||
2595 | |||||||
2596 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); | ||||||
2597 | |||||||
2598 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||||
2599 | |||||||
2600 | // We don't need to concern ourselves with the snan handling difference, so | ||||||
2601 | // use the one which will directly select. | ||||||
2602 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2603 | if (MFI->getMode().IEEE) | ||||||
2604 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||||
2605 | else | ||||||
2606 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||||
2607 | |||||||
2608 | Register CorrectedFract = Min; | ||||||
2609 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||||
2610 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||||
2611 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||||
2612 | } | ||||||
2613 | |||||||
2614 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||||
2615 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||||
2616 | |||||||
2617 | MI.eraseFromParent(); | ||||||
2618 | return true; | ||||||
2619 | } | ||||||
2620 | |||||||
2621 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||||
2622 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||||
2623 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||||
2624 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
2625 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2626 | const LLT S32 = LLT::scalar(32); | ||||||
2627 | assert(MRI.getType(Dst) == LLT::vector(2, 16))((MRI.getType(Dst) == LLT::vector(2, 16)) ? static_cast<void > (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2627, __PRETTY_FUNCTION__)); | ||||||
2628 | |||||||
2629 | Register Src0 = MI.getOperand(1).getReg(); | ||||||
2630 | Register Src1 = MI.getOperand(2).getReg(); | ||||||
2631 | assert(MRI.getType(Src0) == LLT::scalar(16))((MRI.getType(Src0) == LLT::scalar(16)) ? static_cast<void > (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2631, __PRETTY_FUNCTION__)); | ||||||
2632 | |||||||
2633 | auto Merge = B.buildMerge(S32, {Src0, Src1}); | ||||||
2634 | B.buildBitcast(Dst, Merge); | ||||||
2635 | |||||||
2636 | MI.eraseFromParent(); | ||||||
2637 | return true; | ||||||
2638 | } | ||||||
2639 | |||||||
2640 | // Check that this is a G_XOR x, -1 | ||||||
2641 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||||
2642 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||||
2643 | return false; | ||||||
2644 | auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||||
2645 | return ConstVal && *ConstVal == -1; | ||||||
2646 | } | ||||||
2647 | |||||||
2648 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||||
2649 | static MachineInstr * | ||||||
2650 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||||
2651 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||||
2652 | Register CondDef = MI.getOperand(0).getReg(); | ||||||
2653 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||||
2654 | return nullptr; | ||||||
2655 | |||||||
2656 | MachineBasicBlock *Parent = MI.getParent(); | ||||||
2657 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||||
2658 | |||||||
2659 | if (isNot(MRI, *UseMI)) { | ||||||
2660 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||||
2661 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||||
2662 | return nullptr; | ||||||
2663 | |||||||
2664 | // We're deleting the def of this value, so we need to remove it. | ||||||
2665 | UseMI->eraseFromParent(); | ||||||
2666 | |||||||
2667 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||||
2668 | Negated = true; | ||||||
2669 | } | ||||||
2670 | |||||||
2671 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||||
2672 | return nullptr; | ||||||
2673 | |||||||
2674 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||||
2675 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||||
2676 | if (Next == Parent->end()) { | ||||||
2677 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||||
2678 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||||
2679 | return nullptr; | ||||||
2680 | UncondBrTarget = &*NextMBB; | ||||||
2681 | } else { | ||||||
2682 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||||
2683 | return nullptr; | ||||||
2684 | Br = &*Next; | ||||||
2685 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||||
2686 | } | ||||||
2687 | |||||||
2688 | return UseMI; | ||||||
2689 | } | ||||||
2690 | |||||||
2691 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||||
2692 | const ArgDescriptor *Arg, | ||||||
2693 | const TargetRegisterClass *ArgRC, | ||||||
2694 | LLT ArgTy) const { | ||||||
2695 | MCRegister SrcReg = Arg->getRegister(); | ||||||
2696 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")((Register::isPhysicalRegister(SrcReg) && "Physical register expected" ) ? static_cast<void> (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2696, __PRETTY_FUNCTION__)); | ||||||
2697 | assert(DstReg.isVirtual() && "Virtual register expected")((DstReg.isVirtual() && "Virtual register expected") ? static_cast<void> (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2697, __PRETTY_FUNCTION__)); | ||||||
2698 | |||||||
2699 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, | ||||||
2700 | ArgTy); | ||||||
2701 | if (Arg->isMasked()) { | ||||||
2702 | // TODO: Should we try to emit this once in the entry block? | ||||||
2703 | const LLT S32 = LLT::scalar(32); | ||||||
2704 | const unsigned Mask = Arg->getMask(); | ||||||
2705 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); | ||||||
2706 | |||||||
2707 | Register AndMaskSrc = LiveIn; | ||||||
2708 | |||||||
2709 | if (Shift
| ||||||
2710 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||||
2711 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||||
2712 | } | ||||||
2713 | |||||||
2714 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||||
| |||||||
2715 | } else { | ||||||
2716 | B.buildCopy(DstReg, LiveIn); | ||||||
2717 | } | ||||||
2718 | |||||||
2719 | return true; | ||||||
2720 | } | ||||||
2721 | |||||||
2722 | bool AMDGPULegalizerInfo::loadInputValue( | ||||||
2723 | Register DstReg, MachineIRBuilder &B, | ||||||
2724 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
2725 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
2726 | const ArgDescriptor *Arg; | ||||||
2727 | const TargetRegisterClass *ArgRC; | ||||||
2728 | LLT ArgTy; | ||||||
2729 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||||
2730 | |||||||
2731 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||||
2732 | return false; // TODO: Handle these | ||||||
2733 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||||
2734 | } | ||||||
2735 | |||||||
2736 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||||
2737 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||||
2738 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||||
2739 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||||
2740 | return false; | ||||||
2741 | |||||||
2742 | MI.eraseFromParent(); | ||||||
2743 | return true; | ||||||
2744 | } | ||||||
2745 | |||||||
2746 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||||
2747 | MachineRegisterInfo &MRI, | ||||||
2748 | MachineIRBuilder &B) const { | ||||||
2749 | Register Dst = MI.getOperand(0).getReg(); | ||||||
2750 | LLT DstTy = MRI.getType(Dst); | ||||||
2751 | LLT S16 = LLT::scalar(16); | ||||||
2752 | LLT S32 = LLT::scalar(32); | ||||||
2753 | LLT S64 = LLT::scalar(64); | ||||||
2754 | |||||||
2755 | if (DstTy == S16) | ||||||
2756 | return legalizeFDIV16(MI, MRI, B); | ||||||
2757 | if (DstTy == S32) | ||||||
2758 | return legalizeFDIV32(MI, MRI, B); | ||||||
2759 | if (DstTy == S64) | ||||||
2760 | return legalizeFDIV64(MI, MRI, B); | ||||||
2761 | |||||||
2762 | return false; | ||||||
2763 | } | ||||||
2764 | |||||||
2765 | void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, | ||||||
2766 | Register DstReg, | ||||||
2767 | Register X, | ||||||
2768 | Register Y, | ||||||
2769 | bool IsDiv) const { | ||||||
2770 | const LLT S1 = LLT::scalar(1); | ||||||
2771 | const LLT S32 = LLT::scalar(32); | ||||||
2772 | |||||||
2773 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||||
2774 | // algorithm used here. | ||||||
2775 | |||||||
2776 | // Initial estimate of inv(y). | ||||||
2777 | auto FloatY = B.buildUITOFP(S32, Y); | ||||||
2778 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||||
2779 | auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); | ||||||
2780 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||||
2781 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||||
2782 | |||||||
2783 | // One round of UNR. | ||||||
2784 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||||
2785 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||||
2786 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||||
2787 | |||||||
2788 | // Quotient/remainder estimate. | ||||||
2789 | auto Q = B.buildUMulH(S32, X, Z); | ||||||
2790 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||||
2791 | |||||||
2792 | // First quotient/remainder refinement. | ||||||
2793 | auto One = B.buildConstant(S32, 1); | ||||||
2794 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||
2795 | if (IsDiv) | ||||||
2796 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||||
2797 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||||
2798 | |||||||
2799 | // Second quotient/remainder refinement. | ||||||
2800 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||||
2801 | if (IsDiv) | ||||||
2802 | B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||||
2803 | else | ||||||
2804 | B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); | ||||||
2805 | } | ||||||
2806 | |||||||
2807 | bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, | ||||||
2808 | MachineRegisterInfo &MRI, | ||||||
2809 | MachineIRBuilder &B) const { | ||||||
2810 | const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; | ||||||
2811 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2812 | Register Num = MI.getOperand(1).getReg(); | ||||||
2813 | Register Den = MI.getOperand(2).getReg(); | ||||||
2814 | legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); | ||||||
2815 | MI.eraseFromParent(); | ||||||
2816 | return true; | ||||||
2817 | } | ||||||
2818 | |||||||
2819 | // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 | ||||||
2820 | // | ||||||
2821 | // Return lo, hi of result | ||||||
2822 | // | ||||||
2823 | // %cvt.lo = G_UITOFP Val.lo | ||||||
2824 | // %cvt.hi = G_UITOFP Val.hi | ||||||
2825 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||||
2826 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||||
2827 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||||
2828 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||||
2829 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||||
2830 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||||
2831 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||||
2832 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||||
2833 | Register Val) { | ||||||
2834 | const LLT S32 = LLT::scalar(32); | ||||||
2835 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||||
2836 | |||||||
2837 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||||
2838 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||||
2839 | |||||||
2840 | auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 | ||||||
2841 | B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); | ||||||
2842 | |||||||
2843 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||||
2844 | auto Mul1 = | ||||||
2845 | B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); | ||||||
2846 | |||||||
2847 | // 2**(-32) | ||||||
2848 | auto Mul2 = | ||||||
2849 | B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); | ||||||
2850 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||||
2851 | |||||||
2852 | // -(2**32) | ||||||
2853 | auto Mad2 = B.buildFMAD(S32, Trunc, | ||||||
2854 | B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); | ||||||
2855 | |||||||
2856 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||||
2857 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||||
2858 | |||||||
2859 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||||
2860 | } | ||||||
2861 | |||||||
2862 | void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, | ||||||
2863 | Register DstReg, | ||||||
2864 | Register Numer, | ||||||
2865 | Register Denom, | ||||||
2866 | bool IsDiv) const { | ||||||
2867 | const LLT S32 = LLT::scalar(32); | ||||||
2868 | const LLT S64 = LLT::scalar(64); | ||||||
2869 | const LLT S1 = LLT::scalar(1); | ||||||
2870 | Register RcpLo, RcpHi; | ||||||
2871 | |||||||
2872 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||||
2873 | |||||||
2874 | auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); | ||||||
2875 | |||||||
2876 | auto Zero64 = B.buildConstant(S64, 0); | ||||||
2877 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||||
2878 | |||||||
2879 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||||
2880 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||||
2881 | |||||||
2882 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||||
2883 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||||
2884 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||||
2885 | |||||||
2886 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||||
2887 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||||
2888 | auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); | ||||||
2889 | auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); | ||||||
2890 | |||||||
2891 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||||
2892 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||||
2893 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||||
2894 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||||
2895 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||||
2896 | |||||||
2897 | auto Zero32 = B.buildConstant(S32, 0); | ||||||
2898 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||||
2899 | auto Add2_HiC = | ||||||
2900 | B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); | ||||||
2901 | auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); | ||||||
2902 | auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); | ||||||
2903 | |||||||
2904 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||||
2905 | Register NumerLo = UnmergeNumer.getReg(0); | ||||||
2906 | Register NumerHi = UnmergeNumer.getReg(1); | ||||||
2907 | |||||||
2908 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||||
2909 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||||
2910 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||||
2911 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||||
2912 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||||
2913 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||||
2914 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||||
2915 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||||
2916 | auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); | ||||||
2917 | |||||||
2918 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||||
2919 | Register DenomLo = UnmergeDenom.getReg(0); | ||||||
2920 | Register DenomHi = UnmergeDenom.getReg(1); | ||||||
2921 | |||||||
2922 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||||
2923 | auto C1 = B.buildSExt(S32, CmpHi); | ||||||
2924 | |||||||
2925 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||||
2926 | auto C2 = B.buildSExt(S32, CmpLo); | ||||||
2927 | |||||||
2928 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||||
2929 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||||
2930 | |||||||
2931 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||||
2932 | // Currently control flow is unconditional and we have 4 selects after | ||||||
2933 | // potential endif to substitute PHIs. | ||||||
2934 | |||||||
2935 | // if C3 != 0 ... | ||||||
2936 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||||
2937 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||||
2938 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||||
2939 | auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); | ||||||
2940 | |||||||
2941 | auto One64 = B.buildConstant(S64, 1); | ||||||
2942 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||||
2943 | |||||||
2944 | auto C4 = | ||||||
2945 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||||
2946 | auto C5 = | ||||||
2947 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||||
2948 | auto C6 = B.buildSelect( | ||||||
2949 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||||
2950 | |||||||
2951 | // if (C6 != 0) | ||||||
2952 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||||
2953 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||||
2954 | |||||||
2955 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||||
2956 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||||
2957 | auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); | ||||||
2958 | |||||||
2959 | // endif C6 | ||||||
2960 | // endif C3 | ||||||
2961 | |||||||
2962 | if (IsDiv) { | ||||||
2963 | auto Sel1 = B.buildSelect( | ||||||
2964 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||||
2965 | B.buildSelect(DstReg, | ||||||
2966 | B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); | ||||||
2967 | } else { | ||||||
2968 | auto Sel2 = B.buildSelect( | ||||||
2969 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||||
2970 | B.buildSelect(DstReg, | ||||||
2971 | B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); | ||||||
2972 | } | ||||||
2973 | } | ||||||
2974 | |||||||
2975 | bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, | ||||||
2976 | MachineRegisterInfo &MRI, | ||||||
2977 | MachineIRBuilder &B) const { | ||||||
2978 | const LLT S64 = LLT::scalar(64); | ||||||
2979 | const LLT S32 = LLT::scalar(32); | ||||||
2980 | const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; | ||||||
2981 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
2982 | Register Num = MI.getOperand(1).getReg(); | ||||||
2983 | Register Den = MI.getOperand(2).getReg(); | ||||||
2984 | LLT Ty = MRI.getType(DstReg); | ||||||
2985 | |||||||
2986 | if (Ty == S32) | ||||||
2987 | legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); | ||||||
2988 | else if (Ty == S64) | ||||||
2989 | legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); | ||||||
2990 | else | ||||||
2991 | return false; | ||||||
2992 | |||||||
2993 | MI.eraseFromParent(); | ||||||
2994 | return true; | ||||||
2995 | |||||||
2996 | } | ||||||
2997 | |||||||
2998 | bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, | ||||||
2999 | MachineRegisterInfo &MRI, | ||||||
3000 | MachineIRBuilder &B) const { | ||||||
3001 | const LLT S64 = LLT::scalar(64); | ||||||
3002 | const LLT S32 = LLT::scalar(32); | ||||||
3003 | |||||||
3004 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
3005 | const LLT Ty = MRI.getType(DstReg); | ||||||
3006 | if (Ty != S32 && Ty != S64) | ||||||
3007 | return false; | ||||||
3008 | |||||||
3009 | const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; | ||||||
3010 | |||||||
3011 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3012 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3013 | |||||||
3014 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||||
3015 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||||
3016 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||||
3017 | |||||||
3018 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||||
3019 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||||
3020 | |||||||
3021 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||||
3022 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||||
3023 | |||||||
3024 | Register UDivRem = MRI.createGenericVirtualRegister(Ty); | ||||||
3025 | if (Ty == S32) | ||||||
3026 | legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); | ||||||
3027 | else | ||||||
3028 | legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); | ||||||
3029 | |||||||
3030 | Register Sign; | ||||||
3031 | if (IsDiv) | ||||||
3032 | Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||||
3033 | else | ||||||
3034 | Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||||
3035 | |||||||
3036 | UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); | ||||||
3037 | B.buildSub(DstReg, UDivRem, Sign); | ||||||
3038 | |||||||
3039 | MI.eraseFromParent(); | ||||||
3040 | return true; | ||||||
3041 | } | ||||||
3042 | |||||||
3043 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||||
3044 | MachineRegisterInfo &MRI, | ||||||
3045 | MachineIRBuilder &B) const { | ||||||
3046 | Register Res = MI.getOperand(0).getReg(); | ||||||
3047 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3048 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3049 | uint16_t Flags = MI.getFlags(); | ||||||
3050 | LLT ResTy = MRI.getType(Res); | ||||||
3051 | |||||||
3052 | const MachineFunction &MF = B.getMF(); | ||||||
3053 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||
3054 | MI.getFlag(MachineInstr::FmAfn); | ||||||
3055 | |||||||
3056 | if (!AllowInaccurateRcp) | ||||||
3057 | return false; | ||||||
3058 | |||||||
3059 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||||
3060 | // 1 / x -> RCP(x) | ||||||
3061 | if (CLHS->isExactlyValue(1.0)) { | ||||||
3062 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
3063 | .addUse(RHS) | ||||||
3064 | .setMIFlags(Flags); | ||||||
3065 | |||||||
3066 | MI.eraseFromParent(); | ||||||
3067 | return true; | ||||||
3068 | } | ||||||
3069 | |||||||
3070 | // -1 / x -> RCP( FNEG(x) ) | ||||||
3071 | if (CLHS->isExactlyValue(-1.0)) { | ||||||
3072 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||||
3073 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||||
3074 | .addUse(FNeg.getReg(0)) | ||||||
3075 | .setMIFlags(Flags); | ||||||
3076 | |||||||
3077 | MI.eraseFromParent(); | ||||||
3078 | return true; | ||||||
3079 | } | ||||||
3080 | } | ||||||
3081 | |||||||
3082 | // x / y -> x * (1.0 / y) | ||||||
3083 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
3084 | .addUse(RHS) | ||||||
3085 | .setMIFlags(Flags); | ||||||
3086 | B.buildFMul(Res, LHS, RCP, Flags); | ||||||
3087 | |||||||
3088 | MI.eraseFromParent(); | ||||||
3089 | return true; | ||||||
3090 | } | ||||||
3091 | |||||||
3092 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||||
3093 | MachineRegisterInfo &MRI, | ||||||
3094 | MachineIRBuilder &B) const { | ||||||
3095 | Register Res = MI.getOperand(0).getReg(); | ||||||
3096 | Register X = MI.getOperand(1).getReg(); | ||||||
3097 | Register Y = MI.getOperand(2).getReg(); | ||||||
3098 | uint16_t Flags = MI.getFlags(); | ||||||
3099 | LLT ResTy = MRI.getType(Res); | ||||||
3100 | |||||||
3101 | const MachineFunction &MF = B.getMF(); | ||||||
3102 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||||
3103 | MI.getFlag(MachineInstr::FmAfn); | ||||||
3104 | |||||||
3105 | if (!AllowInaccurateRcp) | ||||||
3106 | return false; | ||||||
3107 | |||||||
3108 | auto NegY = B.buildFNeg(ResTy, Y); | ||||||
3109 | auto One = B.buildFConstant(ResTy, 1.0); | ||||||
3110 | |||||||
3111 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||||
3112 | .addUse(Y) | ||||||
3113 | .setMIFlags(Flags); | ||||||
3114 | |||||||
3115 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||||
3116 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||||
3117 | |||||||
3118 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||||
3119 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||||
3120 | |||||||
3121 | auto Ret = B.buildFMul(ResTy, X, R); | ||||||
3122 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||||
3123 | |||||||
3124 | B.buildFMA(Res, Tmp2, R, Ret); | ||||||
3125 | MI.eraseFromParent(); | ||||||
3126 | return true; | ||||||
3127 | } | ||||||
3128 | |||||||
3129 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||||
3130 | MachineRegisterInfo &MRI, | ||||||
3131 | MachineIRBuilder &B) const { | ||||||
3132 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
3133 | return true; | ||||||
3134 | |||||||
3135 | Register Res = MI.getOperand(0).getReg(); | ||||||
3136 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3137 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3138 | |||||||
3139 | uint16_t Flags = MI.getFlags(); | ||||||
3140 | |||||||
3141 | LLT S16 = LLT::scalar(16); | ||||||
3142 | LLT S32 = LLT::scalar(32); | ||||||
3143 | |||||||
3144 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||||
3145 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||||
3146 | |||||||
3147 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3148 | .addUse(RHSExt.getReg(0)) | ||||||
3149 | .setMIFlags(Flags); | ||||||
3150 | |||||||
3151 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||||
3152 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||||
3153 | |||||||
3154 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
3155 | .addUse(RDst.getReg(0)) | ||||||
3156 | .addUse(RHS) | ||||||
3157 | .addUse(LHS) | ||||||
3158 | .setMIFlags(Flags); | ||||||
3159 | |||||||
3160 | MI.eraseFromParent(); | ||||||
3161 | return true; | ||||||
3162 | } | ||||||
3163 | |||||||
3164 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||||
3165 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||||
3166 | static void toggleSPDenormMode(bool Enable, | ||||||
3167 | MachineIRBuilder &B, | ||||||
3168 | const GCNSubtarget &ST, | ||||||
3169 | AMDGPU::SIModeRegisterDefaults Mode) { | ||||||
3170 | // Set SP denorm mode to this value. | ||||||
3171 | unsigned SPDenormMode = | ||||||
3172 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||||
3173 | |||||||
3174 | if (ST.hasDenormModeInst()) { | ||||||
3175 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||||
3176 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||||
3177 | |||||||
3178 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||||
3179 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||||
3180 | .addImm(NewDenormModeValue); | ||||||
3181 | |||||||
3182 | } else { | ||||||
3183 | // Select FP32 bit field in mode register. | ||||||
3184 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||||
3185 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||||
3186 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||||
3187 | |||||||
3188 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||||
3189 | .addImm(SPDenormMode) | ||||||
3190 | .addImm(SPDenormModeBitField); | ||||||
3191 | } | ||||||
3192 | } | ||||||
3193 | |||||||
3194 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||||
3195 | MachineRegisterInfo &MRI, | ||||||
3196 | MachineIRBuilder &B) const { | ||||||
3197 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||||
3198 | return true; | ||||||
3199 | |||||||
3200 | Register Res = MI.getOperand(0).getReg(); | ||||||
3201 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3202 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3203 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3204 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); | ||||||
3205 | |||||||
3206 | uint16_t Flags = MI.getFlags(); | ||||||
3207 | |||||||
3208 | LLT S32 = LLT::scalar(32); | ||||||
3209 | LLT S1 = LLT::scalar(1); | ||||||
3210 | |||||||
3211 | auto One = B.buildFConstant(S32, 1.0f); | ||||||
3212 | |||||||
3213 | auto DenominatorScaled = | ||||||
3214 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
3215 | .addUse(LHS) | ||||||
3216 | .addUse(RHS) | ||||||
3217 | .addImm(0) | ||||||
3218 | .setMIFlags(Flags); | ||||||
3219 | auto NumeratorScaled = | ||||||
3220 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||||
3221 | .addUse(LHS) | ||||||
3222 | .addUse(RHS) | ||||||
3223 | .addImm(1) | ||||||
3224 | .setMIFlags(Flags); | ||||||
3225 | |||||||
3226 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3227 | .addUse(DenominatorScaled.getReg(0)) | ||||||
3228 | .setMIFlags(Flags); | ||||||
3229 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||||
3230 | |||||||
3231 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||||
3232 | // aren't modeled as reading it. | ||||||
3233 | if (!Mode.allFP32Denormals()) | ||||||
3234 | toggleSPDenormMode(true, B, ST, Mode); | ||||||
3235 | |||||||
3236 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||||
3237 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||||
3238 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||||
3239 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||||
3240 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||||
3241 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||||
3242 | |||||||
3243 | if (!Mode.allFP32Denormals()) | ||||||
3244 | toggleSPDenormMode(false, B, ST, Mode); | ||||||
3245 | |||||||
3246 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||||
3247 | .addUse(Fma4.getReg(0)) | ||||||
3248 | .addUse(Fma1.getReg(0)) | ||||||
3249 | .addUse(Fma3.getReg(0)) | ||||||
3250 | .addUse(NumeratorScaled.getReg(1)) | ||||||
3251 | .setMIFlags(Flags); | ||||||
3252 | |||||||
3253 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||||
3254 | .addUse(Fmas.getReg(0)) | ||||||
3255 | .addUse(RHS) | ||||||
3256 | .addUse(LHS) | ||||||
3257 | .setMIFlags(Flags); | ||||||
3258 | |||||||
3259 | MI.eraseFromParent(); | ||||||
3260 | return true; | ||||||
3261 | } | ||||||
3262 | |||||||
3263 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||||
3264 | MachineRegisterInfo &MRI, | ||||||
3265 | MachineIRBuilder &B) const { | ||||||
3266 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||||
3267 | return true; | ||||||
3268 | |||||||
3269 | Register Res = MI.getOperand(0).getReg(); | ||||||
3270 | Register LHS = MI.getOperand(1).getReg(); | ||||||
3271 | Register RHS = MI.getOperand(2).getReg(); | ||||||
3272 | |||||||
3273 | uint16_t Flags = MI.getFlags(); | ||||||
3274 | |||||||
3275 | LLT S64 = LLT::scalar(64); | ||||||
3276 | LLT S1 = LLT::scalar(1); | ||||||
3277 | |||||||
3278 | auto One = B.buildFConstant(S64, 1.0); | ||||||
3279 | |||||||
3280 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
3281 | .addUse(LHS) | ||||||
3282 | .addUse(RHS) | ||||||
3283 | .addImm(0) | ||||||
3284 | .setMIFlags(Flags); | ||||||
3285 | |||||||
3286 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||||
3287 | |||||||
3288 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||||
3289 | .addUse(DivScale0.getReg(0)) | ||||||
3290 | .setMIFlags(Flags); | ||||||
3291 | |||||||
3292 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||||
3293 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||||
3294 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||||
3295 | |||||||
3296 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||||
3297 | .addUse(LHS) | ||||||
3298 | .addUse(RHS) | ||||||
3299 | .addImm(1) | ||||||
3300 | .setMIFlags(Flags); | ||||||
3301 | |||||||
3302 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||||
3303 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||||
3304 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||||
3305 | |||||||
3306 | Register Scale; | ||||||
3307 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||||
3308 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||||
3309 | // is not usable. | ||||||
3310 | |||||||
3311 | LLT S32 = LLT::scalar(32); | ||||||
3312 | |||||||
3313 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||||
3314 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||||
3315 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||||
3316 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||||
3317 | |||||||
3318 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||||
3319 | Scale1Unmerge.getReg(1)); | ||||||
3320 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||||
3321 | Scale0Unmerge.getReg(1)); | ||||||
3322 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||||
3323 | } else { | ||||||
3324 | Scale = DivScale1.getReg(1); | ||||||
3325 | } | ||||||
3326 | |||||||
3327 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||||
3328 | .addUse(Fma4.getReg(0)) | ||||||
3329 | .addUse(Fma3.getReg(0)) | ||||||
3330 | .addUse(Mul.getReg(0)) | ||||||
3331 | .addUse(Scale) | ||||||
3332 | .setMIFlags(Flags); | ||||||
3333 | |||||||
3334 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) | ||||||
3335 | .addUse(Fmas.getReg(0)) | ||||||
3336 | .addUse(RHS) | ||||||
3337 | .addUse(LHS) | ||||||
3338 | .setMIFlags(Flags); | ||||||
3339 | |||||||
3340 | MI.eraseFromParent(); | ||||||
3341 | return true; | ||||||
3342 | } | ||||||
3343 | |||||||
3344 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||||
3345 | MachineRegisterInfo &MRI, | ||||||
3346 | MachineIRBuilder &B) const { | ||||||
3347 | Register Res = MI.getOperand(0).getReg(); | ||||||
3348 | Register LHS = MI.getOperand(2).getReg(); | ||||||
3349 | Register RHS = MI.getOperand(3).getReg(); | ||||||
3350 | uint16_t Flags = MI.getFlags(); | ||||||
3351 | |||||||
3352 | LLT S32 = LLT::scalar(32); | ||||||
3353 | LLT S1 = LLT::scalar(1); | ||||||
3354 | |||||||
3355 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||||
3356 | const APFloat C0Val(1.0f); | ||||||
3357 | |||||||
3358 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||||
3359 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||||
3360 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); | ||||||
3361 | |||||||
3362 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||||
3363 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||||
3364 | |||||||
3365 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||||
3366 | |||||||
3367 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||||
3368 | .addUse(Mul0.getReg(0)) | ||||||
3369 | .setMIFlags(Flags); | ||||||
3370 | |||||||
3371 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||||
3372 | |||||||
3373 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||||
3374 | |||||||
3375 | MI.eraseFromParent(); | ||||||
3376 | return true; | ||||||
3377 | } | ||||||
3378 | |||||||
3379 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||||
3380 | // FIXME: Why do we handle this one but not other removed instructions? | ||||||
3381 | // | ||||||
3382 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||||
3383 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||||
3384 | // +-max_float. | ||||||
3385 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||||
3386 | MachineRegisterInfo &MRI, | ||||||
3387 | MachineIRBuilder &B) const { | ||||||
3388 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||||
3389 | return true; | ||||||
3390 | |||||||
3391 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3392 | Register Src = MI.getOperand(2).getReg(); | ||||||
3393 | auto Flags = MI.getFlags(); | ||||||
3394 | |||||||
3395 | LLT Ty = MRI.getType(Dst); | ||||||
3396 | |||||||
3397 | const fltSemantics *FltSemantics; | ||||||
3398 | if (Ty == LLT::scalar(32)) | ||||||
3399 | FltSemantics = &APFloat::IEEEsingle(); | ||||||
3400 | else if (Ty == LLT::scalar(64)) | ||||||
3401 | FltSemantics = &APFloat::IEEEdouble(); | ||||||
3402 | else | ||||||
3403 | return false; | ||||||
3404 | |||||||
3405 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||||
3406 | .addUse(Src) | ||||||
3407 | .setMIFlags(Flags); | ||||||
3408 | |||||||
3409 | // We don't need to concern ourselves with the snan handling difference, since | ||||||
3410 | // the rsq quieted (or not) so use the one which will directly select. | ||||||
3411 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3412 | const bool UseIEEE = MFI->getMode().IEEE; | ||||||
3413 | |||||||
3414 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||||
3415 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||||
3416 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||||
3417 | |||||||
3418 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||||
3419 | |||||||
3420 | if (UseIEEE) | ||||||
3421 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||||
3422 | else | ||||||
3423 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||||
3424 | MI.eraseFromParent(); | ||||||
3425 | return true; | ||||||
3426 | } | ||||||
3427 | |||||||
3428 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||||
3429 | switch (IID) { | ||||||
3430 | case Intrinsic::amdgcn_ds_fadd: | ||||||
3431 | return AMDGPU::G_ATOMICRMW_FADD; | ||||||
3432 | case Intrinsic::amdgcn_ds_fmin: | ||||||
3433 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||||
3434 | case Intrinsic::amdgcn_ds_fmax: | ||||||
3435 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||||
3436 | default: | ||||||
3437 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3437); | ||||||
3438 | } | ||||||
3439 | } | ||||||
3440 | |||||||
3441 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||||
3442 | MachineInstr &MI, | ||||||
3443 | Intrinsic::ID IID) const { | ||||||
3444 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
3445 | Observer.changingInstr(MI); | ||||||
3446 | |||||||
3447 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||||
3448 | |||||||
3449 | // The remaining operands were used to set fields in the MemOperand on | ||||||
3450 | // construction. | ||||||
3451 | for (int I = 6; I > 3; --I) | ||||||
3452 | MI.RemoveOperand(I); | ||||||
3453 | |||||||
3454 | MI.RemoveOperand(1); // Remove the intrinsic ID. | ||||||
3455 | Observer.changedInstr(MI); | ||||||
3456 | return true; | ||||||
3457 | } | ||||||
3458 | |||||||
3459 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||||
3460 | MachineRegisterInfo &MRI, | ||||||
3461 | MachineIRBuilder &B) const { | ||||||
3462 | uint64_t Offset = | ||||||
3463 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||||
3464 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||||
3465 | LLT DstTy = MRI.getType(DstReg); | ||||||
3466 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||||
3467 | |||||||
3468 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||||
3469 | if (!loadInputValue(KernargPtrReg, B, | ||||||
3470 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||||
3471 | return false; | ||||||
3472 | |||||||
3473 | // FIXME: This should be nuw | ||||||
3474 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||||
3475 | return true; | ||||||
3476 | } | ||||||
3477 | |||||||
3478 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||||
3479 | MachineRegisterInfo &MRI, | ||||||
3480 | MachineIRBuilder &B) const { | ||||||
3481 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||||
3482 | if (!MFI->isEntryFunction()) { | ||||||
3483 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
3484 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||||
3485 | } | ||||||
3486 | |||||||
3487 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
3488 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||||
3489 | return false; | ||||||
3490 | |||||||
3491 | MI.eraseFromParent(); | ||||||
3492 | return true; | ||||||
3493 | } | ||||||
3494 | |||||||
3495 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||||
3496 | MachineRegisterInfo &MRI, | ||||||
3497 | MachineIRBuilder &B, | ||||||
3498 | unsigned AddrSpace) const { | ||||||
3499 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||||
3500 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||||
3501 | Register Hi32 = Unmerge.getReg(1); | ||||||
3502 | |||||||
3503 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||||
3504 | MI.eraseFromParent(); | ||||||
3505 | return true; | ||||||
3506 | } | ||||||
3507 | |||||||
3508 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||||
3509 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||||
3510 | // split between the instruction's voffset and immoffset fields) and soffset | ||||||
3511 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||||
3512 | // the instruction's soffset field). This function takes the first kind of | ||||||
3513 | // offset and figures out how to split it between voffset and immoffset. | ||||||
3514 | std::tuple<Register, unsigned, unsigned> | ||||||
3515 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||||
3516 | Register OrigOffset) const { | ||||||
3517 | const unsigned MaxImm = 4095; | ||||||
3518 | Register BaseReg; | ||||||
3519 | unsigned TotalConstOffset; | ||||||
3520 | const LLT S32 = LLT::scalar(32); | ||||||
3521 | |||||||
3522 | std::tie(BaseReg, TotalConstOffset) = | ||||||
3523 | AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); | ||||||
3524 | |||||||
3525 | unsigned ImmOffset = TotalConstOffset; | ||||||
3526 | |||||||
3527 | // If the immediate value is too big for the immoffset field, put the value | ||||||
3528 | // and -4096 into the immoffset field so that the value that is copied/added | ||||||
3529 | // for the voffset field is a multiple of 4096, and it stands more chance | ||||||
3530 | // of being CSEd with the copy/add for another similar load/store. | ||||||
3531 | // However, do not do that rounding down to a multiple of 4096 if that is a | ||||||
3532 | // negative number, as it appears to be illegal to have a negative offset | ||||||
3533 | // in the vgpr, even if adding the immediate offset makes it positive. | ||||||
3534 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||||
3535 | ImmOffset -= Overflow; | ||||||
3536 | if ((int32_t)Overflow < 0) { | ||||||
3537 | Overflow += ImmOffset; | ||||||
3538 | ImmOffset = 0; | ||||||
3539 | } | ||||||
3540 | |||||||
3541 | if (Overflow != 0) { | ||||||
3542 | if (!BaseReg) { | ||||||
3543 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||||
3544 | } else { | ||||||
3545 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||||
3546 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||||
3547 | } | ||||||
3548 | } | ||||||
3549 | |||||||
3550 | if (!BaseReg) | ||||||
3551 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||||
3552 | |||||||
3553 | return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); | ||||||
3554 | } | ||||||
3555 | |||||||
3556 | /// Handle register layout difference for f16 images for some subtargets. | ||||||
3557 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||||
3558 | MachineRegisterInfo &MRI, | ||||||
3559 | Register Reg, | ||||||
3560 | bool ImageStore) const { | ||||||
3561 | const LLT S16 = LLT::scalar(16); | ||||||
3562 | const LLT S32 = LLT::scalar(32); | ||||||
3563 | LLT StoreVT = MRI.getType(Reg); | ||||||
3564 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16 ) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3564, __PRETTY_FUNCTION__)); | ||||||
3565 | |||||||
3566 | if (ST.hasUnpackedD16VMem()) { | ||||||
3567 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
3568 | |||||||
3569 | SmallVector<Register, 4> WideRegs; | ||||||
3570 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3571 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||||
3572 | |||||||
3573 | int NumElts = StoreVT.getNumElements(); | ||||||
3574 | |||||||
3575 | return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); | ||||||
3576 | } | ||||||
3577 | |||||||
3578 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||||
3579 | if (StoreVT.getNumElements() == 2) { | ||||||
3580 | SmallVector<Register, 4> PackedRegs; | ||||||
3581 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||||
3582 | PackedRegs.push_back(Reg); | ||||||
3583 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||||
3584 | return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); | ||||||
3585 | } | ||||||
3586 | |||||||
3587 | if (StoreVT.getNumElements() == 3) { | ||||||
3588 | SmallVector<Register, 4> PackedRegs; | ||||||
3589 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||||
3590 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3591 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||
3592 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||||
3593 | Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); | ||||||
3594 | return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); | ||||||
3595 | } | ||||||
3596 | |||||||
3597 | if (StoreVT.getNumElements() == 4) { | ||||||
3598 | SmallVector<Register, 4> PackedRegs; | ||||||
3599 | Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); | ||||||
3600 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||||
3601 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||||
3602 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||||
3603 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||||
3604 | return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); | ||||||
3605 | } | ||||||
3606 | |||||||
3607 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3607); | ||||||
3608 | } | ||||||
3609 | |||||||
3610 | return Reg; | ||||||
3611 | } | ||||||
3612 | |||||||
3613 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||||
3614 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||||
3615 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
3616 | LLT Ty = MRI->getType(VData); | ||||||
3617 | |||||||
3618 | const LLT S16 = LLT::scalar(16); | ||||||
3619 | |||||||
3620 | // Fixup illegal register types for i8 stores. | ||||||
3621 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||||
3622 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||||
3623 | return AnyExt; | ||||||
3624 | } | ||||||
3625 | |||||||
3626 | if (Ty.isVector()) { | ||||||
3627 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||||
3628 | if (IsFormat) | ||||||
3629 | return handleD16VData(B, *MRI, VData); | ||||||
3630 | } | ||||||
3631 | } | ||||||
3632 | |||||||
3633 | return VData; | ||||||
3634 | } | ||||||
3635 | |||||||
3636 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||||
3637 | MachineRegisterInfo &MRI, | ||||||
3638 | MachineIRBuilder &B, | ||||||
3639 | bool IsTyped, | ||||||
3640 | bool IsFormat) const { | ||||||
3641 | Register VData = MI.getOperand(1).getReg(); | ||||||
3642 | LLT Ty = MRI.getType(VData); | ||||||
3643 | LLT EltTy = Ty.getScalarType(); | ||||||
3644 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
3645 | const LLT S32 = LLT::scalar(32); | ||||||
3646 | |||||||
3647 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||||
3648 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
3649 | |||||||
3650 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3651 | const int MemSize = MMO->getSize(); | ||||||
3652 | |||||||
3653 | unsigned ImmOffset; | ||||||
3654 | unsigned TotalOffset; | ||||||
3655 | |||||||
3656 | // The typed intrinsics add an immediate after the registers. | ||||||
3657 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3658 | |||||||
3659 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3660 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3661 | Register VIndex; | ||||||
3662 | int OpOffset = 0; | ||||||
3663 | if (HasVIndex) { | ||||||
3664 | VIndex = MI.getOperand(3).getReg(); | ||||||
3665 | OpOffset = 1; | ||||||
3666 | } | ||||||
3667 | |||||||
3668 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3669 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3670 | |||||||
3671 | unsigned Format = 0; | ||||||
3672 | if (IsTyped) { | ||||||
3673 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3674 | ++OpOffset; | ||||||
3675 | } | ||||||
3676 | |||||||
3677 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3678 | |||||||
3679 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||
3680 | if (TotalOffset != 0) | ||||||
3681 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||
3682 | |||||||
3683 | unsigned Opc; | ||||||
3684 | if (IsTyped) { | ||||||
3685 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||||
3686 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||||
3687 | } else if (IsFormat) { | ||||||
3688 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||||
3689 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||||
3690 | } else { | ||||||
3691 | switch (MemSize) { | ||||||
3692 | case 1: | ||||||
3693 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||||
3694 | break; | ||||||
3695 | case 2: | ||||||
3696 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||||
3697 | break; | ||||||
3698 | default: | ||||||
3699 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||||
3700 | break; | ||||||
3701 | } | ||||||
3702 | } | ||||||
3703 | |||||||
3704 | if (!VIndex) | ||||||
3705 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
3706 | |||||||
3707 | auto MIB = B.buildInstr(Opc) | ||||||
3708 | .addUse(VData) // vdata | ||||||
3709 | .addUse(RSrc) // rsrc | ||||||
3710 | .addUse(VIndex) // vindex | ||||||
3711 | .addUse(VOffset) // voffset | ||||||
3712 | .addUse(SOffset) // soffset | ||||||
3713 | .addImm(ImmOffset); // offset(imm) | ||||||
3714 | |||||||
3715 | if (IsTyped) | ||||||
3716 | MIB.addImm(Format); | ||||||
3717 | |||||||
3718 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3719 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3720 | .addMemOperand(MMO); | ||||||
3721 | |||||||
3722 | MI.eraseFromParent(); | ||||||
3723 | return true; | ||||||
3724 | } | ||||||
3725 | |||||||
3726 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||||
3727 | MachineRegisterInfo &MRI, | ||||||
3728 | MachineIRBuilder &B, | ||||||
3729 | bool IsFormat, | ||||||
3730 | bool IsTyped) const { | ||||||
3731 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||||
3732 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3733 | const int MemSize = MMO->getSize(); | ||||||
3734 | const LLT S32 = LLT::scalar(32); | ||||||
3735 | |||||||
3736 | Register Dst = MI.getOperand(0).getReg(); | ||||||
3737 | Register RSrc = MI.getOperand(2).getReg(); | ||||||
3738 | |||||||
3739 | // The typed intrinsics add an immediate after the registers. | ||||||
3740 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||||
3741 | |||||||
3742 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3743 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3744 | Register VIndex; | ||||||
3745 | int OpOffset = 0; | ||||||
3746 | if (HasVIndex) { | ||||||
3747 | VIndex = MI.getOperand(3).getReg(); | ||||||
3748 | OpOffset = 1; | ||||||
3749 | } | ||||||
3750 | |||||||
3751 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3752 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3753 | |||||||
3754 | unsigned Format = 0; | ||||||
3755 | if (IsTyped) { | ||||||
3756 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3757 | ++OpOffset; | ||||||
3758 | } | ||||||
3759 | |||||||
3760 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||||
3761 | unsigned ImmOffset; | ||||||
3762 | unsigned TotalOffset; | ||||||
3763 | |||||||
3764 | LLT Ty = MRI.getType(Dst); | ||||||
3765 | LLT EltTy = Ty.getScalarType(); | ||||||
3766 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||||
3767 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||||
3768 | |||||||
3769 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||
3770 | if (TotalOffset != 0) | ||||||
3771 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); | ||||||
3772 | |||||||
3773 | unsigned Opc; | ||||||
3774 | |||||||
3775 | if (IsTyped) { | ||||||
3776 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||||
3777 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||||
3778 | } else if (IsFormat) { | ||||||
3779 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : | ||||||
3780 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||||
3781 | } else { | ||||||
3782 | switch (MemSize) { | ||||||
3783 | case 1: | ||||||
3784 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||||
3785 | break; | ||||||
3786 | case 2: | ||||||
3787 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||||
3788 | break; | ||||||
3789 | default: | ||||||
3790 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||||
3791 | break; | ||||||
3792 | } | ||||||
3793 | } | ||||||
3794 | |||||||
3795 | Register LoadDstReg; | ||||||
3796 | |||||||
3797 | bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); | ||||||
3798 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||||
3799 | |||||||
3800 | if (IsExtLoad) | ||||||
3801 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||||
3802 | else if (Unpacked && IsD16 && Ty.isVector()) | ||||||
3803 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||||
3804 | else | ||||||
3805 | LoadDstReg = Dst; | ||||||
3806 | |||||||
3807 | if (!VIndex) | ||||||
3808 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||||
3809 | |||||||
3810 | auto MIB = B.buildInstr(Opc) | ||||||
3811 | .addDef(LoadDstReg) // vdata | ||||||
3812 | .addUse(RSrc) // rsrc | ||||||
3813 | .addUse(VIndex) // vindex | ||||||
3814 | .addUse(VOffset) // voffset | ||||||
3815 | .addUse(SOffset) // soffset | ||||||
3816 | .addImm(ImmOffset); // offset(imm) | ||||||
3817 | |||||||
3818 | if (IsTyped) | ||||||
3819 | MIB.addImm(Format); | ||||||
3820 | |||||||
3821 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3822 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3823 | .addMemOperand(MMO); | ||||||
3824 | |||||||
3825 | if (LoadDstReg != Dst) { | ||||||
3826 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||||
3827 | |||||||
3828 | // Widen result for extending loads was widened. | ||||||
3829 | if (IsExtLoad) | ||||||
3830 | B.buildTrunc(Dst, LoadDstReg); | ||||||
3831 | else { | ||||||
3832 | // Repack to original 16-bit vector result | ||||||
3833 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||||
3834 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||||
3835 | SmallVector<Register, 4> Repack; | ||||||
3836 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||||
3837 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||||
3838 | B.buildMerge(Dst, Repack); | ||||||
3839 | } | ||||||
3840 | } | ||||||
3841 | |||||||
3842 | MI.eraseFromParent(); | ||||||
3843 | return true; | ||||||
3844 | } | ||||||
3845 | |||||||
3846 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||||
3847 | MachineIRBuilder &B, | ||||||
3848 | bool IsInc) const { | ||||||
3849 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : | ||||||
3850 | AMDGPU::G_AMDGPU_ATOMIC_DEC; | ||||||
3851 | B.buildInstr(Opc) | ||||||
3852 | .addDef(MI.getOperand(0).getReg()) | ||||||
3853 | .addUse(MI.getOperand(2).getReg()) | ||||||
3854 | .addUse(MI.getOperand(3).getReg()) | ||||||
3855 | .cloneMemRefs(MI); | ||||||
3856 | MI.eraseFromParent(); | ||||||
3857 | return true; | ||||||
3858 | } | ||||||
3859 | |||||||
3860 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||||
3861 | switch (IntrID) { | ||||||
3862 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
3863 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
3864 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||||
3865 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
3866 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
3867 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||||
3868 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
3869 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
3870 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||||
3871 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
3872 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
3873 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||||
3874 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
3875 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
3876 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||||
3877 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
3878 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
3879 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||||
3880 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
3881 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
3882 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||||
3883 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
3884 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
3885 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||||
3886 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
3887 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
3888 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||||
3889 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
3890 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
3891 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||||
3892 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
3893 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
3894 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||||
3895 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
3896 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
3897 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||||
3898 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
3899 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
3900 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||||
3901 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||
3902 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||
3903 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||||
3904 | default: | ||||||
3905 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3905); | ||||||
3906 | } | ||||||
3907 | } | ||||||
3908 | |||||||
3909 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||||
3910 | MachineIRBuilder &B, | ||||||
3911 | Intrinsic::ID IID) const { | ||||||
3912 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||||
3913 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||||
3914 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||||
3915 | |||||||
3916 | Register Dst; | ||||||
3917 | |||||||
3918 | int OpOffset = 0; | ||||||
3919 | if (HasReturn) { | ||||||
3920 | // A few FP atomics do not support return values. | ||||||
3921 | Dst = MI.getOperand(0).getReg(); | ||||||
3922 | } else { | ||||||
3923 | OpOffset = -1; | ||||||
3924 | } | ||||||
3925 | |||||||
3926 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||||
3927 | Register CmpVal; | ||||||
3928 | |||||||
3929 | if (IsCmpSwap) { | ||||||
3930 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3931 | ++OpOffset; | ||||||
3932 | } | ||||||
3933 | |||||||
3934 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||||
3935 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||||
3936 | |||||||
3937 | // The struct intrinsic variants add one additional operand over raw. | ||||||
3938 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||||
3939 | Register VIndex; | ||||||
3940 | if (HasVIndex) { | ||||||
3941 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3942 | ++OpOffset; | ||||||
3943 | } | ||||||
3944 | |||||||
3945 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||||
3946 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||||
3947 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||||
3948 | |||||||
3949 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||||
3950 | |||||||
3951 | unsigned ImmOffset; | ||||||
3952 | unsigned TotalOffset; | ||||||
3953 | std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); | ||||||
3954 | if (TotalOffset != 0) | ||||||
3955 | MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); | ||||||
3956 | |||||||
3957 | if (!VIndex) | ||||||
3958 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||||
3959 | |||||||
3960 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||||
3961 | |||||||
3962 | if (HasReturn) | ||||||
3963 | MIB.addDef(Dst); | ||||||
3964 | |||||||
3965 | MIB.addUse(VData); // vdata | ||||||
3966 | |||||||
3967 | if (IsCmpSwap) | ||||||
3968 | MIB.addReg(CmpVal); | ||||||
3969 | |||||||
3970 | MIB.addUse(RSrc) // rsrc | ||||||
3971 | .addUse(VIndex) // vindex | ||||||
3972 | .addUse(VOffset) // voffset | ||||||
3973 | .addUse(SOffset) // soffset | ||||||
3974 | .addImm(ImmOffset) // offset(imm) | ||||||
3975 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||||
3976 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||||
3977 | .addMemOperand(MMO); | ||||||
3978 | |||||||
3979 | MI.eraseFromParent(); | ||||||
3980 | return true; | ||||||
3981 | } | ||||||
3982 | |||||||
3983 | /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized | ||||||
3984 | /// vector with s16 typed elements. | ||||||
3985 | static void packImageA16AddressToDwords( | ||||||
3986 | MachineIRBuilder &B, MachineInstr &MI, | ||||||
3987 | SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, | ||||||
3988 | const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { | ||||||
3989 | const LLT S16 = LLT::scalar(16); | ||||||
3990 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
3991 | |||||||
3992 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||||
3993 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||
3994 | if (!SrcOp.isReg()) | ||||||
3995 | continue; // _L to _LZ may have eliminated this. | ||||||
3996 | |||||||
3997 | Register AddrReg = SrcOp.getReg(); | ||||||
3998 | |||||||
3999 | if (I < Intr->GradientStart) { | ||||||
4000 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||||
4001 | PackedAddrs.push_back(AddrReg); | ||||||
4002 | } else { | ||||||
4003 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||||
4004 | // derivatives dx/dh and dx/dv are packed with undef. | ||||||
4005 | if (((I + 1) >= EndIdx) || | ||||||
4006 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||||
4007 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||||
4008 | (Intr->NumGradients / 2) - 1) || | ||||||
4009 | I == static_cast<unsigned>(Intr->GradientStart + | ||||||
4010 | Intr->NumGradients - 1))) || | ||||||
4011 | // Check for _L to _LZ optimization | ||||||
4012 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||||
4013 | PackedAddrs.push_back( | ||||||
4014 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||||
4015 | .getReg(0)); | ||||||
4016 | } else { | ||||||
4017 | PackedAddrs.push_back( | ||||||
4018 | B.buildBuildVector( | ||||||
4019 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||||
4020 | .getReg(0)); | ||||||
4021 | ++I; | ||||||
4022 | } | ||||||
4023 | } | ||||||
4024 | } | ||||||
4025 | } | ||||||
4026 | |||||||
4027 | /// Convert from separate vaddr components to a single vector address register, | ||||||
4028 | /// and replace the remaining operands with $noreg. | ||||||
4029 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||||
4030 | int DimIdx, int NumVAddrs) { | ||||||
4031 | const LLT S32 = LLT::scalar(32); | ||||||
4032 | |||||||
4033 | SmallVector<Register, 8> AddrRegs; | ||||||
4034 | for (int I = 0; I != NumVAddrs; ++I) { | ||||||
4035 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||
4036 | if (SrcOp.isReg()) { | ||||||
4037 | AddrRegs.push_back(SrcOp.getReg()); | ||||||
4038 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)((B.getMRI()->getType(SrcOp.getReg()) == S32) ? static_cast <void> (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4038, __PRETTY_FUNCTION__)); | ||||||
4039 | } | ||||||
4040 | } | ||||||
4041 | |||||||
4042 | int NumAddrRegs = AddrRegs.size(); | ||||||
4043 | if (NumAddrRegs != 1) { | ||||||
4044 | // Round up to 8 elements for v5-v7 | ||||||
4045 | // FIXME: Missing intermediate sized register classes and instructions. | ||||||
4046 | if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { | ||||||
4047 | const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); | ||||||
4048 | auto Undef = B.buildUndef(S32); | ||||||
4049 | AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); | ||||||
4050 | NumAddrRegs = RoundedNumRegs; | ||||||
4051 | } | ||||||
4052 | |||||||
4053 | auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); | ||||||
4054 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||||
4055 | } | ||||||
4056 | |||||||
4057 | for (int I = 1; I != NumVAddrs; ++I) { | ||||||
4058 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||||
4059 | if (SrcOp.isReg()) | ||||||
4060 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||||
4061 | } | ||||||
4062 | } | ||||||
4063 | |||||||
4064 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||||
4065 | /// | ||||||
4066 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||||
4067 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||||
4068 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||||
4069 | /// registers. | ||||||
4070 | /// | ||||||
4071 | /// We don't want to directly select image instructions just yet, but also want | ||||||
4072 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||||
4073 | /// want a selected instrution entering RegBankSelect. In order to avoid | ||||||
4074 | /// defining a multitude of intermediate image instructions, directly hack on | ||||||
4075 | /// the intrinsic's arguments. In cases like a16 addreses, this requires padding | ||||||
4076 | /// now unnecessary arguments with $noreg. | ||||||
4077 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||||
4078 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||||
4079 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||||
4080 | |||||||
4081 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||||
4082 | const unsigned ArgOffset = NumDefs + 1; | ||||||
4083 | bool IsTFE = NumDefs == 2; | ||||||
4084 | // We are only processing the operands of d16 image operations on subtargets | ||||||
4085 | // that use the unpacked register layout, or need to repack the TFE result. | ||||||
4086 | |||||||
4087 | // TODO: Do we need to guard against already legalized intrinsics? | ||||||
4088 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||||
4089 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||||
4090 | |||||||
4091 | MachineRegisterInfo *MRI = B.getMRI(); | ||||||
4092 | const LLT S32 = LLT::scalar(32); | ||||||
4093 | const LLT S16 = LLT::scalar(16); | ||||||
4094 | const LLT V2S16 = LLT::vector(2, 16); | ||||||
4095 | |||||||
4096 | unsigned DMask = 0; | ||||||
4097 | |||||||
4098 | // Check for 16 bit addresses and pack if true. | ||||||
4099 | LLT GradTy = | ||||||
4100 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||||
4101 | LLT AddrTy = | ||||||
4102 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||||
4103 | const bool IsG16 = GradTy == S16; | ||||||
4104 | const bool IsA16 = AddrTy == S16; | ||||||
4105 | |||||||
4106 | int DMaskLanes = 0; | ||||||
4107 | if (!BaseOpcode->Atomic) { | ||||||
4108 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||||
4109 | if (BaseOpcode->Gather4) { | ||||||
4110 | DMaskLanes = 4; | ||||||
4111 | } else if (DMask != 0) { | ||||||
4112 | DMaskLanes = countPopulation(DMask); | ||||||
4113 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||||
4114 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||||
4115 | B.buildUndef(MI.getOperand(0)); | ||||||
4116 | MI.eraseFromParent(); | ||||||
4117 | return true; | ||||||
4118 | } | ||||||
4119 | } | ||||||
4120 | |||||||
4121 | Observer.changingInstr(MI); | ||||||
4122 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||||
4123 | |||||||
4124 | unsigned NewOpcode = NumDefs == 0 ? | ||||||
4125 | AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||||
4126 | |||||||
4127 | // Track that we legalized this | ||||||
4128 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||||
4129 | |||||||
4130 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||||
4131 | // dmask to be at least 1 otherwise the instruction will fail | ||||||
4132 | if (IsTFE && DMask == 0) { | ||||||
4133 | DMask = 0x1; | ||||||
4134 | DMaskLanes = 1; | ||||||
4135 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||||
4136 | } | ||||||
4137 | |||||||
4138 | if (BaseOpcode->Atomic) { | ||||||
4139 | Register VData0 = MI.getOperand(2).getReg(); | ||||||
4140 | LLT Ty = MRI->getType(VData0); | ||||||
4141 | |||||||
4142 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||||
4143 | if (Ty.isVector()) | ||||||
4144 | return false; | ||||||
4145 | |||||||
4146 | if (BaseOpcode->AtomicX2) { | ||||||
4147 | Register VData1 = MI.getOperand(3).getReg(); | ||||||
4148 | // The two values are packed in one register. | ||||||
4149 | LLT PackedTy = LLT::vector(2, Ty); | ||||||
4150 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||||
4151 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||||
4152 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||||
4153 | } | ||||||
4154 | } | ||||||
4155 | |||||||
4156 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||||
4157 | |||||||
4158 | // Optimize _L to _LZ when _L is zero | ||||||
4159 | if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = | ||||||
4160 | AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { | ||||||
4161 | const ConstantFP *ConstantLod; | ||||||
4162 | |||||||
4163 | if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, | ||||||
4164 | m_GFCst(ConstantLod))) { | ||||||
4165 | if (ConstantLod->isZero() || ConstantLod->isNegative()) { | ||||||
4166 | // Set new opcode to _lz variant of _l, and change the intrinsic ID. | ||||||
4167 | const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = | ||||||
4168 | AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, | ||||||
4169 | Intr->Dim); | ||||||
4170 | |||||||
4171 | // The starting indexes should remain in the same place. | ||||||
4172 | --CorrectedNumVAddrs; | ||||||
4173 | |||||||
4174 | MI.getOperand(MI.getNumExplicitDefs()) | ||||||
4175 | .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); | ||||||
4176 | MI.RemoveOperand(ArgOffset + Intr->LodIndex); | ||||||
4177 | Intr = NewImageDimIntr; | ||||||
4178 | } | ||||||
4179 | } | ||||||
4180 | } | ||||||
4181 | |||||||
4182 | // Optimize _mip away, when 'lod' is zero | ||||||
4183 | if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { | ||||||
4184 | int64_t ConstantLod; | ||||||
4185 | if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, | ||||||
4186 | m_ICst(ConstantLod))) { | ||||||
4187 | if (ConstantLod == 0) { | ||||||
4188 | // TODO: Change intrinsic opcode and remove operand instead or replacing | ||||||
4189 | // it with 0, as the _L to _LZ handling is done above. | ||||||
4190 | MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); | ||||||
4191 | --CorrectedNumVAddrs; | ||||||
4192 | } | ||||||
4193 | } | ||||||
4194 | } | ||||||
4195 | |||||||
4196 | // Rewrite the addressing register layout before doing anything else. | ||||||
4197 | if (IsA16 || IsG16) { | ||||||
4198 | if (IsA16) { | ||||||
4199 | // Target must support the feature and gradients need to be 16 bit too | ||||||
4200 | if (!ST.hasA16() || !IsG16) | ||||||
4201 | return false; | ||||||
4202 | } else if (!ST.hasG16()) | ||||||
4203 | return false; | ||||||
4204 | |||||||
4205 | if (Intr->NumVAddrs > 1) { | ||||||
4206 | SmallVector<Register, 4> PackedRegs; | ||||||
4207 | // Don't compress addresses for G16 | ||||||
4208 | const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; | ||||||
4209 | packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, | ||||||
4210 | PackEndIdx); | ||||||
4211 | |||||||
4212 | if (!IsA16) { | ||||||
4213 | // Add uncompressed address | ||||||
4214 | for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { | ||||||
4215 | int AddrReg = MI.getOperand(ArgOffset + I).getReg(); | ||||||
4216 | assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32))((B.getMRI()->getType(AddrReg) == LLT::scalar(32)) ? static_cast <void> (0) : __assert_fail ("B.getMRI()->getType(AddrReg) == LLT::scalar(32)" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4216, __PRETTY_FUNCTION__)); | ||||||
4217 | PackedRegs.push_back(AddrReg); | ||||||
4218 | } | ||||||
4219 | } | ||||||
4220 | |||||||
4221 | // See also below in the non-a16 branch | ||||||
4222 | const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); | ||||||
4223 | |||||||
4224 | if (!UseNSA && PackedRegs.size() > 1) { | ||||||
4225 | LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); | ||||||
4226 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||||
4227 | PackedRegs[0] = Concat.getReg(0); | ||||||
4228 | PackedRegs.resize(1); | ||||||
4229 | } | ||||||
4230 | |||||||
4231 | const unsigned NumPacked = PackedRegs.size(); | ||||||
4232 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||||
4233 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||||
4234 | if (!SrcOp.isReg()) { | ||||||
4235 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)((SrcOp.isImm() && SrcOp.getImm() == 0) ? static_cast <void> (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4235, __PRETTY_FUNCTION__)); | ||||||
4236 | continue; | ||||||
4237 | } | ||||||
4238 | |||||||
4239 | assert(SrcOp.getReg() != AMDGPU::NoRegister)((SrcOp.getReg() != AMDGPU::NoRegister) ? static_cast<void > (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4239, __PRETTY_FUNCTION__)); | ||||||
4240 | |||||||
4241 | if (I - Intr->VAddrStart < NumPacked) | ||||||
4242 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||||
4243 | else | ||||||
4244 | SrcOp.setReg(AMDGPU::NoRegister); | ||||||
4245 | } | ||||||
4246 | } | ||||||
4247 | } else { | ||||||
4248 | // If the register allocator cannot place the address registers contiguously | ||||||
4249 | // without introducing moves, then using the non-sequential address encoding | ||||||
4250 | // is always preferable, since it saves VALU instructions and is usually a | ||||||
4251 | // wash in terms of code size or even better. | ||||||
4252 | // | ||||||
4253 | // However, we currently have no way of hinting to the register allocator | ||||||
4254 | // that MIMG addresses should be placed contiguously when it is possible to | ||||||
4255 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||||
4256 | // | ||||||
4257 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||||
4258 | // allocation when possible. | ||||||
4259 | const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); | ||||||
4260 | |||||||
4261 | if (!UseNSA && Intr->NumVAddrs > 1) | ||||||
4262 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||||
4263 | Intr->NumVAddrs); | ||||||
4264 | } | ||||||
4265 | |||||||
4266 | int Flags = 0; | ||||||
4267 | if (IsA16) | ||||||
4268 | Flags |= 1; | ||||||
4269 | if (IsG16) | ||||||
4270 | Flags |= 2; | ||||||
4271 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||||
4272 | |||||||
4273 | if (BaseOpcode->Store) { // No TFE for stores? | ||||||
4274 | // TODO: Handle dmask trim | ||||||
4275 | Register VData = MI.getOperand(1).getReg(); | ||||||
4276 | LLT Ty = MRI->getType(VData); | ||||||
4277 | if (!Ty.isVector() || Ty.getElementType() != S16) | ||||||
4278 | return true; | ||||||
4279 | |||||||
4280 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||||
4281 | if (RepackedReg != VData) { | ||||||
4282 | MI.getOperand(1).setReg(RepackedReg); | ||||||
4283 | } | ||||||
4284 | |||||||
4285 | return true; | ||||||
4286 | } | ||||||
4287 | |||||||
4288 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
4289 | LLT Ty = MRI->getType(DstReg); | ||||||
4290 | const LLT EltTy = Ty.getScalarType(); | ||||||
4291 | const bool IsD16 = Ty.getScalarType() == S16; | ||||||
4292 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||||
4293 | |||||||
4294 | // Confirm that the return type is large enough for the dmask specified | ||||||
4295 | if (NumElts < DMaskLanes) | ||||||
4296 | return false; | ||||||
4297 | |||||||
4298 | if (NumElts > 4 || DMaskLanes > 4) | ||||||
4299 | return false; | ||||||
4300 | |||||||
4301 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; | ||||||
4302 | const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); | ||||||
4303 | |||||||
4304 | // The raw dword aligned data component of the load. The only legal cases | ||||||
4305 | // where this matters should be when using the packed D16 format, for | ||||||
4306 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||||
4307 | LLT RoundedTy; | ||||||
4308 | |||||||
4309 | // S32 vector to to cover all data, plus TFE result element. | ||||||
4310 | LLT TFETy; | ||||||
4311 | |||||||
4312 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||||
4313 | LLT RegTy; | ||||||
4314 | |||||||
4315 | if (IsD16 && ST.hasUnpackedD16VMem()) { | ||||||
4316 | RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); | ||||||
4317 | TFETy = LLT::vector(AdjustedNumElts + 1, 32); | ||||||
4318 | RegTy = S32; | ||||||
4319 | } else { | ||||||
4320 | unsigned EltSize = EltTy.getSizeInBits(); | ||||||
4321 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||||
4322 | unsigned RoundedSize = 32 * RoundedElts; | ||||||
4323 | RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); | ||||||
4324 | TFETy = LLT::vector(RoundedSize / 32 + 1, S32); | ||||||
4325 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||||
4326 | } | ||||||
4327 | |||||||
4328 | // The return type does not need adjustment. | ||||||
4329 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||||
4330 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||||
4331 | return true; | ||||||
4332 | |||||||
4333 | Register Dst1Reg; | ||||||
4334 | |||||||
4335 | // Insert after the instruction. | ||||||
4336 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||||
4337 | |||||||
4338 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||||
4339 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||||
4340 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||||
4341 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||||
4342 | |||||||
4343 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||||
4344 | |||||||
4345 | MI.getOperand(0).setReg(NewResultReg); | ||||||
4346 | |||||||
4347 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||||
4348 | // type. The intruction really returns these two values in one contiguous | ||||||
4349 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||||
4350 | // return type to use a single register result. | ||||||
4351 | |||||||
4352 | if (IsTFE) { | ||||||
4353 | Dst1Reg = MI.getOperand(1).getReg(); | ||||||
4354 | if (MRI->getType(Dst1Reg) != S32) | ||||||
4355 | return false; | ||||||
4356 | |||||||
4357 | // TODO: Make sure the TFE operand bit is set. | ||||||
4358 | MI.RemoveOperand(1); | ||||||
4359 | |||||||
4360 | // Handle the easy case that requires no repack instructions. | ||||||
4361 | if (Ty == S32) { | ||||||
4362 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||||
4363 | return true; | ||||||
4364 | } | ||||||
4365 | } | ||||||
4366 | |||||||
4367 | // Now figure out how to copy the new result register back into the old | ||||||
4368 | // result. | ||||||
4369 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||||
4370 | |||||||
4371 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||||
4372 | |||||||
4373 | if (ResultNumRegs == 1) { | ||||||
4374 | assert(!IsTFE)((!IsTFE) ? static_cast<void> (0) : __assert_fail ("!IsTFE" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4374, __PRETTY_FUNCTION__)); | ||||||
4375 | ResultRegs[0] = NewResultReg; | ||||||
4376 | } else { | ||||||
4377 | // We have to repack into a new vector of some kind. | ||||||
4378 | for (int I = 0; I != NumDataRegs; ++I) | ||||||
4379 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||||
4380 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||||
4381 | |||||||
4382 | // Drop the final TFE element to get the data part. The TFE result is | ||||||
4383 | // directly written to the right place already. | ||||||
4384 | if (IsTFE) | ||||||
4385 | ResultRegs.resize(NumDataRegs); | ||||||
4386 | } | ||||||
4387 | |||||||
4388 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||||
4389 | // of packed vs. unpacked. | ||||||
4390 | if (IsD16 && !Ty.isVector()) { | ||||||
4391 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||||
4392 | return true; | ||||||
4393 | } | ||||||
4394 | |||||||
4395 | // Avoid a build/concat_vector of 1 entry. | ||||||
4396 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||||
4397 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||||
4398 | return true; | ||||||
4399 | } | ||||||
4400 | |||||||
4401 | assert(Ty.isVector())((Ty.isVector()) ? static_cast<void> (0) : __assert_fail ("Ty.isVector()", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4401, __PRETTY_FUNCTION__)); | ||||||
4402 | |||||||
4403 | if (IsD16) { | ||||||
4404 | // For packed D16 results with TFE enabled, all the data components are | ||||||
4405 | // S32. Cast back to the expected type. | ||||||
4406 | // | ||||||
4407 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||||
4408 | // cast for the TFE result if a multiple of v2s16 was used. | ||||||
4409 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||||
4410 | for (Register &Reg : ResultRegs) | ||||||
4411 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||||
4412 | } else if (ST.hasUnpackedD16VMem()) { | ||||||
4413 | for (Register &Reg : ResultRegs) | ||||||
4414 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||||
4415 | } | ||||||
4416 | } | ||||||
4417 | |||||||
4418 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||||
4419 | if (NumElts == 0) | ||||||
4420 | return; | ||||||
4421 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||||
4422 | for (int I = 0; I != NumElts; ++I) | ||||||
4423 | ResultRegs.push_back(Undef); | ||||||
4424 | }; | ||||||
4425 | |||||||
4426 | // Pad out any elements eliminated due to the dmask. | ||||||
4427 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||||
4428 | if (!ResTy.isVector()) { | ||||||
4429 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||||
4430 | B.buildBuildVector(DstReg, ResultRegs); | ||||||
4431 | return true; | ||||||
4432 | } | ||||||
4433 | |||||||
4434 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)((!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? static_cast <void> (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4434, __PRETTY_FUNCTION__)); | ||||||
4435 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||||
4436 | |||||||
4437 | // Deal with the one annoying legal case. | ||||||
4438 | const LLT V3S16 = LLT::vector(3, 16); | ||||||
4439 | if (Ty == V3S16) { | ||||||
4440 | padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); | ||||||
4441 | auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); | ||||||
4442 | B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); | ||||||
4443 | return true; | ||||||
4444 | } | ||||||
4445 | |||||||
4446 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||||
4447 | B.buildConcatVectors(DstReg, ResultRegs); | ||||||
4448 | return true; | ||||||
4449 | } | ||||||
4450 | |||||||
4451 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||||
4452 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||||
4453 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
4454 | GISelChangeObserver &Observer = Helper.Observer; | ||||||
4455 | |||||||
4456 | Register Dst = MI.getOperand(0).getReg(); | ||||||
4457 | LLT Ty = B.getMRI()->getType(Dst); | ||||||
4458 | unsigned Size = Ty.getSizeInBits(); | ||||||
4459 | MachineFunction &MF = B.getMF(); | ||||||
4460 | |||||||
4461 | Observer.changingInstr(MI); | ||||||
4462 | |||||||
4463 | if (shouldBitcastLoadStoreType(ST, Ty, Size)) { | ||||||
4464 | Ty = getBitcastRegisterType(Ty); | ||||||
4465 | Helper.bitcastDst(MI, Ty, 0); | ||||||
4466 | Dst = MI.getOperand(0).getReg(); | ||||||
4467 | B.setInsertPt(B.getMBB(), MI); | ||||||
4468 | } | ||||||
4469 | |||||||
4470 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||||
4471 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||||
4472 | // allowed to add one. | ||||||
4473 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||||
4474 | MI.RemoveOperand(1); // Remove intrinsic ID | ||||||
4475 | |||||||
4476 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||||
4477 | // TODO: Should this use datalayout alignment? | ||||||
4478 | const unsigned MemSize = (Size + 7) / 8; | ||||||
4479 | const Align MemAlign(4); | ||||||
4480 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||||
4481 | MachinePointerInfo(), | ||||||
4482 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||||
4483 | MachineMemOperand::MOInvariant, | ||||||
4484 | MemSize, MemAlign); | ||||||
4485 | MI.addMemOperand(MF, MMO); | ||||||
4486 | |||||||
4487 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||||
4488 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||||
4489 | // out this needs to be converted to a vector load during RegBankSelect. | ||||||
4490 | if (!isPowerOf2_32(Size)) { | ||||||
4491 | if (Ty.isVector()) | ||||||
4492 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||||
4493 | else | ||||||
4494 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||||
4495 | } | ||||||
4496 | |||||||
4497 | Observer.changedInstr(MI); | ||||||
4498 | return true; | ||||||
4499 | } | ||||||
4500 | |||||||
4501 | // TODO: Move to selection | ||||||
4502 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||||
4503 | MachineRegisterInfo &MRI, | ||||||
4504 | MachineIRBuilder &B) const { | ||||||
4505 | // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction | ||||||
4506 | if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || | ||||||
4507 | !ST.isTrapHandlerEnabled()) { | ||||||
4508 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||||
4509 | } else { | ||||||
4510 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||||
4511 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||||
4512 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
4513 | |||||||
4514 | Register LiveIn = | ||||||
4515 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||||
4516 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||||
4517 | return false; | ||||||
4518 | |||||||
4519 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||||
4520 | B.buildCopy(SGPR01, LiveIn); | ||||||
4521 | B.buildInstr(AMDGPU::S_TRAP) | ||||||
4522 | .addImm(GCNSubtarget::TrapIDLLVMTrap) | ||||||
4523 | .addReg(SGPR01, RegState::Implicit); | ||||||
4524 | } | ||||||
4525 | |||||||
4526 | MI.eraseFromParent(); | ||||||
4527 | return true; | ||||||
4528 | } | ||||||
4529 | |||||||
4530 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||||
4531 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||||
4532 | // Is non-HSA path or trap-handler disabled? then, report a warning | ||||||
4533 | // accordingly | ||||||
4534 | if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || | ||||||
4535 | !ST.isTrapHandlerEnabled()) { | ||||||
4536 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||||
4537 | "debugtrap handler not supported", | ||||||
4538 | MI.getDebugLoc(), DS_Warning); | ||||||
4539 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||||
4540 | Ctx.diagnose(NoTrap); | ||||||
4541 | } else { | ||||||
4542 | // Insert debug-trap instruction | ||||||
4543 | B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); | ||||||
4544 | } | ||||||
4545 | |||||||
4546 | MI.eraseFromParent(); | ||||||
4547 | return true; | ||||||
4548 | } | ||||||
4549 | |||||||
4550 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||||
4551 | MachineIRBuilder &B) const { | ||||||
4552 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
4553 | const LLT S16 = LLT::scalar(16); | ||||||
4554 | const LLT S32 = LLT::scalar(32); | ||||||
4555 | |||||||
4556 | Register DstReg = MI.getOperand(0).getReg(); | ||||||
4557 | Register NodePtr = MI.getOperand(2).getReg(); | ||||||
4558 | Register RayExtent = MI.getOperand(3).getReg(); | ||||||
4559 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||||
4560 | Register RayDir = MI.getOperand(5).getReg(); | ||||||
4561 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||||
4562 | Register TDescr = MI.getOperand(7).getReg(); | ||||||
4563 | |||||||
4564 | bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||||
4565 | bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||||
4566 | unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa | ||||||
4567 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa | ||||||
4568 | : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa | ||||||
4569 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; | ||||||
4570 | |||||||
4571 | SmallVector<Register, 12> Ops; | ||||||
4572 | if (Is64) { | ||||||
4573 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||||
4574 | Ops.push_back(Unmerge.getReg(0)); | ||||||
4575 | Ops.push_back(Unmerge.getReg(1)); | ||||||
4576 | } else { | ||||||
4577 | Ops.push_back(NodePtr); | ||||||
4578 | } | ||||||
4579 | Ops.push_back(RayExtent); | ||||||
4580 | |||||||
4581 | auto packLanes = [&Ops, &S32, &B] (Register Src) { | ||||||
4582 | auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); | ||||||
4583 | Ops.push_back(Unmerge.getReg(0)); | ||||||
4584 | Ops.push_back(Unmerge.getReg(1)); | ||||||
4585 | Ops.push_back(Unmerge.getReg(2)); | ||||||
4586 | }; | ||||||
4587 | |||||||
4588 | packLanes(RayOrigin); | ||||||
4589 | if (IsA16) { | ||||||
4590 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); | ||||||
4591 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); | ||||||
4592 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||||
4593 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||||
4594 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||||
4595 | B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||||
4596 | B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||||
4597 | B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||||
4598 | Ops.push_back(R1); | ||||||
4599 | Ops.push_back(R2); | ||||||
4600 | Ops.push_back(R3); | ||||||
4601 | } else { | ||||||
4602 | packLanes(RayDir); | ||||||
4603 | packLanes(RayInvDir); | ||||||
4604 | } | ||||||
4605 | |||||||
4606 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||||
4607 | .addDef(DstReg) | ||||||
4608 | .addImm(Opcode); | ||||||
4609 | |||||||
4610 | for (Register R : Ops) { | ||||||
4611 | MIB.addUse(R); | ||||||
4612 | } | ||||||
4613 | |||||||
4614 | MIB.addUse(TDescr) | ||||||
4615 | .addImm(IsA16 ? 1 : 0) | ||||||
4616 | .cloneMemRefs(MI); | ||||||
4617 | |||||||
4618 | MI.eraseFromParent(); | ||||||
4619 | return true; | ||||||
4620 | } | ||||||
4621 | |||||||
4622 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||||
4623 | MachineInstr &MI) const { | ||||||
4624 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||||
4625 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||||
4626 | |||||||
4627 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||||
4628 | auto IntrID = MI.getIntrinsicID(); | ||||||
4629 | switch (IntrID) { | ||||||
| |||||||
4630 | case Intrinsic::amdgcn_if: | ||||||
4631 | case Intrinsic::amdgcn_else: { | ||||||
4632 | MachineInstr *Br = nullptr; | ||||||
4633 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||
4634 | bool Negated = false; | ||||||
4635 | if (MachineInstr *BrCond = | ||||||
4636 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||
4637 | const SIRegisterInfo *TRI | ||||||
4638 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
4639 | |||||||
4640 | Register Def = MI.getOperand(1).getReg(); | ||||||
4641 | Register Use = MI.getOperand(3).getReg(); | ||||||
4642 | |||||||
4643 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||
4644 | |||||||
4645 | if (Negated) | ||||||
4646 | std::swap(CondBrTarget, UncondBrTarget); | ||||||
4647 | |||||||
4648 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||
4649 | if (IntrID == Intrinsic::amdgcn_if) { | ||||||
4650 | B.buildInstr(AMDGPU::SI_IF) | ||||||
4651 | .addDef(Def) | ||||||
4652 | .addUse(Use) | ||||||
4653 | .addMBB(UncondBrTarget); | ||||||
4654 | } else { | ||||||
4655 | B.buildInstr(AMDGPU::SI_ELSE) | ||||||
4656 | .addDef(Def) | ||||||
4657 | .addUse(Use) | ||||||
4658 | .addMBB(UncondBrTarget); | ||||||
4659 | } | ||||||
4660 | |||||||
4661 | if (Br) { | ||||||
4662 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||
4663 | } else { | ||||||
4664 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||||
4665 | // since we're swapping branch targets it needs to be reinserted. | ||||||
4666 | // FIXME: IRTranslator should probably not do this | ||||||
4667 | B.buildBr(*CondBrTarget); | ||||||
4668 | } | ||||||
4669 | |||||||
4670 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||||
4671 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||||
4672 | MI.eraseFromParent(); | ||||||
4673 | BrCond->eraseFromParent(); | ||||||
4674 | return true; | ||||||
4675 | } | ||||||
4676 | |||||||
4677 | return false; | ||||||
4678 | } | ||||||
4679 | case Intrinsic::amdgcn_loop: { | ||||||
4680 | MachineInstr *Br = nullptr; | ||||||
4681 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||||
4682 | bool Negated = false; | ||||||
4683 | if (MachineInstr *BrCond = | ||||||
4684 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||||
4685 | const SIRegisterInfo *TRI | ||||||
4686 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||||
4687 | |||||||
4688 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||||
4689 | Register Reg = MI.getOperand(2).getReg(); | ||||||
4690 | |||||||
4691 | if (Negated) | ||||||
4692 | std::swap(CondBrTarget, UncondBrTarget); | ||||||
4693 | |||||||
4694 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||||
4695 | B.buildInstr(AMDGPU::SI_LOOP) | ||||||
4696 | .addUse(Reg) | ||||||
4697 | .addMBB(UncondBrTarget); | ||||||
4698 | |||||||
4699 | if (Br) | ||||||
4700 | Br->getOperand(0).setMBB(CondBrTarget); | ||||||
4701 | else | ||||||
4702 | B.buildBr(*CondBrTarget); | ||||||
4703 | |||||||
4704 | MI.eraseFromParent(); | ||||||
4705 | BrCond->eraseFromParent(); | ||||||
4706 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||||
4707 | return true; | ||||||
4708 | } | ||||||
4709 | |||||||
4710 | return false; | ||||||
4711 | } | ||||||
4712 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||||
4713 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||||
4714 | // This only makes sense to call in a kernel, so just lower to null. | ||||||
4715 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||||
4716 | MI.eraseFromParent(); | ||||||
4717 | return true; | ||||||
4718 | } | ||||||
4719 | |||||||
4720 | return legalizePreloadedArgIntrin( | ||||||
4721 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||||
4722 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||||
4723 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||||
4724 | case Intrinsic::amdgcn_workitem_id_x: | ||||||
4725 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4726 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||||
4727 | case Intrinsic::amdgcn_workitem_id_y: | ||||||
4728 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4729 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||||
4730 | case Intrinsic::amdgcn_workitem_id_z: | ||||||
4731 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4732 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||||
4733 | case Intrinsic::amdgcn_workgroup_id_x: | ||||||
4734 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4735 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||||
4736 | case Intrinsic::amdgcn_workgroup_id_y: | ||||||
4737 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4738 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||||
4739 | case Intrinsic::amdgcn_workgroup_id_z: | ||||||
4740 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4741 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||||
4742 | case Intrinsic::amdgcn_dispatch_ptr: | ||||||
4743 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4744 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||||
4745 | case Intrinsic::amdgcn_queue_ptr: | ||||||
4746 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4747 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||||
4748 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||||
4749 | return legalizePreloadedArgIntrin( | ||||||
4750 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||||
4751 | case Intrinsic::amdgcn_dispatch_id: | ||||||
4752 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||||
4753 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||||
4754 | case Intrinsic::amdgcn_fdiv_fast: | ||||||
4755 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||||
4756 | case Intrinsic::amdgcn_is_shared: | ||||||
4757 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||||
4758 | case Intrinsic::amdgcn_is_private: | ||||||
4759 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||||
4760 | case Intrinsic::amdgcn_wavefrontsize: { | ||||||
4761 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||||
4762 | MI.eraseFromParent(); | ||||||
4763 | return true; | ||||||
4764 | } | ||||||
4765 | case Intrinsic::amdgcn_s_buffer_load: | ||||||
4766 | return legalizeSBufferLoad(Helper, MI); | ||||||
4767 | case Intrinsic::amdgcn_raw_buffer_store: | ||||||
4768 | case Intrinsic::amdgcn_struct_buffer_store: | ||||||
4769 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||||
4770 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||||
4771 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||||
4772 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||||
4773 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||||
4774 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||||
4775 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||||
4776 | case Intrinsic::amdgcn_raw_buffer_load: | ||||||
4777 | case Intrinsic::amdgcn_struct_buffer_load: | ||||||
4778 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||||
4779 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||||
4780 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||||
4781 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||||
4782 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||||
4783 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||||
4784 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||||
4785 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||||
4786 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||||
4787 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||||
4788 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||||
4789 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||||
4790 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||||
4791 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||||
4792 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||||
4793 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||||
4794 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||||
4795 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||||
4796 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||||
4797 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||||
4798 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||||
4799 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||||
4800 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||||
4801 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||||
4802 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||||
4803 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||||
4804 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||||
4805 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||||
4806 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||||
4807 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||||
4808 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||||
4809 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||||
4810 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||||
4811 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||||
4812 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||||
4813 | return legalizeBufferAtomic(MI, B, IntrID); | ||||||
4814 | case Intrinsic::amdgcn_atomic_inc: | ||||||
4815 | return legalizeAtomicIncDec(MI, B, true); | ||||||
4816 | case Intrinsic::amdgcn_atomic_dec: | ||||||
4817 | return legalizeAtomicIncDec(MI, B, false); | ||||||
4818 | case Intrinsic::trap: | ||||||
4819 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||||
4820 | case Intrinsic::debugtrap: | ||||||
4821 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||||
4822 | case Intrinsic::amdgcn_rsq_clamp: | ||||||
4823 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||||
4824 | case Intrinsic::amdgcn_ds_fadd: | ||||||
4825 | case Intrinsic::amdgcn_ds_fmin: | ||||||
4826 | case Intrinsic::amdgcn_ds_fmax: | ||||||
4827 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||||
4828 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||||
4829 | return legalizeBVHIntrinsic(MI, B); | ||||||
4830 | default: { | ||||||
4831 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||||
4832 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||||
4833 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||||
4834 | return true; | ||||||
4835 | } | ||||||
4836 | } | ||||||
4837 | |||||||
4838 | return true; | ||||||
4839 | } |
1 | //==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
10 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H |
11 | |
12 | #include "llvm/CodeGen/Register.h" |
13 | #include "llvm/Pass.h" |
14 | |
15 | namespace llvm { |
16 | |
17 | class Function; |
18 | class LLT; |
19 | class raw_ostream; |
20 | class TargetRegisterClass; |
21 | class TargetRegisterInfo; |
22 | |
23 | struct ArgDescriptor { |
24 | private: |
25 | friend struct AMDGPUFunctionArgInfo; |
26 | friend class AMDGPUArgumentUsageInfo; |
27 | |
28 | union { |
29 | MCRegister Reg; |
30 | unsigned StackOffset; |
31 | }; |
32 | |
33 | // Bitmask to locate argument within the register. |
34 | unsigned Mask; |
35 | |
36 | bool IsStack : 1; |
37 | bool IsSet : 1; |
38 | |
39 | public: |
40 | constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, |
41 | bool IsStack = false, bool IsSet = false) |
42 | : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} |
43 | |
44 | static constexpr ArgDescriptor createRegister(Register Reg, |
45 | unsigned Mask = ~0u) { |
46 | return ArgDescriptor(Reg, Mask, false, true); |
47 | } |
48 | |
49 | static constexpr ArgDescriptor createStack(unsigned Offset, |
50 | unsigned Mask = ~0u) { |
51 | return ArgDescriptor(Offset, Mask, true, true); |
52 | } |
53 | |
54 | static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, |
55 | unsigned Mask) { |
56 | return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); |
57 | } |
58 | |
59 | bool isSet() const { |
60 | return IsSet; |
61 | } |
62 | |
63 | explicit operator bool() const { |
64 | return isSet(); |
65 | } |
66 | |
67 | bool isRegister() const { |
68 | return !IsStack; |
69 | } |
70 | |
71 | MCRegister getRegister() const { |
72 | assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 72, __PRETTY_FUNCTION__)); |
73 | return Reg; |
74 | } |
75 | |
76 | unsigned getStackOffset() const { |
77 | assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h" , 77, __PRETTY_FUNCTION__)); |
78 | return StackOffset; |
79 | } |
80 | |
81 | unsigned getMask() const { |
82 | return Mask; |
83 | } |
84 | |
85 | bool isMasked() const { |
86 | return Mask != ~0u; |
87 | } |
88 | |
89 | void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; |
90 | }; |
91 | |
92 | inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { |
93 | Arg.print(OS); |
94 | return OS; |
95 | } |
96 | |
97 | struct AMDGPUFunctionArgInfo { |
98 | enum PreloadedValue { |
99 | // SGPRS: |
100 | PRIVATE_SEGMENT_BUFFER = 0, |
101 | DISPATCH_PTR = 1, |
102 | QUEUE_PTR = 2, |
103 | KERNARG_SEGMENT_PTR = 3, |
104 | DISPATCH_ID = 4, |
105 | FLAT_SCRATCH_INIT = 5, |
106 | WORKGROUP_ID_X = 10, |
107 | WORKGROUP_ID_Y = 11, |
108 | WORKGROUP_ID_Z = 12, |
109 | PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, |
110 | IMPLICIT_BUFFER_PTR = 15, |
111 | IMPLICIT_ARG_PTR = 16, |
112 | |
113 | // VGPRS: |
114 | WORKITEM_ID_X = 17, |
115 | WORKITEM_ID_Y = 18, |
116 | WORKITEM_ID_Z = 19, |
117 | FIRST_VGPR_VALUE = WORKITEM_ID_X |
118 | }; |
119 | |
120 | // Kernel input registers setup for the HSA ABI in allocation order. |
121 | |
122 | // User SGPRs in kernels |
123 | // XXX - Can these require argument spills? |
124 | ArgDescriptor PrivateSegmentBuffer; |
125 | ArgDescriptor DispatchPtr; |
126 | ArgDescriptor QueuePtr; |
127 | ArgDescriptor KernargSegmentPtr; |
128 | ArgDescriptor DispatchID; |
129 | ArgDescriptor FlatScratchInit; |
130 | ArgDescriptor PrivateSegmentSize; |
131 | |
132 | // System SGPRs in kernels. |
133 | ArgDescriptor WorkGroupIDX; |
134 | ArgDescriptor WorkGroupIDY; |
135 | ArgDescriptor WorkGroupIDZ; |
136 | ArgDescriptor WorkGroupInfo; |
137 | ArgDescriptor PrivateSegmentWaveByteOffset; |
138 | |
139 | // Pointer with offset from kernargsegmentptr to where special ABI arguments |
140 | // are passed to callable functions. |
141 | ArgDescriptor ImplicitArgPtr; |
142 | |
143 | // Input registers for non-HSA ABI |
144 | ArgDescriptor ImplicitBufferPtr; |
145 | |
146 | // VGPRs inputs. These are always v0, v1 and v2 for entry functions. |
147 | ArgDescriptor WorkItemIDX; |
148 | ArgDescriptor WorkItemIDY; |
149 | ArgDescriptor WorkItemIDZ; |
150 | |
151 | std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> |
152 | getPreloadedValue(PreloadedValue Value) const; |
153 | |
154 | static constexpr AMDGPUFunctionArgInfo fixedABILayout(); |
155 | }; |
156 | |
157 | class AMDGPUArgumentUsageInfo : public ImmutablePass { |
158 | private: |
159 | DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; |
160 | |
161 | public: |
162 | static char ID; |
163 | |
164 | static const AMDGPUFunctionArgInfo ExternFunctionInfo; |
165 | static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; |
166 | |
167 | AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } |
168 | |
169 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
170 | AU.setPreservesAll(); |
171 | } |
172 | |
173 | bool doInitialization(Module &M) override; |
174 | bool doFinalization(Module &M) override; |
175 | |
176 | void print(raw_ostream &OS, const Module *M = nullptr) const override; |
177 | |
178 | void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { |
179 | ArgInfoMap[&F] = ArgInfo; |
180 | } |
181 | |
182 | const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; |
183 | }; |
184 | |
185 | } // end namespace llvm |
186 | |
187 | #endif |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file contains some functions that are useful for math stuff. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||||
14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||||
15 | |||||||
16 | #include "llvm/Support/Compiler.h" | ||||||
17 | #include <algorithm> | ||||||
18 | #include <cassert> | ||||||
19 | #include <climits> | ||||||
20 | #include <cmath> | ||||||
21 | #include <cstdint> | ||||||
22 | #include <cstring> | ||||||
23 | #include <limits> | ||||||
24 | #include <type_traits> | ||||||
25 | |||||||
26 | #ifdef __ANDROID_NDK__ | ||||||
27 | #include <android/api-level.h> | ||||||
28 | #endif | ||||||
29 | |||||||
30 | #ifdef _MSC_VER | ||||||
31 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||||
32 | // expensive, and MathExtras.h is popular. | ||||||
33 | // #include <intrin.h> | ||||||
34 | extern "C" { | ||||||
35 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||||
36 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
37 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||||
38 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
39 | } | ||||||
40 | #endif | ||||||
41 | |||||||
42 | namespace llvm { | ||||||
43 | |||||||
44 | /// The behavior an operation has on an input of 0. | ||||||
45 | enum ZeroBehavior { | ||||||
46 | /// The returned value is undefined. | ||||||
47 | ZB_Undefined, | ||||||
48 | /// The returned value is numeric_limits<T>::max() | ||||||
49 | ZB_Max, | ||||||
50 | /// The returned value is numeric_limits<T>::digits | ||||||
51 | ZB_Width | ||||||
52 | }; | ||||||
53 | |||||||
54 | /// Mathematical constants. | ||||||
55 | namespace numbers { | ||||||
56 | // TODO: Track C++20 std::numbers. | ||||||
57 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||||
58 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||||
59 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||||
60 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||||
61 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||||
62 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||||
63 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||||
64 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||||
65 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||||
66 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||||
67 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||||
68 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||||
69 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||||
70 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||||
71 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||||
72 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||||
73 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||||
74 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||||
75 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||||
76 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||||
77 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||||
78 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||||
79 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||||
80 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||||
81 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||||
82 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||||
83 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||||
84 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||||
85 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||||
86 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||||
87 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||||
88 | } // namespace numbers | ||||||
89 | |||||||
90 | namespace detail { | ||||||
91 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||||
92 | static unsigned count(T Val, ZeroBehavior) { | ||||||
93 | if (!Val) | ||||||
94 | return std::numeric_limits<T>::digits; | ||||||
95 | if (Val & 0x1) | ||||||
96 | return 0; | ||||||
97 | |||||||
98 | // Bisection method. | ||||||
99 | unsigned ZeroBits = 0; | ||||||
100 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||||
101 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||||
102 | while (Shift) { | ||||||
103 | if ((Val & Mask) == 0) { | ||||||
104 | Val >>= Shift; | ||||||
105 | ZeroBits |= Shift; | ||||||
106 | } | ||||||
107 | Shift >>= 1; | ||||||
108 | Mask >>= Shift; | ||||||
109 | } | ||||||
110 | return ZeroBits; | ||||||
111 | } | ||||||
112 | }; | ||||||
113 | |||||||
114 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
115 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||||
116 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
117 | if (ZB
| ||||||
118 | return 32; | ||||||
119 | |||||||
120 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||||
121 | return __builtin_ctz(Val); | ||||||
122 | #elif defined(_MSC_VER) | ||||||
123 | unsigned long Index; | ||||||
124 | _BitScanForward(&Index, Val); | ||||||
125 | return Index; | ||||||
126 | #endif | ||||||
127 | } | ||||||
128 | }; | ||||||
129 | |||||||
130 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
131 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||||
132 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
133 | if (ZB != ZB_Undefined && Val == 0) | ||||||
134 | return 64; | ||||||
135 | |||||||
136 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||||
137 | return __builtin_ctzll(Val); | ||||||
138 | #elif defined(_MSC_VER) | ||||||
139 | unsigned long Index; | ||||||
140 | _BitScanForward64(&Index, Val); | ||||||
141 | return Index; | ||||||
142 | #endif | ||||||
143 | } | ||||||
144 | }; | ||||||
145 | #endif | ||||||
146 | #endif | ||||||
147 | } // namespace detail | ||||||
148 | |||||||
149 | /// Count number of 0's from the least significant bit to the most | ||||||
150 | /// stopping at the first 1. | ||||||
151 | /// | ||||||
152 | /// Only unsigned integral types are allowed. | ||||||
153 | /// | ||||||
154 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
155 | /// valid arguments. | ||||||
156 | template <typename T> | ||||||
157 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
158 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
159 | !std::numeric_limits<T>::is_signed, | ||||||
160 | "Only unsigned integral types are allowed."); | ||||||
161 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
162 | } | ||||||
163 | |||||||
164 | namespace detail { | ||||||
165 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||||
166 | static unsigned count(T Val, ZeroBehavior) { | ||||||
167 | if (!Val) | ||||||
168 | return std::numeric_limits<T>::digits; | ||||||
169 | |||||||
170 | // Bisection method. | ||||||
171 | unsigned ZeroBits = 0; | ||||||
172 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||||
173 | T Tmp = Val >> Shift; | ||||||
174 | if (Tmp) | ||||||
175 | Val = Tmp; | ||||||
176 | else | ||||||
177 | ZeroBits |= Shift; | ||||||
178 | } | ||||||
179 | return ZeroBits; | ||||||
180 | } | ||||||
181 | }; | ||||||
182 | |||||||
183 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
184 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||||
185 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
186 | if (ZB != ZB_Undefined && Val == 0) | ||||||
187 | return 32; | ||||||
188 | |||||||
189 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||||
190 | return __builtin_clz(Val); | ||||||
191 | #elif defined(_MSC_VER) | ||||||
192 | unsigned long Index; | ||||||
193 | _BitScanReverse(&Index, Val); | ||||||
194 | return Index ^ 31; | ||||||
195 | #endif | ||||||
196 | } | ||||||
197 | }; | ||||||
198 | |||||||
199 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
200 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||||
201 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
202 | if (ZB != ZB_Undefined && Val == 0) | ||||||
203 | return 64; | ||||||
204 | |||||||
205 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||||
206 | return __builtin_clzll(Val); | ||||||
207 | #elif defined(_MSC_VER) | ||||||
208 | unsigned long Index; | ||||||
209 | _BitScanReverse64(&Index, Val); | ||||||
210 | return Index ^ 63; | ||||||
211 | #endif | ||||||
212 | } | ||||||
213 | }; | ||||||
214 | #endif | ||||||
215 | #endif | ||||||
216 | } // namespace detail | ||||||
217 | |||||||
218 | /// Count number of 0's from the most significant bit to the least | ||||||
219 | /// stopping at the first 1. | ||||||
220 | /// | ||||||
221 | /// Only unsigned integral types are allowed. | ||||||
222 | /// | ||||||
223 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
224 | /// valid arguments. | ||||||
225 | template <typename T> | ||||||
226 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
227 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
228 | !std::numeric_limits<T>::is_signed, | ||||||
229 | "Only unsigned integral types are allowed."); | ||||||
230 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
231 | } | ||||||
232 | |||||||
233 | /// Get the index of the first set bit starting from the least | ||||||
234 | /// significant bit. | ||||||
235 | /// | ||||||
236 | /// Only unsigned integral types are allowed. | ||||||
237 | /// | ||||||
238 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
239 | /// valid arguments. | ||||||
240 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
241 | if (ZB == ZB_Max && Val == 0) | ||||||
242 | return std::numeric_limits<T>::max(); | ||||||
243 | |||||||
244 | return countTrailingZeros(Val, ZB_Undefined); | ||||||
245 | } | ||||||
246 | |||||||
247 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||||
248 | /// bits set to 0. Only unsigned types are allowed. | ||||||
249 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||||
250 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||||
251 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||||
252 | assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast< void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 252, __PRETTY_FUNCTION__)); | ||||||
253 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||||
254 | } | ||||||
255 | |||||||
256 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||||
257 | /// bits set to 0. Only unsigned types are allowed. | ||||||
258 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||||
259 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
260 | } | ||||||
261 | |||||||
262 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||||
263 | /// bits set to 1. Only unsigned types are allowed. | ||||||
264 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||||
265 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
266 | } | ||||||
267 | |||||||
268 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||||
269 | /// bits set to 1. Only unsigned types are allowed. | ||||||
270 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||||
271 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
272 | } | ||||||
273 | |||||||
274 | /// Get the index of the last set bit starting from the least | ||||||
275 | /// significant bit. | ||||||
276 | /// | ||||||
277 | /// Only unsigned integral types are allowed. | ||||||
278 | /// | ||||||
279 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
280 | /// valid arguments. | ||||||
281 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
282 | if (ZB == ZB_Max && Val == 0) | ||||||
283 | return std::numeric_limits<T>::max(); | ||||||
284 | |||||||
285 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||||
286 | // in the __builtin_clz intrinsic on x86. | ||||||
287 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||||
288 | (std::numeric_limits<T>::digits - 1); | ||||||
289 | } | ||||||
290 | |||||||
291 | /// Macro compressed bit reversal table for 256 bits. | ||||||
292 | /// | ||||||
293 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||||
294 | static const unsigned char BitReverseTable256[256] = { | ||||||
295 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||||
296 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||||
297 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||||
298 | R6(0), R6(2), R6(1), R6(3) | ||||||
299 | #undef R2 | ||||||
300 | #undef R4 | ||||||
301 | #undef R6 | ||||||
302 | }; | ||||||
303 | |||||||
304 | /// Reverse the bits in \p Val. | ||||||
305 | template <typename T> | ||||||
306 | T reverseBits(T Val) { | ||||||
307 | unsigned char in[sizeof(Val)]; | ||||||
308 | unsigned char out[sizeof(Val)]; | ||||||
309 | std::memcpy(in, &Val, sizeof(Val)); | ||||||
310 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||||
311 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||||
312 | std::memcpy(&Val, out, sizeof(Val)); | ||||||
313 | return Val; | ||||||
314 | } | ||||||
315 | |||||||
316 | #if __has_builtin(__builtin_bitreverse8)1 | ||||||
317 | template<> | ||||||
318 | inline uint8_t reverseBits<uint8_t>(uint8_t Val) { | ||||||
319 | return __builtin_bitreverse8(Val); | ||||||
320 | } | ||||||
321 | #endif | ||||||
322 | |||||||
323 | #if __has_builtin(__builtin_bitreverse16)1 | ||||||
324 | template<> | ||||||
325 | inline uint16_t reverseBits<uint16_t>(uint16_t Val) { | ||||||
326 | return __builtin_bitreverse16(Val); | ||||||
327 | } | ||||||
328 | #endif | ||||||
329 | |||||||
330 | #if __has_builtin(__builtin_bitreverse32)1 | ||||||
331 | template<> | ||||||
332 | inline uint32_t reverseBits<uint32_t>(uint32_t Val) { | ||||||
333 | return __builtin_bitreverse32(Val); | ||||||
334 | } | ||||||
335 | #endif | ||||||
336 | |||||||
337 | #if __has_builtin(__builtin_bitreverse64)1 | ||||||
338 | template<> | ||||||
339 | inline uint64_t reverseBits<uint64_t>(uint64_t Val) { | ||||||
340 | return __builtin_bitreverse64(Val); | ||||||
341 | } | ||||||
342 | #endif | ||||||
343 | |||||||
344 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||||
345 | // type overloading so that signed and unsigned integers can be used without | ||||||
346 | // ambiguity. | ||||||
347 | |||||||
348 | /// Return the high 32 bits of a 64 bit value. | ||||||
349 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||||
350 | return static_cast<uint32_t>(Value >> 32); | ||||||
351 | } | ||||||
352 | |||||||
353 | /// Return the low 32 bits of a 64 bit value. | ||||||
354 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||||
355 | return static_cast<uint32_t>(Value); | ||||||
356 | } | ||||||
357 | |||||||
358 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||||
359 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||||
360 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||||
361 | } | ||||||
362 | |||||||
363 | /// Checks if an integer fits into the given bit width. | ||||||
364 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||||
365 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); | ||||||
366 | } | ||||||
367 | // Template specializations to get better code for common cases. | ||||||
368 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||||
369 | return static_cast<int8_t>(x) == x; | ||||||
370 | } | ||||||
371 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||||
372 | return static_cast<int16_t>(x) == x; | ||||||
373 | } | ||||||
374 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||||
375 | return static_cast<int32_t>(x) == x; | ||||||
376 | } | ||||||
377 | |||||||
378 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||||
379 | template <unsigned N, unsigned S> | ||||||
380 | constexpr inline bool isShiftedInt(int64_t x) { | ||||||
381 | static_assert( | ||||||
382 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||||
383 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||||
384 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
385 | } | ||||||
386 | |||||||
387 | /// Checks if an unsigned integer fits into the given bit width. | ||||||
388 | /// | ||||||
389 | /// This is written as two functions rather than as simply | ||||||
390 | /// | ||||||
391 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||||
392 | /// | ||||||
393 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||||
394 | /// left too many places. | ||||||
395 | template <unsigned N> | ||||||
396 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { | ||||||
397 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||||
398 | return X < (UINT64_C(1)1UL << (N)); | ||||||
399 | } | ||||||
400 | template <unsigned N> | ||||||
401 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t X) { | ||||||
402 | return true; | ||||||
403 | } | ||||||
404 | |||||||
405 | // Template specializations to get better code for common cases. | ||||||
406 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||||
407 | return static_cast<uint8_t>(x) == x; | ||||||
408 | } | ||||||
409 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||||
410 | return static_cast<uint16_t>(x) == x; | ||||||
411 | } | ||||||
412 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||||
413 | return static_cast<uint32_t>(x) == x; | ||||||
414 | } | ||||||
415 | |||||||
416 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||||
417 | template <unsigned N, unsigned S> | ||||||
418 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||||
419 | static_assert( | ||||||
420 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||||
421 | static_assert(N + S <= 64, | ||||||
422 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||||
423 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||||
424 | // 1 << S is not undefined behavior. | ||||||
425 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
426 | } | ||||||
427 | |||||||
428 | /// Gets the maximum value for a N-bit unsigned integer. | ||||||
429 | inline uint64_t maxUIntN(uint64_t N) { | ||||||
430 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 430, __PRETTY_FUNCTION__)); | ||||||
431 | |||||||
432 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||||
433 | // (uint64_t(1) << N) - 1 | ||||||
434 | // without checking first that N != 64. But this works and doesn't have a | ||||||
435 | // branch. | ||||||
436 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); | ||||||
437 | } | ||||||
438 | |||||||
439 | /// Gets the minimum value for a N-bit signed integer. | ||||||
440 | inline int64_t minIntN(int64_t N) { | ||||||
441 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 441, __PRETTY_FUNCTION__)); | ||||||
442 | |||||||
443 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); | ||||||
444 | } | ||||||
445 | |||||||
446 | /// Gets the maximum value for a N-bit signed integer. | ||||||
447 | inline int64_t maxIntN(int64_t N) { | ||||||
448 | assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range" ) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 448, __PRETTY_FUNCTION__)); | ||||||
449 | |||||||
450 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||||
451 | // int64_t only at the very end to avoid UB. | ||||||
452 | return (UINT64_C(1)1UL << (N - 1)) - 1; | ||||||
453 | } | ||||||
454 | |||||||
455 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||||
456 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||||
457 | return N >= 64 || x <= maxUIntN(N); | ||||||
458 | } | ||||||
459 | |||||||
460 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||||
461 | inline bool isIntN(unsigned N, int64_t x) { | ||||||
462 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||||
463 | } | ||||||
464 | |||||||
465 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
466 | /// least significant bit with the remainder zero (32 bit version). | ||||||
467 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||||
468 | constexpr inline bool isMask_32(uint32_t Value) { | ||||||
469 | return Value && ((Value + 1) & Value) == 0; | ||||||
470 | } | ||||||
471 | |||||||
472 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
473 | /// least significant bit with the remainder zero (64 bit version). | ||||||
474 | constexpr inline bool isMask_64(uint64_t Value) { | ||||||
475 | return Value && ((Value + 1) & Value) == 0; | ||||||
476 | } | ||||||
477 | |||||||
478 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
479 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||||
480 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||||
481 | return Value && isMask_32((Value - 1) | Value); | ||||||
482 | } | ||||||
483 | |||||||
484 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
485 | /// remainder zero (64 bit version.) | ||||||
486 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||||
487 | return Value && isMask_64((Value - 1) | Value); | ||||||
488 | } | ||||||
489 | |||||||
490 | /// Return true if the argument is a power of two > 0. | ||||||
491 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||||
492 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||||
493 | return Value && !(Value & (Value - 1)); | ||||||
494 | } | ||||||
495 | |||||||
496 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||||
497 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||||
498 | return Value && !(Value & (Value - 1)); | ||||||
499 | } | ||||||
500 | |||||||
501 | /// Count the number of ones from the most significant bit to the first | ||||||
502 | /// zero bit. | ||||||
503 | /// | ||||||
504 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||||
505 | /// Only unsigned integral types are allowed. | ||||||
506 | /// | ||||||
507 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
508 | /// ZB_Undefined are valid arguments. | ||||||
509 | template <typename T> | ||||||
510 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
511 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
512 | !std::numeric_limits<T>::is_signed, | ||||||
513 | "Only unsigned integral types are allowed."); | ||||||
514 | return countLeadingZeros<T>(~Value, ZB); | ||||||
515 | } | ||||||
516 | |||||||
517 | /// Count the number of ones from the least significant bit to the first | ||||||
518 | /// zero bit. | ||||||
519 | /// | ||||||
520 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||||
521 | /// Only unsigned integral types are allowed. | ||||||
522 | /// | ||||||
523 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
524 | /// ZB_Undefined are valid arguments. | ||||||
525 | template <typename T> | ||||||
526 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
527 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
528 | !std::numeric_limits<T>::is_signed, | ||||||
529 | "Only unsigned integral types are allowed."); | ||||||
530 | return countTrailingZeros<T>(~Value, ZB); | ||||||
531 | } | ||||||
532 | |||||||
533 | namespace detail { | ||||||
534 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||||
535 | static unsigned count(T Value) { | ||||||
536 | // Generic version, forward to 32 bits. | ||||||
537 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||||
538 | #if defined(__GNUC__4) | ||||||
539 | return __builtin_popcount(Value); | ||||||
540 | #else | ||||||
541 | uint32_t v = Value; | ||||||
542 | v = v - ((v >> 1) & 0x55555555); | ||||||
543 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||||
544 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||||
545 | #endif | ||||||
546 | } | ||||||
547 | }; | ||||||
548 | |||||||
549 | template <typename T> struct PopulationCounter<T, 8> { | ||||||
550 | static unsigned count(T Value) { | ||||||
551 | #if defined(__GNUC__4) | ||||||
552 | return __builtin_popcountll(Value); | ||||||
553 | #else | ||||||
554 | uint64_t v = Value; | ||||||
555 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||||
556 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||||
557 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||||
558 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||||
559 | #endif | ||||||
560 | } | ||||||
561 | }; | ||||||
562 | } // namespace detail | ||||||
563 | |||||||
564 | /// Count the number of set bits in a value. | ||||||
565 | /// Ex. countPopulation(0xF000F000) = 8 | ||||||
566 | /// Returns 0 if the word is zero. | ||||||
567 | template <typename T> | ||||||
568 | inline unsigned countPopulation(T Value) { | ||||||
569 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
570 | !std::numeric_limits<T>::is_signed, | ||||||
571 | "Only unsigned integral types are allowed."); | ||||||
572 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||||
573 | } | ||||||
574 | |||||||
575 | /// Compile time Log2. | ||||||
576 | /// Valid only for positive powers of two. | ||||||
577 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||||
578 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||||
579 | "Value is not a valid power of 2"); | ||||||
580 | return 1 + CTLog2<kValue / 2>(); | ||||||
581 | } | ||||||
582 | |||||||
583 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||||
584 | |||||||
585 | /// Return the log base 2 of the specified value. | ||||||
586 | inline double Log2(double Value) { | ||||||
587 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||||
588 | return __builtin_log(Value) / __builtin_log(2.0); | ||||||
589 | #else | ||||||
590 | return log2(Value); | ||||||
591 | #endif | ||||||
592 | } | ||||||
593 | |||||||
594 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
595 | /// (32 bit edition.) | ||||||
596 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||||
597 | inline unsigned Log2_32(uint32_t Value) { | ||||||
598 | return 31 - countLeadingZeros(Value); | ||||||
599 | } | ||||||
600 | |||||||
601 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
602 | /// (64 bit edition.) | ||||||
603 | inline unsigned Log2_64(uint64_t Value) { | ||||||
604 | return 63 - countLeadingZeros(Value); | ||||||
605 | } | ||||||
606 | |||||||
607 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||||
608 | /// (32 bit edition). | ||||||
609 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||||
610 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||||
611 | return 32 - countLeadingZeros(Value - 1); | ||||||
612 | } | ||||||
613 | |||||||
614 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||||
615 | /// (64 bit edition.) | ||||||
616 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||||
617 | return 64 - countLeadingZeros(Value - 1); | ||||||
618 | } | ||||||
619 | |||||||
620 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||||
621 | template <typename T> | ||||||
622 | inline T greatestCommonDivisor(T A, T B) { | ||||||
623 | while (B) { | ||||||
624 | T Tmp = B; | ||||||
625 | B = A % B; | ||||||
626 | A = Tmp; | ||||||
627 | } | ||||||
628 | return A; | ||||||
629 | } | ||||||
630 | |||||||
631 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||||
632 | return greatestCommonDivisor<uint64_t>(A, B); | ||||||
633 | } | ||||||
634 | |||||||
635 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||||
636 | inline double BitsToDouble(uint64_t Bits) { | ||||||
637 | double D; | ||||||
638 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
639 | memcpy(&D, &Bits, sizeof(Bits)); | ||||||
640 | return D; | ||||||
641 | } | ||||||
642 | |||||||
643 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||||
644 | inline float BitsToFloat(uint32_t Bits) { | ||||||
645 | float F; | ||||||
646 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
647 | memcpy(&F, &Bits, sizeof(Bits)); | ||||||
648 | return F; | ||||||
649 | } | ||||||
650 | |||||||
651 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||||
652 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||||
653 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
654 | inline uint64_t DoubleToBits(double Double) { | ||||||
655 | uint64_t Bits; | ||||||
656 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
657 | memcpy(&Bits, &Double, sizeof(Double)); | ||||||
658 | return Bits; | ||||||
659 | } | ||||||
660 | |||||||
661 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||||
662 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||||
663 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
664 | inline uint32_t FloatToBits(float Float) { | ||||||
665 | uint32_t Bits; | ||||||
666 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
667 | memcpy(&Bits, &Float, sizeof(Float)); | ||||||
668 | return Bits; | ||||||
669 | } | ||||||
670 | |||||||
671 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||||
672 | /// may be assumed after adding the two together. | ||||||
673 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||||
674 | // The largest power of 2 that divides both A and B. | ||||||
675 | // | ||||||
676 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||||
677 | // MSVC warning C4146 | ||||||
678 | // return (A | B) & -(A | B); | ||||||
679 | return (A | B) & (1 + ~(A | B)); | ||||||
680 | } | ||||||
681 | |||||||
682 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||||
683 | /// Returns zero on overflow. | ||||||
684 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||||
685 | A |= (A >> 1); | ||||||
686 | A |= (A >> 2); | ||||||
687 | A |= (A >> 4); | ||||||
688 | A |= (A >> 8); | ||||||
689 | A |= (A >> 16); | ||||||
690 | A |= (A >> 32); | ||||||
691 | return A + 1; | ||||||
692 | } | ||||||
693 | |||||||
694 | /// Returns the power of two which is less than or equal to the given value. | ||||||
695 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||||
696 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||||
697 | if (!A) return 0; | ||||||
698 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||||
699 | } | ||||||
700 | |||||||
701 | /// Returns the power of two which is greater than or equal to the given value. | ||||||
702 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||||
703 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||||
704 | if (!A) | ||||||
705 | return 0; | ||||||
706 | return NextPowerOf2(A - 1); | ||||||
707 | } | ||||||
708 | |||||||
709 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
710 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||||
711 | /// | ||||||
712 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||||
713 | /// integer that is greater than or equal to \p Value and equal to | ||||||
714 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||||
715 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||||
716 | /// | ||||||
717 | /// Examples: | ||||||
718 | /// \code | ||||||
719 | /// alignTo(5, 8) = 8 | ||||||
720 | /// alignTo(17, 8) = 24 | ||||||
721 | /// alignTo(~0LL, 8) = 0 | ||||||
722 | /// alignTo(321, 255) = 510 | ||||||
723 | /// | ||||||
724 | /// alignTo(5, 8, 7) = 7 | ||||||
725 | /// alignTo(17, 8, 1) = 17 | ||||||
726 | /// alignTo(~0LL, 8, 3) = 3 | ||||||
727 | /// alignTo(321, 255, 42) = 552 | ||||||
728 | /// \endcode | ||||||
729 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
730 | assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast< void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 730, __PRETTY_FUNCTION__)); | ||||||
731 | Skew %= Align; | ||||||
732 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||||
733 | } | ||||||
734 | |||||||
735 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
736 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||||
737 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||||
738 | static_assert(Align != 0u, "Align must be non-zero"); | ||||||
739 | return (Value + Align - 1) / Align * Align; | ||||||
740 | } | ||||||
741 | |||||||
742 | /// Returns the integer ceil(Numerator / Denominator). | ||||||
743 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||||
744 | return alignTo(Numerator, Denominator) / Denominator; | ||||||
745 | } | ||||||
746 | |||||||
747 | /// Returns the integer nearest(Numerator / Denominator). | ||||||
748 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { | ||||||
749 | return (Numerator + (Denominator / 2)) / Denominator; | ||||||
750 | } | ||||||
751 | |||||||
752 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||||
753 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||||
754 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
755 | assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast< void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 755, __PRETTY_FUNCTION__)); | ||||||
756 | Skew %= Align; | ||||||
757 | return (Value - Skew) / Align * Align + Skew; | ||||||
758 | } | ||||||
759 | |||||||
760 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
761 | /// Requires 0 < B <= 32. | ||||||
762 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||||
763 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
764 | static_assert(B <= 32, "Bit width out of range."); | ||||||
765 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
766 | } | ||||||
767 | |||||||
768 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
769 | /// Requires 0 < B < 32. | ||||||
770 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||||
771 | assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast< void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 771, __PRETTY_FUNCTION__)); | ||||||
772 | assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast <void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 772, __PRETTY_FUNCTION__)); | ||||||
773 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
774 | } | ||||||
775 | |||||||
776 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
777 | /// Requires 0 < B < 64. | ||||||
778 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||||
779 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
780 | static_assert(B <= 64, "Bit width out of range."); | ||||||
781 | return int64_t(x << (64 - B)) >> (64 - B); | ||||||
782 | } | ||||||
783 | |||||||
784 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
785 | /// Requires 0 < B < 64. | ||||||
786 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||||
787 | assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast< void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 787, __PRETTY_FUNCTION__)); | ||||||
788 | assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast <void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include/llvm/Support/MathExtras.h" , 788, __PRETTY_FUNCTION__)); | ||||||
789 | return int64_t(X << (64 - B)) >> (64 - B); | ||||||
790 | } | ||||||
791 | |||||||
792 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||||
793 | /// value of the result. | ||||||
794 | template <typename T> | ||||||
795 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { | ||||||
796 | return std::max(X, Y) - std::min(X, Y); | ||||||
797 | } | ||||||
798 | |||||||
799 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
800 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
801 | /// the result is larger than the maximum representable value of type T. | ||||||
802 | template <typename T> | ||||||
803 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
804 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
805 | bool Dummy; | ||||||
806 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
807 | // Hacker's Delight, p. 29 | ||||||
808 | T Z = X + Y; | ||||||
809 | Overflowed = (Z < X || Z < Y); | ||||||
810 | if (Overflowed) | ||||||
811 | return std::numeric_limits<T>::max(); | ||||||
812 | else | ||||||
813 | return Z; | ||||||
814 | } | ||||||
815 | |||||||
816 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
817 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
818 | /// the result is larger than the maximum representable value of type T. | ||||||
819 | template <typename T> | ||||||
820 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
821 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
822 | bool Dummy; | ||||||
823 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
824 | |||||||
825 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||||
826 | // because it fails for uint16_t (where multiplication can have undefined | ||||||
827 | // behavior due to promotion to int), and requires a division in addition | ||||||
828 | // to the multiplication. | ||||||
829 | |||||||
830 | Overflowed = false; | ||||||
831 | |||||||
832 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||||
833 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||||
834 | // will necessarily be less than Log2Max as desired. | ||||||
835 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||||
836 | const T Max = std::numeric_limits<T>::max(); | ||||||
837 | int Log2Max = Log2_64(Max); | ||||||
838 | if (Log2Z < Log2Max) { | ||||||
839 | return X * Y; | ||||||
840 | } | ||||||
841 | if (Log2Z > Log2Max) { | ||||||
842 | Overflowed = true; | ||||||
843 | return Max; | ||||||
844 | } | ||||||
845 | |||||||
846 | // We're going to use the top bit, and maybe overflow one | ||||||
847 | // bit past it. Multiply all but the bottom bit then add | ||||||
848 | // that on at the end. | ||||||
849 | T Z = (X >> 1) * Y; | ||||||
850 | if (Z & ~(Max >> 1)) { | ||||||
851 | Overflowed = true; | ||||||
852 | return Max; | ||||||
853 | } | ||||||
854 | Z <<= 1; | ||||||
855 | if (X & 1) | ||||||
856 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||||
857 | |||||||
858 | return Z; | ||||||
859 | } | ||||||
860 | |||||||
861 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||||
862 | /// the product. Clamp the result to the maximum representable value of T on | ||||||
863 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||||
864 | /// maximum representable value of type T. | ||||||
865 | template <typename T> | ||||||
866 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
867 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||||
868 | bool Dummy; | ||||||
869 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
870 | |||||||
871 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||||
872 | if (Overflowed) | ||||||
873 | return Product; | ||||||
874 | |||||||
875 | return SaturatingAdd(A, Product, &Overflowed); | ||||||
876 | } | ||||||
877 | |||||||
878 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||||
879 | extern const float huge_valf; | ||||||
880 | |||||||
881 | |||||||
882 | /// Add two signed integers, computing the two's complement truncated result, | ||||||
883 | /// returning true if overflow occured. | ||||||
884 | template <typename T> | ||||||
885 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { | ||||||
886 | #if __has_builtin(__builtin_add_overflow)1 | ||||||
887 | return __builtin_add_overflow(X, Y, &Result); | ||||||
888 | #else | ||||||
889 | // Perform the unsigned addition. | ||||||
890 | using U = std::make_unsigned_t<T>; | ||||||
891 | const U UX = static_cast<U>(X); | ||||||
892 | const U UY = static_cast<U>(Y); | ||||||
893 | const U UResult = UX + UY; | ||||||
894 | |||||||
895 | // Convert to signed. | ||||||
896 | Result = static_cast<T>(UResult); | ||||||
897 | |||||||
898 | // Adding two positive numbers should result in a positive number. | ||||||
899 | if (X > 0 && Y > 0) | ||||||
900 | return Result <= 0; | ||||||
901 | // Adding two negatives should result in a negative number. | ||||||
902 | if (X < 0 && Y < 0) | ||||||
903 | return Result >= 0; | ||||||
904 | return false; | ||||||
905 | #endif | ||||||
906 | } | ||||||
907 | |||||||
908 | /// Subtract two signed integers, computing the two's complement truncated | ||||||
909 | /// result, returning true if an overflow ocurred. | ||||||
910 | template <typename T> | ||||||
911 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { | ||||||
912 | #if __has_builtin(__builtin_sub_overflow)1 | ||||||
913 | return __builtin_sub_overflow(X, Y, &Result); | ||||||
914 | #else | ||||||
915 | // Perform the unsigned addition. | ||||||
916 | using U = std::make_unsigned_t<T>; | ||||||
917 | const U UX = static_cast<U>(X); | ||||||
918 | const U UY = static_cast<U>(Y); | ||||||
919 | const U UResult = UX - UY; | ||||||
920 | |||||||
921 | // Convert to signed. | ||||||
922 | Result = static_cast<T>(UResult); | ||||||
923 | |||||||
924 | // Subtracting a positive number from a negative results in a negative number. | ||||||
925 | if (X <= 0 && Y > 0) | ||||||
926 | return Result >= 0; | ||||||
927 | // Subtracting a negative number from a positive results in a positive number. | ||||||
928 | if (X >= 0 && Y < 0) | ||||||
929 | return Result <= 0; | ||||||
930 | return false; | ||||||
931 | #endif | ||||||
932 | } | ||||||
933 | |||||||
934 | /// Multiply two signed integers, computing the two's complement truncated | ||||||
935 | /// result, returning true if an overflow ocurred. | ||||||
936 | template <typename T> | ||||||
937 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { | ||||||
938 | // Perform the unsigned multiplication on absolute values. | ||||||
939 | using U = std::make_unsigned_t<T>; | ||||||
940 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||||
941 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||||
942 | const U UResult = UX * UY; | ||||||
943 | |||||||
944 | // Convert to signed. | ||||||
945 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||||
946 | Result = IsNegative ? (0 - UResult) : UResult; | ||||||
947 | |||||||
948 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||||
949 | if (UX == 0 || UY == 0) | ||||||
950 | return false; | ||||||
951 | |||||||
952 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||||
953 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||||
954 | // positive) divided by an argument compares to the other. | ||||||
955 | if (IsNegative) | ||||||
956 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||||
957 | else | ||||||
958 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||||
959 | } | ||||||
960 | |||||||
961 | } // End llvm namespace | ||||||
962 | |||||||
963 | #endif |