Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 3299, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1679443490 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-03-22-005342-16304-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30
31#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
32
33using namespace llvm;
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
38
39// Hack until load/store selection patterns support any tuple of legal types.
40static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47static constexpr unsigned MaxRegisterSize = 1024;
48
49// Round the number of elements to the next power of two elements
50static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54}
55
56// Round the number of bits to the next power of two bits
57static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61}
62
63/// \returns true if this is an odd sized vector which should widen by adding an
64/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65/// excludes s1 vectors, which should always be scalarized.
66static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78}
79
80static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85}
86
87static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93}
94
95static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::pair(TypeIdx,
100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101 };
102}
103
104static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::pair(TypeIdx, LLT::scalarOrVector(
112 ElementCount::getFixed(NewNumElts), EltTy));
113 };
114}
115
116// Increase the number of vector elements to reach the next multiple of 32-bit
117// type.
118static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 127, __extension__ __PRETTY_FUNCTION__))
;
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132}
133
134static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
136
137 if (Size <= 32) {
138 // <2 x s8> -> s16
139 // <4 x s8> -> s32
140 return LLT::scalar(Size);
141 }
142
143 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
144}
145
146static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147 return [=](const LegalityQuery &Query) {
148 const LLT Ty = Query.Types[TypeIdx];
149 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
150 };
151}
152
153static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154 return [=](const LegalityQuery &Query) {
155 const LLT Ty = Query.Types[TypeIdx];
156 unsigned Size = Ty.getSizeInBits();
157 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 157, __extension__ __PRETTY_FUNCTION__))
;
158 return std::pair(
159 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
160 };
161}
162
163static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
164 return [=](const LegalityQuery &Query) {
165 const LLT QueryTy = Query.Types[TypeIdx];
166 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
167 };
168}
169
170static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
171 return [=](const LegalityQuery &Query) {
172 const LLT QueryTy = Query.Types[TypeIdx];
173 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
174 };
175}
176
177static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
178 return [=](const LegalityQuery &Query) {
179 const LLT QueryTy = Query.Types[TypeIdx];
180 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
181 };
182}
183
184static bool isRegisterSize(unsigned Size) {
185 return Size % 32 == 0 && Size <= MaxRegisterSize;
186}
187
188static bool isRegisterVectorElementType(LLT EltTy) {
189 const int EltSize = EltTy.getSizeInBits();
190 return EltSize == 16 || EltSize % 32 == 0;
191}
192
193static bool isRegisterVectorType(LLT Ty) {
194 const int EltSize = Ty.getElementType().getSizeInBits();
195 return EltSize == 32 || EltSize == 64 ||
196 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
197 EltSize == 128 || EltSize == 256;
198}
199
200static bool isRegisterType(LLT Ty) {
201 if (!isRegisterSize(Ty.getSizeInBits()))
202 return false;
203
204 if (Ty.isVector())
205 return isRegisterVectorType(Ty);
206
207 return true;
208}
209
210// Any combination of 32 or 64-bit elements up the maximum register size, and
211// multiples of v2s16.
212static LegalityPredicate isRegisterType(unsigned TypeIdx) {
213 return [=](const LegalityQuery &Query) {
214 return isRegisterType(Query.Types[TypeIdx]);
215 };
216}
217
218static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
219 return [=](const LegalityQuery &Query) {
220 const LLT QueryTy = Query.Types[TypeIdx];
221 if (!QueryTy.isVector())
222 return false;
223 const LLT EltTy = QueryTy.getElementType();
224 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
225 };
226}
227
228// If we have a truncating store or an extending load with a data size larger
229// than 32-bits, we need to reduce to a 32-bit type.
230static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT Ty = Query.Types[TypeIdx];
233 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
234 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
235 };
236}
237
238// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
239// handle some operations by just promoting the register during
240// selection. There are also d16 loads on GFX9+ which preserve the high bits.
241static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
242 bool IsLoad, bool IsAtomic) {
243 switch (AS) {
244 case AMDGPUAS::PRIVATE_ADDRESS:
245 // FIXME: Private element size.
246 return ST.enableFlatScratch() ? 128 : 32;
247 case AMDGPUAS::LOCAL_ADDRESS:
248 return ST.useDS128() ? 128 : 64;
249 case AMDGPUAS::GLOBAL_ADDRESS:
250 case AMDGPUAS::CONSTANT_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
252 // Treat constant and global as identical. SMRD loads are sometimes usable for
253 // global loads (ideally constant address space should be eliminated)
254 // depending on the context. Legality cannot be context dependent, but
255 // RegBankSelect can split the load as necessary depending on the pointer
256 // register bank/uniformity and if the memory is invariant or not written in a
257 // kernel.
258 return IsLoad ? 512 : 128;
259 default:
260 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
261 // if they may alias scratch depending on the subtarget. This needs to be
262 // moved to custom handling to use addressMayBeAccessedAsPrivate
263 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
300 Query.MMODescrs[0].Ordering !=
301 AtomicOrdering::NotAtomic))
302 return false;
303
304 switch (MemSize) {
305 case 8:
306 case 16:
307 case 32:
308 case 64:
309 case 128:
310 break;
311 case 96:
312 if (!ST.hasDwordx3LoadStores())
313 return false;
314 break;
315 case 256:
316 case 512:
317 // These may contextually need to be broken down.
318 break;
319 default:
320 return false;
321 }
322
323 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 323, __extension__ __PRETTY_FUNCTION__))
;
324
325 if (AlignBits < MemSize) {
326 const SITargetLowering *TLI = ST.getTargetLowering();
327 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
328 Align(AlignBits / 8)))
329 return false;
330 }
331
332 return true;
333}
334
335// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
336// workaround this. Eventually it should ignore the type for loads and only care
337// about the size. Return true in cases where we will workaround this for now by
338// bitcasting.
339static bool loadStoreBitcastWorkaround(const LLT Ty) {
340 if (EnableNewLegality)
341 return false;
342
343 const unsigned Size = Ty.getSizeInBits();
344 if (Size <= 64)
345 return false;
346 if (!Ty.isVector())
347 return true;
348
349 LLT EltTy = Ty.getElementType();
350 if (EltTy.isPointer())
351 return true;
352
353 unsigned EltSize = EltTy.getSizeInBits();
354 return EltSize != 32 && EltSize != 64;
355}
356
357static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
358 const LLT Ty = Query.Types[0];
359 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
360 !loadStoreBitcastWorkaround(Ty);
361}
362
363/// Return true if a load or store of the type should be lowered with a bitcast
364/// to a different type.
365static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
366 const LLT MemTy) {
367 const unsigned MemSizeInBits = MemTy.getSizeInBits();
368 const unsigned Size = Ty.getSizeInBits();
369 if (Size != MemSizeInBits)
370 return Size <= 32 && Ty.isVector();
371
372 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
373 return true;
374
375 // Don't try to handle bitcasting vector ext loads for now.
376 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
377 (Size <= 32 || isRegisterSize(Size)) &&
378 !isRegisterVectorElementType(Ty.getElementType());
379}
380
381/// Return true if we should legalize a load by widening an odd sized memory
382/// access up to the alignment. Note this case when the memory access itself
383/// changes, not the size of the result register.
384static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
385 uint64_t AlignInBits, unsigned AddrSpace,
386 unsigned Opcode) {
387 unsigned SizeInBits = MemoryTy.getSizeInBits();
388 // We don't want to widen cases that are naturally legal.
389 if (isPowerOf2_32(SizeInBits))
390 return false;
391
392 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
393 // end up widening these for a scalar load during RegBankSelect, since there
394 // aren't 96-bit scalar loads.
395 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
396 return false;
397
398 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
399 return false;
400
401 // A load is known dereferenceable up to the alignment, so it's legal to widen
402 // to it.
403 //
404 // TODO: Could check dereferenceable for less aligned cases.
405 unsigned RoundedSize = NextPowerOf2(SizeInBits);
406 if (AlignInBits < RoundedSize)
407 return false;
408
409 // Do not widen if it would introduce a slow unaligned load.
410 const SITargetLowering *TLI = ST.getTargetLowering();
411 unsigned Fast = 0;
412 return TLI->allowsMisalignedMemoryAccessesImpl(
413 RoundedSize, AddrSpace, Align(AlignInBits / 8),
414 MachineMemOperand::MOLoad, &Fast) &&
415 Fast;
416}
417
418static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
419 unsigned Opcode) {
420 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
421 return false;
422
423 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
424 Query.MMODescrs[0].AlignInBits,
425 Query.Types[1].getAddressSpace(), Opcode);
426}
427
428AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
429 const GCNTargetMachine &TM)
430 : ST(ST_) {
431 using namespace TargetOpcode;
432
433 auto GetAddrSpacePtr = [&TM](unsigned AS) {
434 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
435 };
436
437 const LLT S1 = LLT::scalar(1);
438 const LLT S8 = LLT::scalar(8);
439 const LLT S16 = LLT::scalar(16);
440 const LLT S32 = LLT::scalar(32);
441 const LLT S64 = LLT::scalar(64);
442 const LLT S128 = LLT::scalar(128);
443 const LLT S256 = LLT::scalar(256);
444 const LLT S512 = LLT::scalar(512);
445 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
446
447 const LLT V2S8 = LLT::fixed_vector(2, 8);
448 const LLT V2S16 = LLT::fixed_vector(2, 16);
449 const LLT V4S16 = LLT::fixed_vector(4, 16);
450
451 const LLT V2S32 = LLT::fixed_vector(2, 32);
452 const LLT V3S32 = LLT::fixed_vector(3, 32);
453 const LLT V4S32 = LLT::fixed_vector(4, 32);
454 const LLT V5S32 = LLT::fixed_vector(5, 32);
455 const LLT V6S32 = LLT::fixed_vector(6, 32);
456 const LLT V7S32 = LLT::fixed_vector(7, 32);
457 const LLT V8S32 = LLT::fixed_vector(8, 32);
458 const LLT V9S32 = LLT::fixed_vector(9, 32);
459 const LLT V10S32 = LLT::fixed_vector(10, 32);
460 const LLT V11S32 = LLT::fixed_vector(11, 32);
461 const LLT V12S32 = LLT::fixed_vector(12, 32);
462 const LLT V13S32 = LLT::fixed_vector(13, 32);
463 const LLT V14S32 = LLT::fixed_vector(14, 32);
464 const LLT V15S32 = LLT::fixed_vector(15, 32);
465 const LLT V16S32 = LLT::fixed_vector(16, 32);
466 const LLT V32S32 = LLT::fixed_vector(32, 32);
467
468 const LLT V2S64 = LLT::fixed_vector(2, 64);
469 const LLT V3S64 = LLT::fixed_vector(3, 64);
470 const LLT V4S64 = LLT::fixed_vector(4, 64);
471 const LLT V5S64 = LLT::fixed_vector(5, 64);
472 const LLT V6S64 = LLT::fixed_vector(6, 64);
473 const LLT V7S64 = LLT::fixed_vector(7, 64);
474 const LLT V8S64 = LLT::fixed_vector(8, 64);
475 const LLT V16S64 = LLT::fixed_vector(16, 64);
476
477 std::initializer_list<LLT> AllS32Vectors =
478 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
479 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
480 std::initializer_list<LLT> AllS64Vectors =
481 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
482
483 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
484 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
485 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
486 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
487 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
488 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
489 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
490
491 const LLT CodePtr = FlatPtr;
492
493 const std::initializer_list<LLT> AddrSpaces64 = {
494 GlobalPtr, ConstantPtr, FlatPtr
495 };
496
497 const std::initializer_list<LLT> AddrSpaces32 = {
498 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
499 };
500
501 const std::initializer_list<LLT> FPTypesBase = {
502 S32, S64
503 };
504
505 const std::initializer_list<LLT> FPTypes16 = {
506 S32, S64, S16
507 };
508
509 const std::initializer_list<LLT> FPTypesPK16 = {
510 S32, S64, S16, V2S16
511 };
512
513 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
514
515 // s1 for VCC branches, s32 for SCC branches.
516 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
517
518 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
519 // elements for v3s16
520 getActionDefinitionsBuilder(G_PHI)
521 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
522 .legalFor(AllS32Vectors)
523 .legalFor(AllS64Vectors)
524 .legalFor(AddrSpaces64)
525 .legalFor(AddrSpaces32)
526 .legalIf(isPointer(0))
527 .clampScalar(0, S16, S256)
528 .widenScalarToNextPow2(0, 32)
529 .clampMaxNumElements(0, S32, 16)
530 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
531 .scalarize(0);
532
533 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
534 // Full set of gfx9 features.
535 getActionDefinitionsBuilder({G_ADD, G_SUB})
536 .legalFor({S32, S16, V2S16})
537 .clampMaxNumElementsStrict(0, S16, 2)
538 .scalarize(0)
539 .minScalar(0, S16)
540 .widenScalarToNextMultipleOf(0, 32)
541 .maxScalar(0, S32);
542
543 getActionDefinitionsBuilder(G_MUL)
544 .legalFor({S32, S16, V2S16})
545 .clampMaxNumElementsStrict(0, S16, 2)
546 .scalarize(0)
547 .minScalar(0, S16)
548 .widenScalarToNextMultipleOf(0, 32)
549 .custom();
550 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 550, __extension__ __PRETTY_FUNCTION__))
;
551
552 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
553 .legalFor({S32, S16, V2S16}) // Clamp modifier
554 .minScalarOrElt(0, S16)
555 .clampMaxNumElementsStrict(0, S16, 2)
556 .scalarize(0)
557 .widenScalarToNextPow2(0, 32)
558 .lower();
559 } else if (ST.has16BitInsts()) {
560 getActionDefinitionsBuilder({G_ADD, G_SUB})
561 .legalFor({S32, S16})
562 .minScalar(0, S16)
563 .widenScalarToNextMultipleOf(0, 32)
564 .maxScalar(0, S32)
565 .scalarize(0);
566
567 getActionDefinitionsBuilder(G_MUL)
568 .legalFor({S32, S16})
569 .scalarize(0)
570 .minScalar(0, S16)
571 .widenScalarToNextMultipleOf(0, 32)
572 .custom();
573 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 573, __extension__ __PRETTY_FUNCTION__))
;
574
575 // Technically the saturating operations require clamp bit support, but this
576 // was introduced at the same time as 16-bit operations.
577 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578 .legalFor({S32, S16}) // Clamp modifier
579 .minScalar(0, S16)
580 .scalarize(0)
581 .widenScalarToNextPow2(0, 16)
582 .lower();
583
584 // We're just lowering this, but it helps get a better result to try to
585 // coerce to the desired type first.
586 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
587 .minScalar(0, S16)
588 .scalarize(0)
589 .lower();
590 } else {
591 getActionDefinitionsBuilder({G_ADD, G_SUB})
592 .legalFor({S32})
593 .widenScalarToNextMultipleOf(0, 32)
594 .clampScalar(0, S32, S32)
595 .scalarize(0);
596
597 auto &Mul = getActionDefinitionsBuilder(G_MUL)
598 .legalFor({S32})
599 .scalarize(0)
600 .minScalar(0, S32)
601 .widenScalarToNextMultipleOf(0, 32);
602
603 if (ST.hasMad64_32())
604 Mul.custom();
605 else
606 Mul.maxScalar(0, S32);
607
608 if (ST.hasIntClamp()) {
609 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
610 .legalFor({S32}) // Clamp modifier.
611 .scalarize(0)
612 .minScalarOrElt(0, S32)
613 .lower();
614 } else {
615 // Clamp bit support was added in VI, along with 16-bit operations.
616 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
617 .minScalar(0, S32)
618 .scalarize(0)
619 .lower();
620 }
621
622 // FIXME: DAG expansion gets better results. The widening uses the smaller
623 // range values and goes for the min/max lowering directly.
624 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
625 .minScalar(0, S32)
626 .scalarize(0)
627 .lower();
628 }
629
630 getActionDefinitionsBuilder(
631 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
632 .customFor({S32, S64})
633 .clampScalar(0, S32, S64)
634 .widenScalarToNextPow2(0, 32)
635 .scalarize(0);
636
637 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
638 .legalFor({S32})
639 .maxScalar(0, S32);
640
641 if (ST.hasVOP3PInsts()) {
642 Mulh
643 .clampMaxNumElements(0, S8, 2)
644 .lowerFor({V2S8});
645 }
646
647 Mulh
648 .scalarize(0)
649 .lower();
650
651 // Report legal for any types we can handle anywhere. For the cases only legal
652 // on the SALU, RegBankSelect will be able to re-legalize.
653 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
654 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
655 .clampScalar(0, S32, S64)
656 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
657 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
658 .widenScalarToNextPow2(0)
659 .scalarize(0);
660
661 getActionDefinitionsBuilder(
662 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
663 .legalFor({{S32, S1}, {S32, S32}})
664 .clampScalar(0, S32, S32)
665 .scalarize(0);
666
667 getActionDefinitionsBuilder(G_BITCAST)
668 // Don't worry about the size constraint.
669 .legalIf(all(isRegisterType(0), isRegisterType(1)))
670 .lower();
671
672
673 getActionDefinitionsBuilder(G_CONSTANT)
674 .legalFor({S1, S32, S64, S16, GlobalPtr,
675 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
676 .legalIf(isPointer(0))
677 .clampScalar(0, S32, S64)
678 .widenScalarToNextPow2(0);
679
680 getActionDefinitionsBuilder(G_FCONSTANT)
681 .legalFor({S32, S64, S16})
682 .clampScalar(0, S16, S64);
683
684 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
685 .legalIf(isRegisterType(0))
686 // s1 and s16 are special cases because they have legal operations on
687 // them, but don't really occupy registers in the normal way.
688 .legalFor({S1, S16})
689 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
690 .clampScalarOrElt(0, S32, MaxScalar)
691 .widenScalarToNextPow2(0, 32)
692 .clampMaxNumElements(0, S32, 16);
693
694 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
695
696 // If the amount is divergent, we have to do a wave reduction to get the
697 // maximum value, so this is expanded during RegBankSelect.
698 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
699 .legalFor({{PrivatePtr, S32}});
700
701 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
702 .customIf(typeIsNot(0, PrivatePtr));
703
704 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
705
706 auto &FPOpActions = getActionDefinitionsBuilder(
707 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
708 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
709 .legalFor({S32, S64});
710 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
711 .customFor({S32, S64});
712 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
713 .customFor({S32, S64});
714
715 if (ST.has16BitInsts()) {
716 if (ST.hasVOP3PInsts())
717 FPOpActions.legalFor({S16, V2S16});
718 else
719 FPOpActions.legalFor({S16});
720
721 TrigActions.customFor({S16});
722 FDIVActions.customFor({S16});
723 }
724
725 auto &MinNumMaxNum = getActionDefinitionsBuilder({
726 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
727
728 if (ST.hasVOP3PInsts()) {
729 MinNumMaxNum.customFor(FPTypesPK16)
730 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
731 .clampMaxNumElements(0, S16, 2)
732 .clampScalar(0, S16, S64)
733 .scalarize(0);
734 } else if (ST.has16BitInsts()) {
735 MinNumMaxNum.customFor(FPTypes16)
736 .clampScalar(0, S16, S64)
737 .scalarize(0);
738 } else {
739 MinNumMaxNum.customFor(FPTypesBase)
740 .clampScalar(0, S32, S64)
741 .scalarize(0);
742 }
743
744 if (ST.hasVOP3PInsts())
745 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
746
747 FPOpActions
748 .scalarize(0)
749 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
750
751 TrigActions
752 .scalarize(0)
753 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
754
755 FDIVActions
756 .scalarize(0)
757 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
758
759 getActionDefinitionsBuilder({G_FNEG, G_FABS})
760 .legalFor(FPTypesPK16)
761 .clampMaxNumElementsStrict(0, S16, 2)
762 .scalarize(0)
763 .clampScalar(0, S16, S64);
764
765 if (ST.has16BitInsts()) {
766 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
767 .legalFor({S32, S64, S16})
768 .scalarize(0)
769 .clampScalar(0, S16, S64);
770 } else {
771 getActionDefinitionsBuilder(G_FSQRT)
772 .legalFor({S32, S64})
773 .scalarize(0)
774 .clampScalar(0, S32, S64);
775
776 if (ST.hasFractBug()) {
777 getActionDefinitionsBuilder(G_FFLOOR)
778 .customFor({S64})
779 .legalFor({S32, S64})
780 .scalarize(0)
781 .clampScalar(0, S32, S64);
782 } else {
783 getActionDefinitionsBuilder(G_FFLOOR)
784 .legalFor({S32, S64})
785 .scalarize(0)
786 .clampScalar(0, S32, S64);
787 }
788 }
789
790 getActionDefinitionsBuilder(G_FPTRUNC)
791 .legalFor({{S32, S64}, {S16, S32}})
792 .scalarize(0)
793 .lower();
794
795 getActionDefinitionsBuilder(G_FPEXT)
796 .legalFor({{S64, S32}, {S32, S16}})
797 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
798 .scalarize(0);
799
800 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
801 if (ST.has16BitInsts()) {
802 FSubActions
803 // Use actual fsub instruction
804 .legalFor({S32, S16})
805 // Must use fadd + fneg
806 .lowerFor({S64, V2S16});
807 } else {
808 FSubActions
809 // Use actual fsub instruction
810 .legalFor({S32})
811 // Must use fadd + fneg
812 .lowerFor({S64, S16, V2S16});
813 }
814
815 FSubActions
816 .scalarize(0)
817 .clampScalar(0, S32, S64);
818
819 // Whether this is legal depends on the floating point mode for the function.
820 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
821 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
822 FMad.customFor({S32, S16});
823 else if (ST.hasMadMacF32Insts())
824 FMad.customFor({S32});
825 else if (ST.hasMadF16())
826 FMad.customFor({S16});
827 FMad.scalarize(0)
828 .lower();
829
830 auto &FRem = getActionDefinitionsBuilder(G_FREM);
831 if (ST.has16BitInsts()) {
832 FRem.customFor({S16, S32, S64});
833 } else {
834 FRem.minScalar(0, S32)
835 .customFor({S32, S64});
836 }
837 FRem.scalarize(0);
838
839 // TODO: Do we need to clamp maximum bitwidth?
840 getActionDefinitionsBuilder(G_TRUNC)
841 .legalIf(isScalar(0))
842 .legalFor({{V2S16, V2S32}})
843 .clampMaxNumElements(0, S16, 2)
844 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
845 // situations (like an invalid implicit use), we don't want to infinite loop
846 // in the legalizer.
847 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
848 .alwaysLegal();
849
850 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
851 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
852 {S32, S1}, {S64, S1}, {S16, S1}})
853 .scalarize(0)
854 .clampScalar(0, S32, S64)
855 .widenScalarToNextPow2(1, 32);
856
857 // TODO: Split s1->s64 during regbankselect for VALU.
858 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
859 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
860 .lowerIf(typeIs(1, S1))
861 .customFor({{S32, S64}, {S64, S64}});
862 if (ST.has16BitInsts())
863 IToFP.legalFor({{S16, S16}});
864 IToFP.clampScalar(1, S32, S64)
865 .minScalar(0, S32)
866 .scalarize(0)
867 .widenScalarToNextPow2(1);
868
869 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
870 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
871 .customFor({{S64, S32}, {S64, S64}})
872 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
873 if (ST.has16BitInsts())
874 FPToI.legalFor({{S16, S16}});
875 else
876 FPToI.minScalar(1, S32);
877
878 FPToI.minScalar(0, S32)
879 .widenScalarToNextPow2(0, 32)
880 .scalarize(0)
881 .lower();
882
883 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
884 .customFor({S16, S32})
885 .scalarize(0)
886 .lower();
887
888 // Lower roundeven into G_FRINT
889 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
890 .scalarize(0)
891 .lower();
892
893 if (ST.has16BitInsts()) {
894 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
895 .legalFor({S16, S32, S64})
896 .clampScalar(0, S16, S64)
897 .scalarize(0);
898 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
899 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
900 .legalFor({S32, S64})
901 .clampScalar(0, S32, S64)
902 .scalarize(0);
903 } else {
904 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
905 .legalFor({S32})
906 .customFor({S64})
907 .clampScalar(0, S32, S64)
908 .scalarize(0);
909 }
910
911 getActionDefinitionsBuilder(G_PTR_ADD)
912 .legalIf(all(isPointer(0), sameSize(0, 1)))
913 .scalarize(0)
914 .scalarSameSizeAs(1, 0);
915
916 getActionDefinitionsBuilder(G_PTRMASK)
917 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
918 .scalarSameSizeAs(1, 0)
919 .scalarize(0);
920
921 auto &CmpBuilder =
922 getActionDefinitionsBuilder(G_ICMP)
923 // The compare output type differs based on the register bank of the output,
924 // so make both s1 and s32 legal.
925 //
926 // Scalar compares producing output in scc will be promoted to s32, as that
927 // is the allocatable register type that will be needed for the copy from
928 // scc. This will be promoted during RegBankSelect, and we assume something
929 // before that won't try to use s32 result types.
930 //
931 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
932 // bank.
933 .legalForCartesianProduct(
934 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
935 .legalForCartesianProduct(
936 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
937 if (ST.has16BitInsts()) {
938 CmpBuilder.legalFor({{S1, S16}});
939 }
940
941 CmpBuilder
942 .widenScalarToNextPow2(1)
943 .clampScalar(1, S32, S64)
944 .scalarize(0)
945 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
946
947 getActionDefinitionsBuilder(G_FCMP)
948 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
949 .widenScalarToNextPow2(1)
950 .clampScalar(1, S32, S64)
951 .scalarize(0);
952
953 // FIXME: fpow has a selection pattern that should move to custom lowering.
954 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
955 if (ST.has16BitInsts())
956 Exp2Ops.legalFor({S32, S16});
957 else
958 Exp2Ops.legalFor({S32});
959 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
960 Exp2Ops.scalarize(0);
961
962 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
963 if (ST.has16BitInsts())
964 ExpOps.customFor({{S32}, {S16}});
965 else
966 ExpOps.customFor({S32});
967 ExpOps.clampScalar(0, MinScalarFPTy, S32)
968 .scalarize(0);
969
970 getActionDefinitionsBuilder(G_FPOWI)
971 .clampScalar(0, MinScalarFPTy, S32)
972 .lower();
973
974 // The 64-bit versions produce 32-bit results, but only on the SALU.
975 getActionDefinitionsBuilder(G_CTPOP)
976 .legalFor({{S32, S32}, {S32, S64}})
977 .clampScalar(0, S32, S32)
978 .widenScalarToNextPow2(1, 32)
979 .clampScalar(1, S32, S64)
980 .scalarize(0)
981 .widenScalarToNextPow2(0, 32);
982
983 // If no 16 bit instr is available, lower into different instructions.
984 if (ST.has16BitInsts())
985 getActionDefinitionsBuilder(G_IS_FPCLASS)
986 .legalForCartesianProduct({S1}, FPTypes16)
987 .widenScalarToNextPow2(1)
988 .scalarize(0)
989 .lower();
990 else
991 getActionDefinitionsBuilder(G_IS_FPCLASS)
992 .legalForCartesianProduct({S1}, FPTypesBase)
993 .lowerFor({S1, S16})
994 .widenScalarToNextPow2(1)
995 .scalarize(0)
996 .lower();
997
998 // The hardware instructions return a different result on 0 than the generic
999 // instructions expect. The hardware produces -1, but these produce the
1000 // bitwidth.
1001 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1002 .scalarize(0)
1003 .clampScalar(0, S32, S32)
1004 .clampScalar(1, S32, S64)
1005 .widenScalarToNextPow2(0, 32)
1006 .widenScalarToNextPow2(1, 32)
1007 .custom();
1008
1009 // The 64-bit versions produce 32-bit results, but only on the SALU.
1010 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1011 .legalFor({{S32, S32}, {S32, S64}})
1012 .clampScalar(0, S32, S32)
1013 .clampScalar(1, S32, S64)
1014 .scalarize(0)
1015 .widenScalarToNextPow2(0, 32)
1016 .widenScalarToNextPow2(1, 32);
1017
1018 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1019 // RegBankSelect.
1020 getActionDefinitionsBuilder(G_BITREVERSE)
1021 .legalFor({S32, S64})
1022 .clampScalar(0, S32, S64)
1023 .scalarize(0)
1024 .widenScalarToNextPow2(0);
1025
1026 if (ST.has16BitInsts()) {
1027 getActionDefinitionsBuilder(G_BSWAP)
1028 .legalFor({S16, S32, V2S16})
1029 .clampMaxNumElementsStrict(0, S16, 2)
1030 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1031 // narrowScalar limitation.
1032 .widenScalarToNextPow2(0)
1033 .clampScalar(0, S16, S32)
1034 .scalarize(0);
1035
1036 if (ST.hasVOP3PInsts()) {
1037 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1038 .legalFor({S32, S16, V2S16})
1039 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1040 .clampMaxNumElements(0, S16, 2)
1041 .minScalar(0, S16)
1042 .widenScalarToNextPow2(0)
1043 .scalarize(0)
1044 .lower();
1045 } else {
1046 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1047 .legalFor({S32, S16})
1048 .widenScalarToNextPow2(0)
1049 .minScalar(0, S16)
1050 .scalarize(0)
1051 .lower();
1052 }
1053 } else {
1054 // TODO: Should have same legality without v_perm_b32
1055 getActionDefinitionsBuilder(G_BSWAP)
1056 .legalFor({S32})
1057 .lowerIf(scalarNarrowerThan(0, 32))
1058 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1059 // narrowScalar limitation.
1060 .widenScalarToNextPow2(0)
1061 .maxScalar(0, S32)
1062 .scalarize(0)
1063 .lower();
1064
1065 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1066 .legalFor({S32})
1067 .minScalar(0, S32)
1068 .widenScalarToNextPow2(0)
1069 .scalarize(0)
1070 .lower();
1071 }
1072
1073 getActionDefinitionsBuilder(G_INTTOPTR)
1074 // List the common cases
1075 .legalForCartesianProduct(AddrSpaces64, {S64})
1076 .legalForCartesianProduct(AddrSpaces32, {S32})
1077 .scalarize(0)
1078 // Accept any address space as long as the size matches
1079 .legalIf(sameSize(0, 1))
1080 .widenScalarIf(smallerThan(1, 0),
1081 [](const LegalityQuery &Query) {
1082 return std::pair(
1083 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1084 })
1085 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1086 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1087 });
1088
1089 getActionDefinitionsBuilder(G_PTRTOINT)
1090 // List the common cases
1091 .legalForCartesianProduct(AddrSpaces64, {S64})
1092 .legalForCartesianProduct(AddrSpaces32, {S32})
1093 .scalarize(0)
1094 // Accept any address space as long as the size matches
1095 .legalIf(sameSize(0, 1))
1096 .widenScalarIf(smallerThan(0, 1),
1097 [](const LegalityQuery &Query) {
1098 return std::pair(
1099 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1100 })
1101 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1102 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1103 });
1104
1105 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1106 .scalarize(0)
1107 .custom();
1108
1109 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1110 bool IsLoad) -> bool {
1111 const LLT DstTy = Query.Types[0];
1112
1113 // Split vector extloads.
1114 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1115
1116 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1117 return true;
1118
1119 const LLT PtrTy = Query.Types[1];
1120 unsigned AS = PtrTy.getAddressSpace();
1121 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1122 Query.MMODescrs[0].Ordering !=
1123 AtomicOrdering::NotAtomic))
1124 return true;
1125
1126 // Catch weird sized loads that don't evenly divide into the access sizes
1127 // TODO: May be able to widen depending on alignment etc.
1128 unsigned NumRegs = (MemSize + 31) / 32;
1129 if (NumRegs == 3) {
1130 if (!ST.hasDwordx3LoadStores())
1131 return true;
1132 } else {
1133 // If the alignment allows, these should have been widened.
1134 if (!isPowerOf2_32(NumRegs))
1135 return true;
1136 }
1137
1138 return false;
1139 };
1140
1141 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1142 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1143 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1144
1145 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1146 // LDS
1147 // TODO: Unsupported flat for SI.
1148
1149 for (unsigned Op : {G_LOAD, G_STORE}) {
1150 const bool IsStore = Op == G_STORE;
1151
1152 auto &Actions = getActionDefinitionsBuilder(Op);
1153 // Explicitly list some common cases.
1154 // TODO: Does this help compile time at all?
1155 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1156 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1157 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1158 {S64, GlobalPtr, S64, GlobalAlign32},
1159 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1160 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1161 {S32, GlobalPtr, S8, GlobalAlign8},
1162 {S32, GlobalPtr, S16, GlobalAlign16},
1163
1164 {S32, LocalPtr, S32, 32},
1165 {S64, LocalPtr, S64, 32},
1166 {V2S32, LocalPtr, V2S32, 32},
1167 {S32, LocalPtr, S8, 8},
1168 {S32, LocalPtr, S16, 16},
1169 {V2S16, LocalPtr, S32, 32},
1170
1171 {S32, PrivatePtr, S32, 32},
1172 {S32, PrivatePtr, S8, 8},
1173 {S32, PrivatePtr, S16, 16},
1174 {V2S16, PrivatePtr, S32, 32},
1175
1176 {S32, ConstantPtr, S32, GlobalAlign32},
1177 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1178 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1179 {S64, ConstantPtr, S64, GlobalAlign32},
1180 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1181 Actions.legalIf(
1182 [=](const LegalityQuery &Query) -> bool {
1183 return isLoadStoreLegal(ST, Query);
1184 });
1185
1186 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1187 // 64-bits.
1188 //
1189 // TODO: Should generalize bitcast action into coerce, which will also cover
1190 // inserting addrspacecasts.
1191 Actions.customIf(typeIs(1, Constant32Ptr));
1192
1193 // Turn any illegal element vectors into something easier to deal
1194 // with. These will ultimately produce 32-bit scalar shifts to extract the
1195 // parts anyway.
1196 //
1197 // For odd 16-bit element vectors, prefer to split those into pieces with
1198 // 16-bit vector parts.
1199 Actions.bitcastIf(
1200 [=](const LegalityQuery &Query) -> bool {
1201 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1202 Query.MMODescrs[0].MemoryTy);
1203 }, bitcastToRegisterType(0));
1204
1205 if (!IsStore) {
1206 // Widen suitably aligned loads by loading extra bytes. The standard
1207 // legalization actions can't properly express widening memory operands.
1208 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1209 return shouldWidenLoad(ST, Query, G_LOAD);
1210 });
1211 }
1212
1213 // FIXME: load/store narrowing should be moved to lower action
1214 Actions
1215 .narrowScalarIf(
1216 [=](const LegalityQuery &Query) -> bool {
1217 return !Query.Types[0].isVector() &&
1218 needToSplitMemOp(Query, Op == G_LOAD);
1219 },
1220 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1221 const LLT DstTy = Query.Types[0];
1222 const LLT PtrTy = Query.Types[1];
1223
1224 const unsigned DstSize = DstTy.getSizeInBits();
1225 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1226
1227 // Split extloads.
1228 if (DstSize > MemSize)
1229 return std::pair(0, LLT::scalar(MemSize));
1230
1231 unsigned MaxSize = maxSizeForAddrSpace(
1232 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1233 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1234 if (MemSize > MaxSize)
1235 return std::pair(0, LLT::scalar(MaxSize));
1236
1237 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1238 return std::pair(0, LLT::scalar(Align));
1239 })
1240 .fewerElementsIf(
1241 [=](const LegalityQuery &Query) -> bool {
1242 return Query.Types[0].isVector() &&
1243 needToSplitMemOp(Query, Op == G_LOAD);
1244 },
1245 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1246 const LLT DstTy = Query.Types[0];
1247 const LLT PtrTy = Query.Types[1];
1248
1249 LLT EltTy = DstTy.getElementType();
1250 unsigned MaxSize = maxSizeForAddrSpace(
1251 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1252 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1253
1254 // FIXME: Handle widened to power of 2 results better. This ends
1255 // up scalarizing.
1256 // FIXME: 3 element stores scalarized on SI
1257
1258 // Split if it's too large for the address space.
1259 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1260 if (MemSize > MaxSize) {
1261 unsigned NumElts = DstTy.getNumElements();
1262 unsigned EltSize = EltTy.getSizeInBits();
1263
1264 if (MaxSize % EltSize == 0) {
1265 return std::pair(
1266 0, LLT::scalarOrVector(
1267 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1268 }
1269
1270 unsigned NumPieces = MemSize / MaxSize;
1271
1272 // FIXME: Refine when odd breakdowns handled
1273 // The scalars will need to be re-legalized.
1274 if (NumPieces == 1 || NumPieces >= NumElts ||
1275 NumElts % NumPieces != 0)
1276 return std::pair(0, EltTy);
1277
1278 return std::pair(0,
1279 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1280 }
1281
1282 // FIXME: We could probably handle weird extending loads better.
1283 if (DstTy.getSizeInBits() > MemSize)
1284 return std::pair(0, EltTy);
1285
1286 unsigned EltSize = EltTy.getSizeInBits();
1287 unsigned DstSize = DstTy.getSizeInBits();
1288 if (!isPowerOf2_32(DstSize)) {
1289 // We're probably decomposing an odd sized store. Try to split
1290 // to the widest type. TODO: Account for alignment. As-is it
1291 // should be OK, since the new parts will be further legalized.
1292 unsigned FloorSize = llvm::bit_floor(DstSize);
1293 return std::pair(
1294 0, LLT::scalarOrVector(
1295 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1296 }
1297
1298 // May need relegalization for the scalars.
1299 return std::pair(0, EltTy);
1300 })
1301 .minScalar(0, S32)
1302 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1303 .widenScalarToNextPow2(0)
1304 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1305 .lower();
1306 }
1307
1308 // FIXME: Unaligned accesses not lowered.
1309 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1310 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1311 {S32, GlobalPtr, S16, 2 * 8},
1312 {S32, LocalPtr, S8, 8},
1313 {S32, LocalPtr, S16, 16},
1314 {S32, PrivatePtr, S8, 8},
1315 {S32, PrivatePtr, S16, 16},
1316 {S32, ConstantPtr, S8, 8},
1317 {S32, ConstantPtr, S16, 2 * 8}})
1318 .legalIf(
1319 [=](const LegalityQuery &Query) -> bool {
1320 return isLoadStoreLegal(ST, Query);
1321 });
1322
1323 if (ST.hasFlatAddressSpace()) {
1324 ExtLoads.legalForTypesWithMemDesc(
1325 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1326 }
1327
1328 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1329 // 64-bits.
1330 //
1331 // TODO: Should generalize bitcast action into coerce, which will also cover
1332 // inserting addrspacecasts.
1333 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1334
1335 ExtLoads.clampScalar(0, S32, S32)
1336 .widenScalarToNextPow2(0)
1337 .lower();
1338
1339 auto &Atomics = getActionDefinitionsBuilder(
1340 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1341 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1342 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1343 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1344 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1345 {S64, GlobalPtr}, {S64, LocalPtr},
1346 {S32, RegionPtr}, {S64, RegionPtr}});
1347 if (ST.hasFlatAddressSpace()) {
1348 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1349 }
1350
1351 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1352 if (ST.hasLDSFPAtomicAdd()) {
1353 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1354 if (ST.hasGFX90AInsts())
1355 Atomic.legalFor({{S64, LocalPtr}});
1356 if (ST.hasGFX940Insts())
1357 Atomic.legalFor({{V2S16, LocalPtr}});
1358 }
1359 if (ST.hasAtomicFaddInsts())
1360 Atomic.legalFor({{S32, GlobalPtr}});
1361 if (ST.hasFlatAtomicFaddF32Inst())
1362 Atomic.legalFor({{S32, FlatPtr}});
1363
1364 if (ST.hasGFX90AInsts()) {
1365 // These are legal with some caveats, and should have undergone expansion in
1366 // the IR in most situations
1367 // TODO: Move atomic expansion into legalizer
1368 Atomic.legalFor({
1369 {S32, GlobalPtr},
1370 {S64, GlobalPtr},
1371 {S64, FlatPtr}
1372 });
1373 }
1374
1375 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1376 // demarshalling
1377 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1378 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1379 {S32, FlatPtr}, {S64, FlatPtr}})
1380 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1381 {S32, RegionPtr}, {S64, RegionPtr}});
1382 // TODO: Pointer types, any 32-bit or 64-bit vector
1383
1384 // Condition should be s32 for scalar, s1 for vector.
1385 getActionDefinitionsBuilder(G_SELECT)
1386 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1387 LocalPtr, FlatPtr, PrivatePtr,
1388 LLT::fixed_vector(2, LocalPtr),
1389 LLT::fixed_vector(2, PrivatePtr)},
1390 {S1, S32})
1391 .clampScalar(0, S16, S64)
1392 .scalarize(1)
1393 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1394 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1395 .clampMaxNumElements(0, S32, 2)
1396 .clampMaxNumElements(0, LocalPtr, 2)
1397 .clampMaxNumElements(0, PrivatePtr, 2)
1398 .scalarize(0)
1399 .widenScalarToNextPow2(0)
1400 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1401
1402 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1403 // be more flexible with the shift amount type.
1404 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1405 .legalFor({{S32, S32}, {S64, S32}});
1406 if (ST.has16BitInsts()) {
1407 if (ST.hasVOP3PInsts()) {
1408 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1409 .clampMaxNumElements(0, S16, 2);
1410 } else
1411 Shifts.legalFor({{S16, S16}});
1412
1413 // TODO: Support 16-bit shift amounts for all types
1414 Shifts.widenScalarIf(
1415 [=](const LegalityQuery &Query) {
1416 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1417 // 32-bit amount.
1418 const LLT ValTy = Query.Types[0];
1419 const LLT AmountTy = Query.Types[1];
1420 return ValTy.getSizeInBits() <= 16 &&
1421 AmountTy.getSizeInBits() < 16;
1422 }, changeTo(1, S16));
1423 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1424 Shifts.clampScalar(1, S32, S32);
1425 Shifts.widenScalarToNextPow2(0, 16);
1426 Shifts.clampScalar(0, S16, S64);
1427
1428 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1429 .minScalar(0, S16)
1430 .scalarize(0)
1431 .lower();
1432 } else {
1433 // Make sure we legalize the shift amount type first, as the general
1434 // expansion for the shifted type will produce much worse code if it hasn't
1435 // been truncated already.
1436 Shifts.clampScalar(1, S32, S32);
1437 Shifts.widenScalarToNextPow2(0, 32);
1438 Shifts.clampScalar(0, S32, S64);
1439
1440 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1441 .minScalar(0, S32)
1442 .scalarize(0)
1443 .lower();
1444 }
1445 Shifts.scalarize(0);
1446
1447 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1448 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1449 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1450 unsigned IdxTypeIdx = 2;
1451
1452 getActionDefinitionsBuilder(Op)
1453 .customIf([=](const LegalityQuery &Query) {
1454 const LLT EltTy = Query.Types[EltTypeIdx];
1455 const LLT VecTy = Query.Types[VecTypeIdx];
1456 const LLT IdxTy = Query.Types[IdxTypeIdx];
1457 const unsigned EltSize = EltTy.getSizeInBits();
1458 return (EltSize == 32 || EltSize == 64) &&
1459 VecTy.getSizeInBits() % 32 == 0 &&
1460 VecTy.getSizeInBits() <= MaxRegisterSize &&
1461 IdxTy.getSizeInBits() == 32;
1462 })
1463 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1464 bitcastToVectorElement32(VecTypeIdx))
1465 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1466 .bitcastIf(
1467 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1468 [=](const LegalityQuery &Query) {
1469 // For > 64-bit element types, try to turn this into a 64-bit
1470 // element vector since we may be able to do better indexing
1471 // if this is scalar. If not, fall back to 32.
1472 const LLT EltTy = Query.Types[EltTypeIdx];
1473 const LLT VecTy = Query.Types[VecTypeIdx];
1474 const unsigned DstEltSize = EltTy.getSizeInBits();
1475 const unsigned VecSize = VecTy.getSizeInBits();
1476
1477 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1478 return std::pair(
1479 VecTypeIdx,
1480 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1481 })
1482 .clampScalar(EltTypeIdx, S32, S64)
1483 .clampScalar(VecTypeIdx, S32, S64)
1484 .clampScalar(IdxTypeIdx, S32, S32)
1485 .clampMaxNumElements(VecTypeIdx, S32, 32)
1486 // TODO: Clamp elements for 64-bit vectors?
1487 // It should only be necessary with variable indexes.
1488 // As a last resort, lower to the stack
1489 .lower();
1490 }
1491
1492 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1493 .unsupportedIf([=](const LegalityQuery &Query) {
1494 const LLT &EltTy = Query.Types[1].getElementType();
1495 return Query.Types[0] != EltTy;
1496 });
1497
1498 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1499 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1500 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1501
1502 // FIXME: Doesn't handle extract of illegal sizes.
1503 getActionDefinitionsBuilder(Op)
1504 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1505 .lowerIf([=](const LegalityQuery &Query) {
1506 // Sub-vector(or single element) insert and extract.
1507 // TODO: verify immediate offset here since lower only works with
1508 // whole elements.
1509 const LLT BigTy = Query.Types[BigTyIdx];
1510 return BigTy.isVector();
1511 })
1512 // FIXME: Multiples of 16 should not be legal.
1513 .legalIf([=](const LegalityQuery &Query) {
1514 const LLT BigTy = Query.Types[BigTyIdx];
1515 const LLT LitTy = Query.Types[LitTyIdx];
1516 return (BigTy.getSizeInBits() % 32 == 0) &&
1517 (LitTy.getSizeInBits() % 16 == 0);
1518 })
1519 .widenScalarIf(
1520 [=](const LegalityQuery &Query) {
1521 const LLT BigTy = Query.Types[BigTyIdx];
1522 return (BigTy.getScalarSizeInBits() < 16);
1523 },
1524 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1525 .widenScalarIf(
1526 [=](const LegalityQuery &Query) {
1527 const LLT LitTy = Query.Types[LitTyIdx];
1528 return (LitTy.getScalarSizeInBits() < 16);
1529 },
1530 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1531 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1532 .widenScalarToNextPow2(BigTyIdx, 32);
1533
1534 }
1535
1536 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1537 .legalForCartesianProduct(AllS32Vectors, {S32})
1538 .legalForCartesianProduct(AllS64Vectors, {S64})
1539 .clampNumElements(0, V16S32, V32S32)
1540 .clampNumElements(0, V2S64, V16S64)
1541 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1542
1543 if (ST.hasScalarPackInsts()) {
1544 BuildVector
1545 // FIXME: Should probably widen s1 vectors straight to s32
1546 .minScalarOrElt(0, S16)
1547 .minScalar(1, S16);
1548
1549 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1550 .legalFor({V2S16, S32})
1551 .lower();
1552 } else {
1553 BuildVector.customFor({V2S16, S16});
1554 BuildVector.minScalarOrElt(0, S32);
1555
1556 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1557 .customFor({V2S16, S32})
1558 .lower();
1559 }
1560
1561 BuildVector.legalIf(isRegisterType(0));
1562
1563 // FIXME: Clamp maximum size
1564 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1565 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1566 .clampMaxNumElements(0, S32, 32)
1567 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1568 .clampMaxNumElements(0, S16, 64);
1569
1570 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1571
1572 // Merge/Unmerge
1573 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1574 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1575 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1576
1577 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1578 const LLT Ty = Query.Types[TypeIdx];
1579 if (Ty.isVector()) {
1580 const LLT &EltTy = Ty.getElementType();
1581 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1582 return true;
1583 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1584 return true;
1585 }
1586 return false;
1587 };
1588
1589 auto &Builder = getActionDefinitionsBuilder(Op)
1590 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1591 .lowerFor({{S16, V2S16}})
1592 .lowerIf([=](const LegalityQuery &Query) {
1593 const LLT BigTy = Query.Types[BigTyIdx];
1594 return BigTy.getSizeInBits() == 32;
1595 })
1596 // Try to widen to s16 first for small types.
1597 // TODO: Only do this on targets with legal s16 shifts
1598 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1599 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1600 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1601 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1602 elementTypeIs(1, S16)),
1603 changeTo(1, V2S16))
1604 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1605 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1606 // valid.
1607 .clampScalar(LitTyIdx, S32, S512)
1608 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1609 // Break up vectors with weird elements into scalars
1610 .fewerElementsIf(
1611 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1612 scalarize(0))
1613 .fewerElementsIf(
1614 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1615 scalarize(1))
1616 .clampScalar(BigTyIdx, S32, MaxScalar);
1617
1618 if (Op == G_MERGE_VALUES) {
1619 Builder.widenScalarIf(
1620 // TODO: Use 16-bit shifts if legal for 8-bit values?
1621 [=](const LegalityQuery &Query) {
1622 const LLT Ty = Query.Types[LitTyIdx];
1623 return Ty.getSizeInBits() < 32;
1624 },
1625 changeTo(LitTyIdx, S32));
1626 }
1627
1628 Builder.widenScalarIf(
1629 [=](const LegalityQuery &Query) {
1630 const LLT Ty = Query.Types[BigTyIdx];
1631 return Ty.getSizeInBits() % 16 != 0;
1632 },
1633 [=](const LegalityQuery &Query) {
1634 // Pick the next power of 2, or a multiple of 64 over 128.
1635 // Whichever is smaller.
1636 const LLT &Ty = Query.Types[BigTyIdx];
1637 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1638 if (NewSizeInBits >= 256) {
1639 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1640 if (RoundedTo < NewSizeInBits)
1641 NewSizeInBits = RoundedTo;
1642 }
1643 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1644 })
1645 // Any vectors left are the wrong size. Scalarize them.
1646 .scalarize(0)
1647 .scalarize(1);
1648 }
1649
1650 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1651 // RegBankSelect.
1652 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1653 .legalFor({{S32}, {S64}});
1654
1655 if (ST.hasVOP3PInsts()) {
1656 SextInReg.lowerFor({{V2S16}})
1657 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1658 // get more vector shift opportunities, since we'll get those when
1659 // expanded.
1660 .clampMaxNumElementsStrict(0, S16, 2);
1661 } else if (ST.has16BitInsts()) {
1662 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1663 } else {
1664 // Prefer to promote to s32 before lowering if we don't have 16-bit
1665 // shifts. This avoid a lot of intermediate truncate and extend operations.
1666 SextInReg.lowerFor({{S32}, {S64}});
1667 }
1668
1669 SextInReg
1670 .scalarize(0)
1671 .clampScalar(0, S32, S64)
1672 .lower();
1673
1674 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1675 .scalarize(0)
1676 .lower();
1677
1678 // TODO: Only Try to form v2s16 with legal packed instructions.
1679 getActionDefinitionsBuilder(G_FSHR)
1680 .legalFor({{S32, S32}})
1681 .lowerFor({{V2S16, V2S16}})
1682 .clampMaxNumElementsStrict(0, S16, 2)
1683 .scalarize(0)
1684 .lower();
1685
1686 if (ST.hasVOP3PInsts()) {
1687 getActionDefinitionsBuilder(G_FSHL)
1688 .lowerFor({{V2S16, V2S16}})
1689 .clampMaxNumElementsStrict(0, S16, 2)
1690 .scalarize(0)
1691 .lower();
1692 } else {
1693 getActionDefinitionsBuilder(G_FSHL)
1694 .scalarize(0)
1695 .lower();
1696 }
1697
1698 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1699 .legalFor({S64});
1700
1701 getActionDefinitionsBuilder(G_FENCE)
1702 .alwaysLegal();
1703
1704 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1705 .scalarize(0)
1706 .minScalar(0, S32)
1707 .lower();
1708
1709 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1710 .legalFor({{S32, S32}, {S64, S32}})
1711 .clampScalar(1, S32, S32)
1712 .clampScalar(0, S32, S64)
1713 .widenScalarToNextPow2(0)
1714 .scalarize(0);
1715
1716 getActionDefinitionsBuilder({
1717 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1718 G_FCOPYSIGN,
1719
1720 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1721 G_ATOMICRMW_NAND,
1722 G_ATOMICRMW_FSUB,
1723 G_READ_REGISTER,
1724 G_WRITE_REGISTER,
1725
1726 G_SADDO, G_SSUBO,
1727
1728 // TODO: Implement
1729 G_FMINIMUM, G_FMAXIMUM}).lower();
1730
1731 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1732 .lower();
1733
1734 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1735 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1736 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1737 .unsupported();
1738
1739 getLegacyLegalizerInfo().computeTables();
1740 verify(*ST.getInstrInfo());
1741}
1742
1743bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1744 MachineInstr &MI) const {
1745 MachineIRBuilder &B = Helper.MIRBuilder;
1746 MachineRegisterInfo &MRI = *B.getMRI();
1747
1748 switch (MI.getOpcode()) {
1749 case TargetOpcode::G_ADDRSPACE_CAST:
1750 return legalizeAddrSpaceCast(MI, MRI, B);
1751 case TargetOpcode::G_FRINT:
1752 return legalizeFrint(MI, MRI, B);
1753 case TargetOpcode::G_FCEIL:
1754 return legalizeFceil(MI, MRI, B);
1755 case TargetOpcode::G_FREM:
1756 return legalizeFrem(MI, MRI, B);
1757 case TargetOpcode::G_INTRINSIC_TRUNC:
1758 return legalizeIntrinsicTrunc(MI, MRI, B);
1759 case TargetOpcode::G_SITOFP:
1760 return legalizeITOFP(MI, MRI, B, true);
1761 case TargetOpcode::G_UITOFP:
1762 return legalizeITOFP(MI, MRI, B, false);
1763 case TargetOpcode::G_FPTOSI:
1764 return legalizeFPTOI(MI, MRI, B, true);
1765 case TargetOpcode::G_FPTOUI:
1766 return legalizeFPTOI(MI, MRI, B, false);
1767 case TargetOpcode::G_FMINNUM:
1768 case TargetOpcode::G_FMAXNUM:
1769 case TargetOpcode::G_FMINNUM_IEEE:
1770 case TargetOpcode::G_FMAXNUM_IEEE:
1771 return legalizeMinNumMaxNum(Helper, MI);
1772 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1773 return legalizeExtractVectorElt(MI, MRI, B);
1774 case TargetOpcode::G_INSERT_VECTOR_ELT:
1775 return legalizeInsertVectorElt(MI, MRI, B);
1776 case TargetOpcode::G_FSIN:
1777 case TargetOpcode::G_FCOS:
1778 return legalizeSinCos(MI, MRI, B);
1779 case TargetOpcode::G_GLOBAL_VALUE:
1780 return legalizeGlobalValue(MI, MRI, B);
1781 case TargetOpcode::G_LOAD:
1782 case TargetOpcode::G_SEXTLOAD:
1783 case TargetOpcode::G_ZEXTLOAD:
1784 return legalizeLoad(Helper, MI);
1785 case TargetOpcode::G_FMAD:
1786 return legalizeFMad(MI, MRI, B);
1787 case TargetOpcode::G_FDIV:
1788 return legalizeFDIV(MI, MRI, B);
1789 case TargetOpcode::G_UDIV:
1790 case TargetOpcode::G_UREM:
1791 case TargetOpcode::G_UDIVREM:
1792 return legalizeUnsignedDIV_REM(MI, MRI, B);
1793 case TargetOpcode::G_SDIV:
1794 case TargetOpcode::G_SREM:
1795 case TargetOpcode::G_SDIVREM:
1796 return legalizeSignedDIV_REM(MI, MRI, B);
1797 case TargetOpcode::G_ATOMIC_CMPXCHG:
1798 return legalizeAtomicCmpXChg(MI, MRI, B);
1799 case TargetOpcode::G_FLOG:
1800 return legalizeFlog(MI, B, numbers::ln2f);
1801 case TargetOpcode::G_FLOG10:
1802 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1803 case TargetOpcode::G_FEXP:
1804 return legalizeFExp(MI, B);
1805 case TargetOpcode::G_FPOW:
1806 return legalizeFPow(MI, B);
1807 case TargetOpcode::G_FFLOOR:
1808 return legalizeFFloor(MI, MRI, B);
1809 case TargetOpcode::G_BUILD_VECTOR:
1810 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1811 return legalizeBuildVector(MI, MRI, B);
1812 case TargetOpcode::G_MUL:
1813 return legalizeMul(Helper, MI);
1814 case TargetOpcode::G_CTLZ:
1815 case TargetOpcode::G_CTTZ:
1816 return legalizeCTLZ_CTTZ(MI, MRI, B);
1817 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1818 return legalizeFPTruncRound(MI, B);
1819 default:
1820 return false;
1821 }
1822
1823 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1823)
;
1824}
1825
1826Register AMDGPULegalizerInfo::getSegmentAperture(
1827 unsigned AS,
1828 MachineRegisterInfo &MRI,
1829 MachineIRBuilder &B) const {
1830 MachineFunction &MF = B.getMF();
1831 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1832 const LLT S32 = LLT::scalar(32);
1833 const LLT S64 = LLT::scalar(64);
1834
1835 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1835, __extension__
__PRETTY_FUNCTION__))
;
1836
1837 if (ST.hasApertureRegs()) {
1838 // Note: this register is somewhat broken. When used as a 32-bit operand,
1839 // it only returns zeroes. The real value is in the upper 32 bits.
1840 // Thus, we must emit extract the high 32 bits.
1841 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
1842 ? AMDGPU::SRC_SHARED_BASE
1843 : AMDGPU::SRC_PRIVATE_BASE;
1844 // FIXME: It would be more natural to emit a COPY here, but then copy
1845 // coalescing would kick in and it would think it's okay to use the "HI"
1846 // subregister (instead of extracting the HI 32 bits) which is an artificial
1847 // (unusable) register.
1848 // Register TableGen definitions would need an overhaul to get rid of the
1849 // artificial "HI" aperture registers and prevent this kind of issue from
1850 // happening.
1851 Register Dst = MRI.createGenericVirtualRegister(S64);
1852 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1853 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
1854 return B.buildUnmerge(S32, Dst).getReg(1);
1855 }
1856
1857 // TODO: can we be smarter about machine pointer info?
1858 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1859 Register LoadAddr = MRI.createGenericVirtualRegister(
1860 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1861 // For code object version 5, private_base and shared_base are passed through
1862 // implicit kernargs.
1863 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
1864 AMDGPU::AMDHSA_COV5) {
1865 AMDGPUTargetLowering::ImplicitParameter Param =
1866 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1867 : AMDGPUTargetLowering::PRIVATE_BASE;
1868 uint64_t Offset =
1869 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1870
1871 Register KernargPtrReg = MRI.createGenericVirtualRegister(
1872 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1873
1874 if (!loadInputValue(KernargPtrReg, B,
1875 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1876 return Register();
1877
1878 MachineMemOperand *MMO = MF.getMachineMemOperand(
1879 PtrInfo,
1880 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1881 MachineMemOperand::MOInvariant,
1882 LLT::scalar(32), commonAlignment(Align(64), Offset));
1883
1884 // Pointer address
1885 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1886 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1887 // Load address
1888 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1889 }
1890
1891 Register QueuePtr = MRI.createGenericVirtualRegister(
1892 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1893
1894 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1895 return Register();
1896
1897 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1898 // private_segment_aperture_base_hi.
1899 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1900
1901 MachineMemOperand *MMO = MF.getMachineMemOperand(
1902 PtrInfo,
1903 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1904 MachineMemOperand::MOInvariant,
1905 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1906
1907 B.buildPtrAdd(LoadAddr, QueuePtr,
1908 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1909 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1910}
1911
1912/// Return true if the value is a known valid address, such that a null check is
1913/// not necessary.
1914static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1915 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1916 MachineInstr *Def = MRI.getVRegDef(Val);
1917 switch (Def->getOpcode()) {
1918 case AMDGPU::G_FRAME_INDEX:
1919 case AMDGPU::G_GLOBAL_VALUE:
1920 case AMDGPU::G_BLOCK_ADDR:
1921 return true;
1922 case AMDGPU::G_CONSTANT: {
1923 const ConstantInt *CI = Def->getOperand(1).getCImm();
1924 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1925 }
1926 default:
1927 return false;
1928 }
1929
1930 return false;
1931}
1932
1933bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1934 MachineInstr &MI, MachineRegisterInfo &MRI,
1935 MachineIRBuilder &B) const {
1936 MachineFunction &MF = B.getMF();
1937
1938 const LLT S32 = LLT::scalar(32);
1939 Register Dst = MI.getOperand(0).getReg();
1940 Register Src = MI.getOperand(1).getReg();
1941
1942 LLT DstTy = MRI.getType(Dst);
1943 LLT SrcTy = MRI.getType(Src);
1944 unsigned DestAS = DstTy.getAddressSpace();
1945 unsigned SrcAS = SrcTy.getAddressSpace();
1946
1947 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1948 // vector element.
1949 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1949, __extension__ __PRETTY_FUNCTION__))
;
1950
1951 const AMDGPUTargetMachine &TM
1952 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1953
1954 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1955 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1956 return true;
1957 }
1958
1959 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1960 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1961 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1962 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1963 // Extract low 32-bits of the pointer.
1964 B.buildExtract(Dst, Src, 0);
1965 MI.eraseFromParent();
1966 return true;
1967 }
1968
1969 unsigned NullVal = TM.getNullPointerValue(DestAS);
1970
1971 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1972 auto FlatNull = B.buildConstant(SrcTy, 0);
1973
1974 // Extract low 32-bits of the pointer.
1975 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1976
1977 auto CmpRes =
1978 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1979 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1980
1981 MI.eraseFromParent();
1982 return true;
1983 }
1984
1985 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1986 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1987 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1988 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1989 if (!ApertureReg.isValid())
1990 return false;
1991
1992 // Coerce the type of the low half of the result so we can use merge_values.
1993 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1994
1995 // TODO: Should we allow mismatched types but matching sizes in merges to
1996 // avoid the ptrtoint?
1997 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
1998
1999 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2000 B.buildCopy(Dst, BuildPtr);
2001 MI.eraseFromParent();
2002 return true;
2003 }
2004
2005 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2006 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2007
2008 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2009 SegmentNull.getReg(0));
2010
2011 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2012
2013 MI.eraseFromParent();
2014 return true;
2015 }
2016
2017 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2018 SrcTy.getSizeInBits() == 64) {
2019 // Truncate.
2020 B.buildExtract(Dst, Src, 0);
2021 MI.eraseFromParent();
2022 return true;
2023 }
2024
2025 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2026 DstTy.getSizeInBits() == 64) {
2027 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2028 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2029 auto PtrLo = B.buildPtrToInt(S32, Src);
2030 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2031 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2032 MI.eraseFromParent();
2033 return true;
2034 }
2035
2036 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2037 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2038
2039 LLVMContext &Ctx = MF.getFunction().getContext();
2040 Ctx.diagnose(InvalidAddrSpaceCast);
2041 B.buildUndef(Dst);
2042 MI.eraseFromParent();
2043 return true;
2044}
2045
2046bool AMDGPULegalizerInfo::legalizeFrint(
2047 MachineInstr &MI, MachineRegisterInfo &MRI,
2048 MachineIRBuilder &B) const {
2049 Register Src = MI.getOperand(1).getReg();
2050 LLT Ty = MRI.getType(Src);
2051 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2051, __extension__
__PRETTY_FUNCTION__))
;
2052
2053 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2054 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2055
2056 auto C1 = B.buildFConstant(Ty, C1Val);
2057 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2058
2059 // TODO: Should this propagate fast-math-flags?
2060 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2061 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2062
2063 auto C2 = B.buildFConstant(Ty, C2Val);
2064 auto Fabs = B.buildFAbs(Ty, Src);
2065
2066 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2067 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2068 MI.eraseFromParent();
2069 return true;
2070}
2071
2072bool AMDGPULegalizerInfo::legalizeFceil(
2073 MachineInstr &MI, MachineRegisterInfo &MRI,
2074 MachineIRBuilder &B) const {
2075
2076 const LLT S1 = LLT::scalar(1);
2077 const LLT S64 = LLT::scalar(64);
2078
2079 Register Src = MI.getOperand(1).getReg();
2080 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2080, __extension__ __PRETTY_FUNCTION__))
;
2081
2082 // result = trunc(src)
2083 // if (src > 0.0 && src != result)
2084 // result += 1.0
2085
2086 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2087
2088 const auto Zero = B.buildFConstant(S64, 0.0);
2089 const auto One = B.buildFConstant(S64, 1.0);
2090 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2091 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2092 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2093 auto Add = B.buildSelect(S64, And, One, Zero);
2094
2095 // TODO: Should this propagate fast-math-flags?
2096 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2097 MI.eraseFromParent();
2098 return true;
2099}
2100
2101bool AMDGPULegalizerInfo::legalizeFrem(
2102 MachineInstr &MI, MachineRegisterInfo &MRI,
2103 MachineIRBuilder &B) const {
2104 Register DstReg = MI.getOperand(0).getReg();
2105 Register Src0Reg = MI.getOperand(1).getReg();
2106 Register Src1Reg = MI.getOperand(2).getReg();
2107 auto Flags = MI.getFlags();
2108 LLT Ty = MRI.getType(DstReg);
2109
2110 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2111 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2112 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2113 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2114 MI.eraseFromParent();
2115 return true;
2116}
2117
2118static MachineInstrBuilder extractF64Exponent(Register Hi,
2119 MachineIRBuilder &B) {
2120 const unsigned FractBits = 52;
2121 const unsigned ExpBits = 11;
2122 LLT S32 = LLT::scalar(32);
2123
2124 auto Const0 = B.buildConstant(S32, FractBits - 32);
2125 auto Const1 = B.buildConstant(S32, ExpBits);
2126
2127 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2128 .addUse(Hi)
2129 .addUse(Const0.getReg(0))
2130 .addUse(Const1.getReg(0));
2131
2132 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2133}
2134
2135bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2136 MachineInstr &MI, MachineRegisterInfo &MRI,
2137 MachineIRBuilder &B) const {
2138 const LLT S1 = LLT::scalar(1);
2139 const LLT S32 = LLT::scalar(32);
2140 const LLT S64 = LLT::scalar(64);
2141
2142 Register Src = MI.getOperand(1).getReg();
2143 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2143, __extension__ __PRETTY_FUNCTION__))
;
2144
2145 // TODO: Should this use extract since the low half is unused?
2146 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2147 Register Hi = Unmerge.getReg(1);
2148
2149 // Extract the upper half, since this is where we will find the sign and
2150 // exponent.
2151 auto Exp = extractF64Exponent(Hi, B);
2152
2153 const unsigned FractBits = 52;
2154
2155 // Extract the sign bit.
2156 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2157 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2158
2159 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2160
2161 const auto Zero32 = B.buildConstant(S32, 0);
2162
2163 // Extend back to 64-bits.
2164 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2165
2166 auto Shr = B.buildAShr(S64, FractMask, Exp);
2167 auto Not = B.buildNot(S64, Shr);
2168 auto Tmp0 = B.buildAnd(S64, Src, Not);
2169 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2170
2171 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2172 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2173
2174 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2175 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2176 MI.eraseFromParent();
2177 return true;
2178}
2179
2180bool AMDGPULegalizerInfo::legalizeITOFP(
2181 MachineInstr &MI, MachineRegisterInfo &MRI,
2182 MachineIRBuilder &B, bool Signed) const {
2183
2184 Register Dst = MI.getOperand(0).getReg();
2185 Register Src = MI.getOperand(1).getReg();
2186
2187 const LLT S64 = LLT::scalar(64);
2188 const LLT S32 = LLT::scalar(32);
2189
2190 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2190, __extension__ __PRETTY_FUNCTION__))
;
2191
2192 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2193 auto ThirtyTwo = B.buildConstant(S32, 32);
2194
2195 if (MRI.getType(Dst) == S64) {
2196 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2197 : B.buildUITOFP(S64, Unmerge.getReg(1));
2198
2199 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2200 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2201 .addUse(CvtHi.getReg(0))
2202 .addUse(ThirtyTwo.getReg(0));
2203
2204 // TODO: Should this propagate fast-math-flags?
2205 B.buildFAdd(Dst, LdExp, CvtLo);
2206 MI.eraseFromParent();
2207 return true;
2208 }
2209
2210 assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0
) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2210, __extension__ __PRETTY_FUNCTION__))
;
2211
2212 auto One = B.buildConstant(S32, 1);
2213
2214 MachineInstrBuilder ShAmt;
2215 if (Signed) {
2216 auto ThirtyOne = B.buildConstant(S32, 31);
2217 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2218 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2219 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2220 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2221 /*HasSideEffects=*/false)
2222 .addUse(Unmerge.getReg(1));
2223 auto LS2 = B.buildSub(S32, LS, One);
2224 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2225 } else
2226 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2227 auto Norm = B.buildShl(S64, Src, ShAmt);
2228 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2229 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2230 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2231 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2232 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2233 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2234 /*HasSideEffects=*/false)
2235 .addUse(FVal.getReg(0))
2236 .addUse(Scale.getReg(0));
2237 MI.eraseFromParent();
2238 return true;
2239}
2240
2241// TODO: Copied from DAG implementation. Verify logic and document how this
2242// actually works.
2243bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2244 MachineRegisterInfo &MRI,
2245 MachineIRBuilder &B,
2246 bool Signed) const {
2247
2248 Register Dst = MI.getOperand(0).getReg();
2249 Register Src = MI.getOperand(1).getReg();
2250
2251 const LLT S64 = LLT::scalar(64);
2252 const LLT S32 = LLT::scalar(32);
2253
2254 const LLT SrcLT = MRI.getType(Src);
2255 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2255, __extension__
__PRETTY_FUNCTION__))
;
2256
2257 unsigned Flags = MI.getFlags();
2258
2259 // The basic idea of converting a floating point number into a pair of 32-bit
2260 // integers is illustrated as follows:
2261 //
2262 // tf := trunc(val);
2263 // hif := floor(tf * 2^-32);
2264 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2265 // hi := fptoi(hif);
2266 // lo := fptoi(lof);
2267 //
2268 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2269 MachineInstrBuilder Sign;
2270 if (Signed && SrcLT == S32) {
2271 // However, a 32-bit floating point number has only 23 bits mantissa and
2272 // it's not enough to hold all the significant bits of `lof` if val is
2273 // negative. To avoid the loss of precision, We need to take the absolute
2274 // value after truncating and flip the result back based on the original
2275 // signedness.
2276 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2277 Trunc = B.buildFAbs(S32, Trunc, Flags);
2278 }
2279 MachineInstrBuilder K0, K1;
2280 if (SrcLT == S64) {
2281 K0 = B.buildFConstant(
2282 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2283 K1 = B.buildFConstant(
2284 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2285 } else {
2286 K0 = B.buildFConstant(
2287 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2288 K1 = B.buildFConstant(
2289 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2290 }
2291
2292 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2293 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2294 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2295
2296 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2297 : B.buildFPTOUI(S32, FloorMul);
2298 auto Lo = B.buildFPTOUI(S32, Fma);
2299
2300 if (Signed && SrcLT == S32) {
2301 // Flip the result based on the signedness, which is either all 0s or 1s.
2302 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2303 // r := xor({lo, hi}, sign) - sign;
2304 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2305 Sign);
2306 } else
2307 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2308 MI.eraseFromParent();
2309
2310 return true;
2311}
2312
2313bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2314 MachineInstr &MI) const {
2315 MachineFunction &MF = Helper.MIRBuilder.getMF();
2316 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2317
2318 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2319 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2320
2321 // With ieee_mode disabled, the instructions have the correct behavior
2322 // already for G_FMINNUM/G_FMAXNUM
2323 if (!MFI->getMode().IEEE)
2324 return !IsIEEEOp;
2325
2326 if (IsIEEEOp)
2327 return true;
2328
2329 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2330}
2331
2332bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2333 MachineInstr &MI, MachineRegisterInfo &MRI,
2334 MachineIRBuilder &B) const {
2335 // TODO: Should move some of this into LegalizerHelper.
2336
2337 // TODO: Promote dynamic indexing of s16 to s32
2338
2339 // FIXME: Artifact combiner probably should have replaced the truncated
2340 // constant before this, so we shouldn't need
2341 // getIConstantVRegValWithLookThrough.
2342 std::optional<ValueAndVReg> MaybeIdxVal =
2343 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2344 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2345 return true;
2346 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2347
2348 Register Dst = MI.getOperand(0).getReg();
2349 Register Vec = MI.getOperand(1).getReg();
2350
2351 LLT VecTy = MRI.getType(Vec);
2352 LLT EltTy = VecTy.getElementType();
2353 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2353, __extension__ __PRETTY_FUNCTION__))
;
2354
2355 if (IdxVal < VecTy.getNumElements()) {
2356 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2357 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2358 } else {
2359 B.buildUndef(Dst);
2360 }
2361
2362 MI.eraseFromParent();
2363 return true;
2364}
2365
2366bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2367 MachineInstr &MI, MachineRegisterInfo &MRI,
2368 MachineIRBuilder &B) const {
2369 // TODO: Should move some of this into LegalizerHelper.
2370
2371 // TODO: Promote dynamic indexing of s16 to s32
2372
2373 // FIXME: Artifact combiner probably should have replaced the truncated
2374 // constant before this, so we shouldn't need
2375 // getIConstantVRegValWithLookThrough.
2376 std::optional<ValueAndVReg> MaybeIdxVal =
2377 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2378 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2379 return true;
2380
2381 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2382 Register Dst = MI.getOperand(0).getReg();
2383 Register Vec = MI.getOperand(1).getReg();
2384 Register Ins = MI.getOperand(2).getReg();
2385
2386 LLT VecTy = MRI.getType(Vec);
2387 LLT EltTy = VecTy.getElementType();
2388 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2388, __extension__ __PRETTY_FUNCTION__))
;
2389 (void)Ins;
2390
2391 unsigned NumElts = VecTy.getNumElements();
2392 if (IdxVal < NumElts) {
2393 SmallVector<Register, 8> SrcRegs;
2394 for (unsigned i = 0; i < NumElts; ++i)
2395 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2396 B.buildUnmerge(SrcRegs, Vec);
2397
2398 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2399 B.buildMergeLikeInstr(Dst, SrcRegs);
2400 } else {
2401 B.buildUndef(Dst);
2402 }
2403
2404 MI.eraseFromParent();
2405 return true;
2406}
2407
2408bool AMDGPULegalizerInfo::legalizeSinCos(
2409 MachineInstr &MI, MachineRegisterInfo &MRI,
2410 MachineIRBuilder &B) const {
2411
2412 Register DstReg = MI.getOperand(0).getReg();
2413 Register SrcReg = MI.getOperand(1).getReg();
2414 LLT Ty = MRI.getType(DstReg);
2415 unsigned Flags = MI.getFlags();
2416
2417 Register TrigVal;
2418 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2419 if (ST.hasTrigReducedRange()) {
2420 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2421 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2422 .addUse(MulVal.getReg(0))
2423 .setMIFlags(Flags).getReg(0);
2424 } else
2425 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2426
2427 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2428 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2429 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
2430 .addUse(TrigVal)
2431 .setMIFlags(Flags);
2432 MI.eraseFromParent();
2433 return true;
2434}
2435
2436bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2437 MachineIRBuilder &B,
2438 const GlobalValue *GV,
2439 int64_t Offset,
2440 unsigned GAFlags) const {
2441 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2441, __extension__
__PRETTY_FUNCTION__))
;
2442 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2443 // to the following code sequence:
2444 //
2445 // For constant address space:
2446 // s_getpc_b64 s[0:1]
2447 // s_add_u32 s0, s0, $symbol
2448 // s_addc_u32 s1, s1, 0
2449 //
2450 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2451 // a fixup or relocation is emitted to replace $symbol with a literal
2452 // constant, which is a pc-relative offset from the encoding of the $symbol
2453 // operand to the global variable.
2454 //
2455 // For global address space:
2456 // s_getpc_b64 s[0:1]
2457 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2458 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2459 //
2460 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2461 // fixups or relocations are emitted to replace $symbol@*@lo and
2462 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2463 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2464 // operand to the global variable.
2465 //
2466 // What we want here is an offset from the value returned by s_getpc
2467 // (which is the address of the s_add_u32 instruction) to the global
2468 // variable, but since the encoding of $symbol starts 4 bytes after the start
2469 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2470 // small. This requires us to add 4 to the global variable offset in order to
2471 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2472 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2473 // instruction.
2474
2475 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2476
2477 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2478 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2479
2480 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2481 .addDef(PCReg);
2482
2483 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2484 if (GAFlags == SIInstrInfo::MO_NONE)
2485 MIB.addImm(0);
2486 else
2487 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2488
2489 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2490
2491 if (PtrTy.getSizeInBits() == 32)
2492 B.buildExtract(DstReg, PCReg, 0);
2493 return true;
2494 }
2495
2496bool AMDGPULegalizerInfo::legalizeGlobalValue(
2497 MachineInstr &MI, MachineRegisterInfo &MRI,
2498 MachineIRBuilder &B) const {
2499 Register DstReg = MI.getOperand(0).getReg();
2500 LLT Ty = MRI.getType(DstReg);
2501 unsigned AS = Ty.getAddressSpace();
2502
2503 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2504 MachineFunction &MF = B.getMF();
2505 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2506
2507 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2508 if (!MFI->isModuleEntryFunction() &&
2509 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2510 const Function &Fn = MF.getFunction();
2511 DiagnosticInfoUnsupported BadLDSDecl(
2512 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2513 DS_Warning);
2514 Fn.getContext().diagnose(BadLDSDecl);
2515
2516 // We currently don't have a way to correctly allocate LDS objects that
2517 // aren't directly associated with a kernel. We do force inlining of
2518 // functions that use local objects. However, if these dead functions are
2519 // not eliminated, we don't want a compile time error. Just emit a warning
2520 // and a trap, since there should be no callable path here.
2521 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2522 B.buildUndef(DstReg);
2523 MI.eraseFromParent();
2524 return true;
2525 }
2526
2527 // TODO: We could emit code to handle the initialization somewhere.
2528 // We ignore the initializer for now and legalize it to allow selection.
2529 // The initializer will anyway get errored out during assembly emission.
2530 const SITargetLowering *TLI = ST.getTargetLowering();
2531 if (!TLI->shouldUseLDSConstAddress(GV)) {
2532 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2533 return true; // Leave in place;
2534 }
2535
2536 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2537 Type *Ty = GV->getValueType();
2538 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2539 // zero-sized type in other languages to declare the dynamic shared
2540 // memory which size is not known at the compile time. They will be
2541 // allocated by the runtime and placed directly after the static
2542 // allocated ones. They all share the same offset.
2543 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2544 // Adjust alignment for that dynamic shared memory array.
2545 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2546 LLT S32 = LLT::scalar(32);
2547 auto Sz =
2548 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2549 B.buildIntToPtr(DstReg, Sz);
2550 MI.eraseFromParent();
2551 return true;
2552 }
2553 }
2554
2555 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2556 *cast<GlobalVariable>(GV)));
2557 MI.eraseFromParent();
2558 return true;
2559 }
2560
2561 const SITargetLowering *TLI = ST.getTargetLowering();
2562
2563 if (TLI->shouldEmitFixup(GV)) {
2564 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2565 MI.eraseFromParent();
2566 return true;
2567 }
2568
2569 if (TLI->shouldEmitPCReloc(GV)) {
2570 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2571 MI.eraseFromParent();
2572 return true;
2573 }
2574
2575 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2576 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2577
2578 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2579 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2580 MachinePointerInfo::getGOT(MF),
2581 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2582 MachineMemOperand::MOInvariant,
2583 LoadTy, Align(8));
2584
2585 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2586
2587 if (Ty.getSizeInBits() == 32) {
2588 // Truncate if this is a 32-bit constant address.
2589 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2590 B.buildExtract(DstReg, Load, 0);
2591 } else
2592 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2593
2594 MI.eraseFromParent();
2595 return true;
2596}
2597
2598static LLT widenToNextPowerOf2(LLT Ty) {
2599 if (Ty.isVector())
2600 return Ty.changeElementCount(
2601 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2602 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2603}
2604
2605bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2606 MachineInstr &MI) const {
2607 MachineIRBuilder &B = Helper.MIRBuilder;
2608 MachineRegisterInfo &MRI = *B.getMRI();
2609 GISelChangeObserver &Observer = Helper.Observer;
2610
2611 Register PtrReg = MI.getOperand(1).getReg();
2612 LLT PtrTy = MRI.getType(PtrReg);
2613 unsigned AddrSpace = PtrTy.getAddressSpace();
2614
2615 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2616 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2617 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2618 Observer.changingInstr(MI);
2619 MI.getOperand(1).setReg(Cast.getReg(0));
2620 Observer.changedInstr(MI);
2621 return true;
2622 }
2623
2624 if (MI.getOpcode() != AMDGPU::G_LOAD)
2625 return false;
2626
2627 Register ValReg = MI.getOperand(0).getReg();
2628 LLT ValTy = MRI.getType(ValReg);
2629
2630 MachineMemOperand *MMO = *MI.memoperands_begin();
2631 const unsigned ValSize = ValTy.getSizeInBits();
2632 const LLT MemTy = MMO->getMemoryType();
2633 const Align MemAlign = MMO->getAlign();
2634 const unsigned MemSize = MemTy.getSizeInBits();
2635 const uint64_t AlignInBits = 8 * MemAlign.value();
2636
2637 // Widen non-power-of-2 loads to the alignment if needed
2638 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2639 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2640
2641 // This was already the correct extending load result type, so just adjust
2642 // the memory type.
2643 if (WideMemSize == ValSize) {
2644 MachineFunction &MF = B.getMF();
2645
2646 MachineMemOperand *WideMMO =
2647 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2648 Observer.changingInstr(MI);
2649 MI.setMemRefs(MF, {WideMMO});
2650 Observer.changedInstr(MI);
2651 return true;
2652 }
2653
2654 // Don't bother handling edge case that should probably never be produced.
2655 if (ValSize > WideMemSize)
2656 return false;
2657
2658 LLT WideTy = widenToNextPowerOf2(ValTy);
2659
2660 Register WideLoad;
2661 if (!WideTy.isVector()) {
2662 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2663 B.buildTrunc(ValReg, WideLoad).getReg(0);
2664 } else {
2665 // Extract the subvector.
2666
2667 if (isRegisterType(ValTy)) {
2668 // If this a case where G_EXTRACT is legal, use it.
2669 // (e.g. <3 x s32> -> <4 x s32>)
2670 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2671 B.buildExtract(ValReg, WideLoad, 0);
2672 } else {
2673 // For cases where the widened type isn't a nice register value, unmerge
2674 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2675 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2676 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2677 }
2678 }
2679
2680 MI.eraseFromParent();
2681 return true;
2682 }
2683
2684 return false;
2685}
2686
2687bool AMDGPULegalizerInfo::legalizeFMad(
2688 MachineInstr &MI, MachineRegisterInfo &MRI,
2689 MachineIRBuilder &B) const {
2690 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2691 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2691, __extension__ __PRETTY_FUNCTION__))
;
2692
2693 MachineFunction &MF = B.getMF();
2694 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2695
2696 // TODO: Always legal with future ftz flag.
2697 // FIXME: Do we need just output?
2698 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2699 return true;
2700 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2701 return true;
2702
2703 MachineIRBuilder HelperBuilder(MI);
2704 GISelObserverWrapper DummyObserver;
2705 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2706 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2707}
2708
2709bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2710 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2711 Register DstReg = MI.getOperand(0).getReg();
2712 Register PtrReg = MI.getOperand(1).getReg();
2713 Register CmpVal = MI.getOperand(2).getReg();
2714 Register NewVal = MI.getOperand(3).getReg();
2715
2716 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2717, __extension__
__PRETTY_FUNCTION__))
2717 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2717, __extension__
__PRETTY_FUNCTION__))
;
2718
2719 LLT ValTy = MRI.getType(CmpVal);
2720 LLT VecTy = LLT::fixed_vector(2, ValTy);
2721
2722 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2723
2724 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2725 .addDef(DstReg)
2726 .addUse(PtrReg)
2727 .addUse(PackedVal)
2728 .setMemRefs(MI.memoperands());
2729
2730 MI.eraseFromParent();
2731 return true;
2732}
2733
2734bool AMDGPULegalizerInfo::legalizeFlog(
2735 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2736 Register Dst = MI.getOperand(0).getReg();
2737 Register Src = MI.getOperand(1).getReg();
2738 LLT Ty = B.getMRI()->getType(Dst);
2739 unsigned Flags = MI.getFlags();
2740
2741 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2742 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2743
2744 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2745 MI.eraseFromParent();
2746 return true;
2747}
2748
2749bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2750 MachineIRBuilder &B) const {
2751 Register Dst = MI.getOperand(0).getReg();
2752 Register Src = MI.getOperand(1).getReg();
2753 unsigned Flags = MI.getFlags();
2754 LLT Ty = B.getMRI()->getType(Dst);
2755
2756 auto K = B.buildFConstant(Ty, numbers::log2e);
2757 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2758 B.buildFExp2(Dst, Mul, Flags);
2759 MI.eraseFromParent();
2760 return true;
2761}
2762
2763bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2764 MachineIRBuilder &B) const {
2765 Register Dst = MI.getOperand(0).getReg();
2766 Register Src0 = MI.getOperand(1).getReg();
2767 Register Src1 = MI.getOperand(2).getReg();
2768 unsigned Flags = MI.getFlags();
2769 LLT Ty = B.getMRI()->getType(Dst);
2770 const LLT S16 = LLT::scalar(16);
2771 const LLT S32 = LLT::scalar(32);
2772
2773 if (Ty == S32) {
2774 auto Log = B.buildFLog2(S32, Src0, Flags);
2775 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2776 .addUse(Log.getReg(0))
2777 .addUse(Src1)
2778 .setMIFlags(Flags);
2779 B.buildFExp2(Dst, Mul, Flags);
2780 } else if (Ty == S16) {
2781 // There's no f16 fmul_legacy, so we need to convert for it.
2782 auto Log = B.buildFLog2(S16, Src0, Flags);
2783 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2784 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2785 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2786 .addUse(Ext0.getReg(0))
2787 .addUse(Ext1.getReg(0))
2788 .setMIFlags(Flags);
2789
2790 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2791 } else
2792 return false;
2793
2794 MI.eraseFromParent();
2795 return true;
2796}
2797
2798// Find a source register, ignoring any possible source modifiers.
2799static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2800 Register ModSrc = OrigSrc;
2801 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2802 ModSrc = SrcFNeg->getOperand(1).getReg();
2803 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2804 ModSrc = SrcFAbs->getOperand(1).getReg();
2805 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2806 ModSrc = SrcFAbs->getOperand(1).getReg();
2807 return ModSrc;
2808}
2809
2810bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2811 MachineRegisterInfo &MRI,
2812 MachineIRBuilder &B) const {
2813
2814 const LLT S1 = LLT::scalar(1);
2815 const LLT S64 = LLT::scalar(64);
2816 Register Dst = MI.getOperand(0).getReg();
2817 Register OrigSrc = MI.getOperand(1).getReg();
2818 unsigned Flags = MI.getFlags();
2819 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2820, __extension__
__PRETTY_FUNCTION__))
2820 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2820, __extension__
__PRETTY_FUNCTION__))
;
2821
2822 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2823 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2824 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2825 // V_FRACT bug is:
2826 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2827 //
2828 // Convert floor(x) to (x - fract(x))
2829
2830 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2831 .addUse(OrigSrc)
2832 .setMIFlags(Flags);
2833
2834 // Give source modifier matching some assistance before obscuring a foldable
2835 // pattern.
2836
2837 // TODO: We can avoid the neg on the fract? The input sign to fract
2838 // shouldn't matter?
2839 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2840
2841 auto Const =
2842 B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
2843
2844 Register Min = MRI.createGenericVirtualRegister(S64);
2845
2846 // We don't need to concern ourselves with the snan handling difference, so
2847 // use the one which will directly select.
2848 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2849 if (MFI->getMode().IEEE)
2850 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2851 else
2852 B.buildFMinNum(Min, Fract, Const, Flags);
2853
2854 Register CorrectedFract = Min;
2855 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2856 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2857 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2858 }
2859
2860 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2861 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2862
2863 MI.eraseFromParent();
2864 return true;
2865}
2866
2867// Turn an illegal packed v2s16 build vector into bit operations.
2868// TODO: This should probably be a bitcast action in LegalizerHelper.
2869bool AMDGPULegalizerInfo::legalizeBuildVector(
2870 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2871 Register Dst = MI.getOperand(0).getReg();
2872 const LLT S32 = LLT::scalar(32);
2873 const LLT S16 = LLT::scalar(16);
2874 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector
(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2874, __extension__
__PRETTY_FUNCTION__))
;
2875
2876 Register Src0 = MI.getOperand(1).getReg();
2877 Register Src1 = MI.getOperand(2).getReg();
2878
2879 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2880 assert(MRI.getType(Src0) == S32)(static_cast <bool> (MRI.getType(Src0) == S32) ? void (
0) : __assert_fail ("MRI.getType(Src0) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2880, __extension__ __PRETTY_FUNCTION__))
;
2881 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
2882 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
2883 }
2884
2885 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
2886 B.buildBitcast(Dst, Merge);
2887
2888 MI.eraseFromParent();
2889 return true;
2890}
2891
2892// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2893//
2894// Source and accumulation registers must all be 32-bits.
2895//
2896// TODO: When the multiply is uniform, we should produce a code sequence
2897// that is better suited to instruction selection on the SALU. Instead of
2898// the outer loop going over parts of the result, the outer loop should go
2899// over parts of one of the factors. This should result in instruction
2900// selection that makes full use of S_ADDC_U32 instructions.
2901void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
2902 MutableArrayRef<Register> Accum,
2903 ArrayRef<Register> Src0,
2904 ArrayRef<Register> Src1,
2905 bool UsePartialMad64_32,
2906 bool SeparateOddAlignedProducts) const {
2907 // Use (possibly empty) vectors of S1 registers to represent the set of
2908 // carries from one pair of positions to the next.
2909 using Carry = SmallVector<Register, 2>;
2910
2911 MachineIRBuilder &B = Helper.MIRBuilder;
2912 GISelKnownBits &KB = *Helper.getKnownBits();
2913
2914 const LLT S1 = LLT::scalar(1);
2915 const LLT S32 = LLT::scalar(32);
2916 const LLT S64 = LLT::scalar(64);
2917
2918 Register Zero32;
2919 Register Zero64;
2920
2921 auto getZero32 = [&]() -> Register {
2922 if (!Zero32)
2923 Zero32 = B.buildConstant(S32, 0).getReg(0);
2924 return Zero32;
2925 };
2926 auto getZero64 = [&]() -> Register {
2927 if (!Zero64)
2928 Zero64 = B.buildConstant(S64, 0).getReg(0);
2929 return Zero64;
2930 };
2931
2932 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
2933 for (unsigned i = 0; i < Src0.size(); ++i) {
2934 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
2935 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
2936 }
2937
2938 // Merge the given carries into the 32-bit LocalAccum, which is modified
2939 // in-place.
2940 //
2941 // Returns the carry-out, which is a single S1 register or null.
2942 auto mergeCarry =
2943 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2944 if (CarryIn.empty())
2945 return Register();
2946
2947 bool HaveCarryOut = true;
2948 Register CarryAccum;
2949 if (CarryIn.size() == 1) {
2950 if (!LocalAccum) {
2951 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2952 return Register();
2953 }
2954
2955 CarryAccum = getZero32();
2956 } else {
2957 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2958 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2959 CarryAccum =
2960 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2961 .getReg(0);
2962 }
2963
2964 if (!LocalAccum) {
2965 LocalAccum = getZero32();
2966 HaveCarryOut = false;
2967 }
2968 }
2969
2970 auto Add =
2971 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2972 LocalAccum = Add.getReg(0);
2973 return HaveCarryOut ? Add.getReg(1) : Register();
2974 };
2975
2976 // Build a multiply-add chain to compute
2977 //
2978 // LocalAccum + (partial products at DstIndex)
2979 // + (opportunistic subset of CarryIn)
2980 //
2981 // LocalAccum is an array of one or two 32-bit registers that are updated
2982 // in-place. The incoming registers may be null.
2983 //
2984 // In some edge cases, carry-ins can be consumed "for free". In that case,
2985 // the consumed carry bits are removed from CarryIn in-place.
2986 auto buildMadChain =
2987 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2988 -> Carry {
2989 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||(static_cast <bool> ((DstIndex + 1 < Accum.size() &&
LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() &&
LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2990, __extension__
__PRETTY_FUNCTION__))
2990 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1))(static_cast <bool> ((DstIndex + 1 < Accum.size() &&
LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() &&
LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2990, __extension__
__PRETTY_FUNCTION__))
;
2991
2992 Carry CarryOut;
2993 unsigned j0 = 0;
2994
2995 // Use plain 32-bit multiplication for the most significant part of the
2996 // result by default.
2997 if (LocalAccum.size() == 1 &&
2998 (!UsePartialMad64_32 || !CarryIn.empty())) {
2999 do {
3000 // Skip multiplication if one of the operands is 0
3001 unsigned j1 = DstIndex - j0;
3002 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3003 ++j0;
3004 continue;
3005 }
3006 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3007 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3008 LocalAccum[0] = Mul.getReg(0);
3009 } else {
3010 if (CarryIn.empty()) {
3011 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3012 } else {
3013 LocalAccum[0] =
3014 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3015 .getReg(0);
3016 CarryIn.pop_back();
3017 }
3018 }
3019 ++j0;
3020 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3021 }
3022
3023 // Build full 64-bit multiplies.
3024 if (j0 <= DstIndex) {
3025 bool HaveSmallAccum = false;
3026 Register Tmp;
3027
3028 if (LocalAccum[0]) {
3029 if (LocalAccum.size() == 1) {
3030 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3031 HaveSmallAccum = true;
3032 } else if (LocalAccum[1]) {
3033 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3034 HaveSmallAccum = false;
3035 } else {
3036 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3037 HaveSmallAccum = true;
3038 }
3039 } else {
3040 assert(LocalAccum.size() == 1 || !LocalAccum[1])(static_cast <bool> (LocalAccum.size() == 1 || !LocalAccum
[1]) ? void (0) : __assert_fail ("LocalAccum.size() == 1 || !LocalAccum[1]"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3040, __extension__
__PRETTY_FUNCTION__))
;
3041 Tmp = getZero64();
3042 HaveSmallAccum = true;
3043 }
3044
3045 do {
3046 unsigned j1 = DstIndex - j0;
3047 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3048 ++j0;
3049 continue;
3050 }
3051 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3052 {Src0[j0], Src1[j1], Tmp});
3053 Tmp = Mad.getReg(0);
3054 if (!HaveSmallAccum)
3055 CarryOut.push_back(Mad.getReg(1));
3056 HaveSmallAccum = false;
3057
3058 ++j0;
3059 } while (j0 <= DstIndex);
3060
3061 auto Unmerge = B.buildUnmerge(S32, Tmp);
3062 LocalAccum[0] = Unmerge.getReg(0);
3063 if (LocalAccum.size() > 1)
3064 LocalAccum[1] = Unmerge.getReg(1);
3065 }
3066
3067 return CarryOut;
3068 };
3069
3070 // Outer multiply loop, iterating over destination parts from least
3071 // significant to most significant parts.
3072 //
3073 // The columns of the following diagram correspond to the destination parts
3074 // affected by one iteration of the outer loop (ignoring boundary
3075 // conditions).
3076 //
3077 // Dest index relative to 2 * i: 1 0 -1
3078 // ------
3079 // Carries from previous iteration: e o
3080 // Even-aligned partial product sum: E E .
3081 // Odd-aligned partial product sum: O O
3082 //
3083 // 'o' is OddCarry, 'e' is EvenCarry.
3084 // EE and OO are computed from partial products via buildMadChain and use
3085 // accumulation where possible and appropriate.
3086 //
3087 Register SeparateOddCarry;
3088 Carry EvenCarry;
3089 Carry OddCarry;
3090
3091 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3092 Carry OddCarryIn = std::move(OddCarry);
3093 Carry EvenCarryIn = std::move(EvenCarry);
3094 OddCarry.clear();
3095 EvenCarry.clear();
3096
3097 // Partial products at offset 2 * i.
3098 if (2 * i < Accum.size()) {
3099 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3100 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3101 }
3102
3103 // Partial products at offset 2 * i - 1.
3104 if (i > 0) {
3105 if (!SeparateOddAlignedProducts) {
3106 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3107 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3108 } else {
3109 bool IsHighest = 2 * i >= Accum.size();
3110 Register SeparateOddOut[2];
3111 auto LocalAccum = MutableArrayRef(SeparateOddOut)
3112 .take_front(IsHighest ? 1 : 2);
3113 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3114
3115 MachineInstr *Lo;
3116
3117 if (i == 1) {
3118 if (!IsHighest)
3119 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3120 else
3121 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3122 } else {
3123 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3124 SeparateOddCarry);
3125 }
3126 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3127
3128 if (!IsHighest) {
3129 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3130 Lo->getOperand(1).getReg());
3131 Accum[2 * i] = Hi.getReg(0);
3132 SeparateOddCarry = Hi.getReg(1);
3133 }
3134 }
3135 }
3136
3137 // Add in the carries from the previous iteration
3138 if (i > 0) {
3139 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3140 EvenCarryIn.push_back(CarryOut);
3141
3142 if (2 * i < Accum.size()) {
3143 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3144 OddCarry.push_back(CarryOut);
3145 }
3146 }
3147 }
3148}
3149
3150// Custom narrowing of wide multiplies using wide multiply-add instructions.
3151//
3152// TODO: If the multiply is followed by an addition, we should attempt to
3153// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3154bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3155 MachineInstr &MI) const {
3156 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3156, __extension__ __PRETTY_FUNCTION__))
;
3157 assert(MI.getOpcode() == TargetOpcode::G_MUL)(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_MUL
) ? void (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_MUL"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3157, __extension__
__PRETTY_FUNCTION__))
;
3158
3159 MachineIRBuilder &B = Helper.MIRBuilder;
3160 MachineRegisterInfo &MRI = *B.getMRI();
3161
3162 Register DstReg = MI.getOperand(0).getReg();
3163 Register Src0 = MI.getOperand(1).getReg();
3164 Register Src1 = MI.getOperand(2).getReg();
3165
3166 LLT Ty = MRI.getType(DstReg);
3167 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3167, __extension__ __PRETTY_FUNCTION__))
;
3168
3169 unsigned Size = Ty.getSizeInBits();
3170 unsigned NumParts = Size / 32;
3171 assert((Size % 32) == 0)(static_cast <bool> ((Size % 32) == 0) ? void (0) : __assert_fail
("(Size % 32) == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3171, __extension__ __PRETTY_FUNCTION__))
;
3172 assert(NumParts >= 2)(static_cast <bool> (NumParts >= 2) ? void (0) : __assert_fail
("NumParts >= 2", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3172, __extension__ __PRETTY_FUNCTION__))
;
3173
3174 // Whether to use MAD_64_32 for partial products whose high half is
3175 // discarded. This avoids some ADD instructions but risks false dependency
3176 // stalls on some subtargets in some cases.
3177 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3178
3179 // Whether to compute odd-aligned partial products separately. This is
3180 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3181 // in an even-aligned VGPR.
3182 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3183
3184 LLT S32 = LLT::scalar(32);
3185 SmallVector<Register, 2> Src0Parts, Src1Parts;
3186 for (unsigned i = 0; i < NumParts; ++i) {
3187 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3188 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3189 }
3190 B.buildUnmerge(Src0Parts, Src0);
3191 B.buildUnmerge(Src1Parts, Src1);
3192
3193 SmallVector<Register, 2> AccumRegs(NumParts);
3194 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3195 SeparateOddAlignedProducts);
3196
3197 B.buildMergeLikeInstr(DstReg, AccumRegs);
3198 MI.eraseFromParent();
3199 return true;
3200}
3201
3202// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3203// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3204// case with a single min instruction instead of a compare+select.
3205bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3206 MachineRegisterInfo &MRI,
3207 MachineIRBuilder &B) const {
3208 Register Dst = MI.getOperand(0).getReg();
3209 Register Src = MI.getOperand(1).getReg();
3210 LLT DstTy = MRI.getType(Dst);
3211 LLT SrcTy = MRI.getType(Src);
3212
3213 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3214 ? AMDGPU::G_AMDGPU_FFBH_U32
3215 : AMDGPU::G_AMDGPU_FFBL_B32;
3216 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3217 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3218
3219 MI.eraseFromParent();
3220 return true;
3221}
3222
3223// Check that this is a G_XOR x, -1
3224static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3225 if (MI.getOpcode() != TargetOpcode::G_XOR)
3226 return false;
3227 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3228 return ConstVal && *ConstVal == -1;
3229}
3230
3231// Return the use branch instruction, otherwise null if the usage is invalid.
3232static MachineInstr *
3233verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3234 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3235 Register CondDef = MI.getOperand(0).getReg();
3236 if (!MRI.hasOneNonDBGUse(CondDef))
3237 return nullptr;
3238
3239 MachineBasicBlock *Parent = MI.getParent();
3240 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3241
3242 if (isNot(MRI, *UseMI)) {
3243 Register NegatedCond = UseMI->getOperand(0).getReg();
3244 if (!MRI.hasOneNonDBGUse(NegatedCond))
3245 return nullptr;
3246
3247 // We're deleting the def of this value, so we need to remove it.
3248 eraseInstr(*UseMI, MRI);
3249
3250 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3251 Negated = true;
3252 }
3253
3254 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3255 return nullptr;
3256
3257 // Make sure the cond br is followed by a G_BR, or is the last instruction.
3258 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3259 if (Next == Parent->end()) {
3260 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3261 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3262 return nullptr;
3263 UncondBrTarget = &*NextMBB;
3264 } else {
3265 if (Next->getOpcode() != AMDGPU::G_BR)
3266 return nullptr;
3267 Br = &*Next;
3268 UncondBrTarget = Br->getOperand(0).getMBB();
3269 }
3270
3271 return UseMI;
3272}
3273
3274bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3275 const ArgDescriptor *Arg,
3276 const TargetRegisterClass *ArgRC,
3277 LLT ArgTy) const {
3278 MCRegister SrcReg = Arg->getRegister();
3279 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3279, __extension__
__PRETTY_FUNCTION__))
;
5
'?' condition is true
3280 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3280, __extension__
__PRETTY_FUNCTION__))
;
6
Assuming the condition is true
7
'?' condition is true
3281
3282 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3283 *ArgRC, B.getDebugLoc(), ArgTy);
3284 if (Arg->isMasked()) {
8
Taking true branch
3285 // TODO: Should we try to emit this once in the entry block?
3286 const LLT S32 = LLT::scalar(32);
3287 const unsigned Mask = Arg->getMask();
3288 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
9
Calling 'countr_zero<unsigned int>'
16
Returning from 'countr_zero<unsigned int>'
17
'Shift' initialized to 32
3289
3290 Register AndMaskSrc = LiveIn;
3291
3292 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3293 // 0.
3294 if (Shift
17.1
'Shift' is not equal to 0
17.1
'Shift' is not equal to 0
!= 0) {
18
Taking true branch
3295 auto ShiftAmt = B.buildConstant(S32, Shift);
3296 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3297 }
3298
3299 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
19
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
3300 } else {
3301 B.buildCopy(DstReg, LiveIn);
3302 }
3303
3304 return true;
3305}
3306
3307bool AMDGPULegalizerInfo::loadInputValue(
3308 Register DstReg, MachineIRBuilder &B,
3309 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3310 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3311 const ArgDescriptor *Arg;
3312 const TargetRegisterClass *ArgRC;
3313 LLT ArgTy;
3314 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3315
3316 if (!Arg) {
2
Assuming 'Arg' is non-null
3317 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3318 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3319 // case the pointer argument may be missing and we use null.
3320 B.buildConstant(DstReg, 0);
3321 return true;
3322 }
3323
3324 // It's undefined behavior if a function marked with the amdgpu-no-*
3325 // attributes uses the corresponding intrinsic.
3326 B.buildUndef(DstReg);
3327 return true;
3328 }
3329
3330 if (!Arg->isRegister() || !Arg->getRegister().isValid())
3
Taking false branch
3331 return false; // TODO: Handle these
3332 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4
Calling 'AMDGPULegalizerInfo::loadInputValue'
3333}
3334
3335bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3336 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3337 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3338 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3339 return false;
3340
3341 MI.eraseFromParent();
3342 return true;
3343}
3344
3345static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3346 int64_t C) {
3347 B.buildConstant(MI.getOperand(0).getReg(), C);
3348 MI.eraseFromParent();
3349 return true;
3350}
3351
3352bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3353 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3354 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3355 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3356 if (MaxID == 0)
3357 return replaceWithConstant(B, MI, 0);
3358
3359 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3360 const ArgDescriptor *Arg;
3361 const TargetRegisterClass *ArgRC;
3362 LLT ArgTy;
3363 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3364
3365 Register DstReg = MI.getOperand(0).getReg();
3366 if (!Arg) {
3367 // It's undefined behavior if a function marked with the amdgpu-no-*
3368 // attributes uses the corresponding intrinsic.
3369 B.buildUndef(DstReg);
3370 MI.eraseFromParent();
3371 return true;
3372 }
3373
3374 if (Arg->isMasked()) {
3375 // Don't bother inserting AssertZext for packed IDs since we're emitting the
3376 // masking operations anyway.
3377 //
3378 // TODO: We could assert the top bit is 0 for the source copy.
3379 if (!loadInputValue(DstReg, B, ArgType))
3380 return false;
3381 } else {
3382 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3383 if (!loadInputValue(TmpReg, B, ArgType))
3384 return false;
3385 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
3386 }
3387
3388 MI.eraseFromParent();
3389 return true;
3390}
3391
3392Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3393 int64_t Offset) const {
3394 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3395 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3396
3397 // TODO: If we passed in the base kernel offset we could have a better
3398 // alignment than 4, but we don't really need it.
3399 if (!loadInputValue(KernArgReg, B,
1
Calling 'AMDGPULegalizerInfo::loadInputValue'
3400 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3401 llvm_unreachable("failed to find kernarg segment ptr")::llvm::llvm_unreachable_internal("failed to find kernarg segment ptr"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3401)
;
3402
3403 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3404 // TODO: Should get nuw
3405 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3406}
3407
3408/// Legalize a value that's loaded from kernel arguments. This is only used by
3409/// legacy intrinsics.
3410bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3411 MachineIRBuilder &B,
3412 uint64_t Offset,
3413 Align Alignment) const {
3414 Register DstReg = MI.getOperand(0).getReg();
3415
3416 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3417, __extension__
__PRETTY_FUNCTION__))
3417 "unexpected kernarg parameter type")(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3417, __extension__
__PRETTY_FUNCTION__))
;
3418
3419 Register Ptr = getKernargParameterPtr(B, Offset);
3420 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3421 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3422 MachineMemOperand::MODereferenceable |
3423 MachineMemOperand::MOInvariant);
3424 MI.eraseFromParent();
3425 return true;
3426}
3427
3428bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3429 MachineRegisterInfo &MRI,
3430 MachineIRBuilder &B) const {
3431 Register Dst = MI.getOperand(0).getReg();
3432 LLT DstTy = MRI.getType(Dst);
3433 LLT S16 = LLT::scalar(16);
3434 LLT S32 = LLT::scalar(32);
3435 LLT S64 = LLT::scalar(64);
3436
3437 if (DstTy == S16)
3438 return legalizeFDIV16(MI, MRI, B);
3439 if (DstTy == S32)
3440 return legalizeFDIV32(MI, MRI, B);
3441 if (DstTy == S64)
3442 return legalizeFDIV64(MI, MRI, B);
3443
3444 return false;
3445}
3446
3447void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3448 Register DstDivReg,
3449 Register DstRemReg,
3450 Register X,
3451 Register Y) const {
3452 const LLT S1 = LLT::scalar(1);
3453 const LLT S32 = LLT::scalar(32);
3454
3455 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3456 // algorithm used here.
3457
3458 // Initial estimate of inv(y).
3459 auto FloatY = B.buildUITOFP(S32, Y);
3460 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3461 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
3462 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3463 auto Z = B.buildFPTOUI(S32, ScaledY);
3464
3465 // One round of UNR.
3466 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3467 auto NegYZ = B.buildMul(S32, NegY, Z);
3468 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3469
3470 // Quotient/remainder estimate.
3471 auto Q = B.buildUMulH(S32, X, Z);
3472 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3473
3474 // First quotient/remainder refinement.
3475 auto One = B.buildConstant(S32, 1);
3476 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3477 if (DstDivReg)
3478 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3479 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3480
3481 // Second quotient/remainder refinement.
3482 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3483 if (DstDivReg)
3484 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3485
3486 if (DstRemReg)
3487 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3488}
3489
3490// Build integer reciprocal sequence around V_RCP_IFLAG_F32
3491//
3492// Return lo, hi of result
3493//
3494// %cvt.lo = G_UITOFP Val.lo
3495// %cvt.hi = G_UITOFP Val.hi
3496// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3497// %rcp = G_AMDGPU_RCP_IFLAG %mad
3498// %mul1 = G_FMUL %rcp, 0x5f7ffffc
3499// %mul2 = G_FMUL %mul1, 2**(-32)
3500// %trunc = G_INTRINSIC_TRUNC %mul2
3501// %mad2 = G_FMAD %trunc, -(2**32), %mul1
3502// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3503static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3504 Register Val) {
3505 const LLT S32 = LLT::scalar(32);
3506 auto Unmerge = B.buildUnmerge(S32, Val);
3507
3508 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3509 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3510
3511 auto Mad = B.buildFMAD(
3512 S32, CvtHi, // 2**32
3513 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
3514
3515 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3516 auto Mul1 = B.buildFMul(
3517 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
3518
3519 // 2**(-32)
3520 auto Mul2 = B.buildFMul(
3521 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
3522 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3523
3524 // -(2**32)
3525 auto Mad2 = B.buildFMAD(
3526 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
3527 Mul1);
3528
3529 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3530 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3531
3532 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3533}
3534
3535void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3536 Register DstDivReg,
3537 Register DstRemReg,
3538 Register Numer,
3539 Register Denom) const {
3540 const LLT S32 = LLT::scalar(32);
3541 const LLT S64 = LLT::scalar(64);
3542 const LLT S1 = LLT::scalar(1);
3543 Register RcpLo, RcpHi;
3544
3545 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3546
3547 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3548
3549 auto Zero64 = B.buildConstant(S64, 0);
3550 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3551
3552 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3553 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3554
3555 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3556 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3557 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3558
3559 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3560 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3561 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3562
3563 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3564 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3565 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3566 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3567 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3568
3569 auto Zero32 = B.buildConstant(S32, 0);
3570 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3571 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3572 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3573
3574 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3575 Register NumerLo = UnmergeNumer.getReg(0);
3576 Register NumerHi = UnmergeNumer.getReg(1);
3577
3578 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3579 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3580 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3581 Register Mul3_Lo = UnmergeMul3.getReg(0);
3582 Register Mul3_Hi = UnmergeMul3.getReg(1);
3583 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3584 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3585 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3586 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3587
3588 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3589 Register DenomLo = UnmergeDenom.getReg(0);
3590 Register DenomHi = UnmergeDenom.getReg(1);
3591
3592 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3593 auto C1 = B.buildSExt(S32, CmpHi);
3594
3595 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3596 auto C2 = B.buildSExt(S32, CmpLo);
3597
3598 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3599 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3600
3601 // TODO: Here and below portions of the code can be enclosed into if/endif.
3602 // Currently control flow is unconditional and we have 4 selects after
3603 // potential endif to substitute PHIs.
3604
3605 // if C3 != 0 ...
3606 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3607 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3608 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3609 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3610
3611 auto One64 = B.buildConstant(S64, 1);
3612 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3613
3614 auto C4 =
3615 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3616 auto C5 =
3617 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3618 auto C6 = B.buildSelect(
3619 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3620
3621 // if (C6 != 0)
3622 auto Add4 = B.buildAdd(S64, Add3, One64);
3623 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3624
3625 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3626 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3627 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3628
3629 // endif C6
3630 // endif C3
3631
3632 if (DstDivReg) {
3633 auto Sel1 = B.buildSelect(
3634 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3635 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3636 Sel1, MulHi3);
3637 }
3638
3639 if (DstRemReg) {
3640 auto Sel2 = B.buildSelect(
3641 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3642 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3643 Sel2, Sub1);
3644 }
3645}
3646
3647bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3648 MachineRegisterInfo &MRI,
3649 MachineIRBuilder &B) const {
3650 Register DstDivReg, DstRemReg;
3651 switch (MI.getOpcode()) {
3652 default:
3653 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3653)
;
3654 case AMDGPU::G_UDIV: {
3655 DstDivReg = MI.getOperand(0).getReg();
3656 break;
3657 }
3658 case AMDGPU::G_UREM: {
3659 DstRemReg = MI.getOperand(0).getReg();
3660 break;
3661 }
3662 case AMDGPU::G_UDIVREM: {
3663 DstDivReg = MI.getOperand(0).getReg();
3664 DstRemReg = MI.getOperand(1).getReg();
3665 break;
3666 }
3667 }
3668
3669 const LLT S64 = LLT::scalar(64);
3670 const LLT S32 = LLT::scalar(32);
3671 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3672 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3673 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3674 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3675
3676 if (Ty == S32)
3677 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3678 else if (Ty == S64)
3679 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3680 else
3681 return false;
3682
3683 MI.eraseFromParent();
3684 return true;
3685}
3686
3687bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3688 MachineRegisterInfo &MRI,
3689 MachineIRBuilder &B) const {
3690 const LLT S64 = LLT::scalar(64);
3691 const LLT S32 = LLT::scalar(32);
3692
3693 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3694 if (Ty != S32 && Ty != S64)
3695 return false;
3696
3697 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3698 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3699 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3700
3701 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3702 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3703 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3704
3705 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3706 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3707
3708 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3709 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3710
3711 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3712 switch (MI.getOpcode()) {
3713 default:
3714 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3714)
;
3715 case AMDGPU::G_SDIV: {
3716 DstDivReg = MI.getOperand(0).getReg();
3717 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3718 break;
3719 }
3720 case AMDGPU::G_SREM: {
3721 DstRemReg = MI.getOperand(0).getReg();
3722 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3723 break;
3724 }
3725 case AMDGPU::G_SDIVREM: {
3726 DstDivReg = MI.getOperand(0).getReg();
3727 DstRemReg = MI.getOperand(1).getReg();
3728 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3729 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3730 break;
3731 }
3732 }
3733
3734 if (Ty == S32)
3735 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3736 else
3737 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3738
3739 if (DstDivReg) {
3740 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3741 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3742 B.buildSub(DstDivReg, SignXor, Sign);
3743 }
3744
3745 if (DstRemReg) {
3746 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3747 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3748 B.buildSub(DstRemReg, SignXor, Sign);
3749 }
3750
3751 MI.eraseFromParent();
3752 return true;
3753}
3754
3755bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3756 MachineRegisterInfo &MRI,
3757 MachineIRBuilder &B) const {
3758 Register Res = MI.getOperand(0).getReg();
3759 Register LHS = MI.getOperand(1).getReg();
3760 Register RHS = MI.getOperand(2).getReg();
3761 uint16_t Flags = MI.getFlags();
3762 LLT ResTy = MRI.getType(Res);
3763
3764 const MachineFunction &MF = B.getMF();
3765 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3766 MI.getFlag(MachineInstr::FmAfn);
3767
3768 if (!AllowInaccurateRcp)
3769 return false;
3770
3771 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3772 // 1 / x -> RCP(x)
3773 if (CLHS->isExactlyValue(1.0)) {
3774 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3775 .addUse(RHS)
3776 .setMIFlags(Flags);
3777
3778 MI.eraseFromParent();
3779 return true;
3780 }
3781
3782 // -1 / x -> RCP( FNEG(x) )
3783 if (CLHS->isExactlyValue(-1.0)) {
3784 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3785 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3786 .addUse(FNeg.getReg(0))
3787 .setMIFlags(Flags);
3788
3789 MI.eraseFromParent();
3790 return true;
3791 }
3792 }
3793
3794 // x / y -> x * (1.0 / y)
3795 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3796 .addUse(RHS)
3797 .setMIFlags(Flags);
3798 B.buildFMul(Res, LHS, RCP, Flags);
3799
3800 MI.eraseFromParent();
3801 return true;
3802}
3803
3804bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3805 MachineRegisterInfo &MRI,
3806 MachineIRBuilder &B) const {
3807 Register Res = MI.getOperand(0).getReg();
3808 Register X = MI.getOperand(1).getReg();
3809 Register Y = MI.getOperand(2).getReg();
3810 uint16_t Flags = MI.getFlags();
3811 LLT ResTy = MRI.getType(Res);
3812
3813 const MachineFunction &MF = B.getMF();
3814 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3815 MI.getFlag(MachineInstr::FmAfn);
3816
3817 if (!AllowInaccurateRcp)
3818 return false;
3819
3820 auto NegY = B.buildFNeg(ResTy, Y);
3821 auto One = B.buildFConstant(ResTy, 1.0);
3822
3823 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3824 .addUse(Y)
3825 .setMIFlags(Flags);
3826
3827 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3828 R = B.buildFMA(ResTy, Tmp0, R, R);
3829
3830 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3831 R = B.buildFMA(ResTy, Tmp1, R, R);
3832
3833 auto Ret = B.buildFMul(ResTy, X, R);
3834 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3835
3836 B.buildFMA(Res, Tmp2, R, Ret);
3837 MI.eraseFromParent();
3838 return true;
3839}
3840
3841bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3842 MachineRegisterInfo &MRI,
3843 MachineIRBuilder &B) const {
3844 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3845 return true;
3846
3847 Register Res = MI.getOperand(0).getReg();
3848 Register LHS = MI.getOperand(1).getReg();
3849 Register RHS = MI.getOperand(2).getReg();
3850
3851 uint16_t Flags = MI.getFlags();
3852
3853 LLT S16 = LLT::scalar(16);
3854 LLT S32 = LLT::scalar(32);
3855
3856 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3857 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3858
3859 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3860 .addUse(RHSExt.getReg(0))
3861 .setMIFlags(Flags);
3862
3863 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3864 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3865
3866 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3867 .addUse(RDst.getReg(0))
3868 .addUse(RHS)
3869 .addUse(LHS)
3870 .setMIFlags(Flags);
3871
3872 MI.eraseFromParent();
3873 return true;
3874}
3875
3876// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3877// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3878static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
3879 const GCNSubtarget &ST,
3880 SIModeRegisterDefaults Mode) {
3881 // Set SP denorm mode to this value.
3882 unsigned SPDenormMode =
3883 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3884
3885 if (ST.hasDenormModeInst()) {
3886 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3887 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3888
3889 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3890 B.buildInstr(AMDGPU::S_DENORM_MODE)
3891 .addImm(NewDenormModeValue);
3892
3893 } else {
3894 // Select FP32 bit field in mode register.
3895 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3896 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3897 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3898
3899 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3900 .addImm(SPDenormMode)
3901 .addImm(SPDenormModeBitField);
3902 }
3903}
3904
3905bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3906 MachineRegisterInfo &MRI,
3907 MachineIRBuilder &B) const {
3908 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3909 return true;
3910
3911 Register Res = MI.getOperand(0).getReg();
3912 Register LHS = MI.getOperand(1).getReg();
3913 Register RHS = MI.getOperand(2).getReg();
3914 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3915 SIModeRegisterDefaults Mode = MFI->getMode();
3916
3917 uint16_t Flags = MI.getFlags();
3918
3919 LLT S32 = LLT::scalar(32);
3920 LLT S1 = LLT::scalar(1);
3921
3922 auto One = B.buildFConstant(S32, 1.0f);
3923
3924 auto DenominatorScaled =
3925 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3926 .addUse(LHS)
3927 .addUse(RHS)
3928 .addImm(0)
3929 .setMIFlags(Flags);
3930 auto NumeratorScaled =
3931 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3932 .addUse(LHS)
3933 .addUse(RHS)
3934 .addImm(1)
3935 .setMIFlags(Flags);
3936
3937 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3938 .addUse(DenominatorScaled.getReg(0))
3939 .setMIFlags(Flags);
3940 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3941
3942 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3943 // aren't modeled as reading it.
3944 if (!Mode.allFP32Denormals())
3945 toggleSPDenormMode(true, B, ST, Mode);
3946
3947 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3948 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3949 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3950 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3951 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3952 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3953
3954 if (!Mode.allFP32Denormals())
3955 toggleSPDenormMode(false, B, ST, Mode);
3956
3957 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3958 .addUse(Fma4.getReg(0))
3959 .addUse(Fma1.getReg(0))
3960 .addUse(Fma3.getReg(0))
3961 .addUse(NumeratorScaled.getReg(1))
3962 .setMIFlags(Flags);
3963
3964 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3965 .addUse(Fmas.getReg(0))
3966 .addUse(RHS)
3967 .addUse(LHS)
3968 .setMIFlags(Flags);
3969
3970 MI.eraseFromParent();
3971 return true;
3972}
3973
3974bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3975 MachineRegisterInfo &MRI,
3976 MachineIRBuilder &B) const {
3977 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3978 return true;
3979
3980 Register Res = MI.getOperand(0).getReg();
3981 Register LHS = MI.getOperand(1).getReg();
3982 Register RHS = MI.getOperand(2).getReg();
3983
3984 uint16_t Flags = MI.getFlags();
3985
3986 LLT S64 = LLT::scalar(64);
3987 LLT S1 = LLT::scalar(1);
3988
3989 auto One = B.buildFConstant(S64, 1.0);
3990
3991 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3992 .addUse(LHS)
3993 .addUse(RHS)
3994 .addImm(0)
3995 .setMIFlags(Flags);
3996
3997 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3998
3999 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
4000 .addUse(DivScale0.getReg(0))
4001 .setMIFlags(Flags);
4002
4003 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4004 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4005 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4006
4007 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4008 .addUse(LHS)
4009 .addUse(RHS)
4010 .addImm(1)
4011 .setMIFlags(Flags);
4012
4013 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4014 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4015 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4016
4017 Register Scale;
4018 if (!ST.hasUsableDivScaleConditionOutput()) {
4019 // Workaround a hardware bug on SI where the condition output from div_scale
4020 // is not usable.
4021
4022 LLT S32 = LLT::scalar(32);
4023
4024 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4025 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4026 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4027 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4028
4029 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4030 Scale1Unmerge.getReg(1));
4031 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4032 Scale0Unmerge.getReg(1));
4033 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4034 } else {
4035 Scale = DivScale1.getReg(1);
4036 }
4037
4038 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4039 .addUse(Fma4.getReg(0))
4040 .addUse(Fma3.getReg(0))
4041 .addUse(Mul.getReg(0))
4042 .addUse(Scale)
4043 .setMIFlags(Flags);
4044
4045 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4046 .addUse(Fmas.getReg(0))
4047 .addUse(RHS)
4048 .addUse(LHS)
4049 .setMIFlags(Flags);
4050
4051 MI.eraseFromParent();
4052 return true;
4053}
4054
4055bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4056 MachineRegisterInfo &MRI,
4057 MachineIRBuilder &B) const {
4058 Register Res = MI.getOperand(0).getReg();
4059 Register LHS = MI.getOperand(2).getReg();
4060 Register RHS = MI.getOperand(3).getReg();
4061 uint16_t Flags = MI.getFlags();
4062
4063 LLT S32 = LLT::scalar(32);
4064 LLT S1 = LLT::scalar(1);
4065
4066 auto Abs = B.buildFAbs(S32, RHS, Flags);
4067 const APFloat C0Val(1.0f);
4068
4069 auto C0 = B.buildConstant(S32, 0x6f800000);
4070 auto C1 = B.buildConstant(S32, 0x2f800000);
4071 auto C2 = B.buildConstant(S32, llvm::bit_cast<uint32_t>(1.0f));
4072
4073 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4074 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4075
4076 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4077
4078 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4079 .addUse(Mul0.getReg(0))
4080 .setMIFlags(Flags);
4081
4082 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4083
4084 B.buildFMul(Res, Sel, Mul1, Flags);
4085
4086 MI.eraseFromParent();
4087 return true;
4088}
4089
4090// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4091// FIXME: Why do we handle this one but not other removed instructions?
4092//
4093// Reciprocal square root. The clamp prevents infinite results, clamping
4094// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4095// +-max_float.
4096bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4097 MachineRegisterInfo &MRI,
4098 MachineIRBuilder &B) const {
4099 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4100 return true;
4101
4102 Register Dst = MI.getOperand(0).getReg();
4103 Register Src = MI.getOperand(2).getReg();
4104 auto Flags = MI.getFlags();
4105
4106 LLT Ty = MRI.getType(Dst);
4107
4108 const fltSemantics *FltSemantics;
4109 if (Ty == LLT::scalar(32))
4110 FltSemantics = &APFloat::IEEEsingle();
4111 else if (Ty == LLT::scalar(64))
4112 FltSemantics = &APFloat::IEEEdouble();
4113 else
4114 return false;
4115
4116 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4117 .addUse(Src)
4118 .setMIFlags(Flags);
4119
4120 // We don't need to concern ourselves with the snan handling difference, since
4121 // the rsq quieted (or not) so use the one which will directly select.
4122 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4123 const bool UseIEEE = MFI->getMode().IEEE;
4124
4125 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4126 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4127 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4128
4129 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4130
4131 if (UseIEEE)
4132 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4133 else
4134 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4135 MI.eraseFromParent();
4136 return true;
4137}
4138
4139static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4140 switch (IID) {
4141 case Intrinsic::amdgcn_ds_fadd:
4142 return AMDGPU::G_ATOMICRMW_FADD;
4143 case Intrinsic::amdgcn_ds_fmin:
4144 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4145 case Intrinsic::amdgcn_ds_fmax:
4146 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4147 default:
4148 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4148)
;
4149 }
4150}
4151
4152bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4153 MachineInstr &MI,
4154 Intrinsic::ID IID) const {
4155 GISelChangeObserver &Observer = Helper.Observer;
4156 Observer.changingInstr(MI);
4157
4158 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4159
4160 // The remaining operands were used to set fields in the MemOperand on
4161 // construction.
4162 for (int I = 6; I > 3; --I)
4163 MI.removeOperand(I);
4164
4165 MI.removeOperand(1); // Remove the intrinsic ID.
4166 Observer.changedInstr(MI);
4167 return true;
4168}
4169
4170bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4171 MachineRegisterInfo &MRI,
4172 MachineIRBuilder &B) const {
4173 uint64_t Offset =
4174 ST.getTargetLowering()->getImplicitParameterOffset(
4175 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4176 LLT DstTy = MRI.getType(DstReg);
4177 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4178
4179 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4180 if (!loadInputValue(KernargPtrReg, B,
4181 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4182 return false;
4183
4184 // FIXME: This should be nuw
4185 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4186 return true;
4187}
4188
4189bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4190 MachineRegisterInfo &MRI,
4191 MachineIRBuilder &B) const {
4192 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4193 if (!MFI->isEntryFunction()) {
4194 return legalizePreloadedArgIntrin(MI, MRI, B,
4195 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4196 }
4197
4198 Register DstReg = MI.getOperand(0).getReg();
4199 if (!getImplicitArgPtr(DstReg, MRI, B))
4200 return false;
4201
4202 MI.eraseFromParent();
4203 return true;
4204}
4205
4206bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4207 MachineRegisterInfo &MRI,
4208 MachineIRBuilder &B) const {
4209 Function &F = B.getMF().getFunction();
4210 std::optional<uint32_t> KnownSize =
4211 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4212 if (KnownSize.has_value())
4213 B.buildConstant(DstReg, *KnownSize);
4214 return false;
4215}
4216
4217bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4218 MachineRegisterInfo &MRI,
4219 MachineIRBuilder &B) const {
4220
4221 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4222 if (!MFI->isEntryFunction()) {
4223 return legalizePreloadedArgIntrin(MI, MRI, B,
4224 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4225 }
4226
4227 Register DstReg = MI.getOperand(0).getReg();
4228 if (!getLDSKernelId(DstReg, MRI, B))
4229 return false;
4230
4231 MI.eraseFromParent();
4232 return true;
4233}
4234
4235bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4236 MachineRegisterInfo &MRI,
4237 MachineIRBuilder &B,
4238 unsigned AddrSpace) const {
4239 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4240 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4241 Register Hi32 = Unmerge.getReg(1);
4242
4243 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4244 MI.eraseFromParent();
4245 return true;
4246}
4247
4248// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4249// offset (the offset that is included in bounds checking and swizzling, to be
4250// split between the instruction's voffset and immoffset fields) and soffset
4251// (the offset that is excluded from bounds checking and swizzling, to go in
4252// the instruction's soffset field). This function takes the first kind of
4253// offset and figures out how to split it between voffset and immoffset.
4254std::pair<Register, unsigned>
4255AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4256 Register OrigOffset) const {
4257 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
4258 Register BaseReg;
4259 unsigned ImmOffset;
4260 const LLT S32 = LLT::scalar(32);
4261 MachineRegisterInfo &MRI = *B.getMRI();
4262
4263 std::tie(BaseReg, ImmOffset) =
4264 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4265
4266 // If BaseReg is a pointer, convert it to int.
4267 if (MRI.getType(BaseReg).isPointer())
4268 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4269
4270 // If the immediate value is too big for the immoffset field, put only bits
4271 // that would normally fit in the immoffset field. The remaining value that
4272 // is copied/added for the voffset field is a large power of 2, and it
4273 // stands more chance of being CSEd with the copy/add for another similar
4274 // load/store.
4275 // However, do not do that rounding down if that is a negative
4276 // number, as it appears to be illegal to have a negative offset in the
4277 // vgpr, even if adding the immediate offset makes it positive.
4278 unsigned Overflow = ImmOffset & ~MaxImm;
4279 ImmOffset -= Overflow;
4280 if ((int32_t)Overflow < 0) {
4281 Overflow += ImmOffset;
4282 ImmOffset = 0;
4283 }
4284
4285 if (Overflow != 0) {
4286 if (!BaseReg) {
4287 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4288 } else {
4289 auto OverflowVal = B.buildConstant(S32, Overflow);
4290 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4291 }
4292 }
4293
4294 if (!BaseReg)
4295 BaseReg = B.buildConstant(S32, 0).getReg(0);
4296
4297 return std::pair(BaseReg, ImmOffset);
4298}
4299
4300/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4301void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4302 Register VOffset, Register SOffset,
4303 unsigned ImmOffset, Register VIndex,
4304 MachineRegisterInfo &MRI) const {
4305 std::optional<ValueAndVReg> MaybeVOffsetVal =
4306 getIConstantVRegValWithLookThrough(VOffset, MRI);
4307 std::optional<ValueAndVReg> MaybeSOffsetVal =
4308 getIConstantVRegValWithLookThrough(SOffset, MRI);
4309 std::optional<ValueAndVReg> MaybeVIndexVal =
4310 getIConstantVRegValWithLookThrough(VIndex, MRI);
4311 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4312 // update the MMO with that offset. The stride is unknown so we can only do
4313 // this if VIndex is constant 0.
4314 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4315 MaybeVIndexVal->Value == 0) {
4316 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4317 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4318 MMO->setOffset(TotalOffset);
4319 } else {
4320 // We don't have a constant combined offset to use in the MMO. Give up.
4321 MMO->setValue((Value *)nullptr);
4322 }
4323}
4324
4325/// Handle register layout difference for f16 images for some subtargets.
4326Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4327 MachineRegisterInfo &MRI,
4328 Register Reg,
4329 bool ImageStore) const {
4330 const LLT S16 = LLT::scalar(16);
4331 const LLT S32 = LLT::scalar(32);
4332 LLT StoreVT = MRI.getType(Reg);
4333 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4333, __extension__
__PRETTY_FUNCTION__))
;
4334
4335 if (ST.hasUnpackedD16VMem()) {
4336 auto Unmerge = B.buildUnmerge(S16, Reg);
4337
4338 SmallVector<Register, 4> WideRegs;
4339 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4340 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4341
4342 int NumElts = StoreVT.getNumElements();
4343
4344 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4345 .getReg(0);
4346 }
4347
4348 if (ImageStore && ST.hasImageStoreD16Bug()) {
4349 if (StoreVT.getNumElements() == 2) {
4350 SmallVector<Register, 4> PackedRegs;
4351 Reg = B.buildBitcast(S32, Reg).getReg(0);
4352 PackedRegs.push_back(Reg);
4353 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4354 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4355 .getReg(0);
4356 }
4357
4358 if (StoreVT.getNumElements() == 3) {
4359 SmallVector<Register, 4> PackedRegs;
4360 auto Unmerge = B.buildUnmerge(S16, Reg);
4361 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4362 PackedRegs.push_back(Unmerge.getReg(I));
4363 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4364 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4365 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4366 }
4367
4368 if (StoreVT.getNumElements() == 4) {
4369 SmallVector<Register, 4> PackedRegs;
4370 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4371 auto Unmerge = B.buildUnmerge(S32, Reg);
4372 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4373 PackedRegs.push_back(Unmerge.getReg(I));
4374 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4375 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4376 .getReg(0);
4377 }
4378
4379 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4379)
;
4380 }
4381
4382 if (StoreVT == LLT::fixed_vector(3, S16)) {
4383 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4384 .getReg(0);
4385 }
4386 return Reg;
4387}
4388
4389Register AMDGPULegalizerInfo::fixStoreSourceType(
4390 MachineIRBuilder &B, Register VData, bool IsFormat) const {
4391 MachineRegisterInfo *MRI = B.getMRI();
4392 LLT Ty = MRI->getType(VData);
4393
4394 const LLT S16 = LLT::scalar(16);
4395
4396 // Fixup illegal register types for i8 stores.
4397 if (Ty == LLT::scalar(8) || Ty == S16) {
4398 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4399 return AnyExt;
4400 }
4401
4402 if (Ty.isVector()) {
4403 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4404 if (IsFormat)
4405 return handleD16VData(B, *MRI, VData);
4406 }
4407 }
4408
4409 return VData;
4410}
4411
4412bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4413 MachineRegisterInfo &MRI,
4414 MachineIRBuilder &B,
4415 bool IsTyped,
4416 bool IsFormat) const {
4417 Register VData = MI.getOperand(1).getReg();
4418 LLT Ty = MRI.getType(VData);
4419 LLT EltTy = Ty.getScalarType();
4420 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4421 const LLT S32 = LLT::scalar(32);
4422
4423 VData = fixStoreSourceType(B, VData, IsFormat);
4424 Register RSrc = MI.getOperand(2).getReg();
4425
4426 MachineMemOperand *MMO = *MI.memoperands_begin();
4427 const int MemSize = MMO->getSize();
4428
4429 unsigned ImmOffset;
4430
4431 // The typed intrinsics add an immediate after the registers.
4432 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4433
4434 // The struct intrinsic variants add one additional operand over raw.
4435 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4436 Register VIndex;
4437 int OpOffset = 0;
4438 if (HasVIndex) {
4439 VIndex = MI.getOperand(3).getReg();
4440 OpOffset = 1;
4441 } else {
4442 VIndex = B.buildConstant(S32, 0).getReg(0);
4443 }
4444
4445 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4446 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4447
4448 unsigned Format = 0;
4449 if (IsTyped) {
4450 Format = MI.getOperand(5 + OpOffset).getImm();
4451 ++OpOffset;
4452 }
4453
4454 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4455
4456 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4457 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4458
4459 unsigned Opc;
4460 if (IsTyped) {
4461 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4462 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4463 } else if (IsFormat) {
4464 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4465 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4466 } else {
4467 switch (MemSize) {
4468 case 1:
4469 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4470 break;
4471 case 2:
4472 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4473 break;
4474 default:
4475 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4476 break;
4477 }
4478 }
4479
4480 auto MIB = B.buildInstr(Opc)
4481 .addUse(VData) // vdata
4482 .addUse(RSrc) // rsrc
4483 .addUse(VIndex) // vindex
4484 .addUse(VOffset) // voffset
4485 .addUse(SOffset) // soffset
4486 .addImm(ImmOffset); // offset(imm)
4487
4488 if (IsTyped)
4489 MIB.addImm(Format);
4490
4491 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4492 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4493 .addMemOperand(MMO);
4494
4495 MI.eraseFromParent();
4496 return true;
4497}
4498
4499static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
4500 Register VIndex, Register VOffset, Register SOffset,
4501 unsigned ImmOffset, unsigned Format,
4502 unsigned AuxiliaryData, MachineMemOperand *MMO,
4503 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
4504 auto MIB = B.buildInstr(Opc)
4505 .addDef(LoadDstReg) // vdata
4506 .addUse(RSrc) // rsrc
4507 .addUse(VIndex) // vindex
4508 .addUse(VOffset) // voffset
4509 .addUse(SOffset) // soffset
4510 .addImm(ImmOffset); // offset(imm)
4511
4512 if (IsTyped)
4513 MIB.addImm(Format);
4514
4515 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4516 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4517 .addMemOperand(MMO);
4518}
4519
4520bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4521 MachineRegisterInfo &MRI,
4522 MachineIRBuilder &B,
4523 bool IsFormat,
4524 bool IsTyped) const {
4525 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4526 MachineMemOperand *MMO = *MI.memoperands_begin();
4527 const LLT MemTy = MMO->getMemoryType();
4528 const LLT S32 = LLT::scalar(32);
4529
4530 Register Dst = MI.getOperand(0).getReg();
4531
4532 Register StatusDst;
4533 int OpOffset = 0;
4534 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2)(static_cast <bool> (MI.getNumExplicitDefs() == 1 || MI
.getNumExplicitDefs() == 2) ? void (0) : __assert_fail ("MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4534, __extension__
__PRETTY_FUNCTION__))
;
4535 bool IsTFE = MI.getNumExplicitDefs() == 2;
4536 if (IsTFE) {
4537 StatusDst = MI.getOperand(1).getReg();
4538 ++OpOffset;
4539 }
4540
4541 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
4542
4543 // The typed intrinsics add an immediate after the registers.
4544 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4545
4546 // The struct intrinsic variants add one additional operand over raw.
4547 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
4548 Register VIndex;
4549 if (HasVIndex) {
4550 VIndex = MI.getOperand(3 + OpOffset).getReg();
4551 ++OpOffset;
4552 } else {
4553 VIndex = B.buildConstant(S32, 0).getReg(0);
4554 }
4555
4556 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4557 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4558
4559 unsigned Format = 0;
4560 if (IsTyped) {
4561 Format = MI.getOperand(5 + OpOffset).getImm();
4562 ++OpOffset;
4563 }
4564
4565 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4566 unsigned ImmOffset;
4567
4568 LLT Ty = MRI.getType(Dst);
4569 LLT EltTy = Ty.getScalarType();
4570 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4571 const bool Unpacked = ST.hasUnpackedD16VMem();
4572
4573 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4574 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4575
4576 unsigned Opc;
4577
4578 // TODO: Support TFE for typed and narrow loads.
4579 if (IsTyped) {
4580 if (IsTFE)
4581 return false;
4582 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4583 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4584 } else if (IsFormat) {
4585 if (IsD16) {
4586 if (IsTFE)
4587 return false;
4588 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4589 } else {
4590 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4591 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4592 }
4593 } else {
4594 if (IsTFE)
4595 return false;
4596 switch (MemTy.getSizeInBits()) {
4597 case 8:
4598 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4599 break;
4600 case 16:
4601 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4602 break;
4603 default:
4604 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4605 break;
4606 }
4607 }
4608
4609 if (IsTFE) {
4610 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
4611 unsigned NumLoadDWords = NumValueDWords + 1;
4612 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
4613 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
4614 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4615 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4616 if (NumValueDWords == 1) {
4617 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4618 } else {
4619 SmallVector<Register, 5> LoadElts;
4620 for (unsigned I = 0; I != NumValueDWords; ++I)
4621 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
4622 LoadElts.push_back(StatusDst);
4623 B.buildUnmerge(LoadElts, LoadDstReg);
4624 LoadElts.truncate(NumValueDWords);
4625 B.buildMergeLikeInstr(Dst, LoadElts);
4626 }
4627 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
4628 (IsD16 && !Ty.isVector())) {
4629 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4630 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4631 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4632 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4633 B.buildTrunc(Dst, LoadDstReg);
4634 } else if (Unpacked && IsD16 && Ty.isVector()) {
4635 LLT UnpackedTy = Ty.changeElementSize(32);
4636 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4637 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4638 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4639 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4640 // FIXME: G_TRUNC should work, but legalization currently fails
4641 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4642 SmallVector<Register, 4> Repack;
4643 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4644 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4645 B.buildMergeLikeInstr(Dst, Repack);
4646 } else {
4647 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
4648 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4649 }
4650
4651 MI.eraseFromParent();
4652 return true;
4653}
4654
4655bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4656 MachineIRBuilder &B,
4657 bool IsInc) const {
4658 unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP :
4659 AMDGPU::G_ATOMICRMW_UDEC_WRAP;
4660 B.buildInstr(Opc)
4661 .addDef(MI.getOperand(0).getReg())
4662 .addUse(MI.getOperand(2).getReg())
4663 .addUse(MI.getOperand(3).getReg())
4664 .cloneMemRefs(MI);
4665 MI.eraseFromParent();
4666 return true;
4667}
4668
4669static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4670 switch (IntrID) {
4671 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4672 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4673 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4674 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4675 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4676 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4677 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4678 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4679 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4680 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4681 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4682 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4683 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4684 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4685 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4686 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4687 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4688 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4689 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4690 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4691 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4692 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4693 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4694 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4695 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4696 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4697 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4698 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4699 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4700 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4701 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4702 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4703 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4704 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4705 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4706 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4707 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4708 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4709 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4710 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4711 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4712 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4713 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4714 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4715 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4716 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4717 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4718 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4719 default:
4720 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4720)
;
4721 }
4722}
4723
4724bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4725 MachineIRBuilder &B,
4726 Intrinsic::ID IID) const {
4727 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4728 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4729 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4730
4731 Register Dst;
4732
4733 int OpOffset = 0;
4734 if (HasReturn) {
4735 // A few FP atomics do not support return values.
4736 Dst = MI.getOperand(0).getReg();
4737 } else {
4738 OpOffset = -1;
4739 }
4740
4741 Register VData = MI.getOperand(2 + OpOffset).getReg();
4742 Register CmpVal;
4743
4744 if (IsCmpSwap) {
4745 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4746 ++OpOffset;
4747 }
4748
4749 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4750 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4751
4752 // The struct intrinsic variants add one additional operand over raw.
4753 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4754 Register VIndex;
4755 if (HasVIndex) {
4756 VIndex = MI.getOperand(4 + OpOffset).getReg();
4757 ++OpOffset;
4758 } else {
4759 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4760 }
4761
4762 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4763 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4764 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4765
4766 MachineMemOperand *MMO = *MI.memoperands_begin();
4767
4768 unsigned ImmOffset;
4769 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4770 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4771
4772 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4773
4774 if (HasReturn)
4775 MIB.addDef(Dst);
4776
4777 MIB.addUse(VData); // vdata
4778
4779 if (IsCmpSwap)
4780 MIB.addReg(CmpVal);
4781
4782 MIB.addUse(RSrc) // rsrc
4783 .addUse(VIndex) // vindex
4784 .addUse(VOffset) // voffset
4785 .addUse(SOffset) // soffset
4786 .addImm(ImmOffset) // offset(imm)
4787 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4788 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4789 .addMemOperand(MMO);
4790
4791 MI.eraseFromParent();
4792 return true;
4793}
4794
4795/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4796/// vector with s16 typed elements.
4797static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4798 SmallVectorImpl<Register> &PackedAddrs,
4799 unsigned ArgOffset,
4800 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4801 bool IsA16, bool IsG16) {
4802 const LLT S16 = LLT::scalar(16);
4803 const LLT V2S16 = LLT::fixed_vector(2, 16);
4804 auto EndIdx = Intr->VAddrEnd;
4805
4806 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4807 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4808 if (!SrcOp.isReg())
4809 continue; // _L to _LZ may have eliminated this.
4810
4811 Register AddrReg = SrcOp.getReg();
4812
4813 if ((I < Intr->GradientStart) ||
4814 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4815 (I >= Intr->CoordStart && !IsA16)) {
4816 if ((I < Intr->GradientStart) && IsA16 &&
4817 (B.getMRI()->getType(AddrReg) == S16)) {
4818 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument")(static_cast <bool> (I == Intr->BiasIndex &&
"Got unexpected 16-bit extra argument") ? void (0) : __assert_fail
("I == Intr->BiasIndex && \"Got unexpected 16-bit extra argument\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4818, __extension__
__PRETTY_FUNCTION__))
;
4819 // Special handling of bias when A16 is on. Bias is of type half but
4820 // occupies full 32-bit.
4821 PackedAddrs.push_back(
4822 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4823 .getReg(0));
4824 } else {
4825 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4826, __extension__
__PRETTY_FUNCTION__))
4826 "Bias needs to be converted to 16 bit in A16 mode")(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4826, __extension__
__PRETTY_FUNCTION__))
;
4827 // Handle any gradient or coordinate operands that should not be packed
4828 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4829 PackedAddrs.push_back(AddrReg);
4830 }
4831 } else {
4832 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4833 // derivatives dx/dh and dx/dv are packed with undef.
4834 if (((I + 1) >= EndIdx) ||
4835 ((Intr->NumGradients / 2) % 2 == 1 &&
4836 (I == static_cast<unsigned>(Intr->GradientStart +
4837 (Intr->NumGradients / 2) - 1) ||
4838 I == static_cast<unsigned>(Intr->GradientStart +
4839 Intr->NumGradients - 1))) ||
4840 // Check for _L to _LZ optimization
4841 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4842 PackedAddrs.push_back(
4843 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4844 .getReg(0));
4845 } else {
4846 PackedAddrs.push_back(
4847 B.buildBuildVector(
4848 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4849 .getReg(0));
4850 ++I;
4851 }
4852 }
4853 }
4854}
4855
4856/// Convert from separate vaddr components to a single vector address register,
4857/// and replace the remaining operands with $noreg.
4858static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4859 int DimIdx, int NumVAddrs) {
4860 const LLT S32 = LLT::scalar(32);
4861 (void)S32;
4862 SmallVector<Register, 8> AddrRegs;
4863 for (int I = 0; I != NumVAddrs; ++I) {
4864 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4865 if (SrcOp.isReg()) {
4866 AddrRegs.push_back(SrcOp.getReg());
4867 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__
__PRETTY_FUNCTION__))
;
4868 }
4869 }
4870
4871 int NumAddrRegs = AddrRegs.size();
4872 if (NumAddrRegs != 1) {
4873 auto VAddr =
4874 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4875 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4876 }
4877
4878 for (int I = 1; I != NumVAddrs; ++I) {
4879 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4880 if (SrcOp.isReg())
4881 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4882 }
4883}
4884
4885/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4886///
4887/// Depending on the subtarget, load/store with 16-bit element data need to be
4888/// rewritten to use the low half of 32-bit registers, or directly use a packed
4889/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4890/// registers.
4891///
4892/// We don't want to directly select image instructions just yet, but also want
4893/// to exposes all register repacking to the legalizer/combiners. We also don't
4894/// want a selected instruction entering RegBankSelect. In order to avoid
4895/// defining a multitude of intermediate image instructions, directly hack on
4896/// the intrinsic's arguments. In cases like a16 addresses, this requires
4897/// padding now unnecessary arguments with $noreg.
4898bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4899 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4900 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4901
4902 const MachineFunction &MF = *MI.getMF();
4903 const unsigned NumDefs = MI.getNumExplicitDefs();
4904 const unsigned ArgOffset = NumDefs + 1;
4905 bool IsTFE = NumDefs == 2;
4906 // We are only processing the operands of d16 image operations on subtargets
4907 // that use the unpacked register layout, or need to repack the TFE result.
4908
4909 // TODO: Do we need to guard against already legalized intrinsics?
4910 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4911 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4912
4913 MachineRegisterInfo *MRI = B.getMRI();
4914 const LLT S32 = LLT::scalar(32);
4915 const LLT S16 = LLT::scalar(16);
4916 const LLT V2S16 = LLT::fixed_vector(2, 16);
4917
4918 unsigned DMask = 0;
4919 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4920 LLT Ty = MRI->getType(VData);
4921
4922 // Check for 16 bit addresses and pack if true.
4923 LLT GradTy =
4924 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4925 LLT AddrTy =
4926 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4927 const bool IsG16 =
4928 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
4929 const bool IsA16 = AddrTy == S16;
4930 const bool IsD16 = Ty.getScalarType() == S16;
4931
4932 int DMaskLanes = 0;
4933 if (!BaseOpcode->Atomic) {
4934 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4935 if (BaseOpcode->Gather4) {
4936 DMaskLanes = 4;
4937 } else if (DMask != 0) {
4938 DMaskLanes = llvm::popcount(DMask);
4939 } else if (!IsTFE && !BaseOpcode->Store) {
4940 // If dmask is 0, this is a no-op load. This can be eliminated.
4941 B.buildUndef(MI.getOperand(0));
4942 MI.eraseFromParent();
4943 return true;
4944 }
4945 }
4946
4947 Observer.changingInstr(MI);
4948 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4949
4950 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4951 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4952 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4953 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4954 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4955
4956 // Track that we legalized this
4957 MI.setDesc(B.getTII().get(NewOpcode));
4958
4959 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4960 // dmask to be at least 1 otherwise the instruction will fail
4961 if (IsTFE && DMask == 0) {
4962 DMask = 0x1;
4963 DMaskLanes = 1;
4964 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4965 }
4966
4967 if (BaseOpcode->Atomic) {
4968 Register VData0 = MI.getOperand(2).getReg();
4969 LLT Ty = MRI->getType(VData0);
4970
4971 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4972 if (Ty.isVector())
4973 return false;
4974
4975 if (BaseOpcode->AtomicX2) {
4976 Register VData1 = MI.getOperand(3).getReg();
4977 // The two values are packed in one register.
4978 LLT PackedTy = LLT::fixed_vector(2, Ty);
4979 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4980 MI.getOperand(2).setReg(Concat.getReg(0));
4981 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4982 }
4983 }
4984
4985 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4986
4987 // Rewrite the addressing register layout before doing anything else.
4988 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4989 // 16 bit gradients are supported, but are tied to the A16 control
4990 // so both gradients and addresses must be 16 bit
4991 return false;
4992 }
4993
4994 if (IsA16 && !ST.hasA16()) {
4995 // A16 not supported
4996 return false;
4997 }
4998
4999 const unsigned NSAMaxSize = ST.getNSAMaxSize();
5000 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
5001
5002 if (IsA16 || IsG16) {
5003 if (Intr->NumVAddrs > 1) {
5004 SmallVector<Register, 4> PackedRegs;
5005
5006 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
5007 IsG16);
5008
5009 // See also below in the non-a16 branch
5010 const bool UseNSA = ST.hasNSAEncoding() &&
5011 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
5012 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
5013 const bool UsePartialNSA =
5014 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
5015
5016 if (UsePartialNSA) {
5017 // Pack registers that would go over NSAMaxSize into last VAddr register
5018 LLT PackedAddrTy =
5019 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
5020 auto Concat = B.buildConcatVectors(
5021 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
5022 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
5023 PackedRegs.resize(NSAMaxSize);
5024 } else if (!UseNSA && PackedRegs.size() > 1) {
5025 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
5026 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
5027 PackedRegs[0] = Concat.getReg(0);
5028 PackedRegs.resize(1);
5029 }
5030
5031 const unsigned NumPacked = PackedRegs.size();
5032 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
5033 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
5034 if (!SrcOp.isReg()) {
5035 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5035, __extension__
__PRETTY_FUNCTION__))
;
5036 continue;
5037 }
5038
5039 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5039, __extension__
__PRETTY_FUNCTION__))
;
5040
5041 if (I - Intr->VAddrStart < NumPacked)
5042 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
5043 else
5044 SrcOp.setReg(AMDGPU::NoRegister);
5045 }
5046 }
5047 } else {
5048 // If the register allocator cannot place the address registers contiguously
5049 // without introducing moves, then using the non-sequential address encoding
5050 // is always preferable, since it saves VALU instructions and is usually a
5051 // wash in terms of code size or even better.
5052 //
5053 // However, we currently have no way of hinting to the register allocator
5054 // that MIMG addresses should be placed contiguously when it is possible to
5055 // do so, so force non-NSA for the common 2-address case as a heuristic.
5056 //
5057 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5058 // allocation when possible.
5059 //
5060 // Partial NSA is allowed on GFX11 where the final register is a contiguous
5061 // set of the remaining addresses.
5062 const bool UseNSA = ST.hasNSAEncoding() &&
5063 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5064 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
5065 const bool UsePartialNSA =
5066 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
5067
5068 if (UsePartialNSA) {
5069 convertImageAddrToPacked(B, MI,
5070 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
5071 Intr->NumVAddrs - NSAMaxSize + 1);
5072 } else if (!UseNSA && Intr->NumVAddrs > 1) {
5073 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5074 Intr->NumVAddrs);
5075 }
5076 }
5077
5078 int Flags = 0;
5079 if (IsA16)
5080 Flags |= 1;
5081 if (IsG16)
5082 Flags |= 2;
5083 MI.addOperand(MachineOperand::CreateImm(Flags));
5084
5085 if (BaseOpcode->Store) { // No TFE for stores?
5086 // TODO: Handle dmask trim
5087 if (!Ty.isVector() || !IsD16)
5088 return true;
5089
5090 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5091 if (RepackedReg != VData) {
5092 MI.getOperand(1).setReg(RepackedReg);
5093 }
5094
5095 return true;
5096 }
5097
5098 Register DstReg = MI.getOperand(0).getReg();
5099 const LLT EltTy = Ty.getScalarType();
5100 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5101
5102 // Confirm that the return type is large enough for the dmask specified
5103 if (NumElts < DMaskLanes)
5104 return false;
5105
5106 if (NumElts > 4 || DMaskLanes > 4)
5107 return false;
5108
5109 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5110 const LLT AdjustedTy =
5111 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5112
5113 // The raw dword aligned data component of the load. The only legal cases
5114 // where this matters should be when using the packed D16 format, for
5115 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5116 LLT RoundedTy;
5117
5118 // S32 vector to cover all data, plus TFE result element.
5119 LLT TFETy;
5120
5121 // Register type to use for each loaded component. Will be S32 or V2S16.
5122 LLT RegTy;
5123
5124 if (IsD16 && ST.hasUnpackedD16VMem()) {
5125 RoundedTy =
5126 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5127 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5128 RegTy = S32;
5129 } else {
5130 unsigned EltSize = EltTy.getSizeInBits();
5131 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5132 unsigned RoundedSize = 32 * RoundedElts;
5133 RoundedTy = LLT::scalarOrVector(
5134 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5135 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5136 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5137 }
5138
5139 // The return type does not need adjustment.
5140 // TODO: Should we change s16 case to s32 or <2 x s16>?
5141 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5142 return true;
5143
5144 Register Dst1Reg;
5145
5146 // Insert after the instruction.
5147 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5148
5149 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5150 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5151 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5152 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5153
5154 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5155
5156 MI.getOperand(0).setReg(NewResultReg);
5157
5158 // In the IR, TFE is supposed to be used with a 2 element struct return
5159 // type. The instruction really returns these two values in one contiguous
5160 // register, with one additional dword beyond the loaded data. Rewrite the
5161 // return type to use a single register result.
5162
5163 if (IsTFE) {
5164 Dst1Reg = MI.getOperand(1).getReg();
5165 if (MRI->getType(Dst1Reg) != S32)
5166 return false;
5167
5168 // TODO: Make sure the TFE operand bit is set.
5169 MI.removeOperand(1);
5170
5171 // Handle the easy case that requires no repack instructions.
5172 if (Ty == S32) {
5173 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5174 return true;
5175 }
5176 }
5177
5178 // Now figure out how to copy the new result register back into the old
5179 // result.
5180 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5181
5182 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5183
5184 if (ResultNumRegs == 1) {
5185 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp",
5185, __extension__ __PRETTY_FUNCTION__))
;
5186 ResultRegs[0] = NewResultReg;
5187 } else {
5188 // We have to repack into a new vector of some kind.
5189 for (int I = 0; I != NumDataRegs; ++I)
5190 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5191 B.buildUnmerge(ResultRegs, NewResultReg);
5192
5193 // Drop the final TFE element to get the data part. The TFE result is
5194 // directly written to the right place already.
5195 if (IsTFE)
5196 ResultRegs.resize(NumDataRegs);
5197 }
5198
5199 // For an s16 scalar result, we form an s32 result with a truncate regardless
5200 // of packed vs. unpacked.
5201 if (IsD16 && !Ty.isVector()) {
5202 B.buildTrunc(DstReg, ResultRegs[0]);
5203 return true;
5204 }
5205
5206 // Avoid a build/concat_vector of 1 entry.
5207 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5208 B.buildBitcast(DstReg, ResultRegs[0]);
5209 return true;
5210 }
5211
5212 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5212, __extension__ __PRETTY_FUNCTION__))
;
5213
5214 if (IsD16) {
5215 // For packed D16 results with TFE enabled, all the data components are
5216 // S32. Cast back to the expected type.
5217 //
5218 // TODO: We don't really need to use load s32 elements. We would only need one
5219 // cast for the TFE result if a multiple of v2s16 was used.
5220 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5221 for (Register &Reg : ResultRegs)
5222 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5223 } else if (ST.hasUnpackedD16VMem()) {
5224 for (Register &Reg : ResultRegs)
5225 Reg = B.buildTrunc(S16, Reg).getReg(0);
5226 }
5227 }
5228
5229 auto padWithUndef = [&](LLT Ty, int NumElts) {
5230 if (NumElts == 0)
5231 return;
5232 Register Undef = B.buildUndef(Ty).getReg(0);
5233 for (int I = 0; I != NumElts; ++I)
5234 ResultRegs.push_back(Undef);
5235 };
5236
5237 // Pad out any elements eliminated due to the dmask.
5238 LLT ResTy = MRI->getType(ResultRegs[0]);
5239 if (!ResTy.isVector()) {
5240 padWithUndef(ResTy, NumElts - ResultRegs.size());
5241 B.buildBuildVector(DstReg, ResultRegs);
5242 return true;
5243 }
5244
5245 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5245, __extension__
__PRETTY_FUNCTION__))
;
5246 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5247
5248 // Deal with the one annoying legal case.
5249 const LLT V3S16 = LLT::fixed_vector(3, 16);
5250 if (Ty == V3S16) {
5251 if (IsTFE) {
5252 if (ResultRegs.size() == 1) {
5253 NewResultReg = ResultRegs[0];
5254 } else if (ResultRegs.size() == 2) {
5255 LLT V4S16 = LLT::fixed_vector(4, 16);
5256 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5257 } else {
5258 return false;
5259 }
5260 }
5261
5262 if (MRI->getType(DstReg).getNumElements() <
5263 MRI->getType(NewResultReg).getNumElements()) {
5264 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5265 } else {
5266 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5267 }
5268 return true;
5269 }
5270
5271 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5272 B.buildConcatVectors(DstReg, ResultRegs);
5273 return true;
5274}
5275
5276bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5277 LegalizerHelper &Helper, MachineInstr &MI) const {
5278 MachineIRBuilder &B = Helper.MIRBuilder;
5279 GISelChangeObserver &Observer = Helper.Observer;
5280
5281 Register Dst = MI.getOperand(0).getReg();
5282 LLT Ty = B.getMRI()->getType(Dst);
5283 unsigned Size = Ty.getSizeInBits();
5284 MachineFunction &MF = B.getMF();
5285
5286 Observer.changingInstr(MI);
5287
5288 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5289 Ty = getBitcastRegisterType(Ty);
5290 Helper.bitcastDst(MI, Ty, 0);
5291 Dst = MI.getOperand(0).getReg();
5292 B.setInsertPt(B.getMBB(), MI);
5293 }
5294
5295 // FIXME: We don't really need this intermediate instruction. The intrinsic
5296 // should be fixed to have a memory operand. Since it's readnone, we're not
5297 // allowed to add one.
5298 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5299 MI.removeOperand(1); // Remove intrinsic ID
5300
5301 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5302 // TODO: Should this use datalayout alignment?
5303 const unsigned MemSize = (Size + 7) / 8;
5304 const Align MemAlign(4);
5305 MachineMemOperand *MMO = MF.getMachineMemOperand(
5306 MachinePointerInfo(),
5307 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5308 MachineMemOperand::MOInvariant,
5309 MemSize, MemAlign);
5310 MI.addMemOperand(MF, MMO);
5311
5312 // There are no 96-bit result scalar loads, but widening to 128-bit should
5313 // always be legal. We may need to restore this to a 96-bit result if it turns
5314 // out this needs to be converted to a vector load during RegBankSelect.
5315 if (!isPowerOf2_32(Size)) {
5316 if (Ty.isVector())
5317 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5318 else
5319 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5320 }
5321
5322 Observer.changedInstr(MI);
5323 return true;
5324}
5325
5326// TODO: Move to selection
5327bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5328 MachineRegisterInfo &MRI,
5329 MachineIRBuilder &B) const {
5330 if (!ST.isTrapHandlerEnabled() ||
5331 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5332 return legalizeTrapEndpgm(MI, MRI, B);
5333
5334 const Module *M = B.getMF().getFunction().getParent();
5335 unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
5336 if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
5337 return legalizeTrapHsaQueuePtr(MI, MRI, B);
5338
5339 return ST.supportsGetDoorbellID() ?
5340 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
5341}
5342
5343bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5344 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5345 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5346 MI.eraseFromParent();
5347 return true;
5348}
5349
5350bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5351 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5352 MachineFunction &MF = B.getMF();
5353 const LLT S64 = LLT::scalar(64);
5354
5355 Register SGPR01(AMDGPU::SGPR0_SGPR1);
5356 // For code object version 5, queue_ptr is passed through implicit kernarg.
5357 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
5358 AMDGPU::AMDHSA_COV5) {
5359 AMDGPUTargetLowering::ImplicitParameter Param =
5360 AMDGPUTargetLowering::QUEUE_PTR;
5361 uint64_t Offset =
5362 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5363
5364 Register KernargPtrReg = MRI.createGenericVirtualRegister(
5365 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5366
5367 if (!loadInputValue(KernargPtrReg, B,
5368 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5369 return false;
5370
5371 // TODO: can we be smarter about machine pointer info?
5372 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5373 MachineMemOperand *MMO = MF.getMachineMemOperand(
5374 PtrInfo,
5375 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5376 MachineMemOperand::MOInvariant,
5377 LLT::scalar(64), commonAlignment(Align(64), Offset));
5378
5379 // Pointer address
5380 Register LoadAddr = MRI.createGenericVirtualRegister(
5381 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5382 B.buildPtrAdd(LoadAddr, KernargPtrReg,
5383 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5384 // Load address
5385 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5386 B.buildCopy(SGPR01, Temp);
5387 B.buildInstr(AMDGPU::S_TRAP)
5388 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5389 .addReg(SGPR01, RegState::Implicit);
5390 MI.eraseFromParent();
5391 return true;
5392 }
5393
5394 // Pass queue pointer to trap handler as input, and insert trap instruction
5395 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5396 Register LiveIn =
5397 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5398 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5399 return false;
5400
5401 B.buildCopy(SGPR01, LiveIn);
5402 B.buildInstr(AMDGPU::S_TRAP)
5403 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5404 .addReg(SGPR01, RegState::Implicit);
5405
5406 MI.eraseFromParent();
5407 return true;
5408}
5409
5410bool AMDGPULegalizerInfo::legalizeTrapHsa(
5411 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5412 B.buildInstr(AMDGPU::S_TRAP)
5413 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5414 MI.eraseFromParent();
5415 return true;
5416}
5417
5418bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5419 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5420 // Is non-HSA path or trap-handler disabled? Then, report a warning
5421 // accordingly
5422 if (!ST.isTrapHandlerEnabled() ||
5423 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5424 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5425 "debugtrap handler not supported",
5426 MI.getDebugLoc(), DS_Warning);
5427 LLVMContext &Ctx = B.getMF().getFunction().getContext();
5428 Ctx.diagnose(NoTrap);
5429 } else {
5430 // Insert debug-trap instruction
5431 B.buildInstr(AMDGPU::S_TRAP)
5432 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5433 }
5434
5435 MI.eraseFromParent();
5436 return true;
5437}
5438
5439bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5440 MachineIRBuilder &B) const {
5441 MachineRegisterInfo &MRI = *B.getMRI();
5442 const LLT S16 = LLT::scalar(16);
5443 const LLT S32 = LLT::scalar(32);
5444 const LLT V2S16 = LLT::fixed_vector(2, 16);
5445 const LLT V3S32 = LLT::fixed_vector(3, 32);
5446
5447 Register DstReg = MI.getOperand(0).getReg();
5448 Register NodePtr = MI.getOperand(2).getReg();
5449 Register RayExtent = MI.getOperand(3).getReg();
5450 Register RayOrigin = MI.getOperand(4).getReg();
5451 Register RayDir = MI.getOperand(5).getReg();
5452 Register RayInvDir = MI.getOperand(6).getReg();
5453 Register TDescr = MI.getOperand(7).getReg();
5454
5455 if (!ST.hasGFX10_AEncoding()) {
5456 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5457 "intrinsic not supported on subtarget",
5458 MI.getDebugLoc());
5459 B.getMF().getFunction().getContext().diagnose(BadIntrin);
5460 return false;
5461 }
5462
5463 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5464 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5465 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5466 const unsigned NumVDataDwords = 4;
5467 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5468 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5469 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5470 const unsigned BaseOpcodes[2][2] = {
5471 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5472 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5473 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5474 int Opcode;
5475 if (UseNSA) {
5476 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5477 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5478 : AMDGPU::MIMGEncGfx10NSA,
5479 NumVDataDwords, NumVAddrDwords);
5480 } else {
5481 Opcode = AMDGPU::getMIMGOpcode(
5482 BaseOpcodes[Is64][IsA16],
5483 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5484 NumVDataDwords, NumVAddrDwords);
5485 }
5486 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5486, __extension__ __PRETTY_FUNCTION__))
;
5487
5488 SmallVector<Register, 12> Ops;
5489 if (UseNSA && IsGFX11Plus) {
5490 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5491 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5492 auto Merged = B.buildMergeLikeInstr(
5493 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5494 Ops.push_back(Merged.getReg(0));
5495 };
5496
5497 Ops.push_back(NodePtr);
5498 Ops.push_back(RayExtent);
5499 packLanes(RayOrigin);
5500
5501 if (IsA16) {
5502 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5503 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5504 auto MergedDir = B.buildMergeLikeInstr(
5505 V3S32,
5506 {B.buildBitcast(
5507 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
5508 UnmergeRayDir.getReg(0)}))
5509 .getReg(0),
5510 B.buildBitcast(
5511 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
5512 UnmergeRayDir.getReg(1)}))
5513 .getReg(0),
5514 B.buildBitcast(
5515 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
5516 UnmergeRayDir.getReg(2)}))
5517 .getReg(0)});
5518 Ops.push_back(MergedDir.getReg(0));
5519 } else {
5520 packLanes(RayDir);
5521 packLanes(RayInvDir);
5522 }
5523 } else {
5524 if (Is64) {
5525 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5526 Ops.push_back(Unmerge.getReg(0));
5527 Ops.push_back(Unmerge.getReg(1));
5528 } else {
5529 Ops.push_back(NodePtr);
5530 }
5531 Ops.push_back(RayExtent);
5532
5533 auto packLanes = [&Ops, &S32, &B](Register Src) {
5534 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5535 Ops.push_back(Unmerge.getReg(0));
5536 Ops.push_back(Unmerge.getReg(1));
5537 Ops.push_back(Unmerge.getReg(2));
5538 };
5539
5540 packLanes(RayOrigin);
5541 if (IsA16) {
5542 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5543 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5544 Register R1 = MRI.createGenericVirtualRegister(S32);
5545 Register R2 = MRI.createGenericVirtualRegister(S32);
5546 Register R3 = MRI.createGenericVirtualRegister(S32);
5547 B.buildMergeLikeInstr(R1,
5548 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5549 B.buildMergeLikeInstr(
5550 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5551 B.buildMergeLikeInstr(
5552 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5553 Ops.push_back(R1);
5554 Ops.push_back(R2);
5555 Ops.push_back(R3);
5556 } else {
5557 packLanes(RayDir);
5558 packLanes(RayInvDir);
5559 }
5560 }
5561
5562 if (!UseNSA) {
5563 // Build a single vector containing all the operands so far prepared.
5564 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5565 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
5566 Ops.clear();
5567 Ops.push_back(MergedOps);
5568 }
5569
5570 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5571 .addDef(DstReg)
5572 .addImm(Opcode);
5573
5574 for (Register R : Ops) {
5575 MIB.addUse(R);
5576 }
5577
5578 MIB.addUse(TDescr)
5579 .addImm(IsA16 ? 1 : 0)
5580 .cloneMemRefs(MI);
5581
5582 MI.eraseFromParent();
5583 return true;
5584}
5585
5586bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5587 MachineIRBuilder &B) const {
5588 unsigned Opc;
5589 int RoundMode = MI.getOperand(2).getImm();
5590
5591 if (RoundMode == (int)RoundingMode::TowardPositive)
5592 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5593 else if (RoundMode == (int)RoundingMode::TowardNegative)
5594 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5595 else
5596 return false;
5597
5598 B.buildInstr(Opc)
5599 .addDef(MI.getOperand(0).getReg())
5600 .addUse(MI.getOperand(1).getReg());
5601
5602 MI.eraseFromParent();
5603
5604 return true;
5605}
5606
5607bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5608 MachineInstr &MI) const {
5609 MachineIRBuilder &B = Helper.MIRBuilder;
5610 MachineRegisterInfo &MRI = *B.getMRI();
5611
5612 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5613 auto IntrID = MI.getIntrinsicID();
5614 switch (IntrID) {
5615 case Intrinsic::amdgcn_if:
5616 case Intrinsic::amdgcn_else: {
5617 MachineInstr *Br = nullptr;
5618 MachineBasicBlock *UncondBrTarget = nullptr;
5619 bool Negated = false;
5620 if (MachineInstr *BrCond =
5621 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5622 const SIRegisterInfo *TRI
5623 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5624
5625 Register Def = MI.getOperand(1).getReg();
5626 Register Use = MI.getOperand(3).getReg();
5627
5628 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5629
5630 if (Negated)
5631 std::swap(CondBrTarget, UncondBrTarget);
5632
5633 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5634 if (IntrID == Intrinsic::amdgcn_if) {
5635 B.buildInstr(AMDGPU::SI_IF)
5636 .addDef(Def)
5637 .addUse(Use)
5638 .addMBB(UncondBrTarget);
5639 } else {
5640 B.buildInstr(AMDGPU::SI_ELSE)
5641 .addDef(Def)
5642 .addUse(Use)
5643 .addMBB(UncondBrTarget);
5644 }
5645
5646 if (Br) {
5647 Br->getOperand(0).setMBB(CondBrTarget);
5648 } else {
5649 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5650 // since we're swapping branch targets it needs to be reinserted.
5651 // FIXME: IRTranslator should probably not do this
5652 B.buildBr(*CondBrTarget);
5653 }
5654
5655 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5656 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5657 MI.eraseFromParent();
5658 BrCond->eraseFromParent();
5659 return true;
5660 }
5661
5662 return false;
5663 }
5664 case Intrinsic::amdgcn_loop: {
5665 MachineInstr *Br = nullptr;
5666 MachineBasicBlock *UncondBrTarget = nullptr;
5667 bool Negated = false;
5668 if (MachineInstr *BrCond =
5669 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5670 const SIRegisterInfo *TRI
5671 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5672
5673 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5674 Register Reg = MI.getOperand(2).getReg();
5675
5676 if (Negated)
5677 std::swap(CondBrTarget, UncondBrTarget);
5678
5679 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5680 B.buildInstr(AMDGPU::SI_LOOP)
5681 .addUse(Reg)
5682 .addMBB(UncondBrTarget);
5683
5684 if (Br)
5685 Br->getOperand(0).setMBB(CondBrTarget);
5686 else
5687 B.buildBr(*CondBrTarget);
5688
5689 MI.eraseFromParent();
5690 BrCond->eraseFromParent();
5691 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5692 return true;
5693 }
5694
5695 return false;
5696 }
5697 case Intrinsic::amdgcn_kernarg_segment_ptr:
5698 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5699 // This only makes sense to call in a kernel, so just lower to null.
5700 B.buildConstant(MI.getOperand(0).getReg(), 0);
5701 MI.eraseFromParent();
5702 return true;
5703 }
5704
5705 return legalizePreloadedArgIntrin(
5706 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5707 case Intrinsic::amdgcn_implicitarg_ptr:
5708 return legalizeImplicitArgPtr(MI, MRI, B);
5709 case Intrinsic::amdgcn_workitem_id_x:
5710 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5711 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5712 case Intrinsic::amdgcn_workitem_id_y:
5713 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5714 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5715 case Intrinsic::amdgcn_workitem_id_z:
5716 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5717 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5718 case Intrinsic::amdgcn_workgroup_id_x:
5719 return legalizePreloadedArgIntrin(MI, MRI, B,
5720 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5721 case Intrinsic::amdgcn_workgroup_id_y:
5722 return legalizePreloadedArgIntrin(MI, MRI, B,
5723 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5724 case Intrinsic::amdgcn_workgroup_id_z:
5725 return legalizePreloadedArgIntrin(MI, MRI, B,
5726 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5727 case Intrinsic::amdgcn_lds_kernel_id:
5728 return legalizePreloadedArgIntrin(MI, MRI, B,
5729 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5730 case Intrinsic::amdgcn_dispatch_ptr:
5731 return legalizePreloadedArgIntrin(MI, MRI, B,
5732 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5733 case Intrinsic::amdgcn_queue_ptr:
5734 return legalizePreloadedArgIntrin(MI, MRI, B,
5735 AMDGPUFunctionArgInfo::QUEUE_PTR);
5736 case Intrinsic::amdgcn_implicit_buffer_ptr:
5737 return legalizePreloadedArgIntrin(
5738 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5739 case Intrinsic::amdgcn_dispatch_id:
5740 return legalizePreloadedArgIntrin(MI, MRI, B,
5741 AMDGPUFunctionArgInfo::DISPATCH_ID);
5742 case Intrinsic::r600_read_ngroups_x:
5743 // TODO: Emit error for hsa
5744 return legalizeKernargMemParameter(MI, B,
5745 SI::KernelInputOffsets::NGROUPS_X);
5746 case Intrinsic::r600_read_ngroups_y:
5747 return legalizeKernargMemParameter(MI, B,
5748 SI::KernelInputOffsets::NGROUPS_Y);
5749 case Intrinsic::r600_read_ngroups_z:
5750 return legalizeKernargMemParameter(MI, B,
5751 SI::KernelInputOffsets::NGROUPS_Z);
5752 case Intrinsic::r600_read_local_size_x:
5753 // TODO: Could insert G_ASSERT_ZEXT from s16
5754 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5755 case Intrinsic::r600_read_local_size_y:
5756 // TODO: Could insert G_ASSERT_ZEXT from s16
5757 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
5758 // TODO: Could insert G_ASSERT_ZEXT from s16
5759 case Intrinsic::r600_read_local_size_z:
5760 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5761 case Intrinsic::r600_read_global_size_x:
5762 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5763 case Intrinsic::r600_read_global_size_y:
5764 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5765 case Intrinsic::r600_read_global_size_z:
5766 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5767 case Intrinsic::amdgcn_fdiv_fast:
5768 return legalizeFDIVFastIntrin(MI, MRI, B);
5769 case Intrinsic::amdgcn_is_shared:
5770 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5771 case Intrinsic::amdgcn_is_private:
5772 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5773 case Intrinsic::amdgcn_wavefrontsize: {
5774 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5775 MI.eraseFromParent();
5776 return true;
5777 }
5778 case Intrinsic::amdgcn_s_buffer_load:
5779 return legalizeSBufferLoad(Helper, MI);
5780 case Intrinsic::amdgcn_raw_buffer_store:
5781 case Intrinsic::amdgcn_struct_buffer_store:
5782 return legalizeBufferStore(MI, MRI, B, false, false);
5783 case Intrinsic::amdgcn_raw_buffer_store_format:
5784 case Intrinsic::amdgcn_struct_buffer_store_format:
5785 return legalizeBufferStore(MI, MRI, B, false, true);
5786 case Intrinsic::amdgcn_raw_tbuffer_store:
5787 case Intrinsic::amdgcn_struct_tbuffer_store:
5788 return legalizeBufferStore(MI, MRI, B, true, true);
5789 case Intrinsic::amdgcn_raw_buffer_load:
5790 case Intrinsic::amdgcn_struct_buffer_load:
5791 return legalizeBufferLoad(MI, MRI, B, false, false);
5792 case Intrinsic::amdgcn_raw_buffer_load_format:
5793 case Intrinsic::amdgcn_struct_buffer_load_format:
5794 return legalizeBufferLoad(MI, MRI, B, true, false);
5795 case Intrinsic::amdgcn_raw_tbuffer_load:
5796 case Intrinsic::amdgcn_struct_tbuffer_load:
5797 return legalizeBufferLoad(MI, MRI, B, true, true);
5798 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5799 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5800 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5801 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5802 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5803 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5804 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5805 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5806 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5807 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5808 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5809 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5810 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5811 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5812 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5813 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5814 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5815 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5816 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5817 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5818 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5819 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5820 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5821 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5822 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5823 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5824 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5825 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5826 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5827 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5828 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5829 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5830 return legalizeBufferAtomic(MI, B, IntrID);
5831 case Intrinsic::amdgcn_atomic_inc:
5832 return legalizeAtomicIncDec(MI, B, true);
5833 case Intrinsic::amdgcn_atomic_dec:
5834 return legalizeAtomicIncDec(MI, B, false);
5835 case Intrinsic::trap:
5836 return legalizeTrapIntrinsic(MI, MRI, B);
5837 case Intrinsic::debugtrap:
5838 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5839 case Intrinsic::amdgcn_rsq_clamp:
5840 return legalizeRsqClampIntrinsic(MI, MRI, B);
5841 case Intrinsic::amdgcn_ds_fadd:
5842 case Intrinsic::amdgcn_ds_fmin:
5843 case Intrinsic::amdgcn_ds_fmax:
5844 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5845 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5846 return legalizeBVHIntrinsic(MI, B);
5847 default: {
5848 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5849 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5850 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5851 return true;
5852 }
5853 }
5854
5855 return true;
5856}

/build/source/llvm/include/llvm/ADT/bit.h

1//===-- llvm/ADT/bit.h - C++20 <bit> ----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the C++20 <bit> header.
11///
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_ADT_BIT_H
15#define LLVM_ADT_BIT_H
16
17#include "llvm/Support/Compiler.h"
18#include <cstdint>
19#include <limits>
20#include <type_traits>
21
22#if !__has_builtin(__builtin_bit_cast)1
23#include <cstring>
24#endif
25
26#if defined(_MSC_VER) && !defined(_DEBUG1)
27#include <cstdlib> // for _byteswap_{ushort,ulong,uint64}
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and bit.h is popular via MathExtras.h.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44// This implementation of bit_cast is different from the C++20 one in two ways:
45// - It isn't constexpr because that requires compiler support.
46// - It requires trivially-constructible To, to avoid UB in the implementation.
47template <
48 typename To, typename From,
49 typename = std::enable_if_t<sizeof(To) == sizeof(From)>,
50 typename = std::enable_if_t<std::is_trivially_constructible<To>::value>,
51 typename = std::enable_if_t<std::is_trivially_copyable<To>::value>,
52 typename = std::enable_if_t<std::is_trivially_copyable<From>::value>>
53[[nodiscard]] inline To bit_cast(const From &from) noexcept {
54#if __has_builtin(__builtin_bit_cast)1
55 return __builtin_bit_cast(To, from);
56#else
57 To to;
58 std::memcpy(&to, &from, sizeof(To));
59 return to;
60#endif
61}
62
63/// Reverses the bytes in the given integer value V.
64template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
65[[nodiscard]] constexpr T byteswap(T V) noexcept {
66 if constexpr (sizeof(T) == 1) {
67 return V;
68 } else if constexpr (sizeof(T) == 2) {
69 uint16_t UV = V;
70#if defined(_MSC_VER) && !defined(_DEBUG1)
71 // The DLL version of the runtime lacks these functions (bug!?), but in a
72 // release build they're replaced with BSWAP instructions anyway.
73 return _byteswap_ushort(UV);
74#else
75 uint16_t Hi = UV << 8;
76 uint16_t Lo = UV >> 8;
77 return Hi | Lo;
78#endif
79 } else if constexpr (sizeof(T) == 4) {
80 uint32_t UV = V;
81#if __has_builtin(__builtin_bswap32)1
82 return __builtin_bswap32(UV);
83#elif defined(_MSC_VER) && !defined(_DEBUG1)
84 return _byteswap_ulong(UV);
85#else
86 uint32_t Byte0 = UV & 0x000000FF;
87 uint32_t Byte1 = UV & 0x0000FF00;
88 uint32_t Byte2 = UV & 0x00FF0000;
89 uint32_t Byte3 = UV & 0xFF000000;
90 return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
91#endif
92 } else if constexpr (sizeof(T) == 8) {
93 uint64_t UV = V;
94#if __has_builtin(__builtin_bswap64)1
95 return __builtin_bswap64(UV);
96#elif defined(_MSC_VER) && !defined(_DEBUG1)
97 return _byteswap_uint64(UV);
98#else
99 uint64_t Hi = llvm::byteswap<uint32_t>(UV);
100 uint32_t Lo = llvm::byteswap<uint32_t>(UV >> 32);
101 return (Hi << 32) | Lo;
102#endif
103 } else {
104 static_assert(!sizeof(T *), "Don't know how to handle the given type.");
105 return 0;
106 }
107}
108
109template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
110[[nodiscard]] constexpr inline bool has_single_bit(T Value) noexcept {
111 return (Value != 0) && ((Value & (Value - 1)) == 0);
112}
113
114namespace detail {
115template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
116 static unsigned count(T Val) {
117 if (!Val)
118 return std::numeric_limits<T>::digits;
119 if (Val & 0x1)
120 return 0;
121
122 // Bisection method.
123 unsigned ZeroBits = 0;
124 T Shift = std::numeric_limits<T>::digits >> 1;
125 T Mask = std::numeric_limits<T>::max() >> Shift;
126 while (Shift) {
127 if ((Val & Mask) == 0) {
128 Val >>= Shift;
129 ZeroBits |= Shift;
130 }
131 Shift >>= 1;
132 Mask >>= Shift;
133 }
134 return ZeroBits;
135 }
136};
137
138#if defined(__GNUC__4) || defined(_MSC_VER)
139template <typename T> struct TrailingZerosCounter<T, 4> {
140 static unsigned count(T Val) {
141 if (Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
142 return 32;
13
Returning the value 32
143
144#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
145 return __builtin_ctz(Val);
146#elif defined(_MSC_VER)
147 unsigned long Index;
148 _BitScanForward(&Index, Val);
149 return Index;
150#endif
151 }
152};
153
154#if !defined(_MSC_VER) || defined(_M_X64)
155template <typename T> struct TrailingZerosCounter<T, 8> {
156 static unsigned count(T Val) {
157 if (Val == 0)
158 return 64;
159
160#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
161 return __builtin_ctzll(Val);
162#elif defined(_MSC_VER)
163 unsigned long Index;
164 _BitScanForward64(&Index, Val);
165 return Index;
166#endif
167 }
168};
169#endif
170#endif
171} // namespace detail
172
173/// Count number of 0's from the least significant bit to the most
174/// stopping at the first 1.
175///
176/// Only unsigned integral types are allowed.
177///
178/// Returns std::numeric_limits<T>::digits on an input of 0.
179template <typename T> [[nodiscard]] int countr_zero(T Val) {
180 static_assert(std::is_unsigned_v<T>,
181 "Only unsigned integral types are allowed.");
182 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
183}
184
185namespace detail {
186template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
187 static unsigned count(T Val) {
188 if (!Val)
189 return std::numeric_limits<T>::digits;
190
191 // Bisection method.
192 unsigned ZeroBits = 0;
193 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
194 T Tmp = Val >> Shift;
195 if (Tmp)
196 Val = Tmp;
197 else
198 ZeroBits |= Shift;
199 }
200 return ZeroBits;
201 }
202};
203
204#if defined(__GNUC__4) || defined(_MSC_VER)
205template <typename T> struct LeadingZerosCounter<T, 4> {
206 static unsigned count(T Val) {
207 if (Val == 0)
208 return 32;
209
210#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
211 return __builtin_clz(Val);
212#elif defined(_MSC_VER)
213 unsigned long Index;
214 _BitScanReverse(&Index, Val);
215 return Index ^ 31;
216#endif
217 }
218};
219
220#if !defined(_MSC_VER) || defined(_M_X64)
221template <typename T> struct LeadingZerosCounter<T, 8> {
222 static unsigned count(T Val) {
223 if (Val == 0)
224 return 64;
225
226#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
227 return __builtin_clzll(Val);
228#elif defined(_MSC_VER)
229 unsigned long Index;
230 _BitScanReverse64(&Index, Val);
231 return Index ^ 63;
232#endif
233 }
234};
235#endif
236#endif
237} // namespace detail
238
239/// Count number of 0's from the most significant bit to the least
240/// stopping at the first 1.
241///
242/// Only unsigned integral types are allowed.
243///
244/// Returns std::numeric_limits<T>::digits on an input of 0.
245template <typename T> [[nodiscard]] int countl_zero(T Val) {
246 static_assert(std::is_unsigned_v<T>,
247 "Only unsigned integral types are allowed.");
248 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val);
249}
250
251/// Count the number of ones from the most significant bit to the first
252/// zero bit.
253///
254/// Ex. countl_one(0xFF0FFF00) == 8.
255/// Only unsigned integral types are allowed.
256///
257/// Returns std::numeric_limits<T>::digits on an input of all ones.
258template <typename T> [[nodiscard]] int countl_one(T Value) {
259 static_assert(std::is_unsigned_v<T>,
260 "Only unsigned integral types are allowed.");
261 return llvm::countl_zero<T>(~Value);
262}
263
264/// Count the number of ones from the least significant bit to the first
265/// zero bit.
266///
267/// Ex. countr_one(0x00FF00FF) == 8.
268/// Only unsigned integral types are allowed.
269///
270/// Returns std::numeric_limits<T>::digits on an input of all ones.
271template <typename T> [[nodiscard]] int countr_one(T Value) {
272 static_assert(std::is_unsigned_v<T>,
273 "Only unsigned integral types are allowed.");
274 return llvm::countr_zero<T>(~Value);
275}
276
277/// Returns the number of bits needed to represent Value if Value is nonzero.
278/// Returns 0 otherwise.
279///
280/// Ex. bit_width(5) == 3.
281template <typename T> [[nodiscard]] int bit_width(T Value) {
282 static_assert(std::is_unsigned_v<T>,
283 "Only unsigned integral types are allowed.");
284 return std::numeric_limits<T>::digits - llvm::countl_zero(Value);
285}
286
287/// Returns the largest integral power of two no greater than Value if Value is
288/// nonzero. Returns 0 otherwise.
289///
290/// Ex. bit_floor(5) == 4.
291template <typename T> [[nodiscard]] T bit_floor(T Value) {
292 static_assert(std::is_unsigned_v<T>,
293 "Only unsigned integral types are allowed.");
294 if (!Value)
295 return 0;
296 return T(1) << (llvm::bit_width(Value) - 1);
297}
298
299/// Returns the smallest integral power of two no smaller than Value if Value is
300/// nonzero. Returns 1 otherwise.
301///
302/// Ex. bit_ceil(5) == 8.
303///
304/// The return value is undefined if the input is larger than the largest power
305/// of two representable in T.
306template <typename T> [[nodiscard]] T bit_ceil(T Value) {
307 static_assert(std::is_unsigned_v<T>,
308 "Only unsigned integral types are allowed.");
309 if (Value < 2)
310 return 1;
311 return T(1) << llvm::bit_width<T>(Value - 1u);
312}
313
314namespace detail {
315