Bug Summary

File:build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 3293, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-10-03-140002-15933-1 -x c++ /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30
31#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
32
33using namespace llvm;
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
38
39// Hack until load/store selection patterns support any tuple of legal types.
40static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47static constexpr unsigned MaxRegisterSize = 1024;
48
49// Round the number of elements to the next power of two elements
50static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54}
55
56// Round the number of bits to the next power of two bits
57static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61}
62
63/// \returns true if this is an odd sized vector which should widen by adding an
64/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65/// excludes s1 vectors, which should always be scalarized.
66static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78}
79
80static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85}
86
87static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93}
94
95static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::make_pair(TypeIdx,
100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101 };
102}
103
104static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::make_pair(
112 TypeIdx,
113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114 };
115}
116
117// Increase the number of vector elements to reach the next multiple of 32-bit
118// type.
119static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120 return [=](const LegalityQuery &Query) {
121 const LLT Ty = Query.Types[TypeIdx];
122
123 const LLT EltTy = Ty.getElementType();
124 const int Size = Ty.getSizeInBits();
125 const int EltSize = EltTy.getSizeInBits();
126 const int NextMul32 = (Size + 31) / 32;
127
128 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 128, __extension__ __PRETTY_FUNCTION__))
;
129
130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132 };
133}
134
135static LLT getBitcastRegisterType(const LLT Ty) {
136 const unsigned Size = Ty.getSizeInBits();
137
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145}
146
147static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152}
153
154static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 158, __extension__ __PRETTY_FUNCTION__))
;
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162}
163
164static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169}
170
171static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176}
177
178static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183}
184
185static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187}
188
189static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192}
193
194static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199}
200
201static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209}
210
211// Any combination of 32 or 64-bit elements up the maximum register size, and
212// multiples of v2s16.
213static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217}
218
219static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227}
228
229// If we have a truncating store or an extending load with a data size larger
230// than 32-bits, we need to reduce to a 32-bit type.
231static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237}
238
239// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240// handle some operations by just promoting the register during
241// selection. There are also d16 loads on GFX9+ which preserve the high bits.
242static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 321, __extension__ __PRETTY_FUNCTION__))
;
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331}
332
333// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334// workaround this. Eventually it should ignore the type for loads and only care
335// about the size. Return true in cases where we will workaround this for now by
336// bitcasting.
337static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353}
354
355static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359}
360
361/// Return true if a load or store of the type should be lowered with a bitcast
362/// to a different type.
363static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377}
378
379/// Return true if we should legalize a load by widening an odd sized memory
380/// access up to the alignment. Note this case when the memory access itself
381/// changes, not the size of the result register.
382static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 uint64_t AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414}
415
416static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424}
425
426AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB})
534 .legalFor({S32, S16, V2S16})
535 .clampMaxNumElementsStrict(0, S16, 2)
536 .scalarize(0)
537 .minScalar(0, S16)
538 .widenScalarToNextMultipleOf(0, 32)
539 .maxScalar(0, S32);
540
541 getActionDefinitionsBuilder(G_MUL)
542 .legalFor({S32, S16, V2S16})
543 .clampMaxNumElementsStrict(0, S16, 2)
544 .scalarize(0)
545 .minScalar(0, S16)
546 .widenScalarToNextMultipleOf(0, 32)
547 .custom();
548 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 548, __extension__ __PRETTY_FUNCTION__))
;
549
550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551 .legalFor({S32, S16, V2S16}) // Clamp modifier
552 .minScalarOrElt(0, S16)
553 .clampMaxNumElementsStrict(0, S16, 2)
554 .scalarize(0)
555 .widenScalarToNextPow2(0, 32)
556 .lower();
557 } else if (ST.has16BitInsts()) {
558 getActionDefinitionsBuilder({G_ADD, G_SUB})
559 .legalFor({S32, S16})
560 .minScalar(0, S16)
561 .widenScalarToNextMultipleOf(0, 32)
562 .maxScalar(0, S32)
563 .scalarize(0);
564
565 getActionDefinitionsBuilder(G_MUL)
566 .legalFor({S32, S16})
567 .scalarize(0)
568 .minScalar(0, S16)
569 .widenScalarToNextMultipleOf(0, 32)
570 .custom();
571 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 571, __extension__ __PRETTY_FUNCTION__))
;
572
573 // Technically the saturating operations require clamp bit support, but this
574 // was introduced at the same time as 16-bit operations.
575 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576 .legalFor({S32, S16}) // Clamp modifier
577 .minScalar(0, S16)
578 .scalarize(0)
579 .widenScalarToNextPow2(0, 16)
580 .lower();
581
582 // We're just lowering this, but it helps get a better result to try to
583 // coerce to the desired type first.
584 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585 .minScalar(0, S16)
586 .scalarize(0)
587 .lower();
588 } else {
589 getActionDefinitionsBuilder({G_ADD, G_SUB})
590 .legalFor({S32})
591 .widenScalarToNextMultipleOf(0, 32)
592 .clampScalar(0, S32, S32)
593 .scalarize(0);
594
595 auto &Mul = getActionDefinitionsBuilder(G_MUL)
596 .legalFor({S32})
597 .scalarize(0)
598 .minScalar(0, S32)
599 .widenScalarToNextMultipleOf(0, 32);
600
601 if (ST.hasMad64_32())
602 Mul.custom();
603 else
604 Mul.maxScalar(0, S32);
605
606 if (ST.hasIntClamp()) {
607 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608 .legalFor({S32}) // Clamp modifier.
609 .scalarize(0)
610 .minScalarOrElt(0, S32)
611 .lower();
612 } else {
613 // Clamp bit support was added in VI, along with 16-bit operations.
614 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615 .minScalar(0, S32)
616 .scalarize(0)
617 .lower();
618 }
619
620 // FIXME: DAG expansion gets better results. The widening uses the smaller
621 // range values and goes for the min/max lowering directly.
622 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623 .minScalar(0, S32)
624 .scalarize(0)
625 .lower();
626 }
627
628 getActionDefinitionsBuilder(
629 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630 .customFor({S32, S64})
631 .clampScalar(0, S32, S64)
632 .widenScalarToNextPow2(0, 32)
633 .scalarize(0);
634
635 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636 .legalFor({S32})
637 .maxScalar(0, S32);
638
639 if (ST.hasVOP3PInsts()) {
640 Mulh
641 .clampMaxNumElements(0, S8, 2)
642 .lowerFor({V2S8});
643 }
644
645 Mulh
646 .scalarize(0)
647 .lower();
648
649 // Report legal for any types we can handle anywhere. For the cases only legal
650 // on the SALU, RegBankSelect will be able to re-legalize.
651 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653 .clampScalar(0, S32, S64)
654 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
655 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
656 .widenScalarToNextPow2(0)
657 .scalarize(0);
658
659 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661 .legalFor({{S32, S1}, {S32, S32}})
662 .minScalar(0, S32)
663 .scalarize(0)
664 .lower();
665
666 getActionDefinitionsBuilder(G_BITCAST)
667 // Don't worry about the size constraint.
668 .legalIf(all(isRegisterType(0), isRegisterType(1)))
669 .lower();
670
671
672 getActionDefinitionsBuilder(G_CONSTANT)
673 .legalFor({S1, S32, S64, S16, GlobalPtr,
674 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675 .legalIf(isPointer(0))
676 .clampScalar(0, S32, S64)
677 .widenScalarToNextPow2(0);
678
679 getActionDefinitionsBuilder(G_FCONSTANT)
680 .legalFor({S32, S64, S16})
681 .clampScalar(0, S16, S64);
682
683 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684 .legalIf(isRegisterType(0))
685 // s1 and s16 are special cases because they have legal operations on
686 // them, but don't really occupy registers in the normal way.
687 .legalFor({S1, S16})
688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689 .clampScalarOrElt(0, S32, MaxScalar)
690 .widenScalarToNextPow2(0, 32)
691 .clampMaxNumElements(0, S32, 16);
692
693 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694
695 // If the amount is divergent, we have to do a wave reduction to get the
696 // maximum value, so this is expanded during RegBankSelect.
697 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698 .legalFor({{PrivatePtr, S32}});
699
700 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701 .customIf(typeIsNot(0, PrivatePtr));
702
703 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704
705 auto &FPOpActions = getActionDefinitionsBuilder(
706 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707 .legalFor({S32, S64});
708 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709 .customFor({S32, S64});
710 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711 .customFor({S32, S64});
712
713 if (ST.has16BitInsts()) {
714 if (ST.hasVOP3PInsts())
715 FPOpActions.legalFor({S16, V2S16});
716 else
717 FPOpActions.legalFor({S16});
718
719 TrigActions.customFor({S16});
720 FDIVActions.customFor({S16});
721 }
722
723 auto &MinNumMaxNum = getActionDefinitionsBuilder({
724 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725
726 if (ST.hasVOP3PInsts()) {
727 MinNumMaxNum.customFor(FPTypesPK16)
728 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
729 .clampMaxNumElements(0, S16, 2)
730 .clampScalar(0, S16, S64)
731 .scalarize(0);
732 } else if (ST.has16BitInsts()) {
733 MinNumMaxNum.customFor(FPTypes16)
734 .clampScalar(0, S16, S64)
735 .scalarize(0);
736 } else {
737 MinNumMaxNum.customFor(FPTypesBase)
738 .clampScalar(0, S32, S64)
739 .scalarize(0);
740 }
741
742 if (ST.hasVOP3PInsts())
743 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744
745 FPOpActions
746 .scalarize(0)
747 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748
749 TrigActions
750 .scalarize(0)
751 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752
753 FDIVActions
754 .scalarize(0)
755 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756
757 getActionDefinitionsBuilder({G_FNEG, G_FABS})
758 .legalFor(FPTypesPK16)
759 .clampMaxNumElementsStrict(0, S16, 2)
760 .scalarize(0)
761 .clampScalar(0, S16, S64);
762
763 if (ST.has16BitInsts()) {
764 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765 .legalFor({S32, S64, S16})
766 .scalarize(0)
767 .clampScalar(0, S16, S64);
768 } else {
769 getActionDefinitionsBuilder(G_FSQRT)
770 .legalFor({S32, S64})
771 .scalarize(0)
772 .clampScalar(0, S32, S64);
773
774 if (ST.hasFractBug()) {
775 getActionDefinitionsBuilder(G_FFLOOR)
776 .customFor({S64})
777 .legalFor({S32, S64})
778 .scalarize(0)
779 .clampScalar(0, S32, S64);
780 } else {
781 getActionDefinitionsBuilder(G_FFLOOR)
782 .legalFor({S32, S64})
783 .scalarize(0)
784 .clampScalar(0, S32, S64);
785 }
786 }
787
788 getActionDefinitionsBuilder(G_FPTRUNC)
789 .legalFor({{S32, S64}, {S16, S32}})
790 .scalarize(0)
791 .lower();
792
793 getActionDefinitionsBuilder(G_FPEXT)
794 .legalFor({{S64, S32}, {S32, S16}})
795 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796 .scalarize(0);
797
798 auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
799 if (ST.has16BitInsts()) {
800 FSubActions
801 // Use actual fsub instruction
802 .legalFor({S32, S16})
803 // Must use fadd + fneg
804 .lowerFor({S64, V2S16});
805 } else {
806 FSubActions
807 // Use actual fsub instruction
808 .legalFor({S32})
809 // Must use fadd + fneg
810 .lowerFor({S64, S16, V2S16});
811 }
812
813 FSubActions
814 .scalarize(0)
815 .clampScalar(0, S32, S64);
816
817 // Whether this is legal depends on the floating point mode for the function.
818 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
819 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
820 FMad.customFor({S32, S16});
821 else if (ST.hasMadMacF32Insts())
822 FMad.customFor({S32});
823 else if (ST.hasMadF16())
824 FMad.customFor({S16});
825 FMad.scalarize(0)
826 .lower();
827
828 auto &FRem = getActionDefinitionsBuilder(G_FREM);
829 if (ST.has16BitInsts()) {
830 FRem.customFor({S16, S32, S64});
831 } else {
832 FRem.minScalar(0, S32)
833 .customFor({S32, S64});
834 }
835 FRem.scalarize(0);
836
837 // TODO: Do we need to clamp maximum bitwidth?
838 getActionDefinitionsBuilder(G_TRUNC)
839 .legalIf(isScalar(0))
840 .legalFor({{V2S16, V2S32}})
841 .clampMaxNumElements(0, S16, 2)
842 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
843 // situations (like an invalid implicit use), we don't want to infinite loop
844 // in the legalizer.
845 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
846 .alwaysLegal();
847
848 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
849 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
850 {S32, S1}, {S64, S1}, {S16, S1}})
851 .scalarize(0)
852 .clampScalar(0, S32, S64)
853 .widenScalarToNextPow2(1, 32);
854
855 // TODO: Split s1->s64 during regbankselect for VALU.
856 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
857 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
858 .lowerIf(typeIs(1, S1))
859 .customFor({{S32, S64}, {S64, S64}});
860 if (ST.has16BitInsts())
861 IToFP.legalFor({{S16, S16}});
862 IToFP.clampScalar(1, S32, S64)
863 .minScalar(0, S32)
864 .scalarize(0)
865 .widenScalarToNextPow2(1);
866
867 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
868 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869 .customFor({{S64, S32}, {S64, S64}})
870 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
871 if (ST.has16BitInsts())
872 FPToI.legalFor({{S16, S16}});
873 else
874 FPToI.minScalar(1, S32);
875
876 FPToI.minScalar(0, S32)
877 .widenScalarToNextPow2(0, 32)
878 .scalarize(0)
879 .lower();
880
881 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
882 .customFor({S16, S32})
883 .scalarize(0)
884 .lower();
885
886 // Lower roundeven into G_FRINT
887 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
888 .scalarize(0)
889 .lower();
890
891 if (ST.has16BitInsts()) {
892 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893 .legalFor({S16, S32, S64})
894 .clampScalar(0, S16, S64)
895 .scalarize(0);
896 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
897 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
898 .legalFor({S32, S64})
899 .clampScalar(0, S32, S64)
900 .scalarize(0);
901 } else {
902 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
903 .legalFor({S32})
904 .customFor({S64})
905 .clampScalar(0, S32, S64)
906 .scalarize(0);
907 }
908
909 getActionDefinitionsBuilder(G_PTR_ADD)
910 .legalIf(all(isPointer(0), sameSize(0, 1)))
911 .scalarize(0)
912 .scalarSameSizeAs(1, 0);
913
914 getActionDefinitionsBuilder(G_PTRMASK)
915 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
916 .scalarSameSizeAs(1, 0)
917 .scalarize(0);
918
919 auto &CmpBuilder =
920 getActionDefinitionsBuilder(G_ICMP)
921 // The compare output type differs based on the register bank of the output,
922 // so make both s1 and s32 legal.
923 //
924 // Scalar compares producing output in scc will be promoted to s32, as that
925 // is the allocatable register type that will be needed for the copy from
926 // scc. This will be promoted during RegBankSelect, and we assume something
927 // before that won't try to use s32 result types.
928 //
929 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
930 // bank.
931 .legalForCartesianProduct(
932 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933 .legalForCartesianProduct(
934 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
935 if (ST.has16BitInsts()) {
936 CmpBuilder.legalFor({{S1, S16}});
937 }
938
939 CmpBuilder
940 .widenScalarToNextPow2(1)
941 .clampScalar(1, S32, S64)
942 .scalarize(0)
943 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
944
945 getActionDefinitionsBuilder(G_FCMP)
946 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
947 .widenScalarToNextPow2(1)
948 .clampScalar(1, S32, S64)
949 .scalarize(0);
950
951 // FIXME: fpow has a selection pattern that should move to custom lowering.
952 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
953 if (ST.has16BitInsts())
954 Exp2Ops.legalFor({S32, S16});
955 else
956 Exp2Ops.legalFor({S32});
957 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
958 Exp2Ops.scalarize(0);
959
960 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
961 if (ST.has16BitInsts())
962 ExpOps.customFor({{S32}, {S16}});
963 else
964 ExpOps.customFor({S32});
965 ExpOps.clampScalar(0, MinScalarFPTy, S32)
966 .scalarize(0);
967
968 getActionDefinitionsBuilder(G_FPOWI)
969 .clampScalar(0, MinScalarFPTy, S32)
970 .lower();
971
972 // The 64-bit versions produce 32-bit results, but only on the SALU.
973 getActionDefinitionsBuilder(G_CTPOP)
974 .legalFor({{S32, S32}, {S32, S64}})
975 .clampScalar(0, S32, S32)
976 .widenScalarToNextPow2(1, 32)
977 .clampScalar(1, S32, S64)
978 .scalarize(0)
979 .widenScalarToNextPow2(0, 32);
980
981
982 // The hardware instructions return a different result on 0 than the generic
983 // instructions expect. The hardware produces -1, but these produce the
984 // bitwidth.
985 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
986 .scalarize(0)
987 .clampScalar(0, S32, S32)
988 .clampScalar(1, S32, S64)
989 .widenScalarToNextPow2(0, 32)
990 .widenScalarToNextPow2(1, 32)
991 .custom();
992
993 // The 64-bit versions produce 32-bit results, but only on the SALU.
994 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
995 .legalFor({{S32, S32}, {S32, S64}})
996 .clampScalar(0, S32, S32)
997 .clampScalar(1, S32, S64)
998 .scalarize(0)
999 .widenScalarToNextPow2(0, 32)
1000 .widenScalarToNextPow2(1, 32);
1001
1002 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1003 // RegBankSelect.
1004 getActionDefinitionsBuilder(G_BITREVERSE)
1005 .legalFor({S32, S64})
1006 .clampScalar(0, S32, S64)
1007 .scalarize(0)
1008 .widenScalarToNextPow2(0);
1009
1010 if (ST.has16BitInsts()) {
1011 getActionDefinitionsBuilder(G_BSWAP)
1012 .legalFor({S16, S32, V2S16})
1013 .clampMaxNumElementsStrict(0, S16, 2)
1014 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1015 // narrowScalar limitation.
1016 .widenScalarToNextPow2(0)
1017 .clampScalar(0, S16, S32)
1018 .scalarize(0);
1019
1020 if (ST.hasVOP3PInsts()) {
1021 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1022 .legalFor({S32, S16, V2S16})
1023 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1024 .clampMaxNumElements(0, S16, 2)
1025 .minScalar(0, S16)
1026 .widenScalarToNextPow2(0)
1027 .scalarize(0)
1028 .lower();
1029 } else {
1030 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1031 .legalFor({S32, S16})
1032 .widenScalarToNextPow2(0)
1033 .minScalar(0, S16)
1034 .scalarize(0)
1035 .lower();
1036 }
1037 } else {
1038 // TODO: Should have same legality without v_perm_b32
1039 getActionDefinitionsBuilder(G_BSWAP)
1040 .legalFor({S32})
1041 .lowerIf(scalarNarrowerThan(0, 32))
1042 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1043 // narrowScalar limitation.
1044 .widenScalarToNextPow2(0)
1045 .maxScalar(0, S32)
1046 .scalarize(0)
1047 .lower();
1048
1049 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1050 .legalFor({S32})
1051 .minScalar(0, S32)
1052 .widenScalarToNextPow2(0)
1053 .scalarize(0)
1054 .lower();
1055 }
1056
1057 getActionDefinitionsBuilder(G_INTTOPTR)
1058 // List the common cases
1059 .legalForCartesianProduct(AddrSpaces64, {S64})
1060 .legalForCartesianProduct(AddrSpaces32, {S32})
1061 .scalarize(0)
1062 // Accept any address space as long as the size matches
1063 .legalIf(sameSize(0, 1))
1064 .widenScalarIf(smallerThan(1, 0),
1065 [](const LegalityQuery &Query) {
1066 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1067 })
1068 .narrowScalarIf(largerThan(1, 0),
1069 [](const LegalityQuery &Query) {
1070 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1071 });
1072
1073 getActionDefinitionsBuilder(G_PTRTOINT)
1074 // List the common cases
1075 .legalForCartesianProduct(AddrSpaces64, {S64})
1076 .legalForCartesianProduct(AddrSpaces32, {S32})
1077 .scalarize(0)
1078 // Accept any address space as long as the size matches
1079 .legalIf(sameSize(0, 1))
1080 .widenScalarIf(smallerThan(0, 1),
1081 [](const LegalityQuery &Query) {
1082 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1083 })
1084 .narrowScalarIf(
1085 largerThan(0, 1),
1086 [](const LegalityQuery &Query) {
1087 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1088 });
1089
1090 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1091 .scalarize(0)
1092 .custom();
1093
1094 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1095 bool IsLoad) -> bool {
1096 const LLT DstTy = Query.Types[0];
1097
1098 // Split vector extloads.
1099 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1100
1101 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1102 return true;
1103
1104 const LLT PtrTy = Query.Types[1];
1105 unsigned AS = PtrTy.getAddressSpace();
1106 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1107 return true;
1108
1109 // Catch weird sized loads that don't evenly divide into the access sizes
1110 // TODO: May be able to widen depending on alignment etc.
1111 unsigned NumRegs = (MemSize + 31) / 32;
1112 if (NumRegs == 3) {
1113 if (!ST.hasDwordx3LoadStores())
1114 return true;
1115 } else {
1116 // If the alignment allows, these should have been widened.
1117 if (!isPowerOf2_32(NumRegs))
1118 return true;
1119 }
1120
1121 return false;
1122 };
1123
1124 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1127
1128 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1129 // LDS
1130 // TODO: Unsupported flat for SI.
1131
1132 for (unsigned Op : {G_LOAD, G_STORE}) {
1133 const bool IsStore = Op == G_STORE;
1134
1135 auto &Actions = getActionDefinitionsBuilder(Op);
1136 // Explicitly list some common cases.
1137 // TODO: Does this help compile time at all?
1138 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141 {S64, GlobalPtr, S64, GlobalAlign32},
1142 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144 {S32, GlobalPtr, S8, GlobalAlign8},
1145 {S32, GlobalPtr, S16, GlobalAlign16},
1146
1147 {S32, LocalPtr, S32, 32},
1148 {S64, LocalPtr, S64, 32},
1149 {V2S32, LocalPtr, V2S32, 32},
1150 {S32, LocalPtr, S8, 8},
1151 {S32, LocalPtr, S16, 16},
1152 {V2S16, LocalPtr, S32, 32},
1153
1154 {S32, PrivatePtr, S32, 32},
1155 {S32, PrivatePtr, S8, 8},
1156 {S32, PrivatePtr, S16, 16},
1157 {V2S16, PrivatePtr, S32, 32},
1158
1159 {S32, ConstantPtr, S32, GlobalAlign32},
1160 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162 {S64, ConstantPtr, S64, GlobalAlign32},
1163 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1164 Actions.legalIf(
1165 [=](const LegalityQuery &Query) -> bool {
1166 return isLoadStoreLegal(ST, Query);
1167 });
1168
1169 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1170 // 64-bits.
1171 //
1172 // TODO: Should generalize bitcast action into coerce, which will also cover
1173 // inserting addrspacecasts.
1174 Actions.customIf(typeIs(1, Constant32Ptr));
1175
1176 // Turn any illegal element vectors into something easier to deal
1177 // with. These will ultimately produce 32-bit scalar shifts to extract the
1178 // parts anyway.
1179 //
1180 // For odd 16-bit element vectors, prefer to split those into pieces with
1181 // 16-bit vector parts.
1182 Actions.bitcastIf(
1183 [=](const LegalityQuery &Query) -> bool {
1184 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1185 Query.MMODescrs[0].MemoryTy);
1186 }, bitcastToRegisterType(0));
1187
1188 if (!IsStore) {
1189 // Widen suitably aligned loads by loading extra bytes. The standard
1190 // legalization actions can't properly express widening memory operands.
1191 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1192 return shouldWidenLoad(ST, Query, G_LOAD);
1193 });
1194 }
1195
1196 // FIXME: load/store narrowing should be moved to lower action
1197 Actions
1198 .narrowScalarIf(
1199 [=](const LegalityQuery &Query) -> bool {
1200 return !Query.Types[0].isVector() &&
1201 needToSplitMemOp(Query, Op == G_LOAD);
1202 },
1203 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1204 const LLT DstTy = Query.Types[0];
1205 const LLT PtrTy = Query.Types[1];
1206
1207 const unsigned DstSize = DstTy.getSizeInBits();
1208 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1209
1210 // Split extloads.
1211 if (DstSize > MemSize)
1212 return std::make_pair(0, LLT::scalar(MemSize));
1213
1214 unsigned MaxSize = maxSizeForAddrSpace(ST,
1215 PtrTy.getAddressSpace(),
1216 Op == G_LOAD);
1217 if (MemSize > MaxSize)
1218 return std::make_pair(0, LLT::scalar(MaxSize));
1219
1220 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1221 return std::make_pair(0, LLT::scalar(Align));
1222 })
1223 .fewerElementsIf(
1224 [=](const LegalityQuery &Query) -> bool {
1225 return Query.Types[0].isVector() &&
1226 needToSplitMemOp(Query, Op == G_LOAD);
1227 },
1228 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1229 const LLT DstTy = Query.Types[0];
1230 const LLT PtrTy = Query.Types[1];
1231
1232 LLT EltTy = DstTy.getElementType();
1233 unsigned MaxSize = maxSizeForAddrSpace(ST,
1234 PtrTy.getAddressSpace(),
1235 Op == G_LOAD);
1236
1237 // FIXME: Handle widened to power of 2 results better. This ends
1238 // up scalarizing.
1239 // FIXME: 3 element stores scalarized on SI
1240
1241 // Split if it's too large for the address space.
1242 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1243 if (MemSize > MaxSize) {
1244 unsigned NumElts = DstTy.getNumElements();
1245 unsigned EltSize = EltTy.getSizeInBits();
1246
1247 if (MaxSize % EltSize == 0) {
1248 return std::make_pair(
1249 0, LLT::scalarOrVector(
1250 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1251 }
1252
1253 unsigned NumPieces = MemSize / MaxSize;
1254
1255 // FIXME: Refine when odd breakdowns handled
1256 // The scalars will need to be re-legalized.
1257 if (NumPieces == 1 || NumPieces >= NumElts ||
1258 NumElts % NumPieces != 0)
1259 return std::make_pair(0, EltTy);
1260
1261 return std::make_pair(
1262 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1263 }
1264
1265 // FIXME: We could probably handle weird extending loads better.
1266 if (DstTy.getSizeInBits() > MemSize)
1267 return std::make_pair(0, EltTy);
1268
1269 unsigned EltSize = EltTy.getSizeInBits();
1270 unsigned DstSize = DstTy.getSizeInBits();
1271 if (!isPowerOf2_32(DstSize)) {
1272 // We're probably decomposing an odd sized store. Try to split
1273 // to the widest type. TODO: Account for alignment. As-is it
1274 // should be OK, since the new parts will be further legalized.
1275 unsigned FloorSize = PowerOf2Floor(DstSize);
1276 return std::make_pair(
1277 0, LLT::scalarOrVector(
1278 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1279 }
1280
1281 // May need relegalization for the scalars.
1282 return std::make_pair(0, EltTy);
1283 })
1284 .minScalar(0, S32)
1285 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1286 .widenScalarToNextPow2(0)
1287 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1288 .lower();
1289 }
1290
1291 // FIXME: Unaligned accesses not lowered.
1292 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294 {S32, GlobalPtr, S16, 2 * 8},
1295 {S32, LocalPtr, S8, 8},
1296 {S32, LocalPtr, S16, 16},
1297 {S32, PrivatePtr, S8, 8},
1298 {S32, PrivatePtr, S16, 16},
1299 {S32, ConstantPtr, S8, 8},
1300 {S32, ConstantPtr, S16, 2 * 8}})
1301 .legalIf(
1302 [=](const LegalityQuery &Query) -> bool {
1303 return isLoadStoreLegal(ST, Query);
1304 });
1305
1306 if (ST.hasFlatAddressSpace()) {
1307 ExtLoads.legalForTypesWithMemDesc(
1308 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1309 }
1310
1311 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1312 // 64-bits.
1313 //
1314 // TODO: Should generalize bitcast action into coerce, which will also cover
1315 // inserting addrspacecasts.
1316 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1317
1318 ExtLoads.clampScalar(0, S32, S32)
1319 .widenScalarToNextPow2(0)
1320 .lower();
1321
1322 auto &Atomics = getActionDefinitionsBuilder(
1323 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1324 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1325 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1326 G_ATOMICRMW_UMIN})
1327 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328 {S64, GlobalPtr}, {S64, LocalPtr},
1329 {S32, RegionPtr}, {S64, RegionPtr}});
1330 if (ST.hasFlatAddressSpace()) {
1331 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1332 }
1333
1334 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335 if (ST.hasLDSFPAtomicAdd()) {
1336 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337 if (ST.hasGFX90AInsts())
1338 Atomic.legalFor({{S64, LocalPtr}});
1339 if (ST.hasGFX940Insts())
1340 Atomic.legalFor({{V2S16, LocalPtr}});
1341 }
1342 if (ST.hasAtomicFaddInsts())
1343 Atomic.legalFor({{S32, GlobalPtr}});
1344 if (ST.hasFlatAtomicFaddF32Inst())
1345 Atomic.legalFor({{S32, FlatPtr}});
1346
1347 if (ST.hasGFX90AInsts()) {
1348 // These are legal with some caveats, and should have undergone expansion in
1349 // the IR in most situations
1350 // TODO: Move atomic expansion into legalizer
1351 Atomic.legalFor({
1352 {S32, GlobalPtr},
1353 {S64, GlobalPtr},
1354 {S64, FlatPtr}
1355 });
1356 }
1357
1358 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1359 // demarshalling
1360 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1361 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1362 {S32, FlatPtr}, {S64, FlatPtr}})
1363 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1364 {S32, RegionPtr}, {S64, RegionPtr}});
1365 // TODO: Pointer types, any 32-bit or 64-bit vector
1366
1367 // Condition should be s32 for scalar, s1 for vector.
1368 getActionDefinitionsBuilder(G_SELECT)
1369 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1370 LocalPtr, FlatPtr, PrivatePtr,
1371 LLT::fixed_vector(2, LocalPtr),
1372 LLT::fixed_vector(2, PrivatePtr)},
1373 {S1, S32})
1374 .clampScalar(0, S16, S64)
1375 .scalarize(1)
1376 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1377 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1378 .clampMaxNumElements(0, S32, 2)
1379 .clampMaxNumElements(0, LocalPtr, 2)
1380 .clampMaxNumElements(0, PrivatePtr, 2)
1381 .scalarize(0)
1382 .widenScalarToNextPow2(0)
1383 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1384
1385 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1386 // be more flexible with the shift amount type.
1387 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1388 .legalFor({{S32, S32}, {S64, S32}});
1389 if (ST.has16BitInsts()) {
1390 if (ST.hasVOP3PInsts()) {
1391 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1392 .clampMaxNumElements(0, S16, 2);
1393 } else
1394 Shifts.legalFor({{S16, S16}});
1395
1396 // TODO: Support 16-bit shift amounts for all types
1397 Shifts.widenScalarIf(
1398 [=](const LegalityQuery &Query) {
1399 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1400 // 32-bit amount.
1401 const LLT ValTy = Query.Types[0];
1402 const LLT AmountTy = Query.Types[1];
1403 return ValTy.getSizeInBits() <= 16 &&
1404 AmountTy.getSizeInBits() < 16;
1405 }, changeTo(1, S16));
1406 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1407 Shifts.clampScalar(1, S32, S32);
1408 Shifts.widenScalarToNextPow2(0, 16);
1409 Shifts.clampScalar(0, S16, S64);
1410
1411 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1412 .minScalar(0, S16)
1413 .scalarize(0)
1414 .lower();
1415 } else {
1416 // Make sure we legalize the shift amount type first, as the general
1417 // expansion for the shifted type will produce much worse code if it hasn't
1418 // been truncated already.
1419 Shifts.clampScalar(1, S32, S32);
1420 Shifts.widenScalarToNextPow2(0, 32);
1421 Shifts.clampScalar(0, S32, S64);
1422
1423 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1424 .minScalar(0, S32)
1425 .scalarize(0)
1426 .lower();
1427 }
1428 Shifts.scalarize(0);
1429
1430 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1431 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1432 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1433 unsigned IdxTypeIdx = 2;
1434
1435 getActionDefinitionsBuilder(Op)
1436 .customIf([=](const LegalityQuery &Query) {
1437 const LLT EltTy = Query.Types[EltTypeIdx];
1438 const LLT VecTy = Query.Types[VecTypeIdx];
1439 const LLT IdxTy = Query.Types[IdxTypeIdx];
1440 const unsigned EltSize = EltTy.getSizeInBits();
1441 return (EltSize == 32 || EltSize == 64) &&
1442 VecTy.getSizeInBits() % 32 == 0 &&
1443 VecTy.getSizeInBits() <= MaxRegisterSize &&
1444 IdxTy.getSizeInBits() == 32;
1445 })
1446 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1447 bitcastToVectorElement32(VecTypeIdx))
1448 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1449 .bitcastIf(
1450 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1451 [=](const LegalityQuery &Query) {
1452 // For > 64-bit element types, try to turn this into a 64-bit
1453 // element vector since we may be able to do better indexing
1454 // if this is scalar. If not, fall back to 32.
1455 const LLT EltTy = Query.Types[EltTypeIdx];
1456 const LLT VecTy = Query.Types[VecTypeIdx];
1457 const unsigned DstEltSize = EltTy.getSizeInBits();
1458 const unsigned VecSize = VecTy.getSizeInBits();
1459
1460 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1461 return std::make_pair(
1462 VecTypeIdx,
1463 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1464 })
1465 .clampScalar(EltTypeIdx, S32, S64)
1466 .clampScalar(VecTypeIdx, S32, S64)
1467 .clampScalar(IdxTypeIdx, S32, S32)
1468 .clampMaxNumElements(VecTypeIdx, S32, 32)
1469 // TODO: Clamp elements for 64-bit vectors?
1470 // It should only be necessary with variable indexes.
1471 // As a last resort, lower to the stack
1472 .lower();
1473 }
1474
1475 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1476 .unsupportedIf([=](const LegalityQuery &Query) {
1477 const LLT &EltTy = Query.Types[1].getElementType();
1478 return Query.Types[0] != EltTy;
1479 });
1480
1481 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1482 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1483 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1484
1485 // FIXME: Doesn't handle extract of illegal sizes.
1486 getActionDefinitionsBuilder(Op)
1487 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1488 .lowerIf([=](const LegalityQuery &Query) {
1489 // Sub-vector(or single element) insert and extract.
1490 // TODO: verify immediate offset here since lower only works with
1491 // whole elements.
1492 const LLT BigTy = Query.Types[BigTyIdx];
1493 return BigTy.isVector();
1494 })
1495 // FIXME: Multiples of 16 should not be legal.
1496 .legalIf([=](const LegalityQuery &Query) {
1497 const LLT BigTy = Query.Types[BigTyIdx];
1498 const LLT LitTy = Query.Types[LitTyIdx];
1499 return (BigTy.getSizeInBits() % 32 == 0) &&
1500 (LitTy.getSizeInBits() % 16 == 0);
1501 })
1502 .widenScalarIf(
1503 [=](const LegalityQuery &Query) {
1504 const LLT BigTy = Query.Types[BigTyIdx];
1505 return (BigTy.getScalarSizeInBits() < 16);
1506 },
1507 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1508 .widenScalarIf(
1509 [=](const LegalityQuery &Query) {
1510 const LLT LitTy = Query.Types[LitTyIdx];
1511 return (LitTy.getScalarSizeInBits() < 16);
1512 },
1513 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1514 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1515 .widenScalarToNextPow2(BigTyIdx, 32);
1516
1517 }
1518
1519 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1520 .legalForCartesianProduct(AllS32Vectors, {S32})
1521 .legalForCartesianProduct(AllS64Vectors, {S64})
1522 .clampNumElements(0, V16S32, V32S32)
1523 .clampNumElements(0, V2S64, V16S64)
1524 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1525
1526 if (ST.hasScalarPackInsts()) {
1527 BuildVector
1528 // FIXME: Should probably widen s1 vectors straight to s32
1529 .minScalarOrElt(0, S16)
1530 .minScalar(1, S16);
1531
1532 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1533 .legalFor({V2S16, S32})
1534 .lower();
1535 } else {
1536 BuildVector.customFor({V2S16, S16});
1537 BuildVector.minScalarOrElt(0, S32);
1538
1539 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1540 .customFor({V2S16, S32})
1541 .lower();
1542 }
1543
1544 BuildVector.legalIf(isRegisterType(0));
1545
1546 // FIXME: Clamp maximum size
1547 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1548 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1549 .clampMaxNumElements(0, S32, 32)
1550 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1551 .clampMaxNumElements(0, S16, 64);
1552
1553 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1554 // pre-legalize.
1555 if (ST.hasVOP3PInsts()) {
1556 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1557 .customFor({V2S16, V2S16})
1558 .lower();
1559 } else
1560 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1561
1562 // Merge/Unmerge
1563 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1564 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1565 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1566
1567 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1568 const LLT Ty = Query.Types[TypeIdx];
1569 if (Ty.isVector()) {
1570 const LLT &EltTy = Ty.getElementType();
1571 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1572 return true;
1573 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1574 return true;
1575 }
1576 return false;
1577 };
1578
1579 auto &Builder = getActionDefinitionsBuilder(Op)
1580 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1581 .lowerFor({{S16, V2S16}})
1582 .lowerIf([=](const LegalityQuery &Query) {
1583 const LLT BigTy = Query.Types[BigTyIdx];
1584 return BigTy.getSizeInBits() == 32;
1585 })
1586 // Try to widen to s16 first for small types.
1587 // TODO: Only do this on targets with legal s16 shifts
1588 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1589 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1590 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1591 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1592 elementTypeIs(1, S16)),
1593 changeTo(1, V2S16))
1594 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1595 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1596 // valid.
1597 .clampScalar(LitTyIdx, S32, S512)
1598 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1599 // Break up vectors with weird elements into scalars
1600 .fewerElementsIf(
1601 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1602 scalarize(0))
1603 .fewerElementsIf(
1604 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1605 scalarize(1))
1606 .clampScalar(BigTyIdx, S32, MaxScalar);
1607
1608 if (Op == G_MERGE_VALUES) {
1609 Builder.widenScalarIf(
1610 // TODO: Use 16-bit shifts if legal for 8-bit values?
1611 [=](const LegalityQuery &Query) {
1612 const LLT Ty = Query.Types[LitTyIdx];
1613 return Ty.getSizeInBits() < 32;
1614 },
1615 changeTo(LitTyIdx, S32));
1616 }
1617
1618 Builder.widenScalarIf(
1619 [=](const LegalityQuery &Query) {
1620 const LLT Ty = Query.Types[BigTyIdx];
1621 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1622 Ty.getSizeInBits() % 16 != 0;
1623 },
1624 [=](const LegalityQuery &Query) {
1625 // Pick the next power of 2, or a multiple of 64 over 128.
1626 // Whichever is smaller.
1627 const LLT &Ty = Query.Types[BigTyIdx];
1628 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1629 if (NewSizeInBits >= 256) {
1630 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1631 if (RoundedTo < NewSizeInBits)
1632 NewSizeInBits = RoundedTo;
1633 }
1634 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1635 })
1636 // Any vectors left are the wrong size. Scalarize them.
1637 .scalarize(0)
1638 .scalarize(1);
1639 }
1640
1641 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1642 // RegBankSelect.
1643 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1644 .legalFor({{S32}, {S64}});
1645
1646 if (ST.hasVOP3PInsts()) {
1647 SextInReg.lowerFor({{V2S16}})
1648 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1649 // get more vector shift opportunities, since we'll get those when
1650 // expanded.
1651 .clampMaxNumElementsStrict(0, S16, 2);
1652 } else if (ST.has16BitInsts()) {
1653 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1654 } else {
1655 // Prefer to promote to s32 before lowering if we don't have 16-bit
1656 // shifts. This avoid a lot of intermediate truncate and extend operations.
1657 SextInReg.lowerFor({{S32}, {S64}});
1658 }
1659
1660 SextInReg
1661 .scalarize(0)
1662 .clampScalar(0, S32, S64)
1663 .lower();
1664
1665 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1666 .scalarize(0)
1667 .lower();
1668
1669 // TODO: Only Try to form v2s16 with legal packed instructions.
1670 getActionDefinitionsBuilder(G_FSHR)
1671 .legalFor({{S32, S32}})
1672 .lowerFor({{V2S16, V2S16}})
1673 .clampMaxNumElementsStrict(0, S16, 2)
1674 .scalarize(0)
1675 .lower();
1676
1677 if (ST.hasVOP3PInsts()) {
1678 getActionDefinitionsBuilder(G_FSHL)
1679 .lowerFor({{V2S16, V2S16}})
1680 .clampMaxNumElementsStrict(0, S16, 2)
1681 .scalarize(0)
1682 .lower();
1683 } else {
1684 getActionDefinitionsBuilder(G_FSHL)
1685 .scalarize(0)
1686 .lower();
1687 }
1688
1689 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1690 .legalFor({S64});
1691
1692 getActionDefinitionsBuilder(G_FENCE)
1693 .alwaysLegal();
1694
1695 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1696 .scalarize(0)
1697 .minScalar(0, S32)
1698 .lower();
1699
1700 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1701 .legalFor({{S32, S32}, {S64, S32}})
1702 .clampScalar(1, S32, S32)
1703 .clampScalar(0, S32, S64)
1704 .widenScalarToNextPow2(0)
1705 .scalarize(0);
1706
1707 getActionDefinitionsBuilder({
1708 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1709 G_FCOPYSIGN,
1710
1711 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1712 G_ATOMICRMW_NAND,
1713 G_ATOMICRMW_FSUB,
1714 G_READ_REGISTER,
1715 G_WRITE_REGISTER,
1716
1717 G_SADDO, G_SSUBO,
1718
1719 // TODO: Implement
1720 G_FMINIMUM, G_FMAXIMUM}).lower();
1721
1722 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1723 .lower();
1724
1725 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1726 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1727 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1728 .unsupported();
1729
1730 getLegacyLegalizerInfo().computeTables();
1731 verify(*ST.getInstrInfo());
1732}
1733
1734bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1735 MachineInstr &MI) const {
1736 MachineIRBuilder &B = Helper.MIRBuilder;
1737 MachineRegisterInfo &MRI = *B.getMRI();
1738
1739 switch (MI.getOpcode()) {
1740 case TargetOpcode::G_ADDRSPACE_CAST:
1741 return legalizeAddrSpaceCast(MI, MRI, B);
1742 case TargetOpcode::G_FRINT:
1743 return legalizeFrint(MI, MRI, B);
1744 case TargetOpcode::G_FCEIL:
1745 return legalizeFceil(MI, MRI, B);
1746 case TargetOpcode::G_FREM:
1747 return legalizeFrem(MI, MRI, B);
1748 case TargetOpcode::G_INTRINSIC_TRUNC:
1749 return legalizeIntrinsicTrunc(MI, MRI, B);
1750 case TargetOpcode::G_SITOFP:
1751 return legalizeITOFP(MI, MRI, B, true);
1752 case TargetOpcode::G_UITOFP:
1753 return legalizeITOFP(MI, MRI, B, false);
1754 case TargetOpcode::G_FPTOSI:
1755 return legalizeFPTOI(MI, MRI, B, true);
1756 case TargetOpcode::G_FPTOUI:
1757 return legalizeFPTOI(MI, MRI, B, false);
1758 case TargetOpcode::G_FMINNUM:
1759 case TargetOpcode::G_FMAXNUM:
1760 case TargetOpcode::G_FMINNUM_IEEE:
1761 case TargetOpcode::G_FMAXNUM_IEEE:
1762 return legalizeMinNumMaxNum(Helper, MI);
1763 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1764 return legalizeExtractVectorElt(MI, MRI, B);
1765 case TargetOpcode::G_INSERT_VECTOR_ELT:
1766 return legalizeInsertVectorElt(MI, MRI, B);
1767 case TargetOpcode::G_SHUFFLE_VECTOR:
1768 return legalizeShuffleVector(MI, MRI, B);
1769 case TargetOpcode::G_FSIN:
1770 case TargetOpcode::G_FCOS:
1771 return legalizeSinCos(MI, MRI, B);
1772 case TargetOpcode::G_GLOBAL_VALUE:
1773 return legalizeGlobalValue(MI, MRI, B);
1774 case TargetOpcode::G_LOAD:
1775 case TargetOpcode::G_SEXTLOAD:
1776 case TargetOpcode::G_ZEXTLOAD:
1777 return legalizeLoad(Helper, MI);
1778 case TargetOpcode::G_FMAD:
1779 return legalizeFMad(MI, MRI, B);
1780 case TargetOpcode::G_FDIV:
1781 return legalizeFDIV(MI, MRI, B);
1782 case TargetOpcode::G_UDIV:
1783 case TargetOpcode::G_UREM:
1784 case TargetOpcode::G_UDIVREM:
1785 return legalizeUnsignedDIV_REM(MI, MRI, B);
1786 case TargetOpcode::G_SDIV:
1787 case TargetOpcode::G_SREM:
1788 case TargetOpcode::G_SDIVREM:
1789 return legalizeSignedDIV_REM(MI, MRI, B);
1790 case TargetOpcode::G_ATOMIC_CMPXCHG:
1791 return legalizeAtomicCmpXChg(MI, MRI, B);
1792 case TargetOpcode::G_FLOG:
1793 return legalizeFlog(MI, B, numbers::ln2f);
1794 case TargetOpcode::G_FLOG10:
1795 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1796 case TargetOpcode::G_FEXP:
1797 return legalizeFExp(MI, B);
1798 case TargetOpcode::G_FPOW:
1799 return legalizeFPow(MI, B);
1800 case TargetOpcode::G_FFLOOR:
1801 return legalizeFFloor(MI, MRI, B);
1802 case TargetOpcode::G_BUILD_VECTOR:
1803 return legalizeBuildVector(MI, MRI, B);
1804 case TargetOpcode::G_MUL:
1805 return legalizeMul(Helper, MI);
1806 case TargetOpcode::G_CTLZ:
1807 case TargetOpcode::G_CTTZ:
1808 return legalizeCTLZ_CTTZ(MI, MRI, B);
1809 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1810 return legalizeFPTruncRound(MI, B);
1811 default:
1812 return false;
1813 }
1814
1815 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1815)
;
1816}
1817
1818Register AMDGPULegalizerInfo::getSegmentAperture(
1819 unsigned AS,
1820 MachineRegisterInfo &MRI,
1821 MachineIRBuilder &B) const {
1822 MachineFunction &MF = B.getMF();
1823 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1824 const LLT S32 = LLT::scalar(32);
1825
1826 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1826, __extension__
__PRETTY_FUNCTION__))
;
1827
1828 if (ST.hasApertureRegs()) {
1829 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1830 // getreg.
1831 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1832 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1833 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1834 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1835 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1836 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1837 unsigned Encoding =
1838 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1839 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1840 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1841
1842 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1843
1844 B.buildInstr(AMDGPU::S_GETREG_B32)
1845 .addDef(GetReg)
1846 .addImm(Encoding);
1847 MRI.setType(GetReg, S32);
1848
1849 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1850 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1851 }
1852
1853 // TODO: can we be smarter about machine pointer info?
1854 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1855 Register LoadAddr = MRI.createGenericVirtualRegister(
1856 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1857 // For code object version 5, private_base and shared_base are passed through
1858 // implicit kernargs.
1859 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1860 AMDGPUTargetLowering::ImplicitParameter Param =
1861 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1862 : AMDGPUTargetLowering::PRIVATE_BASE;
1863 uint64_t Offset =
1864 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1865
1866 Register KernargPtrReg = MRI.createGenericVirtualRegister(
1867 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1868
1869 if (!loadInputValue(KernargPtrReg, B,
1870 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1871 return Register();
1872
1873 MachineMemOperand *MMO = MF.getMachineMemOperand(
1874 PtrInfo,
1875 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1876 MachineMemOperand::MOInvariant,
1877 LLT::scalar(32), commonAlignment(Align(64), Offset));
1878
1879 // Pointer address
1880 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1881 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1882 // Load address
1883 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1884 }
1885
1886 Register QueuePtr = MRI.createGenericVirtualRegister(
1887 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1888
1889 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1890 return Register();
1891
1892 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1893 // private_segment_aperture_base_hi.
1894 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1895
1896 MachineMemOperand *MMO = MF.getMachineMemOperand(
1897 PtrInfo,
1898 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1899 MachineMemOperand::MOInvariant,
1900 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1901
1902 B.buildPtrAdd(LoadAddr, QueuePtr,
1903 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1904 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1905}
1906
1907/// Return true if the value is a known valid address, such that a null check is
1908/// not necessary.
1909static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1910 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1911 MachineInstr *Def = MRI.getVRegDef(Val);
1912 switch (Def->getOpcode()) {
1913 case AMDGPU::G_FRAME_INDEX:
1914 case AMDGPU::G_GLOBAL_VALUE:
1915 case AMDGPU::G_BLOCK_ADDR:
1916 return true;
1917 case AMDGPU::G_CONSTANT: {
1918 const ConstantInt *CI = Def->getOperand(1).getCImm();
1919 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1920 }
1921 default:
1922 return false;
1923 }
1924
1925 return false;
1926}
1927
1928bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1929 MachineInstr &MI, MachineRegisterInfo &MRI,
1930 MachineIRBuilder &B) const {
1931 MachineFunction &MF = B.getMF();
1932
1933 const LLT S32 = LLT::scalar(32);
1934 Register Dst = MI.getOperand(0).getReg();
1935 Register Src = MI.getOperand(1).getReg();
1936
1937 LLT DstTy = MRI.getType(Dst);
1938 LLT SrcTy = MRI.getType(Src);
1939 unsigned DestAS = DstTy.getAddressSpace();
1940 unsigned SrcAS = SrcTy.getAddressSpace();
1941
1942 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1943 // vector element.
1944 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1944, __extension__ __PRETTY_FUNCTION__))
;
1945
1946 const AMDGPUTargetMachine &TM
1947 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1948
1949 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1950 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1951 return true;
1952 }
1953
1954 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1955 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1956 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1958 // Extract low 32-bits of the pointer.
1959 B.buildExtract(Dst, Src, 0);
1960 MI.eraseFromParent();
1961 return true;
1962 }
1963
1964 unsigned NullVal = TM.getNullPointerValue(DestAS);
1965
1966 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1967 auto FlatNull = B.buildConstant(SrcTy, 0);
1968
1969 // Extract low 32-bits of the pointer.
1970 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1971
1972 auto CmpRes =
1973 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1974 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1975
1976 MI.eraseFromParent();
1977 return true;
1978 }
1979
1980 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1981 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1982 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1983 if (!ST.hasFlatAddressSpace())
1984 return false;
1985
1986 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1987 if (!ApertureReg.isValid())
1988 return false;
1989
1990 // Coerce the type of the low half of the result so we can use merge_values.
1991 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1992
1993 // TODO: Should we allow mismatched types but matching sizes in merges to
1994 // avoid the ptrtoint?
1995 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1996
1997 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1998 B.buildCopy(Dst, BuildPtr);
1999 MI.eraseFromParent();
2000 return true;
2001 }
2002
2003 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2004 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2005
2006 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2007 SegmentNull.getReg(0));
2008
2009 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2010
2011 MI.eraseFromParent();
2012 return true;
2013 }
2014
2015 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2016 SrcTy.getSizeInBits() == 64) {
2017 // Truncate.
2018 B.buildExtract(Dst, Src, 0);
2019 MI.eraseFromParent();
2020 return true;
2021 }
2022
2023 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2024 DstTy.getSizeInBits() == 64) {
2025 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2026 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2027
2028 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2029 // another. Merge operands are required to be the same type, but creating an
2030 // extra ptrtoint would be kind of pointless.
2031 auto HighAddr = B.buildConstant(
2032 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
2033 B.buildMerge(Dst, {Src, HighAddr});
2034 MI.eraseFromParent();
2035 return true;
2036 }
2037
2038 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2039 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2040
2041 LLVMContext &Ctx = MF.getFunction().getContext();
2042 Ctx.diagnose(InvalidAddrSpaceCast);
2043 B.buildUndef(Dst);
2044 MI.eraseFromParent();
2045 return true;
2046}
2047
2048bool AMDGPULegalizerInfo::legalizeFrint(
2049 MachineInstr &MI, MachineRegisterInfo &MRI,
2050 MachineIRBuilder &B) const {
2051 Register Src = MI.getOperand(1).getReg();
2052 LLT Ty = MRI.getType(Src);
2053 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2053, __extension__
__PRETTY_FUNCTION__))
;
2054
2055 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2056 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2057
2058 auto C1 = B.buildFConstant(Ty, C1Val);
2059 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2060
2061 // TODO: Should this propagate fast-math-flags?
2062 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2063 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2064
2065 auto C2 = B.buildFConstant(Ty, C2Val);
2066 auto Fabs = B.buildFAbs(Ty, Src);
2067
2068 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2069 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2070 MI.eraseFromParent();
2071 return true;
2072}
2073
2074bool AMDGPULegalizerInfo::legalizeFceil(
2075 MachineInstr &MI, MachineRegisterInfo &MRI,
2076 MachineIRBuilder &B) const {
2077
2078 const LLT S1 = LLT::scalar(1);
2079 const LLT S64 = LLT::scalar(64);
2080
2081 Register Src = MI.getOperand(1).getReg();
2082 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2082, __extension__ __PRETTY_FUNCTION__))
;
2083
2084 // result = trunc(src)
2085 // if (src > 0.0 && src != result)
2086 // result += 1.0
2087
2088 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2089
2090 const auto Zero = B.buildFConstant(S64, 0.0);
2091 const auto One = B.buildFConstant(S64, 1.0);
2092 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2093 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2094 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2095 auto Add = B.buildSelect(S64, And, One, Zero);
2096
2097 // TODO: Should this propagate fast-math-flags?
2098 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2099 MI.eraseFromParent();
2100 return true;
2101}
2102
2103bool AMDGPULegalizerInfo::legalizeFrem(
2104 MachineInstr &MI, MachineRegisterInfo &MRI,
2105 MachineIRBuilder &B) const {
2106 Register DstReg = MI.getOperand(0).getReg();
2107 Register Src0Reg = MI.getOperand(1).getReg();
2108 Register Src1Reg = MI.getOperand(2).getReg();
2109 auto Flags = MI.getFlags();
2110 LLT Ty = MRI.getType(DstReg);
2111
2112 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2113 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2114 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2115 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2116 MI.eraseFromParent();
2117 return true;
2118}
2119
2120static MachineInstrBuilder extractF64Exponent(Register Hi,
2121 MachineIRBuilder &B) {
2122 const unsigned FractBits = 52;
2123 const unsigned ExpBits = 11;
2124 LLT S32 = LLT::scalar(32);
2125
2126 auto Const0 = B.buildConstant(S32, FractBits - 32);
2127 auto Const1 = B.buildConstant(S32, ExpBits);
2128
2129 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2130 .addUse(Hi)
2131 .addUse(Const0.getReg(0))
2132 .addUse(Const1.getReg(0));
2133
2134 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2135}
2136
2137bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2138 MachineInstr &MI, MachineRegisterInfo &MRI,
2139 MachineIRBuilder &B) const {
2140 const LLT S1 = LLT::scalar(1);
2141 const LLT S32 = LLT::scalar(32);
2142 const LLT S64 = LLT::scalar(64);
2143
2144 Register Src = MI.getOperand(1).getReg();
2145 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2145, __extension__ __PRETTY_FUNCTION__))
;
2146
2147 // TODO: Should this use extract since the low half is unused?
2148 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2149 Register Hi = Unmerge.getReg(1);
2150
2151 // Extract the upper half, since this is where we will find the sign and
2152 // exponent.
2153 auto Exp = extractF64Exponent(Hi, B);
2154
2155 const unsigned FractBits = 52;
2156
2157 // Extract the sign bit.
2158 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2159 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2160
2161 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2162
2163 const auto Zero32 = B.buildConstant(S32, 0);
2164
2165 // Extend back to 64-bits.
2166 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2167
2168 auto Shr = B.buildAShr(S64, FractMask, Exp);
2169 auto Not = B.buildNot(S64, Shr);
2170 auto Tmp0 = B.buildAnd(S64, Src, Not);
2171 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2172
2173 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2174 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2175
2176 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2177 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2178 MI.eraseFromParent();
2179 return true;
2180}
2181
2182bool AMDGPULegalizerInfo::legalizeITOFP(
2183 MachineInstr &MI, MachineRegisterInfo &MRI,
2184 MachineIRBuilder &B, bool Signed) const {
2185
2186 Register Dst = MI.getOperand(0).getReg();
2187 Register Src = MI.getOperand(1).getReg();
2188
2189 const LLT S64 = LLT::scalar(64);
2190 const LLT S32 = LLT::scalar(32);
2191
2192 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2192, __extension__ __PRETTY_FUNCTION__))
;
2193
2194 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2195 auto ThirtyTwo = B.buildConstant(S32, 32);
2196
2197 if (MRI.getType(Dst) == S64) {
2198 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2199 : B.buildUITOFP(S64, Unmerge.getReg(1));
2200
2201 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2202 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2203 .addUse(CvtHi.getReg(0))
2204 .addUse(ThirtyTwo.getReg(0));
2205
2206 // TODO: Should this propagate fast-math-flags?
2207 B.buildFAdd(Dst, LdExp, CvtLo);
2208 MI.eraseFromParent();
2209 return true;
2210 }
2211
2212 assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0
) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2212, __extension__ __PRETTY_FUNCTION__))
;
2213
2214 auto One = B.buildConstant(S32, 1);
2215
2216 MachineInstrBuilder ShAmt;
2217 if (Signed) {
2218 auto ThirtyOne = B.buildConstant(S32, 31);
2219 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2220 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2221 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2222 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2223 /*HasSideEffects=*/false)
2224 .addUse(Unmerge.getReg(1));
2225 auto LS2 = B.buildSub(S32, LS, One);
2226 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2227 } else
2228 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2229 auto Norm = B.buildShl(S64, Src, ShAmt);
2230 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2231 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2232 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2233 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2234 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2235 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2236 /*HasSideEffects=*/false)
2237 .addUse(FVal.getReg(0))
2238 .addUse(Scale.getReg(0));
2239 MI.eraseFromParent();
2240 return true;
2241}
2242
2243// TODO: Copied from DAG implementation. Verify logic and document how this
2244// actually works.
2245bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2246 MachineRegisterInfo &MRI,
2247 MachineIRBuilder &B,
2248 bool Signed) const {
2249
2250 Register Dst = MI.getOperand(0).getReg();
2251 Register Src = MI.getOperand(1).getReg();
2252
2253 const LLT S64 = LLT::scalar(64);
2254 const LLT S32 = LLT::scalar(32);
2255
2256 const LLT SrcLT = MRI.getType(Src);
2257 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2257, __extension__
__PRETTY_FUNCTION__))
;
2258
2259 unsigned Flags = MI.getFlags();
2260
2261 // The basic idea of converting a floating point number into a pair of 32-bit
2262 // integers is illustrated as follows:
2263 //
2264 // tf := trunc(val);
2265 // hif := floor(tf * 2^-32);
2266 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2267 // hi := fptoi(hif);
2268 // lo := fptoi(lof);
2269 //
2270 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2271 MachineInstrBuilder Sign;
2272 if (Signed && SrcLT == S32) {
2273 // However, a 32-bit floating point number has only 23 bits mantissa and
2274 // it's not enough to hold all the significant bits of `lof` if val is
2275 // negative. To avoid the loss of precision, We need to take the absolute
2276 // value after truncating and flip the result back based on the original
2277 // signedness.
2278 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2279 Trunc = B.buildFAbs(S32, Trunc, Flags);
2280 }
2281 MachineInstrBuilder K0, K1;
2282 if (SrcLT == S64) {
2283 K0 = B.buildFConstant(S64,
2284 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2285 K1 = B.buildFConstant(S64,
2286 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2287 } else {
2288 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2289 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2290 }
2291
2292 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2293 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2294 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2295
2296 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2297 : B.buildFPTOUI(S32, FloorMul);
2298 auto Lo = B.buildFPTOUI(S32, Fma);
2299
2300 if (Signed && SrcLT == S32) {
2301 // Flip the result based on the signedness, which is either all 0s or 1s.
2302 Sign = B.buildMerge(S64, {Sign, Sign});
2303 // r := xor({lo, hi}, sign) - sign;
2304 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2305 } else
2306 B.buildMerge(Dst, {Lo, Hi});
2307 MI.eraseFromParent();
2308
2309 return true;
2310}
2311
2312bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2313 MachineInstr &MI) const {
2314 MachineFunction &MF = Helper.MIRBuilder.getMF();
2315 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2316
2317 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2318 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2319
2320 // With ieee_mode disabled, the instructions have the correct behavior
2321 // already for G_FMINNUM/G_FMAXNUM
2322 if (!MFI->getMode().IEEE)
2323 return !IsIEEEOp;
2324
2325 if (IsIEEEOp)
2326 return true;
2327
2328 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2329}
2330
2331bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2332 MachineInstr &MI, MachineRegisterInfo &MRI,
2333 MachineIRBuilder &B) const {
2334 // TODO: Should move some of this into LegalizerHelper.
2335
2336 // TODO: Promote dynamic indexing of s16 to s32
2337
2338 // FIXME: Artifact combiner probably should have replaced the truncated
2339 // constant before this, so we shouldn't need
2340 // getIConstantVRegValWithLookThrough.
2341 Optional<ValueAndVReg> MaybeIdxVal =
2342 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2343 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2344 return true;
2345 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2346
2347 Register Dst = MI.getOperand(0).getReg();
2348 Register Vec = MI.getOperand(1).getReg();
2349
2350 LLT VecTy = MRI.getType(Vec);
2351 LLT EltTy = VecTy.getElementType();
2352 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2352, __extension__ __PRETTY_FUNCTION__))
;
2353
2354 if (IdxVal < VecTy.getNumElements()) {
2355 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2356 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2357 } else {
2358 B.buildUndef(Dst);
2359 }
2360
2361 MI.eraseFromParent();
2362 return true;
2363}
2364
2365bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2366 MachineInstr &MI, MachineRegisterInfo &MRI,
2367 MachineIRBuilder &B) const {
2368 // TODO: Should move some of this into LegalizerHelper.
2369
2370 // TODO: Promote dynamic indexing of s16 to s32
2371
2372 // FIXME: Artifact combiner probably should have replaced the truncated
2373 // constant before this, so we shouldn't need
2374 // getIConstantVRegValWithLookThrough.
2375 Optional<ValueAndVReg> MaybeIdxVal =
2376 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2377 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2378 return true;
2379
2380 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2381 Register Dst = MI.getOperand(0).getReg();
2382 Register Vec = MI.getOperand(1).getReg();
2383 Register Ins = MI.getOperand(2).getReg();
2384
2385 LLT VecTy = MRI.getType(Vec);
2386 LLT EltTy = VecTy.getElementType();
2387 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2387, __extension__ __PRETTY_FUNCTION__))
;
2388 (void)Ins;
2389
2390 unsigned NumElts = VecTy.getNumElements();
2391 if (IdxVal < NumElts) {
2392 SmallVector<Register, 8> SrcRegs;
2393 for (unsigned i = 0; i < NumElts; ++i)
2394 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2395 B.buildUnmerge(SrcRegs, Vec);
2396
2397 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2398 B.buildMerge(Dst, SrcRegs);
2399 } else {
2400 B.buildUndef(Dst);
2401 }
2402
2403 MI.eraseFromParent();
2404 return true;
2405}
2406
2407bool AMDGPULegalizerInfo::legalizeShuffleVector(
2408 MachineInstr &MI, MachineRegisterInfo &MRI,
2409 MachineIRBuilder &B) const {
2410 const LLT V2S16 = LLT::fixed_vector(2, 16);
2411
2412 Register Dst = MI.getOperand(0).getReg();
2413 Register Src0 = MI.getOperand(1).getReg();
2414 LLT DstTy = MRI.getType(Dst);
2415 LLT SrcTy = MRI.getType(Src0);
2416
2417 if (SrcTy == V2S16 && DstTy == V2S16 &&
2418 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2419 return true;
2420
2421 MachineIRBuilder HelperBuilder(MI);
2422 GISelObserverWrapper DummyObserver;
2423 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2424 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2425}
2426
2427bool AMDGPULegalizerInfo::legalizeSinCos(
2428 MachineInstr &MI, MachineRegisterInfo &MRI,
2429 MachineIRBuilder &B) const {
2430
2431 Register DstReg = MI.getOperand(0).getReg();
2432 Register SrcReg = MI.getOperand(1).getReg();
2433 LLT Ty = MRI.getType(DstReg);
2434 unsigned Flags = MI.getFlags();
2435
2436 Register TrigVal;
2437 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2438 if (ST.hasTrigReducedRange()) {
2439 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2440 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2441 .addUse(MulVal.getReg(0))
2442 .setMIFlags(Flags).getReg(0);
2443 } else
2444 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2445
2446 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2447 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2448 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2449 .addUse(TrigVal)
2450 .setMIFlags(Flags);
2451 MI.eraseFromParent();
2452 return true;
2453}
2454
2455bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2456 MachineIRBuilder &B,
2457 const GlobalValue *GV,
2458 int64_t Offset,
2459 unsigned GAFlags) const {
2460 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2460, __extension__
__PRETTY_FUNCTION__))
;
2461 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2462 // to the following code sequence:
2463 //
2464 // For constant address space:
2465 // s_getpc_b64 s[0:1]
2466 // s_add_u32 s0, s0, $symbol
2467 // s_addc_u32 s1, s1, 0
2468 //
2469 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2470 // a fixup or relocation is emitted to replace $symbol with a literal
2471 // constant, which is a pc-relative offset from the encoding of the $symbol
2472 // operand to the global variable.
2473 //
2474 // For global address space:
2475 // s_getpc_b64 s[0:1]
2476 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2477 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2478 //
2479 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2480 // fixups or relocations are emitted to replace $symbol@*@lo and
2481 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2482 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2483 // operand to the global variable.
2484 //
2485 // What we want here is an offset from the value returned by s_getpc
2486 // (which is the address of the s_add_u32 instruction) to the global
2487 // variable, but since the encoding of $symbol starts 4 bytes after the start
2488 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2489 // small. This requires us to add 4 to the global variable offset in order to
2490 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2491 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2492 // instruction.
2493
2494 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2495
2496 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2497 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2498
2499 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2500 .addDef(PCReg);
2501
2502 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2503 if (GAFlags == SIInstrInfo::MO_NONE)
2504 MIB.addImm(0);
2505 else
2506 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2507
2508 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2509
2510 if (PtrTy.getSizeInBits() == 32)
2511 B.buildExtract(DstReg, PCReg, 0);
2512 return true;
2513 }
2514
2515bool AMDGPULegalizerInfo::legalizeGlobalValue(
2516 MachineInstr &MI, MachineRegisterInfo &MRI,
2517 MachineIRBuilder &B) const {
2518 Register DstReg = MI.getOperand(0).getReg();
2519 LLT Ty = MRI.getType(DstReg);
2520 unsigned AS = Ty.getAddressSpace();
2521
2522 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2523 MachineFunction &MF = B.getMF();
2524 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2525
2526 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2527 if (!MFI->isModuleEntryFunction() &&
2528 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2529 const Function &Fn = MF.getFunction();
2530 DiagnosticInfoUnsupported BadLDSDecl(
2531 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2532 DS_Warning);
2533 Fn.getContext().diagnose(BadLDSDecl);
2534
2535 // We currently don't have a way to correctly allocate LDS objects that
2536 // aren't directly associated with a kernel. We do force inlining of
2537 // functions that use local objects. However, if these dead functions are
2538 // not eliminated, we don't want a compile time error. Just emit a warning
2539 // and a trap, since there should be no callable path here.
2540 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2541 B.buildUndef(DstReg);
2542 MI.eraseFromParent();
2543 return true;
2544 }
2545
2546 // TODO: We could emit code to handle the initialization somewhere.
2547 // We ignore the initializer for now and legalize it to allow selection.
2548 // The initializer will anyway get errored out during assembly emission.
2549 const SITargetLowering *TLI = ST.getTargetLowering();
2550 if (!TLI->shouldUseLDSConstAddress(GV)) {
2551 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2552 return true; // Leave in place;
2553 }
2554
2555 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2556 Type *Ty = GV->getValueType();
2557 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2558 // zero-sized type in other languages to declare the dynamic shared
2559 // memory which size is not known at the compile time. They will be
2560 // allocated by the runtime and placed directly after the static
2561 // allocated ones. They all share the same offset.
2562 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2563 // Adjust alignment for that dynamic shared memory array.
2564 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2565 LLT S32 = LLT::scalar(32);
2566 auto Sz =
2567 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2568 B.buildIntToPtr(DstReg, Sz);
2569 MI.eraseFromParent();
2570 return true;
2571 }
2572 }
2573
2574 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2575 *cast<GlobalVariable>(GV)));
2576 MI.eraseFromParent();
2577 return true;
2578 }
2579
2580 const SITargetLowering *TLI = ST.getTargetLowering();
2581
2582 if (TLI->shouldEmitFixup(GV)) {
2583 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2584 MI.eraseFromParent();
2585 return true;
2586 }
2587
2588 if (TLI->shouldEmitPCReloc(GV)) {
2589 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2590 MI.eraseFromParent();
2591 return true;
2592 }
2593
2594 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2595 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2596
2597 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2598 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2599 MachinePointerInfo::getGOT(MF),
2600 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2601 MachineMemOperand::MOInvariant,
2602 LoadTy, Align(8));
2603
2604 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2605
2606 if (Ty.getSizeInBits() == 32) {
2607 // Truncate if this is a 32-bit constant address.
2608 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2609 B.buildExtract(DstReg, Load, 0);
2610 } else
2611 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2612
2613 MI.eraseFromParent();
2614 return true;
2615}
2616
2617static LLT widenToNextPowerOf2(LLT Ty) {
2618 if (Ty.isVector())
2619 return Ty.changeElementCount(
2620 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2621 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2622}
2623
2624bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2625 MachineInstr &MI) const {
2626 MachineIRBuilder &B = Helper.MIRBuilder;
2627 MachineRegisterInfo &MRI = *B.getMRI();
2628 GISelChangeObserver &Observer = Helper.Observer;
2629
2630 Register PtrReg = MI.getOperand(1).getReg();
2631 LLT PtrTy = MRI.getType(PtrReg);
2632 unsigned AddrSpace = PtrTy.getAddressSpace();
2633
2634 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2635 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2636 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2637 Observer.changingInstr(MI);
2638 MI.getOperand(1).setReg(Cast.getReg(0));
2639 Observer.changedInstr(MI);
2640 return true;
2641 }
2642
2643 if (MI.getOpcode() != AMDGPU::G_LOAD)
2644 return false;
2645
2646 Register ValReg = MI.getOperand(0).getReg();
2647 LLT ValTy = MRI.getType(ValReg);
2648
2649 MachineMemOperand *MMO = *MI.memoperands_begin();
2650 const unsigned ValSize = ValTy.getSizeInBits();
2651 const LLT MemTy = MMO->getMemoryType();
2652 const Align MemAlign = MMO->getAlign();
2653 const unsigned MemSize = MemTy.getSizeInBits();
2654 const uint64_t AlignInBits = 8 * MemAlign.value();
2655
2656 // Widen non-power-of-2 loads to the alignment if needed
2657 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2658 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2659
2660 // This was already the correct extending load result type, so just adjust
2661 // the memory type.
2662 if (WideMemSize == ValSize) {
2663 MachineFunction &MF = B.getMF();
2664
2665 MachineMemOperand *WideMMO =
2666 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2667 Observer.changingInstr(MI);
2668 MI.setMemRefs(MF, {WideMMO});
2669 Observer.changedInstr(MI);
2670 return true;
2671 }
2672
2673 // Don't bother handling edge case that should probably never be produced.
2674 if (ValSize > WideMemSize)
2675 return false;
2676
2677 LLT WideTy = widenToNextPowerOf2(ValTy);
2678
2679 Register WideLoad;
2680 if (!WideTy.isVector()) {
2681 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2682 B.buildTrunc(ValReg, WideLoad).getReg(0);
2683 } else {
2684 // Extract the subvector.
2685
2686 if (isRegisterType(ValTy)) {
2687 // If this a case where G_EXTRACT is legal, use it.
2688 // (e.g. <3 x s32> -> <4 x s32>)
2689 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2690 B.buildExtract(ValReg, WideLoad, 0);
2691 } else {
2692 // For cases where the widened type isn't a nice register value, unmerge
2693 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2694 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2695 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2696 }
2697 }
2698
2699 MI.eraseFromParent();
2700 return true;
2701 }
2702
2703 return false;
2704}
2705
2706bool AMDGPULegalizerInfo::legalizeFMad(
2707 MachineInstr &MI, MachineRegisterInfo &MRI,
2708 MachineIRBuilder &B) const {
2709 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2710 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2710, __extension__ __PRETTY_FUNCTION__))
;
2711
2712 MachineFunction &MF = B.getMF();
2713 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2714
2715 // TODO: Always legal with future ftz flag.
2716 // FIXME: Do we need just output?
2717 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2718 return true;
2719 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2720 return true;
2721
2722 MachineIRBuilder HelperBuilder(MI);
2723 GISelObserverWrapper DummyObserver;
2724 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2725 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2726}
2727
2728bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2729 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2730 Register DstReg = MI.getOperand(0).getReg();
2731 Register PtrReg = MI.getOperand(1).getReg();
2732 Register CmpVal = MI.getOperand(2).getReg();
2733 Register NewVal = MI.getOperand(3).getReg();
2734
2735 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2736, __extension__
__PRETTY_FUNCTION__))
2736 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2736, __extension__
__PRETTY_FUNCTION__))
;
2737
2738 LLT ValTy = MRI.getType(CmpVal);
2739 LLT VecTy = LLT::fixed_vector(2, ValTy);
2740
2741 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2742
2743 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2744 .addDef(DstReg)
2745 .addUse(PtrReg)
2746 .addUse(PackedVal)
2747 .setMemRefs(MI.memoperands());
2748
2749 MI.eraseFromParent();
2750 return true;
2751}
2752
2753bool AMDGPULegalizerInfo::legalizeFlog(
2754 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2755 Register Dst = MI.getOperand(0).getReg();
2756 Register Src = MI.getOperand(1).getReg();
2757 LLT Ty = B.getMRI()->getType(Dst);
2758 unsigned Flags = MI.getFlags();
2759
2760 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2761 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2762
2763 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2764 MI.eraseFromParent();
2765 return true;
2766}
2767
2768bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2769 MachineIRBuilder &B) const {
2770 Register Dst = MI.getOperand(0).getReg();
2771 Register Src = MI.getOperand(1).getReg();
2772 unsigned Flags = MI.getFlags();
2773 LLT Ty = B.getMRI()->getType(Dst);
2774
2775 auto K = B.buildFConstant(Ty, numbers::log2e);
2776 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2777 B.buildFExp2(Dst, Mul, Flags);
2778 MI.eraseFromParent();
2779 return true;
2780}
2781
2782bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2783 MachineIRBuilder &B) const {
2784 Register Dst = MI.getOperand(0).getReg();
2785 Register Src0 = MI.getOperand(1).getReg();
2786 Register Src1 = MI.getOperand(2).getReg();
2787 unsigned Flags = MI.getFlags();
2788 LLT Ty = B.getMRI()->getType(Dst);
2789 const LLT S16 = LLT::scalar(16);
2790 const LLT S32 = LLT::scalar(32);
2791
2792 if (Ty == S32) {
2793 auto Log = B.buildFLog2(S32, Src0, Flags);
2794 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2795 .addUse(Log.getReg(0))
2796 .addUse(Src1)
2797 .setMIFlags(Flags);
2798 B.buildFExp2(Dst, Mul, Flags);
2799 } else if (Ty == S16) {
2800 // There's no f16 fmul_legacy, so we need to convert for it.
2801 auto Log = B.buildFLog2(S16, Src0, Flags);
2802 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2803 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2804 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2805 .addUse(Ext0.getReg(0))
2806 .addUse(Ext1.getReg(0))
2807 .setMIFlags(Flags);
2808
2809 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2810 } else
2811 return false;
2812
2813 MI.eraseFromParent();
2814 return true;
2815}
2816
2817// Find a source register, ignoring any possible source modifiers.
2818static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2819 Register ModSrc = OrigSrc;
2820 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2821 ModSrc = SrcFNeg->getOperand(1).getReg();
2822 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2823 ModSrc = SrcFAbs->getOperand(1).getReg();
2824 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2825 ModSrc = SrcFAbs->getOperand(1).getReg();
2826 return ModSrc;
2827}
2828
2829bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2830 MachineRegisterInfo &MRI,
2831 MachineIRBuilder &B) const {
2832
2833 const LLT S1 = LLT::scalar(1);
2834 const LLT S64 = LLT::scalar(64);
2835 Register Dst = MI.getOperand(0).getReg();
2836 Register OrigSrc = MI.getOperand(1).getReg();
2837 unsigned Flags = MI.getFlags();
2838 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2839, __extension__
__PRETTY_FUNCTION__))
2839 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2839, __extension__
__PRETTY_FUNCTION__))
;
2840
2841 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2842 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2843 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2844 // V_FRACT bug is:
2845 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2846 //
2847 // Convert floor(x) to (x - fract(x))
2848
2849 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2850 .addUse(OrigSrc)
2851 .setMIFlags(Flags);
2852
2853 // Give source modifier matching some assistance before obscuring a foldable
2854 // pattern.
2855
2856 // TODO: We can avoid the neg on the fract? The input sign to fract
2857 // shouldn't matter?
2858 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2859
2860 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2861
2862 Register Min = MRI.createGenericVirtualRegister(S64);
2863
2864 // We don't need to concern ourselves with the snan handling difference, so
2865 // use the one which will directly select.
2866 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2867 if (MFI->getMode().IEEE)
2868 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2869 else
2870 B.buildFMinNum(Min, Fract, Const, Flags);
2871
2872 Register CorrectedFract = Min;
2873 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2874 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2875 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2876 }
2877
2878 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2879 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2880
2881 MI.eraseFromParent();
2882 return true;
2883}
2884
2885// Turn an illegal packed v2s16 build vector into bit operations.
2886// TODO: This should probably be a bitcast action in LegalizerHelper.
2887bool AMDGPULegalizerInfo::legalizeBuildVector(
2888 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2889 Register Dst = MI.getOperand(0).getReg();
2890 const LLT S32 = LLT::scalar(32);
2891 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector
(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2891, __extension__
__PRETTY_FUNCTION__))
;
2892
2893 Register Src0 = MI.getOperand(1).getReg();
2894 Register Src1 = MI.getOperand(2).getReg();
2895 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16
)) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2895, __extension__
__PRETTY_FUNCTION__))
;
2896
2897 auto Merge = B.buildMerge(S32, {Src0, Src1});
2898 B.buildBitcast(Dst, Merge);
2899
2900 MI.eraseFromParent();
2901 return true;
2902}
2903
2904// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2905//
2906// Source and accumulation registers must all be 32-bits.
2907//
2908// TODO: When the multiply is uniform, we should produce a code sequence
2909// that is better suited to instruction selection on the SALU. Instead of
2910// the outer loop going over parts of the result, the outer loop should go
2911// over parts of one of the factors. This should result in instruction
2912// selection that makes full use of S_ADDC_U32 instructions.
2913void AMDGPULegalizerInfo::buildMultiply(
2914 LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2915 ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2916 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2917 // Use (possibly empty) vectors of S1 registers to represent the set of
2918 // carries from one pair of positions to the next.
2919 using Carry = SmallVector<Register, 2>;
2920
2921 MachineIRBuilder &B = Helper.MIRBuilder;
2922
2923 const LLT S1 = LLT::scalar(1);
2924 const LLT S32 = LLT::scalar(32);
2925 const LLT S64 = LLT::scalar(64);
2926
2927 Register Zero32;
2928 Register Zero64;
2929
2930 auto getZero32 = [&]() -> Register {
2931 if (!Zero32)
2932 Zero32 = B.buildConstant(S32, 0).getReg(0);
2933 return Zero32;
2934 };
2935 auto getZero64 = [&]() -> Register {
2936 if (!Zero64)
2937 Zero64 = B.buildConstant(S64, 0).getReg(0);
2938 return Zero64;
2939 };
2940
2941 // Merge the given carries into the 32-bit LocalAccum, which is modified
2942 // in-place.
2943 //
2944 // Returns the carry-out, which is a single S1 register or null.
2945 auto mergeCarry =
2946 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2947 if (CarryIn.empty())
2948 return Register();
2949
2950 bool HaveCarryOut = true;
2951 Register CarryAccum;
2952 if (CarryIn.size() == 1) {
2953 if (!LocalAccum) {
2954 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2955 return Register();
2956 }
2957
2958 CarryAccum = getZero32();
2959 } else {
2960 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2961 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2962 CarryAccum =
2963 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2964 .getReg(0);
2965 }
2966
2967 if (!LocalAccum) {
2968 LocalAccum = getZero32();
2969 HaveCarryOut = false;
2970 }
2971 }
2972
2973 auto Add =
2974 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2975 LocalAccum = Add.getReg(0);
2976 return HaveCarryOut ? Add.getReg(1) : Register();
2977 };
2978
2979 // Build a multiply-add chain to compute
2980 //
2981 // LocalAccum + (partial products at DstIndex)
2982 // + (opportunistic subset of CarryIn)
2983 //
2984 // LocalAccum is an array of one or two 32-bit registers that are updated
2985 // in-place. The incoming registers may be null.
2986 //
2987 // In some edge cases, carry-ins can be consumed "for free". In that case,
2988 // the consumed carry bits are removed from CarryIn in-place.
2989 auto buildMadChain =
2990 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2991 -> Carry {
2992 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||(static_cast <bool> ((DstIndex + 1 < Accum.size() &&
LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() &&
LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2993, __extension__
__PRETTY_FUNCTION__))
2993 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1))(static_cast <bool> ((DstIndex + 1 < Accum.size() &&
LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() &&
LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2993, __extension__
__PRETTY_FUNCTION__))
;
2994
2995 Carry CarryOut;
2996 unsigned j0 = 0;
2997
2998 // Use plain 32-bit multiplication for the most significant part of the
2999 // result by default.
3000 if (LocalAccum.size() == 1 &&
3001 (!UsePartialMad64_32 || !CarryIn.empty())) {
3002 do {
3003 unsigned j1 = DstIndex - j0;
3004 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3005 if (!LocalAccum[0]) {
3006 LocalAccum[0] = Mul.getReg(0);
3007 } else {
3008 if (CarryIn.empty()) {
3009 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3010 } else {
3011 LocalAccum[0] =
3012 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3013 .getReg(0);
3014 CarryIn.pop_back();
3015 }
3016 }
3017 ++j0;
3018 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3019 }
3020
3021 // Build full 64-bit multiplies.
3022 if (j0 <= DstIndex) {
3023 bool HaveSmallAccum = false;
3024 Register Tmp;
3025
3026 if (LocalAccum[0]) {
3027 if (LocalAccum.size() == 1) {
3028 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3029 HaveSmallAccum = true;
3030 } else if (LocalAccum[1]) {
3031 Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3032 HaveSmallAccum = false;
3033 } else {
3034 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3035 HaveSmallAccum = true;
3036 }
3037 } else {
3038 assert(LocalAccum.size() == 1 || !LocalAccum[1])(static_cast <bool> (LocalAccum.size() == 1 || !LocalAccum
[1]) ? void (0) : __assert_fail ("LocalAccum.size() == 1 || !LocalAccum[1]"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
;
3039 Tmp = getZero64();
3040 HaveSmallAccum = true;
3041 }
3042
3043 do {
3044 unsigned j1 = DstIndex - j0;
3045 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3046 {Src0[j0], Src1[j1], Tmp});
3047 Tmp = Mad.getReg(0);
3048 if (!HaveSmallAccum)
3049 CarryOut.push_back(Mad.getReg(1));
3050 HaveSmallAccum = false;
3051 ++j0;
3052 } while (j0 <= DstIndex);
3053
3054 auto Unmerge = B.buildUnmerge(S32, Tmp);
3055 LocalAccum[0] = Unmerge.getReg(0);
3056 if (LocalAccum.size() > 1)
3057 LocalAccum[1] = Unmerge.getReg(1);
3058 }
3059
3060 return CarryOut;
3061 };
3062
3063 // Outer multiply loop, iterating over destination parts from least
3064 // significant to most significant parts.
3065 //
3066 // The columns of the following diagram correspond to the destination parts
3067 // affected by one iteration of the outer loop (ignoring boundary
3068 // conditions).
3069 //
3070 // Dest index relative to 2 * i: 1 0 -1
3071 // ------
3072 // Carries from previous iteration: e o
3073 // Even-aligned partial product sum: E E .
3074 // Odd-aligned partial product sum: O O
3075 //
3076 // 'o' is OddCarry, 'e' is EvenCarry.
3077 // EE and OO are computed from partial products via buildMadChain and use
3078 // accumulation where possible and appropriate.
3079 //
3080 Register SeparateOddCarry;
3081 Carry EvenCarry;
3082 Carry OddCarry;
3083
3084 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3085 Carry OddCarryIn = std::move(OddCarry);
3086 Carry EvenCarryIn = std::move(EvenCarry);
3087 OddCarry.clear();
3088 EvenCarry.clear();
3089
3090 // Partial products at offset 2 * i.
3091 if (2 * i < Accum.size()) {
3092 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3093 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3094 }
3095
3096 // Partial products at offset 2 * i - 1.
3097 if (i > 0) {
3098 if (!SeparateOddAlignedProducts) {
3099 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3100 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3101 } else {
3102 bool IsHighest = 2 * i >= Accum.size();
3103 Register SeparateOddOut[2];
3104 auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3105 .take_front(IsHighest ? 1 : 2);
3106 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3107
3108 MachineInstr *Lo;
3109
3110 if (i == 1) {
3111 if (!IsHighest)
3112 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3113 else
3114 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3115 } else {
3116 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3117 SeparateOddCarry);
3118 }
3119 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3120
3121 if (!IsHighest) {
3122 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3123 Lo->getOperand(1).getReg());
3124 Accum[2 * i] = Hi.getReg(0);
3125 SeparateOddCarry = Hi.getReg(1);
3126 }
3127 }
3128 }
3129
3130 // Add in the carries from the previous iteration
3131 if (i > 0) {
3132 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3133 EvenCarryIn.push_back(CarryOut);
3134
3135 if (2 * i < Accum.size()) {
3136 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3137 OddCarry.push_back(CarryOut);
3138 }
3139 }
3140 }
3141}
3142
3143// Custom narrowing of wide multiplies using wide multiply-add instructions.
3144//
3145// TODO: If the multiply is followed by an addition, we should attempt to
3146// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3147bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3148 MachineInstr &MI) const {
3149 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3149, __extension__ __PRETTY_FUNCTION__))
;
3150 assert(MI.getOpcode() == TargetOpcode::G_MUL)(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_MUL
) ? void (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_MUL"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3150, __extension__
__PRETTY_FUNCTION__))
;
3151
3152 MachineIRBuilder &B = Helper.MIRBuilder;
3153 MachineRegisterInfo &MRI = *B.getMRI();
3154
3155 Register DstReg = MI.getOperand(0).getReg();
3156 Register Src0 = MI.getOperand(1).getReg();
3157 Register Src1 = MI.getOperand(2).getReg();
3158
3159 LLT Ty = MRI.getType(DstReg);
3160 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3160, __extension__ __PRETTY_FUNCTION__))
;
3161
3162 unsigned Size = Ty.getSizeInBits();
3163 unsigned NumParts = Size / 32;
3164 assert((Size % 32) == 0)(static_cast <bool> ((Size % 32) == 0) ? void (0) : __assert_fail
("(Size % 32) == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3164, __extension__ __PRETTY_FUNCTION__))
;
3165 assert(NumParts >= 2)(static_cast <bool> (NumParts >= 2) ? void (0) : __assert_fail
("NumParts >= 2", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3165, __extension__ __PRETTY_FUNCTION__))
;
3166
3167 // Whether to use MAD_64_32 for partial products whose high half is
3168 // discarded. This avoids some ADD instructions but risks false dependency
3169 // stalls on some subtargets in some cases.
3170 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3171
3172 // Whether to compute odd-aligned partial products separately. This is
3173 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3174 // in an even-aligned VGPR.
3175 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3176
3177 LLT S32 = LLT::scalar(32);
3178 SmallVector<Register, 2> Src0Parts, Src1Parts;
3179 for (unsigned i = 0; i < NumParts; ++i) {
3180 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3181 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3182 }
3183 B.buildUnmerge(Src0Parts, Src0);
3184 B.buildUnmerge(Src1Parts, Src1);
3185
3186 SmallVector<Register, 2> AccumRegs(NumParts);
3187 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3188 SeparateOddAlignedProducts);
3189
3190 B.buildMerge(DstReg, AccumRegs);
3191 MI.eraseFromParent();
3192 return true;
3193
3194}
3195
3196// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3197// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3198// case with a single min instruction instead of a compare+select.
3199bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3200 MachineRegisterInfo &MRI,
3201 MachineIRBuilder &B) const {
3202 Register Dst = MI.getOperand(0).getReg();
3203 Register Src = MI.getOperand(1).getReg();
3204 LLT DstTy = MRI.getType(Dst);
3205 LLT SrcTy = MRI.getType(Src);
3206
3207 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3208 ? AMDGPU::G_AMDGPU_FFBH_U32
3209 : AMDGPU::G_AMDGPU_FFBL_B32;
3210 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3211 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3212
3213 MI.eraseFromParent();
3214 return true;
3215}
3216
3217// Check that this is a G_XOR x, -1
3218static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3219 if (MI.getOpcode() != TargetOpcode::G_XOR)
3220 return false;
3221 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3222 return ConstVal && *ConstVal == -1;
3223}
3224
3225// Return the use branch instruction, otherwise null if the usage is invalid.
3226static MachineInstr *
3227verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3228 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3229 Register CondDef = MI.getOperand(0).getReg();
3230 if (!MRI.hasOneNonDBGUse(CondDef))
3231 return nullptr;
3232
3233 MachineBasicBlock *Parent = MI.getParent();
3234 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3235
3236 if (isNot(MRI, *UseMI)) {
3237 Register NegatedCond = UseMI->getOperand(0).getReg();
3238 if (!MRI.hasOneNonDBGUse(NegatedCond))
3239 return nullptr;
3240
3241 // We're deleting the def of this value, so we need to remove it.
3242 eraseInstr(*UseMI, MRI);
3243
3244 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3245 Negated = true;
3246 }
3247
3248 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3249 return nullptr;
3250
3251 // Make sure the cond br is followed by a G_BR, or is the last instruction.
3252 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3253 if (Next == Parent->end()) {
3254 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3255 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3256 return nullptr;
3257 UncondBrTarget = &*NextMBB;
3258 } else {
3259 if (Next->getOpcode() != AMDGPU::G_BR)
3260 return nullptr;
3261 Br = &*Next;
3262 UncondBrTarget = Br->getOperand(0).getMBB();
3263 }
3264
3265 return UseMI;
3266}
3267
3268bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3269 const ArgDescriptor *Arg,
3270 const TargetRegisterClass *ArgRC,
3271 LLT ArgTy) const {
3272 MCRegister SrcReg = Arg->getRegister();
3273 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3273, __extension__
__PRETTY_FUNCTION__))
;
5
'?' condition is true
3274 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3274, __extension__
__PRETTY_FUNCTION__))
;
6
Assuming the condition is true
7
'?' condition is true
3275
3276 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3277 *ArgRC, B.getDebugLoc(), ArgTy);
3278 if (Arg->isMasked()) {
8
Taking true branch
3279 // TODO: Should we try to emit this once in the entry block?
3280 const LLT S32 = LLT::scalar(32);
3281 const unsigned Mask = Arg->getMask();
3282 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
9
Calling 'countTrailingZeros<unsigned int>'
16
Returning from 'countTrailingZeros<unsigned int>'
17
'Shift' initialized to 32
3283
3284 Register AndMaskSrc = LiveIn;
3285
3286 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3287 // 0.
3288 if (Shift
17.1
'Shift' is not equal to 0
17.1
'Shift' is not equal to 0
!= 0) {
18
Taking true branch
3289 auto ShiftAmt = B.buildConstant(S32, Shift);
3290 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3291 }
3292
3293 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
19
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
3294 } else {
3295 B.buildCopy(DstReg, LiveIn);
3296 }
3297
3298 return true;
3299}
3300
3301bool AMDGPULegalizerInfo::loadInputValue(
3302 Register DstReg, MachineIRBuilder &B,
3303 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3304 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3305 const ArgDescriptor *Arg;
3306 const TargetRegisterClass *ArgRC;
3307 LLT ArgTy;
3308 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3309
3310 if (!Arg) {
2
Assuming 'Arg' is non-null
3311 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3312 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3313 // case the pointer argument may be missing and we use null.
3314 B.buildConstant(DstReg, 0);
3315 return true;
3316 }
3317
3318 // It's undefined behavior if a function marked with the amdgpu-no-*
3319 // attributes uses the corresponding intrinsic.
3320 B.buildUndef(DstReg);
3321 return true;
3322 }
3323
3324 if (!Arg->isRegister() || !Arg->getRegister().isValid())
3
Taking false branch
3325 return false; // TODO: Handle these
3326 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4
Calling 'AMDGPULegalizerInfo::loadInputValue'
3327}
3328
3329bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3330 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3331 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3332 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3333 return false;
3334
3335 MI.eraseFromParent();
3336 return true;
3337}
3338
3339static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3340 int64_t C) {
3341 B.buildConstant(MI.getOperand(0).getReg(), C);
3342 MI.eraseFromParent();
3343 return true;
3344}
3345
3346bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3347 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3348 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3349 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3350 if (MaxID == 0)
3351 return replaceWithConstant(B, MI, 0);
3352
3353 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3354 const ArgDescriptor *Arg;
3355 const TargetRegisterClass *ArgRC;
3356 LLT ArgTy;
3357 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3358
3359 Register DstReg = MI.getOperand(0).getReg();
3360 if (!Arg) {
3361 // It's undefined behavior if a function marked with the amdgpu-no-*
3362 // attributes uses the corresponding intrinsic.
3363 B.buildUndef(DstReg);
3364 MI.eraseFromParent();
3365 return true;
3366 }
3367
3368 if (Arg->isMasked()) {
3369 // Don't bother inserting AssertZext for packed IDs since we're emitting the
3370 // masking operations anyway.
3371 //
3372 // TODO: We could assert the top bit is 0 for the source copy.
3373 if (!loadInputValue(DstReg, B, ArgType))
3374 return false;
3375 } else {
3376 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3377 if (!loadInputValue(TmpReg, B, ArgType))
3378 return false;
3379 B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3380 }
3381
3382 MI.eraseFromParent();
3383 return true;
3384}
3385
3386Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3387 int64_t Offset) const {
3388 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3389 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3390
3391 // TODO: If we passed in the base kernel offset we could have a better
3392 // alignment than 4, but we don't really need it.
3393 if (!loadInputValue(KernArgReg, B,
1
Calling 'AMDGPULegalizerInfo::loadInputValue'
3394 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3395 llvm_unreachable("failed to find kernarg segment ptr")::llvm::llvm_unreachable_internal("failed to find kernarg segment ptr"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3395)
;
3396
3397 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3398 // TODO: Should get nuw
3399 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3400}
3401
3402/// Legalize a value that's loaded from kernel arguments. This is only used by
3403/// legacy intrinsics.
3404bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3405 MachineIRBuilder &B,
3406 uint64_t Offset,
3407 Align Alignment) const {
3408 Register DstReg = MI.getOperand(0).getReg();
3409
3410 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3411, __extension__
__PRETTY_FUNCTION__))
3411 "unexpected kernarg parameter type")(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3411, __extension__
__PRETTY_FUNCTION__))
;
3412
3413 Register Ptr = getKernargParameterPtr(B, Offset);
3414 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3415 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3416 MachineMemOperand::MODereferenceable |
3417 MachineMemOperand::MOInvariant);
3418 MI.eraseFromParent();
3419 return true;
3420}
3421
3422bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3423 MachineRegisterInfo &MRI,
3424 MachineIRBuilder &B) const {
3425 Register Dst = MI.getOperand(0).getReg();
3426 LLT DstTy = MRI.getType(Dst);
3427 LLT S16 = LLT::scalar(16);
3428 LLT S32 = LLT::scalar(32);
3429 LLT S64 = LLT::scalar(64);
3430
3431 if (DstTy == S16)
3432 return legalizeFDIV16(MI, MRI, B);
3433 if (DstTy == S32)
3434 return legalizeFDIV32(MI, MRI, B);
3435 if (DstTy == S64)
3436 return legalizeFDIV64(MI, MRI, B);
3437
3438 return false;
3439}
3440
3441void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3442 Register DstDivReg,
3443 Register DstRemReg,
3444 Register X,
3445 Register Y) const {
3446 const LLT S1 = LLT::scalar(1);
3447 const LLT S32 = LLT::scalar(32);
3448
3449 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3450 // algorithm used here.
3451
3452 // Initial estimate of inv(y).
3453 auto FloatY = B.buildUITOFP(S32, Y);
3454 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3455 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3456 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3457 auto Z = B.buildFPTOUI(S32, ScaledY);
3458
3459 // One round of UNR.
3460 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3461 auto NegYZ = B.buildMul(S32, NegY, Z);
3462 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3463
3464 // Quotient/remainder estimate.
3465 auto Q = B.buildUMulH(S32, X, Z);
3466 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3467
3468 // First quotient/remainder refinement.
3469 auto One = B.buildConstant(S32, 1);
3470 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3471 if (DstDivReg)
3472 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3473 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3474
3475 // Second quotient/remainder refinement.
3476 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3477 if (DstDivReg)
3478 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3479
3480 if (DstRemReg)
3481 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3482}
3483
3484// Build integer reciprocal sequence around V_RCP_IFLAG_F32
3485//
3486// Return lo, hi of result
3487//
3488// %cvt.lo = G_UITOFP Val.lo
3489// %cvt.hi = G_UITOFP Val.hi
3490// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3491// %rcp = G_AMDGPU_RCP_IFLAG %mad
3492// %mul1 = G_FMUL %rcp, 0x5f7ffffc
3493// %mul2 = G_FMUL %mul1, 2**(-32)
3494// %trunc = G_INTRINSIC_TRUNC %mul2
3495// %mad2 = G_FMAD %trunc, -(2**32), %mul1
3496// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3497static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3498 Register Val) {
3499 const LLT S32 = LLT::scalar(32);
3500 auto Unmerge = B.buildUnmerge(S32, Val);
3501
3502 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3503 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3504
3505 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3506 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3507
3508 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3509 auto Mul1 =
3510 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3511
3512 // 2**(-32)
3513 auto Mul2 =
3514 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3515 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3516
3517 // -(2**32)
3518 auto Mad2 = B.buildFMAD(S32, Trunc,
3519 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3520
3521 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3522 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3523
3524 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3525}
3526
3527void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3528 Register DstDivReg,
3529 Register DstRemReg,
3530 Register Numer,
3531 Register Denom) const {
3532 const LLT S32 = LLT::scalar(32);
3533 const LLT S64 = LLT::scalar(64);
3534 const LLT S1 = LLT::scalar(1);
3535 Register RcpLo, RcpHi;
3536
3537 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3538
3539 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3540
3541 auto Zero64 = B.buildConstant(S64, 0);
3542 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3543
3544 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3545 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3546
3547 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3548 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3549 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3550
3551 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3552 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3553 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3554
3555 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3556 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3557 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3558 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3559 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3560
3561 auto Zero32 = B.buildConstant(S32, 0);
3562 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3563 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3564 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3565
3566 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3567 Register NumerLo = UnmergeNumer.getReg(0);
3568 Register NumerHi = UnmergeNumer.getReg(1);
3569
3570 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3571 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3572 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3573 Register Mul3_Lo = UnmergeMul3.getReg(0);
3574 Register Mul3_Hi = UnmergeMul3.getReg(1);
3575 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3576 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3577 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3578 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3579
3580 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3581 Register DenomLo = UnmergeDenom.getReg(0);
3582 Register DenomHi = UnmergeDenom.getReg(1);
3583
3584 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3585 auto C1 = B.buildSExt(S32, CmpHi);
3586
3587 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3588 auto C2 = B.buildSExt(S32, CmpLo);
3589
3590 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3591 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3592
3593 // TODO: Here and below portions of the code can be enclosed into if/endif.
3594 // Currently control flow is unconditional and we have 4 selects after
3595 // potential endif to substitute PHIs.
3596
3597 // if C3 != 0 ...
3598 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3599 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3600 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3601 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3602
3603 auto One64 = B.buildConstant(S64, 1);
3604 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3605
3606 auto C4 =
3607 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3608 auto C5 =
3609 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3610 auto C6 = B.buildSelect(
3611 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3612
3613 // if (C6 != 0)
3614 auto Add4 = B.buildAdd(S64, Add3, One64);
3615 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3616
3617 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3618 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3619 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3620
3621 // endif C6
3622 // endif C3
3623
3624 if (DstDivReg) {
3625 auto Sel1 = B.buildSelect(
3626 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3627 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3628 Sel1, MulHi3);
3629 }
3630
3631 if (DstRemReg) {
3632 auto Sel2 = B.buildSelect(
3633 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3634 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3635 Sel2, Sub1);
3636 }
3637}
3638
3639bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3640 MachineRegisterInfo &MRI,
3641 MachineIRBuilder &B) const {
3642 Register DstDivReg, DstRemReg;
3643 switch (MI.getOpcode()) {
3644 default:
3645 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3645)
;
3646 case AMDGPU::G_UDIV: {
3647 DstDivReg = MI.getOperand(0).getReg();
3648 break;
3649 }
3650 case AMDGPU::G_UREM: {
3651 DstRemReg = MI.getOperand(0).getReg();
3652 break;
3653 }
3654 case AMDGPU::G_UDIVREM: {
3655 DstDivReg = MI.getOperand(0).getReg();
3656 DstRemReg = MI.getOperand(1).getReg();
3657 break;
3658 }
3659 }
3660
3661 const LLT S64 = LLT::scalar(64);
3662 const LLT S32 = LLT::scalar(32);
3663 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3664 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3665 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3666 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3667
3668 if (Ty == S32)
3669 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3670 else if (Ty == S64)
3671 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3672 else
3673 return false;
3674
3675 MI.eraseFromParent();
3676 return true;
3677}
3678
3679bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3680 MachineRegisterInfo &MRI,
3681 MachineIRBuilder &B) const {
3682 const LLT S64 = LLT::scalar(64);
3683 const LLT S32 = LLT::scalar(32);
3684
3685 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3686 if (Ty != S32 && Ty != S64)
3687 return false;
3688
3689 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3690 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3691 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3692
3693 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3694 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3695 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3696
3697 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3698 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3699
3700 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3701 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3702
3703 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3704 switch (MI.getOpcode()) {
3705 default:
3706 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3706)
;
3707 case AMDGPU::G_SDIV: {
3708 DstDivReg = MI.getOperand(0).getReg();
3709 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3710 break;
3711 }
3712 case AMDGPU::G_SREM: {
3713 DstRemReg = MI.getOperand(0).getReg();
3714 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3715 break;
3716 }
3717 case AMDGPU::G_SDIVREM: {
3718 DstDivReg = MI.getOperand(0).getReg();
3719 DstRemReg = MI.getOperand(1).getReg();
3720 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3721 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3722 break;
3723 }
3724 }
3725
3726 if (Ty == S32)
3727 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3728 else
3729 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3730
3731 if (DstDivReg) {
3732 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3733 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3734 B.buildSub(DstDivReg, SignXor, Sign);
3735 }
3736
3737 if (DstRemReg) {
3738 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3739 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3740 B.buildSub(DstRemReg, SignXor, Sign);
3741 }
3742
3743 MI.eraseFromParent();
3744 return true;
3745}
3746
3747bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3748 MachineRegisterInfo &MRI,
3749 MachineIRBuilder &B) const {
3750 Register Res = MI.getOperand(0).getReg();
3751 Register LHS = MI.getOperand(1).getReg();
3752 Register RHS = MI.getOperand(2).getReg();
3753 uint16_t Flags = MI.getFlags();
3754 LLT ResTy = MRI.getType(Res);
3755
3756 const MachineFunction &MF = B.getMF();
3757 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3758 MI.getFlag(MachineInstr::FmAfn);
3759
3760 if (!AllowInaccurateRcp)
3761 return false;
3762
3763 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3764 // 1 / x -> RCP(x)
3765 if (CLHS->isExactlyValue(1.0)) {
3766 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3767 .addUse(RHS)
3768 .setMIFlags(Flags);
3769
3770 MI.eraseFromParent();
3771 return true;
3772 }
3773
3774 // -1 / x -> RCP( FNEG(x) )
3775 if (CLHS->isExactlyValue(-1.0)) {
3776 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3777 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3778 .addUse(FNeg.getReg(0))
3779 .setMIFlags(Flags);
3780
3781 MI.eraseFromParent();
3782 return true;
3783 }
3784 }
3785
3786 // x / y -> x * (1.0 / y)
3787 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3788 .addUse(RHS)
3789 .setMIFlags(Flags);
3790 B.buildFMul(Res, LHS, RCP, Flags);
3791
3792 MI.eraseFromParent();
3793 return true;
3794}
3795
3796bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3797 MachineRegisterInfo &MRI,
3798 MachineIRBuilder &B) const {
3799 Register Res = MI.getOperand(0).getReg();
3800 Register X = MI.getOperand(1).getReg();
3801 Register Y = MI.getOperand(2).getReg();
3802 uint16_t Flags = MI.getFlags();
3803 LLT ResTy = MRI.getType(Res);
3804
3805 const MachineFunction &MF = B.getMF();
3806 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3807 MI.getFlag(MachineInstr::FmAfn);
3808
3809 if (!AllowInaccurateRcp)
3810 return false;
3811
3812 auto NegY = B.buildFNeg(ResTy, Y);
3813 auto One = B.buildFConstant(ResTy, 1.0);
3814
3815 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3816 .addUse(Y)
3817 .setMIFlags(Flags);
3818
3819 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3820 R = B.buildFMA(ResTy, Tmp0, R, R);
3821
3822 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3823 R = B.buildFMA(ResTy, Tmp1, R, R);
3824
3825 auto Ret = B.buildFMul(ResTy, X, R);
3826 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3827
3828 B.buildFMA(Res, Tmp2, R, Ret);
3829 MI.eraseFromParent();
3830 return true;
3831}
3832
3833bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3834 MachineRegisterInfo &MRI,
3835 MachineIRBuilder &B) const {
3836 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3837 return true;
3838
3839 Register Res = MI.getOperand(0).getReg();
3840 Register LHS = MI.getOperand(1).getReg();
3841 Register RHS = MI.getOperand(2).getReg();
3842
3843 uint16_t Flags = MI.getFlags();
3844
3845 LLT S16 = LLT::scalar(16);
3846 LLT S32 = LLT::scalar(32);
3847
3848 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3849 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3850
3851 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3852 .addUse(RHSExt.getReg(0))
3853 .setMIFlags(Flags);
3854
3855 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3856 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3857
3858 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3859 .addUse(RDst.getReg(0))
3860 .addUse(RHS)
3861 .addUse(LHS)
3862 .setMIFlags(Flags);
3863
3864 MI.eraseFromParent();
3865 return true;
3866}
3867
3868// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3869// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3870static void toggleSPDenormMode(bool Enable,
3871 MachineIRBuilder &B,
3872 const GCNSubtarget &ST,
3873 AMDGPU::SIModeRegisterDefaults Mode) {
3874 // Set SP denorm mode to this value.
3875 unsigned SPDenormMode =
3876 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3877
3878 if (ST.hasDenormModeInst()) {
3879 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3880 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3881
3882 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3883 B.buildInstr(AMDGPU::S_DENORM_MODE)
3884 .addImm(NewDenormModeValue);
3885
3886 } else {
3887 // Select FP32 bit field in mode register.
3888 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3889 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3890 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3891
3892 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3893 .addImm(SPDenormMode)
3894 .addImm(SPDenormModeBitField);
3895 }
3896}
3897
3898bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3899 MachineRegisterInfo &MRI,
3900 MachineIRBuilder &B) const {
3901 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3902 return true;
3903
3904 Register Res = MI.getOperand(0).getReg();
3905 Register LHS = MI.getOperand(1).getReg();
3906 Register RHS = MI.getOperand(2).getReg();
3907 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3908 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3909
3910 uint16_t Flags = MI.getFlags();
3911
3912 LLT S32 = LLT::scalar(32);
3913 LLT S1 = LLT::scalar(1);
3914
3915 auto One = B.buildFConstant(S32, 1.0f);
3916
3917 auto DenominatorScaled =
3918 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3919 .addUse(LHS)
3920 .addUse(RHS)
3921 .addImm(0)
3922 .setMIFlags(Flags);
3923 auto NumeratorScaled =
3924 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3925 .addUse(LHS)
3926 .addUse(RHS)
3927 .addImm(1)
3928 .setMIFlags(Flags);
3929
3930 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3931 .addUse(DenominatorScaled.getReg(0))
3932 .setMIFlags(Flags);
3933 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3934
3935 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3936 // aren't modeled as reading it.
3937 if (!Mode.allFP32Denormals())
3938 toggleSPDenormMode(true, B, ST, Mode);
3939
3940 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3941 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3942 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3943 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3944 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3945 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3946
3947 if (!Mode.allFP32Denormals())
3948 toggleSPDenormMode(false, B, ST, Mode);
3949
3950 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3951 .addUse(Fma4.getReg(0))
3952 .addUse(Fma1.getReg(0))
3953 .addUse(Fma3.getReg(0))
3954 .addUse(NumeratorScaled.getReg(1))
3955 .setMIFlags(Flags);
3956
3957 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3958 .addUse(Fmas.getReg(0))
3959 .addUse(RHS)
3960 .addUse(LHS)
3961 .setMIFlags(Flags);
3962
3963 MI.eraseFromParent();
3964 return true;
3965}
3966
3967bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3968 MachineRegisterInfo &MRI,
3969 MachineIRBuilder &B) const {
3970 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3971 return true;
3972
3973 Register Res = MI.getOperand(0).getReg();
3974 Register LHS = MI.getOperand(1).getReg();
3975 Register RHS = MI.getOperand(2).getReg();
3976
3977 uint16_t Flags = MI.getFlags();
3978
3979 LLT S64 = LLT::scalar(64);
3980 LLT S1 = LLT::scalar(1);
3981
3982 auto One = B.buildFConstant(S64, 1.0);
3983
3984 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3985 .addUse(LHS)
3986 .addUse(RHS)
3987 .addImm(0)
3988 .setMIFlags(Flags);
3989
3990 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3991
3992 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3993 .addUse(DivScale0.getReg(0))
3994 .setMIFlags(Flags);
3995
3996 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3997 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3998 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3999
4000 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4001 .addUse(LHS)
4002 .addUse(RHS)
4003 .addImm(1)
4004 .setMIFlags(Flags);
4005
4006 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4007 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4008 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4009
4010 Register Scale;
4011 if (!ST.hasUsableDivScaleConditionOutput()) {
4012 // Workaround a hardware bug on SI where the condition output from div_scale
4013 // is not usable.
4014
4015 LLT S32 = LLT::scalar(32);
4016
4017 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4018 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4019 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4020 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4021
4022 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4023 Scale1Unmerge.getReg(1));
4024 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4025 Scale0Unmerge.getReg(1));
4026 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4027 } else {
4028 Scale = DivScale1.getReg(1);
4029 }
4030
4031 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4032 .addUse(Fma4.getReg(0))
4033 .addUse(Fma3.getReg(0))
4034 .addUse(Mul.getReg(0))
4035 .addUse(Scale)
4036 .setMIFlags(Flags);
4037
4038 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4039 .addUse(Fmas.getReg(0))
4040 .addUse(RHS)
4041 .addUse(LHS)
4042 .setMIFlags(Flags);
4043
4044 MI.eraseFromParent();
4045 return true;
4046}
4047
4048bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4049 MachineRegisterInfo &MRI,
4050 MachineIRBuilder &B) const {
4051 Register Res = MI.getOperand(0).getReg();
4052 Register LHS = MI.getOperand(2).getReg();
4053 Register RHS = MI.getOperand(3).getReg();
4054 uint16_t Flags = MI.getFlags();
4055
4056 LLT S32 = LLT::scalar(32);
4057 LLT S1 = LLT::scalar(1);
4058
4059 auto Abs = B.buildFAbs(S32, RHS, Flags);
4060 const APFloat C0Val(1.0f);
4061
4062 auto C0 = B.buildConstant(S32, 0x6f800000);
4063 auto C1 = B.buildConstant(S32, 0x2f800000);
4064 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4065
4066 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4067 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4068
4069 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4070
4071 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4072 .addUse(Mul0.getReg(0))
4073 .setMIFlags(Flags);
4074
4075 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4076
4077 B.buildFMul(Res, Sel, Mul1, Flags);
4078
4079 MI.eraseFromParent();
4080 return true;
4081}
4082
4083// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4084// FIXME: Why do we handle this one but not other removed instructions?
4085//
4086// Reciprocal square root. The clamp prevents infinite results, clamping
4087// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4088// +-max_float.
4089bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4090 MachineRegisterInfo &MRI,
4091 MachineIRBuilder &B) const {
4092 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4093 return true;
4094
4095 Register Dst = MI.getOperand(0).getReg();
4096 Register Src = MI.getOperand(2).getReg();
4097 auto Flags = MI.getFlags();
4098
4099 LLT Ty = MRI.getType(Dst);
4100
4101 const fltSemantics *FltSemantics;
4102 if (Ty == LLT::scalar(32))
4103 FltSemantics = &APFloat::IEEEsingle();
4104 else if (Ty == LLT::scalar(64))
4105 FltSemantics = &APFloat::IEEEdouble();
4106 else
4107 return false;
4108
4109 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4110 .addUse(Src)
4111 .setMIFlags(Flags);
4112
4113 // We don't need to concern ourselves with the snan handling difference, since
4114 // the rsq quieted (or not) so use the one which will directly select.
4115 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4116 const bool UseIEEE = MFI->getMode().IEEE;
4117
4118 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4119 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4120 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4121
4122 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4123
4124 if (UseIEEE)
4125 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4126 else
4127 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4128 MI.eraseFromParent();
4129 return true;
4130}
4131
4132static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4133 switch (IID) {
4134 case Intrinsic::amdgcn_ds_fadd:
4135 return AMDGPU::G_ATOMICRMW_FADD;
4136 case Intrinsic::amdgcn_ds_fmin:
4137 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4138 case Intrinsic::amdgcn_ds_fmax:
4139 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4140 default:
4141 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4141)
;
4142 }
4143}
4144
4145bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4146 MachineInstr &MI,
4147 Intrinsic::ID IID) const {
4148 GISelChangeObserver &Observer = Helper.Observer;
4149 Observer.changingInstr(MI);
4150
4151 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4152
4153 // The remaining operands were used to set fields in the MemOperand on
4154 // construction.
4155 for (int I = 6; I > 3; --I)
4156 MI.removeOperand(I);
4157
4158 MI.removeOperand(1); // Remove the intrinsic ID.
4159 Observer.changedInstr(MI);
4160 return true;
4161}
4162
4163bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4164 MachineRegisterInfo &MRI,
4165 MachineIRBuilder &B) const {
4166 uint64_t Offset =
4167 ST.getTargetLowering()->getImplicitParameterOffset(
4168 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4169 LLT DstTy = MRI.getType(DstReg);
4170 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4171
4172 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4173 if (!loadInputValue(KernargPtrReg, B,
4174 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4175 return false;
4176
4177 // FIXME: This should be nuw
4178 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4179 return true;
4180}
4181
4182bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4183 MachineRegisterInfo &MRI,
4184 MachineIRBuilder &B) const {
4185 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4186 if (!MFI->isEntryFunction()) {
4187 return legalizePreloadedArgIntrin(MI, MRI, B,
4188 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4189 }
4190
4191 Register DstReg = MI.getOperand(0).getReg();
4192 if (!getImplicitArgPtr(DstReg, MRI, B))
4193 return false;
4194
4195 MI.eraseFromParent();
4196 return true;
4197}
4198
4199bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4200 MachineRegisterInfo &MRI,
4201 MachineIRBuilder &B) const {
4202 Function &F = B.getMF().getFunction();
4203 Optional<uint32_t> KnownSize =
4204 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4205 if (KnownSize.has_value())
4206 B.buildConstant(DstReg, KnownSize.value());
4207 return false;
4208}
4209
4210bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4211 MachineRegisterInfo &MRI,
4212 MachineIRBuilder &B) const {
4213
4214 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4215 if (!MFI->isEntryFunction()) {
4216 return legalizePreloadedArgIntrin(MI, MRI, B,
4217 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4218 }
4219
4220 Register DstReg = MI.getOperand(0).getReg();
4221 if (!getLDSKernelId(DstReg, MRI, B))
4222 return false;
4223
4224 MI.eraseFromParent();
4225 return true;
4226}
4227
4228bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4229 MachineRegisterInfo &MRI,
4230 MachineIRBuilder &B,
4231 unsigned AddrSpace) const {
4232 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4233 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4234 Register Hi32 = Unmerge.getReg(1);
4235
4236 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4237 MI.eraseFromParent();
4238 return true;
4239}
4240
4241// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4242// offset (the offset that is included in bounds checking and swizzling, to be
4243// split between the instruction's voffset and immoffset fields) and soffset
4244// (the offset that is excluded from bounds checking and swizzling, to go in
4245// the instruction's soffset field). This function takes the first kind of
4246// offset and figures out how to split it between voffset and immoffset.
4247std::pair<Register, unsigned>
4248AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4249 Register OrigOffset) const {
4250 const unsigned MaxImm = 4095;
4251 Register BaseReg;
4252 unsigned ImmOffset;
4253 const LLT S32 = LLT::scalar(32);
4254 MachineRegisterInfo &MRI = *B.getMRI();
4255
4256 std::tie(BaseReg, ImmOffset) =
4257 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4258
4259 // If BaseReg is a pointer, convert it to int.
4260 if (MRI.getType(BaseReg).isPointer())
4261 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4262
4263 // If the immediate value is too big for the immoffset field, put the value
4264 // and -4096 into the immoffset field so that the value that is copied/added
4265 // for the voffset field is a multiple of 4096, and it stands more chance
4266 // of being CSEd with the copy/add for another similar load/store.
4267 // However, do not do that rounding down to a multiple of 4096 if that is a
4268 // negative number, as it appears to be illegal to have a negative offset
4269 // in the vgpr, even if adding the immediate offset makes it positive.
4270 unsigned Overflow = ImmOffset & ~MaxImm;
4271 ImmOffset -= Overflow;
4272 if ((int32_t)Overflow < 0) {
4273 Overflow += ImmOffset;
4274 ImmOffset = 0;
4275 }
4276
4277 if (Overflow != 0) {
4278 if (!BaseReg) {
4279 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4280 } else {
4281 auto OverflowVal = B.buildConstant(S32, Overflow);
4282 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4283 }
4284 }
4285
4286 if (!BaseReg)
4287 BaseReg = B.buildConstant(S32, 0).getReg(0);
4288
4289 return std::make_pair(BaseReg, ImmOffset);
4290}
4291
4292/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4293void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4294 Register VOffset, Register SOffset,
4295 unsigned ImmOffset, Register VIndex,
4296 MachineRegisterInfo &MRI) const {
4297 Optional<ValueAndVReg> MaybeVOffsetVal =
4298 getIConstantVRegValWithLookThrough(VOffset, MRI);
4299 Optional<ValueAndVReg> MaybeSOffsetVal =
4300 getIConstantVRegValWithLookThrough(SOffset, MRI);
4301 Optional<ValueAndVReg> MaybeVIndexVal =
4302 getIConstantVRegValWithLookThrough(VIndex, MRI);
4303 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4304 // update the MMO with that offset. The stride is unknown so we can only do
4305 // this if VIndex is constant 0.
4306 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4307 MaybeVIndexVal->Value == 0) {
4308 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4309 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4310 MMO->setOffset(TotalOffset);
4311 } else {
4312 // We don't have a constant combined offset to use in the MMO. Give up.
4313 MMO->setValue((Value *)nullptr);
4314 }
4315}
4316
4317/// Handle register layout difference for f16 images for some subtargets.
4318Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4319 MachineRegisterInfo &MRI,
4320 Register Reg,
4321 bool ImageStore) const {
4322 const LLT S16 = LLT::scalar(16);
4323 const LLT S32 = LLT::scalar(32);
4324 LLT StoreVT = MRI.getType(Reg);
4325 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4325, __extension__
__PRETTY_FUNCTION__))
;
4326
4327 if (ST.hasUnpackedD16VMem()) {
4328 auto Unmerge = B.buildUnmerge(S16, Reg);
4329
4330 SmallVector<Register, 4> WideRegs;
4331 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4332 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4333
4334 int NumElts = StoreVT.getNumElements();
4335
4336 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4337 .getReg(0);
4338 }
4339
4340 if (ImageStore && ST.hasImageStoreD16Bug()) {
4341 if (StoreVT.getNumElements() == 2) {
4342 SmallVector<Register, 4> PackedRegs;
4343 Reg = B.buildBitcast(S32, Reg).getReg(0);
4344 PackedRegs.push_back(Reg);
4345 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4346 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4347 .getReg(0);
4348 }
4349
4350 if (StoreVT.getNumElements() == 3) {
4351 SmallVector<Register, 4> PackedRegs;
4352 auto Unmerge = B.buildUnmerge(S16, Reg);
4353 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4354 PackedRegs.push_back(Unmerge.getReg(I));
4355 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4356 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4357 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4358 }
4359
4360 if (StoreVT.getNumElements() == 4) {
4361 SmallVector<Register, 4> PackedRegs;
4362 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4363 auto Unmerge = B.buildUnmerge(S32, Reg);
4364 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4365 PackedRegs.push_back(Unmerge.getReg(I));
4366 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4367 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4368 .getReg(0);
4369 }
4370
4371 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4371)
;
4372 }
4373
4374 if (StoreVT == LLT::fixed_vector(3, S16)) {
4375 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4376 .getReg(0);
4377 }
4378 return Reg;
4379}
4380
4381Register AMDGPULegalizerInfo::fixStoreSourceType(
4382 MachineIRBuilder &B, Register VData, bool IsFormat) const {
4383 MachineRegisterInfo *MRI = B.getMRI();
4384 LLT Ty = MRI->getType(VData);
4385
4386 const LLT S16 = LLT::scalar(16);
4387
4388 // Fixup illegal register types for i8 stores.
4389 if (Ty == LLT::scalar(8) || Ty == S16) {
4390 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4391 return AnyExt;
4392 }
4393
4394 if (Ty.isVector()) {
4395 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4396 if (IsFormat)
4397 return handleD16VData(B, *MRI, VData);
4398 }
4399 }
4400
4401 return VData;
4402}
4403
4404bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4405 MachineRegisterInfo &MRI,
4406 MachineIRBuilder &B,
4407 bool IsTyped,
4408 bool IsFormat) const {
4409 Register VData = MI.getOperand(1).getReg();
4410 LLT Ty = MRI.getType(VData);
4411 LLT EltTy = Ty.getScalarType();
4412 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4413 const LLT S32 = LLT::scalar(32);
4414
4415 VData = fixStoreSourceType(B, VData, IsFormat);
4416 Register RSrc = MI.getOperand(2).getReg();
4417
4418 MachineMemOperand *MMO = *MI.memoperands_begin();
4419 const int MemSize = MMO->getSize();
4420
4421 unsigned ImmOffset;
4422
4423 // The typed intrinsics add an immediate after the registers.
4424 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4425
4426 // The struct intrinsic variants add one additional operand over raw.
4427 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4428 Register VIndex;
4429 int OpOffset = 0;
4430 if (HasVIndex) {
4431 VIndex = MI.getOperand(3).getReg();
4432 OpOffset = 1;
4433 } else {
4434 VIndex = B.buildConstant(S32, 0).getReg(0);
4435 }
4436
4437 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4438 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4439
4440 unsigned Format = 0;
4441 if (IsTyped) {
4442 Format = MI.getOperand(5 + OpOffset).getImm();
4443 ++OpOffset;
4444 }
4445
4446 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4447
4448 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4449 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4450
4451 unsigned Opc;
4452 if (IsTyped) {
4453 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4454 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4455 } else if (IsFormat) {
4456 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4457 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4458 } else {
4459 switch (MemSize) {
4460 case 1:
4461 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4462 break;
4463 case 2:
4464 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4465 break;
4466 default:
4467 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4468 break;
4469 }
4470 }
4471
4472 auto MIB = B.buildInstr(Opc)
4473 .addUse(VData) // vdata
4474 .addUse(RSrc) // rsrc
4475 .addUse(VIndex) // vindex
4476 .addUse(VOffset) // voffset
4477 .addUse(SOffset) // soffset
4478 .addImm(ImmOffset); // offset(imm)
4479
4480 if (IsTyped)
4481 MIB.addImm(Format);
4482
4483 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4484 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4485 .addMemOperand(MMO);
4486
4487 MI.eraseFromParent();
4488 return true;
4489}
4490
4491bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4492 MachineRegisterInfo &MRI,
4493 MachineIRBuilder &B,
4494 bool IsFormat,
4495 bool IsTyped) const {
4496 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4497 MachineMemOperand *MMO = *MI.memoperands_begin();
4498 const LLT MemTy = MMO->getMemoryType();
4499 const LLT S32 = LLT::scalar(32);
4500
4501 Register Dst = MI.getOperand(0).getReg();
4502 Register RSrc = MI.getOperand(2).getReg();
4503
4504 // The typed intrinsics add an immediate after the registers.
4505 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4506
4507 // The struct intrinsic variants add one additional operand over raw.
4508 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4509 Register VIndex;
4510 int OpOffset = 0;
4511 if (HasVIndex) {
4512 VIndex = MI.getOperand(3).getReg();
4513 OpOffset = 1;
4514 } else {
4515 VIndex = B.buildConstant(S32, 0).getReg(0);
4516 }
4517
4518 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4519 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4520
4521 unsigned Format = 0;
4522 if (IsTyped) {
4523 Format = MI.getOperand(5 + OpOffset).getImm();
4524 ++OpOffset;
4525 }
4526
4527 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4528 unsigned ImmOffset;
4529
4530 LLT Ty = MRI.getType(Dst);
4531 LLT EltTy = Ty.getScalarType();
4532 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4533 const bool Unpacked = ST.hasUnpackedD16VMem();
4534
4535 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4536 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4537
4538 unsigned Opc;
4539
4540 if (IsTyped) {
4541 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4542 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4543 } else if (IsFormat) {
4544 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4545 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4546 } else {
4547 switch (MemTy.getSizeInBits()) {
4548 case 8:
4549 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4550 break;
4551 case 16:
4552 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4553 break;
4554 default:
4555 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4556 break;
4557 }
4558 }
4559
4560 Register LoadDstReg;
4561
4562 bool IsExtLoad =
4563 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4564 LLT UnpackedTy = Ty.changeElementSize(32);
4565
4566 if (IsExtLoad)
4567 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4568 else if (Unpacked && IsD16 && Ty.isVector())
4569 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4570 else
4571 LoadDstReg = Dst;
4572
4573 auto MIB = B.buildInstr(Opc)
4574 .addDef(LoadDstReg) // vdata
4575 .addUse(RSrc) // rsrc
4576 .addUse(VIndex) // vindex
4577 .addUse(VOffset) // voffset
4578 .addUse(SOffset) // soffset
4579 .addImm(ImmOffset); // offset(imm)
4580
4581 if (IsTyped)
4582 MIB.addImm(Format);
4583
4584 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4585 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4586 .addMemOperand(MMO);
4587
4588 if (LoadDstReg != Dst) {
4589 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4590
4591 // Widen result for extending loads was widened.
4592 if (IsExtLoad)
4593 B.buildTrunc(Dst, LoadDstReg);
4594 else {
4595 // Repack to original 16-bit vector result
4596 // FIXME: G_TRUNC should work, but legalization currently fails
4597 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4598 SmallVector<Register, 4> Repack;
4599 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4600 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4601 B.buildMerge(Dst, Repack);
4602 }
4603 }
4604
4605 MI.eraseFromParent();
4606 return true;
4607}
4608
4609bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4610 MachineIRBuilder &B,
4611 bool IsInc) const {
4612 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4613 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4614 B.buildInstr(Opc)
4615 .addDef(MI.getOperand(0).getReg())
4616 .addUse(MI.getOperand(2).getReg())
4617 .addUse(MI.getOperand(3).getReg())
4618 .cloneMemRefs(MI);
4619 MI.eraseFromParent();
4620 return true;
4621}
4622
4623static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4624 switch (IntrID) {
4625 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4626 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4627 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4628 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4629 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4631 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4632 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4633 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4634 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4635 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4636 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4637 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4638 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4639 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4640 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4641 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4642 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4643 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4644 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4646 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4647 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4648 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4649 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4650 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4651 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4652 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4653 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4654 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4655 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4656 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4657 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4658 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4659 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4661 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4662 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4663 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4664 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4665 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4666 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4667 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4668 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4669 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4670 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4671 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4672 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4673 default:
4674 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4674)
;
4675 }
4676}
4677
4678bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4679 MachineIRBuilder &B,
4680 Intrinsic::ID IID) const {
4681 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4682 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4683 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4684
4685 Register Dst;
4686
4687 int OpOffset = 0;
4688 if (HasReturn) {
4689 // A few FP atomics do not support return values.
4690 Dst = MI.getOperand(0).getReg();
4691 } else {
4692 OpOffset = -1;
4693 }
4694
4695 Register VData = MI.getOperand(2 + OpOffset).getReg();
4696 Register CmpVal;
4697
4698 if (IsCmpSwap) {
4699 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4700 ++OpOffset;
4701 }
4702
4703 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4704 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4705
4706 // The struct intrinsic variants add one additional operand over raw.
4707 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4708 Register VIndex;
4709 if (HasVIndex) {
4710 VIndex = MI.getOperand(4 + OpOffset).getReg();
4711 ++OpOffset;
4712 } else {
4713 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4714 }
4715
4716 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4717 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4718 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4719
4720 MachineMemOperand *MMO = *MI.memoperands_begin();
4721
4722 unsigned ImmOffset;
4723 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4724 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4725
4726 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4727
4728 if (HasReturn)
4729 MIB.addDef(Dst);
4730
4731 MIB.addUse(VData); // vdata
4732
4733 if (IsCmpSwap)
4734 MIB.addReg(CmpVal);
4735
4736 MIB.addUse(RSrc) // rsrc
4737 .addUse(VIndex) // vindex
4738 .addUse(VOffset) // voffset
4739 .addUse(SOffset) // soffset
4740 .addImm(ImmOffset) // offset(imm)
4741 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4742 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4743 .addMemOperand(MMO);
4744
4745 MI.eraseFromParent();
4746 return true;
4747}
4748
4749/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4750/// vector with s16 typed elements.
4751static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4752 SmallVectorImpl<Register> &PackedAddrs,
4753 unsigned ArgOffset,
4754 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4755 bool IsA16, bool IsG16) {
4756 const LLT S16 = LLT::scalar(16);
4757 const LLT V2S16 = LLT::fixed_vector(2, 16);
4758 auto EndIdx = Intr->VAddrEnd;
4759
4760 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4761 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4762 if (!SrcOp.isReg())
4763 continue; // _L to _LZ may have eliminated this.
4764
4765 Register AddrReg = SrcOp.getReg();
4766
4767 if ((I < Intr->GradientStart) ||
4768 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4769 (I >= Intr->CoordStart && !IsA16)) {
4770 if ((I < Intr->GradientStart) && IsA16 &&
4771 (B.getMRI()->getType(AddrReg) == S16)) {
4772 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument")(static_cast <bool> (I == Intr->BiasIndex &&
"Got unexpected 16-bit extra argument") ? void (0) : __assert_fail
("I == Intr->BiasIndex && \"Got unexpected 16-bit extra argument\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4772, __extension__
__PRETTY_FUNCTION__))
;
4773 // Special handling of bias when A16 is on. Bias is of type half but
4774 // occupies full 32-bit.
4775 PackedAddrs.push_back(
4776 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4777 .getReg(0));
4778 } else {
4779 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4780, __extension__
__PRETTY_FUNCTION__))
4780 "Bias needs to be converted to 16 bit in A16 mode")(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4780, __extension__
__PRETTY_FUNCTION__))
;
4781 // Handle any gradient or coordinate operands that should not be packed
4782 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4783 PackedAddrs.push_back(AddrReg);
4784 }
4785 } else {
4786 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4787 // derivatives dx/dh and dx/dv are packed with undef.
4788 if (((I + 1) >= EndIdx) ||
4789 ((Intr->NumGradients / 2) % 2 == 1 &&
4790 (I == static_cast<unsigned>(Intr->GradientStart +
4791 (Intr->NumGradients / 2) - 1) ||
4792 I == static_cast<unsigned>(Intr->GradientStart +
4793 Intr->NumGradients - 1))) ||
4794 // Check for _L to _LZ optimization
4795 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4796 PackedAddrs.push_back(
4797 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4798 .getReg(0));
4799 } else {
4800 PackedAddrs.push_back(
4801 B.buildBuildVector(
4802 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4803 .getReg(0));
4804 ++I;
4805 }
4806 }
4807 }
4808}
4809
4810/// Convert from separate vaddr components to a single vector address register,
4811/// and replace the remaining operands with $noreg.
4812static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4813 int DimIdx, int NumVAddrs) {
4814 const LLT S32 = LLT::scalar(32);
4815
4816 SmallVector<Register, 8> AddrRegs;
4817 for (int I = 0; I != NumVAddrs; ++I) {
4818 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4819 if (SrcOp.isReg()) {
4820 AddrRegs.push_back(SrcOp.getReg());
4821 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4821, __extension__
__PRETTY_FUNCTION__))
;
4822 }
4823 }
4824
4825 int NumAddrRegs = AddrRegs.size();
4826 if (NumAddrRegs != 1) {
4827 // Above 8 elements round up to next power of 2 (i.e. 16).
4828 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4829 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4830 auto Undef = B.buildUndef(S32);
4831 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4832 NumAddrRegs = RoundedNumRegs;
4833 }
4834
4835 auto VAddr =
4836 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4837 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4838 }
4839
4840 for (int I = 1; I != NumVAddrs; ++I) {
4841 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4842 if (SrcOp.isReg())
4843 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4844 }
4845}
4846
4847/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4848///
4849/// Depending on the subtarget, load/store with 16-bit element data need to be
4850/// rewritten to use the low half of 32-bit registers, or directly use a packed
4851/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4852/// registers.
4853///
4854/// We don't want to directly select image instructions just yet, but also want
4855/// to exposes all register repacking to the legalizer/combiners. We also don't
4856/// want a selected instruction entering RegBankSelect. In order to avoid
4857/// defining a multitude of intermediate image instructions, directly hack on
4858/// the intrinsic's arguments. In cases like a16 addresses, this requires
4859/// padding now unnecessary arguments with $noreg.
4860bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4861 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4862 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4863
4864 const MachineFunction &MF = *MI.getMF();
4865 const unsigned NumDefs = MI.getNumExplicitDefs();
4866 const unsigned ArgOffset = NumDefs + 1;
4867 bool IsTFE = NumDefs == 2;
4868 // We are only processing the operands of d16 image operations on subtargets
4869 // that use the unpacked register layout, or need to repack the TFE result.
4870
4871 // TODO: Do we need to guard against already legalized intrinsics?
4872 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4873 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4874
4875 MachineRegisterInfo *MRI = B.getMRI();
4876 const LLT S32 = LLT::scalar(32);
4877 const LLT S16 = LLT::scalar(16);
4878 const LLT V2S16 = LLT::fixed_vector(2, 16);
4879
4880 unsigned DMask = 0;
4881 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4882 LLT Ty = MRI->getType(VData);
4883
4884 // Check for 16 bit addresses and pack if true.
4885 LLT GradTy =
4886 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4887 LLT AddrTy =
4888 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4889 const bool IsG16 = GradTy == S16;
4890 const bool IsA16 = AddrTy == S16;
4891 const bool IsD16 = Ty.getScalarType() == S16;
4892
4893 int DMaskLanes = 0;
4894 if (!BaseOpcode->Atomic) {
4895 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4896 if (BaseOpcode->Gather4) {
4897 DMaskLanes = 4;
4898 } else if (DMask != 0) {
4899 DMaskLanes = countPopulation(DMask);
4900 } else if (!IsTFE && !BaseOpcode->Store) {
4901 // If dmask is 0, this is a no-op load. This can be eliminated.
4902 B.buildUndef(MI.getOperand(0));
4903 MI.eraseFromParent();
4904 return true;
4905 }
4906 }
4907
4908 Observer.changingInstr(MI);
4909 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4910
4911 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4912 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4913 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4914 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4915 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4916
4917 // Track that we legalized this
4918 MI.setDesc(B.getTII().get(NewOpcode));
4919
4920 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4921 // dmask to be at least 1 otherwise the instruction will fail
4922 if (IsTFE && DMask == 0) {
4923 DMask = 0x1;
4924 DMaskLanes = 1;
4925 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4926 }
4927
4928 if (BaseOpcode->Atomic) {
4929 Register VData0 = MI.getOperand(2).getReg();
4930 LLT Ty = MRI->getType(VData0);
4931
4932 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4933 if (Ty.isVector())
4934 return false;
4935
4936 if (BaseOpcode->AtomicX2) {
4937 Register VData1 = MI.getOperand(3).getReg();
4938 // The two values are packed in one register.
4939 LLT PackedTy = LLT::fixed_vector(2, Ty);
4940 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4941 MI.getOperand(2).setReg(Concat.getReg(0));
4942 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4943 }
4944 }
4945
4946 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4947
4948 // Rewrite the addressing register layout before doing anything else.
4949 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4950 // 16 bit gradients are supported, but are tied to the A16 control
4951 // so both gradients and addresses must be 16 bit
4952 return false;
4953 }
4954
4955 if (IsA16 && !ST.hasA16()) {
4956 // A16 not supported
4957 return false;
4958 }
4959
4960 if (IsA16 || IsG16) {
4961 if (Intr->NumVAddrs > 1) {
4962 SmallVector<Register, 4> PackedRegs;
4963
4964 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4965 IsG16);
4966
4967 // See also below in the non-a16 branch
4968 const bool UseNSA = ST.hasNSAEncoding() &&
4969 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
4970 PackedRegs.size() <= ST.getNSAMaxSize();
4971
4972 if (!UseNSA && PackedRegs.size() > 1) {
4973 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4974 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4975 PackedRegs[0] = Concat.getReg(0);
4976 PackedRegs.resize(1);
4977 }
4978
4979 const unsigned NumPacked = PackedRegs.size();
4980 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4981 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4982 if (!SrcOp.isReg()) {
4983 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4983, __extension__
__PRETTY_FUNCTION__))
;
4984 continue;
4985 }
4986
4987 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4987, __extension__
__PRETTY_FUNCTION__))
;
4988
4989 if (I - Intr->VAddrStart < NumPacked)
4990 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4991 else
4992 SrcOp.setReg(AMDGPU::NoRegister);
4993 }
4994 }
4995 } else {
4996 // If the register allocator cannot place the address registers contiguously
4997 // without introducing moves, then using the non-sequential address encoding
4998 // is always preferable, since it saves VALU instructions and is usually a
4999 // wash in terms of code size or even better.
5000 //
5001 // However, we currently have no way of hinting to the register allocator
5002 // that MIMG addresses should be placed contiguously when it is possible to
5003 // do so, so force non-NSA for the common 2-address case as a heuristic.
5004 //
5005 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5006 // allocation when possible.
5007 //
5008 // TODO: we can actually allow partial NSA where the final register is a
5009 // contiguous set of the remaining addresses.
5010 // This could help where there are more addresses than supported.
5011 const bool UseNSA = ST.hasNSAEncoding() &&
5012 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5013 CorrectedNumVAddrs <= ST.getNSAMaxSize();
5014
5015 if (!UseNSA && Intr->NumVAddrs > 1)
5016 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5017 Intr->NumVAddrs);
5018 }
5019
5020 int Flags = 0;
5021 if (IsA16)
5022 Flags |= 1;
5023 if (IsG16)
5024 Flags |= 2;
5025 MI.addOperand(MachineOperand::CreateImm(Flags));
5026
5027 if (BaseOpcode->Store) { // No TFE for stores?
5028 // TODO: Handle dmask trim
5029 if (!Ty.isVector() || !IsD16)
5030 return true;
5031
5032 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5033 if (RepackedReg != VData) {
5034 MI.getOperand(1).setReg(RepackedReg);
5035 }
5036
5037 return true;
5038 }
5039
5040 Register DstReg = MI.getOperand(0).getReg();
5041 const LLT EltTy = Ty.getScalarType();
5042 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5043
5044 // Confirm that the return type is large enough for the dmask specified
5045 if (NumElts < DMaskLanes)
5046 return false;
5047
5048 if (NumElts > 4 || DMaskLanes > 4)
5049 return false;
5050
5051 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5052 const LLT AdjustedTy =
5053 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5054
5055 // The raw dword aligned data component of the load. The only legal cases
5056 // where this matters should be when using the packed D16 format, for
5057 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5058 LLT RoundedTy;
5059
5060 // S32 vector to to cover all data, plus TFE result element.
5061 LLT TFETy;
5062
5063 // Register type to use for each loaded component. Will be S32 or V2S16.
5064 LLT RegTy;
5065
5066 if (IsD16 && ST.hasUnpackedD16VMem()) {
5067 RoundedTy =
5068 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5069 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5070 RegTy = S32;
5071 } else {
5072 unsigned EltSize = EltTy.getSizeInBits();
5073 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5074 unsigned RoundedSize = 32 * RoundedElts;
5075 RoundedTy = LLT::scalarOrVector(
5076 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5077 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5078 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5079 }
5080
5081 // The return type does not need adjustment.
5082 // TODO: Should we change s16 case to s32 or <2 x s16>?
5083 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5084 return true;
5085
5086 Register Dst1Reg;
5087
5088 // Insert after the instruction.
5089 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5090
5091 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5092 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5093 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5094 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5095
5096 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5097
5098 MI.getOperand(0).setReg(NewResultReg);
5099
5100 // In the IR, TFE is supposed to be used with a 2 element struct return
5101 // type. The instruction really returns these two values in one contiguous
5102 // register, with one additional dword beyond the loaded data. Rewrite the
5103 // return type to use a single register result.
5104
5105 if (IsTFE) {
5106 Dst1Reg = MI.getOperand(1).getReg();
5107 if (MRI->getType(Dst1Reg) != S32)
5108 return false;
5109
5110 // TODO: Make sure the TFE operand bit is set.
5111 MI.removeOperand(1);
5112
5113 // Handle the easy case that requires no repack instructions.
5114 if (Ty == S32) {
5115 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5116 return true;
5117 }
5118 }
5119
5120 // Now figure out how to copy the new result register back into the old
5121 // result.
5122 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5123
5124 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5125
5126 if (ResultNumRegs == 1) {
5127 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp",
5127, __extension__ __PRETTY_FUNCTION__))
;
5128 ResultRegs[0] = NewResultReg;
5129 } else {
5130 // We have to repack into a new vector of some kind.
5131 for (int I = 0; I != NumDataRegs; ++I)
5132 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5133 B.buildUnmerge(ResultRegs, NewResultReg);
5134
5135 // Drop the final TFE element to get the data part. The TFE result is
5136 // directly written to the right place already.
5137 if (IsTFE)
5138 ResultRegs.resize(NumDataRegs);
5139 }
5140
5141 // For an s16 scalar result, we form an s32 result with a truncate regardless
5142 // of packed vs. unpacked.
5143 if (IsD16 && !Ty.isVector()) {
5144 B.buildTrunc(DstReg, ResultRegs[0]);
5145 return true;
5146 }
5147
5148 // Avoid a build/concat_vector of 1 entry.
5149 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5150 B.buildBitcast(DstReg, ResultRegs[0]);
5151 return true;
5152 }
5153
5154 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5154, __extension__ __PRETTY_FUNCTION__))
;
5155
5156 if (IsD16) {
5157 // For packed D16 results with TFE enabled, all the data components are
5158 // S32. Cast back to the expected type.
5159 //
5160 // TODO: We don't really need to use load s32 elements. We would only need one
5161 // cast for the TFE result if a multiple of v2s16 was used.
5162 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5163 for (Register &Reg : ResultRegs)
5164 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5165 } else if (ST.hasUnpackedD16VMem()) {
5166 for (Register &Reg : ResultRegs)
5167 Reg = B.buildTrunc(S16, Reg).getReg(0);
5168 }
5169 }
5170
5171 auto padWithUndef = [&](LLT Ty, int NumElts) {
5172 if (NumElts == 0)
5173 return;
5174 Register Undef = B.buildUndef(Ty).getReg(0);
5175 for (int I = 0; I != NumElts; ++I)
5176 ResultRegs.push_back(Undef);
5177 };
5178
5179 // Pad out any elements eliminated due to the dmask.
5180 LLT ResTy = MRI->getType(ResultRegs[0]);
5181 if (!ResTy.isVector()) {
5182 padWithUndef(ResTy, NumElts - ResultRegs.size());
5183 B.buildBuildVector(DstReg, ResultRegs);
5184 return true;
5185 }
5186
5187 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5187, __extension__
__PRETTY_FUNCTION__))
;
5188 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5189
5190 // Deal with the one annoying legal case.
5191 const LLT V3S16 = LLT::fixed_vector(3, 16);
5192 if (Ty == V3S16) {
5193 if (IsTFE) {
5194 if (ResultRegs.size() == 1) {
5195 NewResultReg = ResultRegs[0];
5196 } else if (ResultRegs.size() == 2) {
5197 LLT V4S16 = LLT::fixed_vector(4, 16);
5198 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5199 } else {
5200 return false;
5201 }
5202 }
5203
5204 if (MRI->getType(DstReg).getNumElements() <
5205 MRI->getType(NewResultReg).getNumElements()) {
5206 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5207 } else {
5208 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5209 }
5210 return true;
5211 }
5212
5213 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5214 B.buildConcatVectors(DstReg, ResultRegs);
5215 return true;
5216}
5217
5218bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5219 LegalizerHelper &Helper, MachineInstr &MI) const {
5220 MachineIRBuilder &B = Helper.MIRBuilder;
5221 GISelChangeObserver &Observer = Helper.Observer;
5222
5223 Register Dst = MI.getOperand(0).getReg();
5224 LLT Ty = B.getMRI()->getType(Dst);
5225 unsigned Size = Ty.getSizeInBits();
5226 MachineFunction &MF = B.getMF();
5227
5228 Observer.changingInstr(MI);
5229
5230 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5231 Ty = getBitcastRegisterType(Ty);
5232 Helper.bitcastDst(MI, Ty, 0);
5233 Dst = MI.getOperand(0).getReg();
5234 B.setInsertPt(B.getMBB(), MI);
5235 }
5236
5237 // FIXME: We don't really need this intermediate instruction. The intrinsic
5238 // should be fixed to have a memory operand. Since it's readnone, we're not
5239 // allowed to add one.
5240 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5241 MI.removeOperand(1); // Remove intrinsic ID
5242
5243 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5244 // TODO: Should this use datalayout alignment?
5245 const unsigned MemSize = (Size + 7) / 8;
5246 const Align MemAlign(4);
5247 MachineMemOperand *MMO = MF.getMachineMemOperand(
5248 MachinePointerInfo(),
5249 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5250 MachineMemOperand::MOInvariant,
5251 MemSize, MemAlign);
5252 MI.addMemOperand(MF, MMO);
5253
5254 // There are no 96-bit result scalar loads, but widening to 128-bit should
5255 // always be legal. We may need to restore this to a 96-bit result if it turns
5256 // out this needs to be converted to a vector load during RegBankSelect.
5257 if (!isPowerOf2_32(Size)) {
5258 if (Ty.isVector())
5259 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5260 else
5261 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5262 }
5263
5264 Observer.changedInstr(MI);
5265 return true;
5266}
5267
5268// TODO: Move to selection
5269bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5270 MachineRegisterInfo &MRI,
5271 MachineIRBuilder &B) const {
5272 if (!ST.isTrapHandlerEnabled() ||
5273 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5274 return legalizeTrapEndpgm(MI, MRI, B);
5275
5276 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5277 switch (*HsaAbiVer) {
5278 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5279 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5280 return legalizeTrapHsaQueuePtr(MI, MRI, B);
5281 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5282 case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5283 return ST.supportsGetDoorbellID() ?
5284 legalizeTrapHsa(MI, MRI, B) :
5285 legalizeTrapHsaQueuePtr(MI, MRI, B);
5286 }
5287 }
5288
5289 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5289)
;
5290}
5291
5292bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5293 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5294 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5295 MI.eraseFromParent();
5296 return true;
5297}
5298
5299bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5300 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5301 MachineFunction &MF = B.getMF();
5302 const LLT S64 = LLT::scalar(64);
5303
5304 Register SGPR01(AMDGPU::SGPR0_SGPR1);
5305 // For code object version 5, queue_ptr is passed through implicit kernarg.
5306 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5307 AMDGPUTargetLowering::ImplicitParameter Param =
5308 AMDGPUTargetLowering::QUEUE_PTR;
5309 uint64_t Offset =
5310 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5311
5312 Register KernargPtrReg = MRI.createGenericVirtualRegister(
5313 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5314
5315 if (!loadInputValue(KernargPtrReg, B,
5316 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5317 return false;
5318
5319 // TODO: can we be smarter about machine pointer info?
5320 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5321 MachineMemOperand *MMO = MF.getMachineMemOperand(
5322 PtrInfo,
5323 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5324 MachineMemOperand::MOInvariant,
5325 LLT::scalar(64), commonAlignment(Align(64), Offset));
5326
5327 // Pointer address
5328 Register LoadAddr = MRI.createGenericVirtualRegister(
5329 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5330 B.buildPtrAdd(LoadAddr, KernargPtrReg,
5331 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5332 // Load address
5333 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5334 B.buildCopy(SGPR01, Temp);
5335 B.buildInstr(AMDGPU::S_TRAP)
5336 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5337 .addReg(SGPR01, RegState::Implicit);
5338 MI.eraseFromParent();
5339 return true;
5340 }
5341
5342 // Pass queue pointer to trap handler as input, and insert trap instruction
5343 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5344 Register LiveIn =
5345 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5346 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5347 return false;
5348
5349 B.buildCopy(SGPR01, LiveIn);
5350 B.buildInstr(AMDGPU::S_TRAP)
5351 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5352 .addReg(SGPR01, RegState::Implicit);
5353
5354 MI.eraseFromParent();
5355 return true;
5356}
5357
5358bool AMDGPULegalizerInfo::legalizeTrapHsa(
5359 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5360 B.buildInstr(AMDGPU::S_TRAP)
5361 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5362 MI.eraseFromParent();
5363 return true;
5364}
5365
5366bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5367 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5368 // Is non-HSA path or trap-handler disabled? Then, report a warning
5369 // accordingly
5370 if (!ST.isTrapHandlerEnabled() ||
5371 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5372 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5373 "debugtrap handler not supported",
5374 MI.getDebugLoc(), DS_Warning);
5375 LLVMContext &Ctx = B.getMF().getFunction().getContext();
5376 Ctx.diagnose(NoTrap);
5377 } else {
5378 // Insert debug-trap instruction
5379 B.buildInstr(AMDGPU::S_TRAP)
5380 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5381 }
5382
5383 MI.eraseFromParent();
5384 return true;
5385}
5386
5387bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5388 MachineIRBuilder &B) const {
5389 MachineRegisterInfo &MRI = *B.getMRI();
5390 const LLT S16 = LLT::scalar(16);
5391 const LLT S32 = LLT::scalar(32);
5392 const LLT V2S16 = LLT::fixed_vector(2, 16);
5393 const LLT V3S32 = LLT::fixed_vector(3, 32);
5394
5395 Register DstReg = MI.getOperand(0).getReg();
5396 Register NodePtr = MI.getOperand(2).getReg();
5397 Register RayExtent = MI.getOperand(3).getReg();
5398 Register RayOrigin = MI.getOperand(4).getReg();
5399 Register RayDir = MI.getOperand(5).getReg();
5400 Register RayInvDir = MI.getOperand(6).getReg();
5401 Register TDescr = MI.getOperand(7).getReg();
5402
5403 if (!ST.hasGFX10_AEncoding()) {
5404 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5405 "intrinsic not supported on subtarget",
5406 MI.getDebugLoc());
5407 B.getMF().getFunction().getContext().diagnose(BadIntrin);
5408 return false;
5409 }
5410
5411 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5412 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5413 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5414 const unsigned NumVDataDwords = 4;
5415 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5416 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5417 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5418 const unsigned BaseOpcodes[2][2] = {
5419 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5420 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5421 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5422 int Opcode;
5423 if (UseNSA) {
5424 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5425 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5426 : AMDGPU::MIMGEncGfx10NSA,
5427 NumVDataDwords, NumVAddrDwords);
5428 } else {
5429 Opcode = AMDGPU::getMIMGOpcode(
5430 BaseOpcodes[Is64][IsA16],
5431 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5432 NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
5433 }
5434 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5434, __extension__ __PRETTY_FUNCTION__))
;
5435
5436 SmallVector<Register, 12> Ops;
5437 if (UseNSA && IsGFX11Plus) {
5438 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5439 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5440 auto Merged = B.buildMerge(
5441 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5442 Ops.push_back(Merged.getReg(0));
5443 };
5444
5445 Ops.push_back(NodePtr);
5446 Ops.push_back(RayExtent);
5447 packLanes(RayOrigin);
5448
5449 if (IsA16) {
5450 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5451 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5452 auto MergedDir = B.buildMerge(
5453 V3S32,
5454 {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
5455 UnmergeRayDir.getReg(0)}))
5456 .getReg(0),
5457 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
5458 UnmergeRayDir.getReg(1)}))
5459 .getReg(0),
5460 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
5461 UnmergeRayDir.getReg(2)}))
5462 .getReg(0)});
5463 Ops.push_back(MergedDir.getReg(0));
5464 } else {
5465 packLanes(RayDir);
5466 packLanes(RayInvDir);
5467 }
5468 } else {
5469 if (Is64) {
5470 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5471 Ops.push_back(Unmerge.getReg(0));
5472 Ops.push_back(Unmerge.getReg(1));
5473 } else {
5474 Ops.push_back(NodePtr);
5475 }
5476 Ops.push_back(RayExtent);
5477
5478 auto packLanes = [&Ops, &S32, &B](Register Src) {
5479 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5480 Ops.push_back(Unmerge.getReg(0));
5481 Ops.push_back(Unmerge.getReg(1));
5482 Ops.push_back(Unmerge.getReg(2));
5483 };
5484
5485 packLanes(RayOrigin);
5486 if (IsA16) {
5487 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5488 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5489 Register R1 = MRI.createGenericVirtualRegister(S32);
5490 Register R2 = MRI.createGenericVirtualRegister(S32);
5491 Register R3 = MRI.createGenericVirtualRegister(S32);
5492 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5493 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5494 B.buildMerge(R3,
5495 {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5496 Ops.push_back(R1);
5497 Ops.push_back(R2);
5498 Ops.push_back(R3);
5499 } else {
5500 packLanes(RayDir);
5501 packLanes(RayInvDir);
5502 }
5503 }
5504
5505 if (!UseNSA) {
5506 // Build a single vector containing all the operands so far prepared.
5507 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5508 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5509 Ops.clear();
5510 Ops.push_back(MergedOps);
5511 }
5512
5513 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5514 .addDef(DstReg)
5515 .addImm(Opcode);
5516
5517 for (Register R : Ops) {
5518 MIB.addUse(R);
5519 }
5520
5521 MIB.addUse(TDescr)
5522 .addImm(IsA16 ? 1 : 0)
5523 .cloneMemRefs(MI);
5524
5525 MI.eraseFromParent();
5526 return true;
5527}
5528
5529bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5530 MachineIRBuilder &B) const {
5531 unsigned Opc;
5532 int RoundMode = MI.getOperand(2).getImm();
5533
5534 if (RoundMode == (int)RoundingMode::TowardPositive)
5535 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5536 else if (RoundMode == (int)RoundingMode::TowardNegative)
5537 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5538 else
5539 return false;
5540
5541 B.buildInstr(Opc)
5542 .addDef(MI.getOperand(0).getReg())
5543 .addUse(MI.getOperand(1).getReg());
5544
5545 MI.eraseFromParent();
5546
5547 return true;
5548}
5549
5550bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5551 MachineInstr &MI) const {
5552 MachineIRBuilder &B = Helper.MIRBuilder;
5553 MachineRegisterInfo &MRI = *B.getMRI();
5554
5555 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5556 auto IntrID = MI.getIntrinsicID();
5557 switch (IntrID) {
5558 case Intrinsic::amdgcn_if:
5559 case Intrinsic::amdgcn_else: {
5560 MachineInstr *Br = nullptr;
5561 MachineBasicBlock *UncondBrTarget = nullptr;
5562 bool Negated = false;
5563 if (MachineInstr *BrCond =
5564 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5565 const SIRegisterInfo *TRI
5566 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5567
5568 Register Def = MI.getOperand(1).getReg();
5569 Register Use = MI.getOperand(3).getReg();
5570
5571 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5572
5573 if (Negated)
5574 std::swap(CondBrTarget, UncondBrTarget);
5575
5576 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5577 if (IntrID == Intrinsic::amdgcn_if) {
5578 B.buildInstr(AMDGPU::SI_IF)
5579 .addDef(Def)
5580 .addUse(Use)
5581 .addMBB(UncondBrTarget);
5582 } else {
5583 B.buildInstr(AMDGPU::SI_ELSE)
5584 .addDef(Def)
5585 .addUse(Use)
5586 .addMBB(UncondBrTarget);
5587 }
5588
5589 if (Br) {
5590 Br->getOperand(0).setMBB(CondBrTarget);
5591 } else {
5592 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5593 // since we're swapping branch targets it needs to be reinserted.
5594 // FIXME: IRTranslator should probably not do this
5595 B.buildBr(*CondBrTarget);
5596 }
5597
5598 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5599 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5600 MI.eraseFromParent();
5601 BrCond->eraseFromParent();
5602 return true;
5603 }
5604
5605 return false;
5606 }
5607 case Intrinsic::amdgcn_loop: {
5608 MachineInstr *Br = nullptr;
5609 MachineBasicBlock *UncondBrTarget = nullptr;
5610 bool Negated = false;
5611 if (MachineInstr *BrCond =
5612 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5613 const SIRegisterInfo *TRI
5614 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5615
5616 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5617 Register Reg = MI.getOperand(2).getReg();
5618
5619 if (Negated)
5620 std::swap(CondBrTarget, UncondBrTarget);
5621
5622 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5623 B.buildInstr(AMDGPU::SI_LOOP)
5624 .addUse(Reg)
5625 .addMBB(UncondBrTarget);
5626
5627 if (Br)
5628 Br->getOperand(0).setMBB(CondBrTarget);
5629 else
5630 B.buildBr(*CondBrTarget);
5631
5632 MI.eraseFromParent();
5633 BrCond->eraseFromParent();
5634 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5635 return true;
5636 }
5637
5638 return false;
5639 }
5640 case Intrinsic::amdgcn_kernarg_segment_ptr:
5641 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5642 // This only makes sense to call in a kernel, so just lower to null.
5643 B.buildConstant(MI.getOperand(0).getReg(), 0);
5644 MI.eraseFromParent();
5645 return true;
5646 }
5647
5648 return legalizePreloadedArgIntrin(
5649 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5650 case Intrinsic::amdgcn_implicitarg_ptr:
5651 return legalizeImplicitArgPtr(MI, MRI, B);
5652 case Intrinsic::amdgcn_workitem_id_x:
5653 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5654 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5655 case Intrinsic::amdgcn_workitem_id_y:
5656 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5657 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5658 case Intrinsic::amdgcn_workitem_id_z:
5659 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5660 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5661 case Intrinsic::amdgcn_workgroup_id_x:
5662 return legalizePreloadedArgIntrin(MI, MRI, B,
5663 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5664 case Intrinsic::amdgcn_workgroup_id_y:
5665 return legalizePreloadedArgIntrin(MI, MRI, B,
5666 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5667 case Intrinsic::amdgcn_workgroup_id_z:
5668 return legalizePreloadedArgIntrin(MI, MRI, B,
5669 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5670 case Intrinsic::amdgcn_lds_kernel_id:
5671 return legalizePreloadedArgIntrin(MI, MRI, B,
5672 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5673 case Intrinsic::amdgcn_dispatch_ptr:
5674 return legalizePreloadedArgIntrin(MI, MRI, B,
5675 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5676 case Intrinsic::amdgcn_queue_ptr:
5677 return legalizePreloadedArgIntrin(MI, MRI, B,
5678 AMDGPUFunctionArgInfo::QUEUE_PTR);
5679 case Intrinsic::amdgcn_implicit_buffer_ptr:
5680 return legalizePreloadedArgIntrin(
5681 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5682 case Intrinsic::amdgcn_dispatch_id:
5683 return legalizePreloadedArgIntrin(MI, MRI, B,
5684 AMDGPUFunctionArgInfo::DISPATCH_ID);
5685 case Intrinsic::r600_read_ngroups_x:
5686 // TODO: Emit error for hsa
5687 return legalizeKernargMemParameter(MI, B,
5688 SI::KernelInputOffsets::NGROUPS_X);
5689 case Intrinsic::r600_read_ngroups_y:
5690 return legalizeKernargMemParameter(MI, B,
5691 SI::KernelInputOffsets::NGROUPS_Y);
5692 case Intrinsic::r600_read_ngroups_z:
5693 return legalizeKernargMemParameter(MI, B,
5694 SI::KernelInputOffsets::NGROUPS_Z);
5695 case Intrinsic::r600_read_local_size_x:
5696 // TODO: Could insert G_ASSERT_ZEXT from s16
5697 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5698 case Intrinsic::r600_read_local_size_y:
5699 // TODO: Could insert G_ASSERT_ZEXT from s16
5700 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
5701 // TODO: Could insert G_ASSERT_ZEXT from s16
5702 case Intrinsic::r600_read_local_size_z:
5703 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5704 case Intrinsic::r600_read_global_size_x:
5705 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5706 case Intrinsic::r600_read_global_size_y:
5707 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5708 case Intrinsic::r600_read_global_size_z:
5709 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5710 case Intrinsic::amdgcn_fdiv_fast:
5711 return legalizeFDIVFastIntrin(MI, MRI, B);
5712 case Intrinsic::amdgcn_is_shared:
5713 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5714 case Intrinsic::amdgcn_is_private:
5715 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5716 case Intrinsic::amdgcn_wavefrontsize: {
5717 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5718 MI.eraseFromParent();
5719 return true;
5720 }
5721 case Intrinsic::amdgcn_s_buffer_load:
5722 return legalizeSBufferLoad(Helper, MI);
5723 case Intrinsic::amdgcn_raw_buffer_store:
5724 case Intrinsic::amdgcn_struct_buffer_store:
5725 return legalizeBufferStore(MI, MRI, B, false, false);
5726 case Intrinsic::amdgcn_raw_buffer_store_format:
5727 case Intrinsic::amdgcn_struct_buffer_store_format:
5728 return legalizeBufferStore(MI, MRI, B, false, true);
5729 case Intrinsic::amdgcn_raw_tbuffer_store:
5730 case Intrinsic::amdgcn_struct_tbuffer_store:
5731 return legalizeBufferStore(MI, MRI, B, true, true);
5732 case Intrinsic::amdgcn_raw_buffer_load:
5733 case Intrinsic::amdgcn_struct_buffer_load:
5734 return legalizeBufferLoad(MI, MRI, B, false, false);
5735 case Intrinsic::amdgcn_raw_buffer_load_format:
5736 case Intrinsic::amdgcn_struct_buffer_load_format:
5737 return legalizeBufferLoad(MI, MRI, B, true, false);
5738 case Intrinsic::amdgcn_raw_tbuffer_load:
5739 case Intrinsic::amdgcn_struct_tbuffer_load:
5740 return legalizeBufferLoad(MI, MRI, B, true, true);
5741 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5742 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5743 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5744 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5745 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5746 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5747 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5748 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5749 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5750 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5751 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5752 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5753 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5754 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5755 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5756 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5757 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5758 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5759 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5760 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5761 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5762 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5763 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5764 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5765 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5766 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5767 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5768 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5769 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5770 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5771 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5772 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5773 return legalizeBufferAtomic(MI, B, IntrID);
5774 case Intrinsic::amdgcn_atomic_inc:
5775 return legalizeAtomicIncDec(MI, B, true);
5776 case Intrinsic::amdgcn_atomic_dec:
5777 return legalizeAtomicIncDec(MI, B, false);
5778 case Intrinsic::trap:
5779 return legalizeTrapIntrinsic(MI, MRI, B);
5780 case Intrinsic::debugtrap:
5781 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5782 case Intrinsic::amdgcn_rsq_clamp:
5783 return legalizeRsqClampIntrinsic(MI, MRI, B);
5784 case Intrinsic::amdgcn_ds_fadd:
5785 case Intrinsic::amdgcn_ds_fmin:
5786 case Intrinsic::amdgcn_ds_fmax:
5787 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5788 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5789 return legalizeBVHIntrinsic(MI, B);
5790 default: {
5791 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5792 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5793 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5794 return true;
5795 }
5796 }
5797
5798 return true;
5799}

/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/include/llvm/Support/MathExtras.h

<
1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/ADT/bit.h"
17#include "llvm/Support/Compiler.h"
18#include <cassert>
19#include <climits>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef _MSC_VER
26// Declare these intrinsics manually rather including intrin.h. It's very
27// expensive, and MathExtras.h is popular.
28// #include <intrin.h>
29extern "C" {
30unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
31unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
32unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
33unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
34}
35#endif
36
37namespace llvm {
38
39/// The behavior an operation has on an input of 0.
40enum ZeroBehavior {
41 /// The returned value is undefined.
42 ZB_Undefined,
43 /// The returned value is numeric_limits<T>::max()
44 ZB_Max,
45 /// The returned value is numeric_limits<T>::digits
46 ZB_Width
47};
48
49/// Mathematical constants.
50namespace numbers {
51// TODO: Track C++20 std::numbers.
52// TODO: Favor using the hexadecimal FP constants (requires C++17).
53constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
54 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
55 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
56 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
57 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
58 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
59 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
60 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
61 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
62 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
63 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
64 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
65 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
66 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
67 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
68constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
69 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
70 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
71 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
72 log2ef = 1.44269504F, // (0x1.715476P+0)
73 log10ef = .434294482F, // (0x1.bcb7b2P-2)
74 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
75 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
76 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
77 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
78 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
79 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
80 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
81 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
82 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
83} // namespace numbers
84
85namespace detail {
86template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
87 static unsigned count(T Val, ZeroBehavior) {
88 if (!Val)
89 return std::numeric_limits<T>::digits;
90 if (Val & 0x1)
91 return 0;
92
93 // Bisection method.
94 unsigned ZeroBits = 0;
95 T Shift = std::numeric_limits<T>::digits >> 1;
96 T Mask = std::numeric_limits<T>::max() >> Shift;
97 while (Shift) {
98 if ((Val & Mask) == 0) {
99 Val >>= Shift;
100 ZeroBits |= Shift;
101 }
102 Shift >>= 1;
103 Mask >>= Shift;
104 }
105 return ZeroBits;
106 }
107};
108
109#if defined(__GNUC__4) || defined(_MSC_VER)
110template <typename T> struct TrailingZerosCounter<T, 4> {
111 static unsigned count(T Val, ZeroBehavior ZB) {
112 if (ZB
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
113 return 32;
13
Returning the value 32
114
115#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
116 return __builtin_ctz(Val);
117#elif defined(_MSC_VER)
118 unsigned long Index;
119 _BitScanForward(&Index, Val);
120 return Index;
121#endif
122 }
123};
124
125#if !defined(_MSC_VER) || defined(_M_X64)
126template <typename T> struct TrailingZerosCounter<T, 8> {
127 static unsigned count(T Val, ZeroBehavior ZB) {
128 if (ZB != ZB_Undefined && Val == 0)
129 return 64;
130
131#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
132 return __builtin_ctzll(Val);
133#elif defined(_MSC_VER)
134 unsigned long Index;
135 _BitScanForward64(&Index, Val);
136 return Index;
137#endif
138 }
139};
140#endif
141#endif
142} // namespace detail
143
144/// Count number of 0's from the least significant bit to the most
145/// stopping at the first 1.
146///
147/// Only unsigned integral types are allowed.
148///
149/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
150/// valid arguments.
151template <typename T>
152unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
153 static_assert(std::is_unsigned_v<T>,
154 "Only unsigned integral types are allowed.");
155 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
156}
157
158namespace detail {
159template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
160 static unsigned count(T Val, ZeroBehavior) {
161 if (!Val)
162 return std::numeric_limits<T>::digits;
163
164 // Bisection method.
165 unsigned ZeroBits = 0;
166 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
167 T Tmp = Val >> Shift;
168 if (Tmp)
169 Val = Tmp;
170 else
171 ZeroBits |= Shift;
172 }
173 return ZeroBits;
174 }
175};
176
177#if defined(__GNUC__4) || defined(_MSC_VER)
178template <typename T> struct LeadingZerosCounter<T, 4> {
179 static unsigned count(T Val, ZeroBehavior ZB) {
180 if (ZB != ZB_Undefined && Val == 0)
181 return 32;
182
183#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
184 return __builtin_clz(Val);
185#elif defined(_MSC_VER)
186 unsigned long Index;
187 _BitScanReverse(&Index, Val);
188 return Index ^ 31;
189#endif
190 }
191};
192
193#if !defined(_MSC_VER) || defined(_M_X64)
194template <typename T> struct LeadingZerosCounter<T, 8> {
195 static unsigned count(T Val, ZeroBehavior ZB) {
196 if (ZB != ZB_Undefined && Val == 0)
197 return 64;
198
199#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
200 return __builtin_clzll(Val);
201#elif defined(_MSC_VER)
202 unsigned long Index;
203 _BitScanReverse64(&Index, Val);
204 return Index ^ 63;
205#endif
206 }
207};
208#endif
209#endif
210} // namespace detail
211
212/// Count number of 0's from the most significant bit to the least
213/// stopping at the first 1.
214///
215/// Only unsigned integral types are allowed.
216///
217/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
218/// valid arguments.
219template <typename T>
220unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
221 static_assert(std::is_unsigned_v<T>,
222 "Only unsigned integral types are allowed.");
223 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
224}
225
226/// Get the index of the first set bit starting from the least
227/// significant bit.
228///
229/// Only unsigned integral types are allowed.
230///
231/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
232/// valid arguments.
233template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
234 if (ZB == ZB_Max && Val == 0)
235 return std::numeric_limits<T>::max();
236
237 return countTrailingZeros(Val, ZB_Undefined);
238}
239
240/// Create a bitmask with the N right-most bits set to 1, and all other
241/// bits set to 0. Only unsigned types are allowed.
242template <typename T> T maskTrailingOnes(unsigned N) {
243 static_assert(std::is_unsigned<T>::value, "Invalid type!");
244 const unsigned Bits = CHAR_BIT8 * sizeof(T);
245 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 245, __extension__
__PRETTY_FUNCTION__))
;
246 return N == 0 ? 0 : (T(-1) >> (Bits - N));
247}
248
249/// Create a bitmask with the N left-most bits set to 1, and all other
250/// bits set to 0. Only unsigned types are allowed.
251template <typename T> T maskLeadingOnes(unsigned N) {
252 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
253}
254
255/// Create a bitmask with the N right-most bits set to 0, and all other
256/// bits set to 1. Only unsigned types are allowed.
257template <typename T> T maskTrailingZeros(unsigned N) {
258 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N left-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskLeadingZeros(unsigned N) {
264 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Get the index of the last set bit starting from the least
268/// significant bit.
269///
270/// Only unsigned integral types are allowed.
271///
272/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
273/// valid arguments.
274template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
275 if (ZB == ZB_Max && Val == 0)
276 return std::numeric_limits<T>::max();
277
278 // Use ^ instead of - because both gcc and llvm can remove the associated ^
279 // in the __builtin_clz intrinsic on x86.
280 return countLeadingZeros(Val, ZB_Undefined) ^
281 (std::numeric_limits<T>::digits - 1);
282}
283
284/// Macro compressed bit reversal table for 256 bits.
285///
286/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
287static const unsigned char BitReverseTable256[256] = {
288#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
289#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
290#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
291 R6(0), R6(2), R6(1), R6(3)
292#undef R2
293#undef R4
294#undef R6
295};
296
297/// Reverse the bits in \p Val.
298template <typename T> T reverseBits(T Val) {
299#if __has_builtin(__builtin_bitreverse8)1
300 if constexpr (std::is_same_v<T, uint8_t>)
301 return __builtin_bitreverse8(Val);
302#endif
303#if __has_builtin(__builtin_bitreverse16)1
304 if constexpr (std::is_same_v<T, uint16_t>)
305 return __builtin_bitreverse16(Val);
306#endif
307#if __has_builtin(__builtin_bitreverse32)1
308 if constexpr (std::is_same_v<T, uint32_t>)
309 return __builtin_bitreverse32(Val);
310#endif
311#if __has_builtin(__builtin_bitreverse64)1
312 if constexpr (std::is_same_v<T, uint64_t>)
313 return __builtin_bitreverse64(Val);
314#endif
315
316 unsigned char in[sizeof(Val)];
317 unsigned char out[sizeof(Val)];
318 std::memcpy(in, &Val, sizeof(Val));
319 for (unsigned i = 0; i < sizeof(Val); ++i)
320 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
321 std::memcpy(&Val, out, sizeof(Val));
322 return Val;
323}
324
325// NOTE: The following support functions use the _32/_64 extensions instead of
326// type overloading so that signed and unsigned integers can be used without
327// ambiguity.
328
329/// Return the high 32 bits of a 64 bit value.
330constexpr inline uint32_t Hi_32(uint64_t Value) {
331 return static_cast<uint32_t>(Value >> 32);
332}
333
334/// Return the low 32 bits of a 64 bit value.
335constexpr inline uint32_t Lo_32(uint64_t Value) {
336 return static_cast<uint32_t>(Value);
337}
338
339/// Make a 64-bit integer from a high / low pair of 32-bit integers.
340constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
341 return ((uint64_t)High << 32) | (uint64_t)Low;
342}
343
344/// Checks if an integer fits into the given bit width.
345template <unsigned N> constexpr inline bool isInt(int64_t x) {
346 if constexpr (N == 8)
347 return static_cast<int8_t>(x) == x;
348 if constexpr (N == 16)
349 return static_cast<int16_t>(x) == x;
350 if constexpr (N == 32)
351 return static_cast<int32_t>(x) == x;
352 if constexpr (N < 64)
353 return -(INT64_C(1)1L << (N - 1)) <= x && x < (INT64_C(1)1L << (N - 1));
354 (void)x; // MSVC v19.25 warns that x is unused.
355 return true;
356}
357
358/// Checks if a signed integer is an N bit number shifted left by S.
359template <unsigned N, unsigned S>
360constexpr inline bool isShiftedInt(int64_t x) {
361 static_assert(
362