Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2895, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-13-020333-16386-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returns true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx,
99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100 };
101}
102
103static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 unsigned Size = Ty.getSizeInBits();
108 unsigned Pieces = (Size + 63) / 64;
109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110 return std::make_pair(
111 TypeIdx,
112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113 };
114}
115
116// Increase the number of vector elements to reach the next multiple of 32-bit
117// type.
118static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 127, __extension__ __PRETTY_FUNCTION__))
;
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132}
133
134static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
136
137 LLT CoercedTy;
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145}
146
147static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152}
153
154static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 158, __extension__ __PRETTY_FUNCTION__))
;
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162}
163
164static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169}
170
171static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176}
177
178static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183}
184
185static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187}
188
189static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192}
193
194static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199}
200
201static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209}
210
211// Any combination of 32 or 64-bit elements up the maximum register size, and
212// multiples of v2s16.
213static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217}
218
219static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227}
228
229// If we have a truncating store or an extending load with a data size larger
230// than 32-bits, we need to reduce to a 32-bit type.
231static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237}
238
239// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240// handle some operations by just promoting the register during
241// selection. There are also d16 loads on GFX9+ which preserve the high bits.
242static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 321, __extension__ __PRETTY_FUNCTION__))
;
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331}
332
333// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334// workaround this. Eventually it should ignore the type for loads and only care
335// about the size. Return true in cases where we will workaround this for now by
336// bitcasting.
337static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353}
354
355static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359}
360
361/// Return true if a load or store of the type should be lowered with a bitcast
362/// to a different type.
363static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377}
378
379/// Return true if we should legalize a load by widening an odd sized memory
380/// access up to the alignment. Note this case when the memory access itself
381/// changes, not the size of the result register.
382static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 unsigned AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414}
415
416static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424}
425
426AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16, V2S16})
535 .minScalar(0, S16)
536 .clampMaxNumElementsStrict(0, S16, 2)
537 .widenScalarToNextMultipleOf(0, 32)
538 .maxScalar(0, S32)
539 .scalarize(0);
540
541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542 .legalFor({S32, S16, V2S16}) // Clamp modifier
543 .minScalarOrElt(0, S16)
544 .clampMaxNumElementsStrict(0, S16, 2)
545 .scalarize(0)
546 .widenScalarToNextPow2(0, 32)
547 .lower();
548 } else if (ST.has16BitInsts()) {
549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550 .legalFor({S32, S16})
551 .minScalar(0, S16)
552 .widenScalarToNextMultipleOf(0, 32)
553 .maxScalar(0, S32)
554 .scalarize(0);
555
556 // Technically the saturating operations require clamp bit support, but this
557 // was introduced at the same time as 16-bit operations.
558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559 .legalFor({S32, S16}) // Clamp modifier
560 .minScalar(0, S16)
561 .scalarize(0)
562 .widenScalarToNextPow2(0, 16)
563 .lower();
564
565 // We're just lowering this, but it helps get a better result to try to
566 // coerce to the desired type first.
567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568 .minScalar(0, S16)
569 .scalarize(0)
570 .lower();
571 } else {
572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573 .legalFor({S32})
574 .widenScalarToNextMultipleOf(0, 32)
575 .clampScalar(0, S32, S32)
576 .scalarize(0);
577
578 if (ST.hasIntClamp()) {
579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580 .legalFor({S32}) // Clamp modifier.
581 .scalarize(0)
582 .minScalarOrElt(0, S32)
583 .lower();
584 } else {
585 // Clamp bit support was added in VI, along with 16-bit operations.
586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587 .minScalar(0, S32)
588 .scalarize(0)
589 .lower();
590 }
591
592 // FIXME: DAG expansion gets better results. The widening uses the smaller
593 // range values and goes for the min/max lowering directly.
594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595 .minScalar(0, S32)
596 .scalarize(0)
597 .lower();
598 }
599
600 getActionDefinitionsBuilder(
601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602 .customFor({S32, S64})
603 .clampScalar(0, S32, S64)
604 .widenScalarToNextPow2(0, 32)
605 .scalarize(0);
606
607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608 .legalFor({S32})
609 .maxScalar(0, S32);
610
611 if (ST.hasVOP3PInsts()) {
612 Mulh
613 .clampMaxNumElements(0, S8, 2)
614 .lowerFor({V2S8});
615 }
616
617 Mulh
618 .scalarize(0)
619 .lower();
620
621 // Report legal for any types we can handle anywhere. For the cases only legal
622 // on the SALU, RegBankSelect will be able to re-legalize.
623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625 .clampScalar(0, S32, S64)
626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628 .widenScalarToNextPow2(0)
629 .scalarize(0);
630
631 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633 .legalFor({{S32, S1}, {S32, S32}})
634 .minScalar(0, S32)
635 // TODO: .scalarize(0)
636 .lower();
637
638 getActionDefinitionsBuilder(G_BITCAST)
639 // Don't worry about the size constraint.
640 .legalIf(all(isRegisterType(0), isRegisterType(1)))
641 .lower();
642
643
644 getActionDefinitionsBuilder(G_CONSTANT)
645 .legalFor({S1, S32, S64, S16, GlobalPtr,
646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647 .legalIf(isPointer(0))
648 .clampScalar(0, S32, S64)
649 .widenScalarToNextPow2(0);
650
651 getActionDefinitionsBuilder(G_FCONSTANT)
652 .legalFor({S32, S64, S16})
653 .clampScalar(0, S16, S64);
654
655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656 .legalIf(isRegisterType(0))
657 // s1 and s16 are special cases because they have legal operations on
658 // them, but don't really occupy registers in the normal way.
659 .legalFor({S1, S16})
660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661 .clampScalarOrElt(0, S32, MaxScalar)
662 .widenScalarToNextPow2(0, 32)
663 .clampMaxNumElements(0, S32, 16);
664
665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666
667 // If the amount is divergent, we have to do a wave reduction to get the
668 // maximum value, so this is expanded during RegBankSelect.
669 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670 .legalFor({{PrivatePtr, S32}});
671
672 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673 .customIf(typeIsNot(0, PrivatePtr));
674
675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676
677 auto &FPOpActions = getActionDefinitionsBuilder(
678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679 .legalFor({S32, S64});
680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681 .customFor({S32, S64});
682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683 .customFor({S32, S64});
684
685 if (ST.has16BitInsts()) {
686 if (ST.hasVOP3PInsts())
687 FPOpActions.legalFor({S16, V2S16});
688 else
689 FPOpActions.legalFor({S16});
690
691 TrigActions.customFor({S16});
692 FDIVActions.customFor({S16});
693 }
694
695 auto &MinNumMaxNum = getActionDefinitionsBuilder({
696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697
698 if (ST.hasVOP3PInsts()) {
699 MinNumMaxNum.customFor(FPTypesPK16)
700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701 .clampMaxNumElements(0, S16, 2)
702 .clampScalar(0, S16, S64)
703 .scalarize(0);
704 } else if (ST.has16BitInsts()) {
705 MinNumMaxNum.customFor(FPTypes16)
706 .clampScalar(0, S16, S64)
707 .scalarize(0);
708 } else {
709 MinNumMaxNum.customFor(FPTypesBase)
710 .clampScalar(0, S32, S64)
711 .scalarize(0);
712 }
713
714 if (ST.hasVOP3PInsts())
715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716
717 FPOpActions
718 .scalarize(0)
719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720
721 TrigActions
722 .scalarize(0)
723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724
725 FDIVActions
726 .scalarize(0)
727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728
729 getActionDefinitionsBuilder({G_FNEG, G_FABS})
730 .legalFor(FPTypesPK16)
731 .clampMaxNumElementsStrict(0, S16, 2)
732 .scalarize(0)
733 .clampScalar(0, S16, S64);
734
735 if (ST.has16BitInsts()) {
736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737 .legalFor({S32, S64, S16})
738 .scalarize(0)
739 .clampScalar(0, S16, S64);
740 } else {
741 getActionDefinitionsBuilder(G_FSQRT)
742 .legalFor({S32, S64})
743 .scalarize(0)
744 .clampScalar(0, S32, S64);
745
746 if (ST.hasFractBug()) {
747 getActionDefinitionsBuilder(G_FFLOOR)
748 .customFor({S64})
749 .legalFor({S32, S64})
750 .scalarize(0)
751 .clampScalar(0, S32, S64);
752 } else {
753 getActionDefinitionsBuilder(G_FFLOOR)
754 .legalFor({S32, S64})
755 .scalarize(0)
756 .clampScalar(0, S32, S64);
757 }
758 }
759
760 getActionDefinitionsBuilder(G_FPTRUNC)
761 .legalFor({{S32, S64}, {S16, S32}})
762 .scalarize(0)
763 .lower();
764
765 getActionDefinitionsBuilder(G_FPEXT)
766 .legalFor({{S64, S32}, {S32, S16}})
767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768 .scalarize(0);
769
770 getActionDefinitionsBuilder(G_FSUB)
771 // Use actual fsub instruction
772 .legalFor({S32})
773 // Must use fadd + fneg
774 .lowerFor({S64, S16, V2S16})
775 .scalarize(0)
776 .clampScalar(0, S32, S64);
777
778 // Whether this is legal depends on the floating point mode for the function.
779 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781 FMad.customFor({S32, S16});
782 else if (ST.hasMadMacF32Insts())
783 FMad.customFor({S32});
784 else if (ST.hasMadF16())
785 FMad.customFor({S16});
786 FMad.scalarize(0)
787 .lower();
788
789 auto &FRem = getActionDefinitionsBuilder(G_FREM);
790 if (ST.has16BitInsts()) {
791 FRem.customFor({S16, S32, S64});
792 } else {
793 FRem.minScalar(0, S32)
794 .customFor({S32, S64});
795 }
796 FRem.scalarize(0);
797
798 // TODO: Do we need to clamp maximum bitwidth?
799 getActionDefinitionsBuilder(G_TRUNC)
800 .legalIf(isScalar(0))
801 .legalFor({{V2S16, V2S32}})
802 .clampMaxNumElements(0, S16, 2)
803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804 // situations (like an invalid implicit use), we don't want to infinite loop
805 // in the legalizer.
806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807 .alwaysLegal();
808
809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811 {S32, S1}, {S64, S1}, {S16, S1}})
812 .scalarize(0)
813 .clampScalar(0, S32, S64)
814 .widenScalarToNextPow2(1, 32);
815
816 // TODO: Split s1->s64 during regbankselect for VALU.
817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819 .lowerIf(typeIs(1, S1))
820 .customFor({{S32, S64}, {S64, S64}});
821 if (ST.has16BitInsts())
822 IToFP.legalFor({{S16, S16}});
823 IToFP.clampScalar(1, S32, S64)
824 .minScalar(0, S32)
825 .scalarize(0)
826 .widenScalarToNextPow2(1);
827
828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830 .customFor({{S64, S32}, {S64, S64}})
831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832 if (ST.has16BitInsts())
833 FPToI.legalFor({{S16, S16}});
834 else
835 FPToI.minScalar(1, S32);
836
837 FPToI.minScalar(0, S32)
838 .widenScalarToNextPow2(0, 32)
839 .scalarize(0)
840 .lower();
841
842 // Lower roundeven into G_FRINT
843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844 .scalarize(0)
845 .lower();
846
847 if (ST.has16BitInsts()) {
848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849 .legalFor({S16, S32, S64})
850 .clampScalar(0, S16, S64)
851 .scalarize(0);
852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854 .legalFor({S32, S64})
855 .clampScalar(0, S32, S64)
856 .scalarize(0);
857 } else {
858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859 .legalFor({S32})
860 .customFor({S64})
861 .clampScalar(0, S32, S64)
862 .scalarize(0);
863 }
864
865 getActionDefinitionsBuilder(G_PTR_ADD)
866 .legalIf(all(isPointer(0), sameSize(0, 1)))
867 .scalarize(0)
868 .scalarSameSizeAs(1, 0);
869
870 getActionDefinitionsBuilder(G_PTRMASK)
871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872 .scalarSameSizeAs(1, 0)
873 .scalarize(0);
874
875 auto &CmpBuilder =
876 getActionDefinitionsBuilder(G_ICMP)
877 // The compare output type differs based on the register bank of the output,
878 // so make both s1 and s32 legal.
879 //
880 // Scalar compares producing output in scc will be promoted to s32, as that
881 // is the allocatable register type that will be needed for the copy from
882 // scc. This will be promoted during RegBankSelect, and we assume something
883 // before that won't try to use s32 result types.
884 //
885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886 // bank.
887 .legalForCartesianProduct(
888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889 .legalForCartesianProduct(
890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891 if (ST.has16BitInsts()) {
892 CmpBuilder.legalFor({{S1, S16}});
893 }
894
895 CmpBuilder
896 .widenScalarToNextPow2(1)
897 .clampScalar(1, S32, S64)
898 .scalarize(0)
899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900
901 getActionDefinitionsBuilder(G_FCMP)
902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903 .widenScalarToNextPow2(1)
904 .clampScalar(1, S32, S64)
905 .scalarize(0);
906
907 // FIXME: fpow has a selection pattern that should move to custom lowering.
908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909 if (ST.has16BitInsts())
910 Exp2Ops.legalFor({S32, S16});
911 else
912 Exp2Ops.legalFor({S32});
913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914 Exp2Ops.scalarize(0);
915
916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917 if (ST.has16BitInsts())
918 ExpOps.customFor({{S32}, {S16}});
919 else
920 ExpOps.customFor({S32});
921 ExpOps.clampScalar(0, MinScalarFPTy, S32)
922 .scalarize(0);
923
924 getActionDefinitionsBuilder(G_FPOWI)
925 .clampScalar(0, MinScalarFPTy, S32)
926 .lower();
927
928 // The 64-bit versions produce 32-bit results, but only on the SALU.
929 getActionDefinitionsBuilder(G_CTPOP)
930 .legalFor({{S32, S32}, {S32, S64}})
931 .clampScalar(0, S32, S32)
932 .clampScalar(1, S32, S64)
933 .scalarize(0)
934 .widenScalarToNextPow2(0, 32)
935 .widenScalarToNextPow2(1, 32);
936
937 // The hardware instructions return a different result on 0 than the generic
938 // instructions expect. The hardware produces -1, but these produce the
939 // bitwidth.
940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941 .scalarize(0)
942 .clampScalar(0, S32, S32)
943 .clampScalar(1, S32, S64)
944 .widenScalarToNextPow2(0, 32)
945 .widenScalarToNextPow2(1, 32)
946 .custom();
947
948 // The 64-bit versions produce 32-bit results, but only on the SALU.
949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950 .legalFor({{S32, S32}, {S32, S64}})
951 .clampScalar(0, S32, S32)
952 .clampScalar(1, S32, S64)
953 .scalarize(0)
954 .widenScalarToNextPow2(0, 32)
955 .widenScalarToNextPow2(1, 32);
956
957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958 // RegBankSelect.
959 getActionDefinitionsBuilder(G_BITREVERSE)
960 .legalFor({S32, S64})
961 .clampScalar(0, S32, S64)
962 .scalarize(0)
963 .widenScalarToNextPow2(0);
964
965 if (ST.has16BitInsts()) {
966 getActionDefinitionsBuilder(G_BSWAP)
967 .legalFor({S16, S32, V2S16})
968 .clampMaxNumElementsStrict(0, S16, 2)
969 // FIXME: Fixing non-power-of-2 before clamp is workaround for
970 // narrowScalar limitation.
971 .widenScalarToNextPow2(0)
972 .clampScalar(0, S16, S32)
973 .scalarize(0);
974
975 if (ST.hasVOP3PInsts()) {
976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977 .legalFor({S32, S16, V2S16})
978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979 .clampMaxNumElements(0, S16, 2)
980 .minScalar(0, S16)
981 .widenScalarToNextPow2(0)
982 .scalarize(0)
983 .lower();
984 } else {
985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986 .legalFor({S32, S16})
987 .widenScalarToNextPow2(0)
988 .minScalar(0, S16)
989 .scalarize(0)
990 .lower();
991 }
992 } else {
993 // TODO: Should have same legality without v_perm_b32
994 getActionDefinitionsBuilder(G_BSWAP)
995 .legalFor({S32})
996 .lowerIf(scalarNarrowerThan(0, 32))
997 // FIXME: Fixing non-power-of-2 before clamp is workaround for
998 // narrowScalar limitation.
999 .widenScalarToNextPow2(0)
1000 .maxScalar(0, S32)
1001 .scalarize(0)
1002 .lower();
1003
1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005 .legalFor({S32})
1006 .minScalar(0, S32)
1007 .widenScalarToNextPow2(0)
1008 .scalarize(0)
1009 .lower();
1010 }
1011
1012 getActionDefinitionsBuilder(G_INTTOPTR)
1013 // List the common cases
1014 .legalForCartesianProduct(AddrSpaces64, {S64})
1015 .legalForCartesianProduct(AddrSpaces32, {S32})
1016 .scalarize(0)
1017 // Accept any address space as long as the size matches
1018 .legalIf(sameSize(0, 1))
1019 .widenScalarIf(smallerThan(1, 0),
1020 [](const LegalityQuery &Query) {
1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022 })
1023 .narrowScalarIf(largerThan(1, 0),
1024 [](const LegalityQuery &Query) {
1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026 });
1027
1028 getActionDefinitionsBuilder(G_PTRTOINT)
1029 // List the common cases
1030 .legalForCartesianProduct(AddrSpaces64, {S64})
1031 .legalForCartesianProduct(AddrSpaces32, {S32})
1032 .scalarize(0)
1033 // Accept any address space as long as the size matches
1034 .legalIf(sameSize(0, 1))
1035 .widenScalarIf(smallerThan(0, 1),
1036 [](const LegalityQuery &Query) {
1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038 })
1039 .narrowScalarIf(
1040 largerThan(0, 1),
1041 [](const LegalityQuery &Query) {
1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043 });
1044
1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046 .scalarize(0)
1047 .custom();
1048
1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050 bool IsLoad) -> bool {
1051 const LLT DstTy = Query.Types[0];
1052
1053 // Split vector extloads.
1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055
1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057 return true;
1058
1059 const LLT PtrTy = Query.Types[1];
1060 unsigned AS = PtrTy.getAddressSpace();
1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062 return true;
1063
1064 // Catch weird sized loads that don't evenly divide into the access sizes
1065 // TODO: May be able to widen depending on alignment etc.
1066 unsigned NumRegs = (MemSize + 31) / 32;
1067 if (NumRegs == 3) {
1068 if (!ST.hasDwordx3LoadStores())
1069 return true;
1070 } else {
1071 // If the alignment allows, these should have been widened.
1072 if (!isPowerOf2_32(NumRegs))
1073 return true;
1074 }
1075
1076 return false;
1077 };
1078
1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082
1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084 // LDS
1085 // TODO: Unsupported flat for SI.
1086
1087 for (unsigned Op : {G_LOAD, G_STORE}) {
1088 const bool IsStore = Op == G_STORE;
1089
1090 auto &Actions = getActionDefinitionsBuilder(Op);
1091 // Explicitly list some common cases.
1092 // TODO: Does this help compile time at all?
1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096 {S64, GlobalPtr, S64, GlobalAlign32},
1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099 {S32, GlobalPtr, S8, GlobalAlign8},
1100 {S32, GlobalPtr, S16, GlobalAlign16},
1101
1102 {S32, LocalPtr, S32, 32},
1103 {S64, LocalPtr, S64, 32},
1104 {V2S32, LocalPtr, V2S32, 32},
1105 {S32, LocalPtr, S8, 8},
1106 {S32, LocalPtr, S16, 16},
1107 {V2S16, LocalPtr, S32, 32},
1108
1109 {S32, PrivatePtr, S32, 32},
1110 {S32, PrivatePtr, S8, 8},
1111 {S32, PrivatePtr, S16, 16},
1112 {V2S16, PrivatePtr, S32, 32},
1113
1114 {S32, ConstantPtr, S32, GlobalAlign32},
1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117 {S64, ConstantPtr, S64, GlobalAlign32},
1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119 Actions.legalIf(
1120 [=](const LegalityQuery &Query) -> bool {
1121 return isLoadStoreLegal(ST, Query);
1122 });
1123
1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125 // 64-bits.
1126 //
1127 // TODO: Should generalize bitcast action into coerce, which will also cover
1128 // inserting addrspacecasts.
1129 Actions.customIf(typeIs(1, Constant32Ptr));
1130
1131 // Turn any illegal element vectors into something easier to deal
1132 // with. These will ultimately produce 32-bit scalar shifts to extract the
1133 // parts anyway.
1134 //
1135 // For odd 16-bit element vectors, prefer to split those into pieces with
1136 // 16-bit vector parts.
1137 Actions.bitcastIf(
1138 [=](const LegalityQuery &Query) -> bool {
1139 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140 Query.MMODescrs[0].MemoryTy);
1141 }, bitcastToRegisterType(0));
1142
1143 if (!IsStore) {
1144 // Widen suitably aligned loads by loading extra bytes. The standard
1145 // legalization actions can't properly express widening memory operands.
1146 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147 return shouldWidenLoad(ST, Query, G_LOAD);
1148 });
1149 }
1150
1151 // FIXME: load/store narrowing should be moved to lower action
1152 Actions
1153 .narrowScalarIf(
1154 [=](const LegalityQuery &Query) -> bool {
1155 return !Query.Types[0].isVector() &&
1156 needToSplitMemOp(Query, Op == G_LOAD);
1157 },
1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159 const LLT DstTy = Query.Types[0];
1160 const LLT PtrTy = Query.Types[1];
1161
1162 const unsigned DstSize = DstTy.getSizeInBits();
1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164
1165 // Split extloads.
1166 if (DstSize > MemSize)
1167 return std::make_pair(0, LLT::scalar(MemSize));
1168
1169 unsigned MaxSize = maxSizeForAddrSpace(ST,
1170 PtrTy.getAddressSpace(),
1171 Op == G_LOAD);
1172 if (MemSize > MaxSize)
1173 return std::make_pair(0, LLT::scalar(MaxSize));
1174
1175 unsigned Align = Query.MMODescrs[0].AlignInBits;
1176 return std::make_pair(0, LLT::scalar(Align));
1177 })
1178 .fewerElementsIf(
1179 [=](const LegalityQuery &Query) -> bool {
1180 return Query.Types[0].isVector() &&
1181 needToSplitMemOp(Query, Op == G_LOAD);
1182 },
1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184 const LLT DstTy = Query.Types[0];
1185 const LLT PtrTy = Query.Types[1];
1186
1187 LLT EltTy = DstTy.getElementType();
1188 unsigned MaxSize = maxSizeForAddrSpace(ST,
1189 PtrTy.getAddressSpace(),
1190 Op == G_LOAD);
1191
1192 // FIXME: Handle widened to power of 2 results better. This ends
1193 // up scalarizing.
1194 // FIXME: 3 element stores scalarized on SI
1195
1196 // Split if it's too large for the address space.
1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198 if (MemSize > MaxSize) {
1199 unsigned NumElts = DstTy.getNumElements();
1200 unsigned EltSize = EltTy.getSizeInBits();
1201
1202 if (MaxSize % EltSize == 0) {
1203 return std::make_pair(
1204 0, LLT::scalarOrVector(
1205 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206 }
1207
1208 unsigned NumPieces = MemSize / MaxSize;
1209
1210 // FIXME: Refine when odd breakdowns handled
1211 // The scalars will need to be re-legalized.
1212 if (NumPieces == 1 || NumPieces >= NumElts ||
1213 NumElts % NumPieces != 0)
1214 return std::make_pair(0, EltTy);
1215
1216 return std::make_pair(
1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218 }
1219
1220 // FIXME: We could probably handle weird extending loads better.
1221 if (DstTy.getSizeInBits() > MemSize)
1222 return std::make_pair(0, EltTy);
1223
1224 unsigned EltSize = EltTy.getSizeInBits();
1225 unsigned DstSize = DstTy.getSizeInBits();
1226 if (!isPowerOf2_32(DstSize)) {
1227 // We're probably decomposing an odd sized store. Try to split
1228 // to the widest type. TODO: Account for alignment. As-is it
1229 // should be OK, since the new parts will be further legalized.
1230 unsigned FloorSize = PowerOf2Floor(DstSize);
1231 return std::make_pair(
1232 0, LLT::scalarOrVector(
1233 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234 }
1235
1236 // May need relegalization for the scalars.
1237 return std::make_pair(0, EltTy);
1238 })
1239 .minScalar(0, S32)
1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241 .widenScalarToNextPow2(0)
1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243 .lower();
1244 }
1245
1246 // FIXME: Unaligned accesses not lowered.
1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249 {S32, GlobalPtr, S16, 2 * 8},
1250 {S32, LocalPtr, S8, 8},
1251 {S32, LocalPtr, S16, 16},
1252 {S32, PrivatePtr, S8, 8},
1253 {S32, PrivatePtr, S16, 16},
1254 {S32, ConstantPtr, S8, 8},
1255 {S32, ConstantPtr, S16, 2 * 8}})
1256 .legalIf(
1257 [=](const LegalityQuery &Query) -> bool {
1258 return isLoadStoreLegal(ST, Query);
1259 });
1260
1261 if (ST.hasFlatAddressSpace()) {
1262 ExtLoads.legalForTypesWithMemDesc(
1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264 }
1265
1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267 // 64-bits.
1268 //
1269 // TODO: Should generalize bitcast action into coerce, which will also cover
1270 // inserting addrspacecasts.
1271 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272
1273 ExtLoads.clampScalar(0, S32, S32)
1274 .widenScalarToNextPow2(0)
1275 .lower();
1276
1277 auto &Atomics = getActionDefinitionsBuilder(
1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281 G_ATOMICRMW_UMIN})
1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283 {S64, GlobalPtr}, {S64, LocalPtr},
1284 {S32, RegionPtr}, {S64, RegionPtr}});
1285 if (ST.hasFlatAddressSpace()) {
1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287 }
1288
1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290 if (ST.hasLDSFPAtomicAdd()) {
1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292 if (ST.hasGFX90AInsts())
1293 Atomic.legalFor({{S64, LocalPtr}});
1294 }
1295 if (ST.hasAtomicFaddInsts())
1296 Atomic.legalFor({{S32, GlobalPtr}});
1297
1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299 // demarshalling
1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302 {S32, FlatPtr}, {S64, FlatPtr}})
1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304 {S32, RegionPtr}, {S64, RegionPtr}});
1305 // TODO: Pointer types, any 32-bit or 64-bit vector
1306
1307 // Condition should be s32 for scalar, s1 for vector.
1308 getActionDefinitionsBuilder(G_SELECT)
1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310 LocalPtr, FlatPtr, PrivatePtr,
1311 LLT::fixed_vector(2, LocalPtr),
1312 LLT::fixed_vector(2, PrivatePtr)},
1313 {S1, S32})
1314 .clampScalar(0, S16, S64)
1315 .scalarize(1)
1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318 .clampMaxNumElements(0, S32, 2)
1319 .clampMaxNumElements(0, LocalPtr, 2)
1320 .clampMaxNumElements(0, PrivatePtr, 2)
1321 .scalarize(0)
1322 .widenScalarToNextPow2(0)
1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324
1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326 // be more flexible with the shift amount type.
1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328 .legalFor({{S32, S32}, {S64, S32}});
1329 if (ST.has16BitInsts()) {
1330 if (ST.hasVOP3PInsts()) {
1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332 .clampMaxNumElements(0, S16, 2);
1333 } else
1334 Shifts.legalFor({{S16, S16}});
1335
1336 // TODO: Support 16-bit shift amounts for all types
1337 Shifts.widenScalarIf(
1338 [=](const LegalityQuery &Query) {
1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340 // 32-bit amount.
1341 const LLT ValTy = Query.Types[0];
1342 const LLT AmountTy = Query.Types[1];
1343 return ValTy.getSizeInBits() <= 16 &&
1344 AmountTy.getSizeInBits() < 16;
1345 }, changeTo(1, S16));
1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347 Shifts.clampScalar(1, S32, S32);
1348 Shifts.clampScalar(0, S16, S64);
1349 Shifts.widenScalarToNextPow2(0, 16);
1350
1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352 .minScalar(0, S16)
1353 .scalarize(0)
1354 .lower();
1355 } else {
1356 // Make sure we legalize the shift amount type first, as the general
1357 // expansion for the shifted type will produce much worse code if it hasn't
1358 // been truncated already.
1359 Shifts.clampScalar(1, S32, S32);
1360 Shifts.clampScalar(0, S32, S64);
1361 Shifts.widenScalarToNextPow2(0, 32);
1362
1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364 .minScalar(0, S32)
1365 .scalarize(0)
1366 .lower();
1367 }
1368 Shifts.scalarize(0);
1369
1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373 unsigned IdxTypeIdx = 2;
1374
1375 getActionDefinitionsBuilder(Op)
1376 .customIf([=](const LegalityQuery &Query) {
1377 const LLT EltTy = Query.Types[EltTypeIdx];
1378 const LLT VecTy = Query.Types[VecTypeIdx];
1379 const LLT IdxTy = Query.Types[IdxTypeIdx];
1380 const unsigned EltSize = EltTy.getSizeInBits();
1381 return (EltSize == 32 || EltSize == 64) &&
1382 VecTy.getSizeInBits() % 32 == 0 &&
1383 VecTy.getSizeInBits() <= MaxRegisterSize &&
1384 IdxTy.getSizeInBits() == 32;
1385 })
1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387 bitcastToVectorElement32(VecTypeIdx))
1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389 .bitcastIf(
1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391 [=](const LegalityQuery &Query) {
1392 // For > 64-bit element types, try to turn this into a 64-bit
1393 // element vector since we may be able to do better indexing
1394 // if this is scalar. If not, fall back to 32.
1395 const LLT EltTy = Query.Types[EltTypeIdx];
1396 const LLT VecTy = Query.Types[VecTypeIdx];
1397 const unsigned DstEltSize = EltTy.getSizeInBits();
1398 const unsigned VecSize = VecTy.getSizeInBits();
1399
1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401 return std::make_pair(
1402 VecTypeIdx,
1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404 })
1405 .clampScalar(EltTypeIdx, S32, S64)
1406 .clampScalar(VecTypeIdx, S32, S64)
1407 .clampScalar(IdxTypeIdx, S32, S32)
1408 .clampMaxNumElements(VecTypeIdx, S32, 32)
1409 // TODO: Clamp elements for 64-bit vectors?
1410 // It should only be necessary with variable indexes.
1411 // As a last resort, lower to the stack
1412 .lower();
1413 }
1414
1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416 .unsupportedIf([=](const LegalityQuery &Query) {
1417 const LLT &EltTy = Query.Types[1].getElementType();
1418 return Query.Types[0] != EltTy;
1419 });
1420
1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424
1425 // FIXME: Doesn't handle extract of illegal sizes.
1426 getActionDefinitionsBuilder(Op)
1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428 .lowerIf([=](const LegalityQuery &Query) {
1429 // Sub-vector(or single element) insert and extract.
1430 // TODO: verify immediate offset here since lower only works with
1431 // whole elements.
1432 const LLT BigTy = Query.Types[BigTyIdx];
1433 return BigTy.isVector();
1434 })
1435 // FIXME: Multiples of 16 should not be legal.
1436 .legalIf([=](const LegalityQuery &Query) {
1437 const LLT BigTy = Query.Types[BigTyIdx];
1438 const LLT LitTy = Query.Types[LitTyIdx];
1439 return (BigTy.getSizeInBits() % 32 == 0) &&
1440 (LitTy.getSizeInBits() % 16 == 0);
1441 })
1442 .widenScalarIf(
1443 [=](const LegalityQuery &Query) {
1444 const LLT BigTy = Query.Types[BigTyIdx];
1445 return (BigTy.getScalarSizeInBits() < 16);
1446 },
1447 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1448 .widenScalarIf(
1449 [=](const LegalityQuery &Query) {
1450 const LLT LitTy = Query.Types[LitTyIdx];
1451 return (LitTy.getScalarSizeInBits() < 16);
1452 },
1453 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1454 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455 .widenScalarToNextPow2(BigTyIdx, 32);
1456
1457 }
1458
1459 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460 .legalForCartesianProduct(AllS32Vectors, {S32})
1461 .legalForCartesianProduct(AllS64Vectors, {S64})
1462 .clampNumElements(0, V16S32, V32S32)
1463 .clampNumElements(0, V2S64, V16S64)
1464 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465
1466 if (ST.hasScalarPackInsts()) {
1467 BuildVector
1468 // FIXME: Should probably widen s1 vectors straight to s32
1469 .minScalarOrElt(0, S16)
1470 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471 .minScalar(1, S32);
1472
1473 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474 .legalFor({V2S16, S32})
1475 .lower();
1476 BuildVector.minScalarOrElt(0, S32);
1477 } else {
1478 BuildVector.customFor({V2S16, S16});
1479 BuildVector.minScalarOrElt(0, S32);
1480
1481 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482 .customFor({V2S16, S32})
1483 .lower();
1484 }
1485
1486 BuildVector.legalIf(isRegisterType(0));
1487
1488 // FIXME: Clamp maximum size
1489 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491 .clampMaxNumElements(0, S32, 32)
1492 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493 .clampMaxNumElements(0, S16, 64);
1494
1495 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496 // pre-legalize.
1497 if (ST.hasVOP3PInsts()) {
1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499 .customFor({V2S16, V2S16})
1500 .lower();
1501 } else
1502 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503
1504 // Merge/Unmerge
1505 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508
1509 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510 const LLT Ty = Query.Types[TypeIdx];
1511 if (Ty.isVector()) {
1512 const LLT &EltTy = Ty.getElementType();
1513 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514 return true;
1515 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516 return true;
1517 }
1518 return false;
1519 };
1520
1521 auto &Builder = getActionDefinitionsBuilder(Op)
1522 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523 .lowerFor({{S16, V2S16}})
1524 .lowerIf([=](const LegalityQuery &Query) {
1525 const LLT BigTy = Query.Types[BigTyIdx];
1526 return BigTy.getSizeInBits() == 32;
1527 })
1528 // Try to widen to s16 first for small types.
1529 // TODO: Only do this on targets with legal s16 shifts
1530 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534 elementTypeIs(1, S16)),
1535 changeTo(1, V2S16))
1536 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538 // valid.
1539 .clampScalar(LitTyIdx, S32, S512)
1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541 // Break up vectors with weird elements into scalars
1542 .fewerElementsIf(
1543 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544 scalarize(0))
1545 .fewerElementsIf(
1546 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547 scalarize(1))
1548 .clampScalar(BigTyIdx, S32, MaxScalar);
1549
1550 if (Op == G_MERGE_VALUES) {
1551 Builder.widenScalarIf(
1552 // TODO: Use 16-bit shifts if legal for 8-bit values?
1553 [=](const LegalityQuery &Query) {
1554 const LLT Ty = Query.Types[LitTyIdx];
1555 return Ty.getSizeInBits() < 32;
1556 },
1557 changeTo(LitTyIdx, S32));
1558 }
1559
1560 Builder.widenScalarIf(
1561 [=](const LegalityQuery &Query) {
1562 const LLT Ty = Query.Types[BigTyIdx];
1563 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564 Ty.getSizeInBits() % 16 != 0;
1565 },
1566 [=](const LegalityQuery &Query) {
1567 // Pick the next power of 2, or a multiple of 64 over 128.
1568 // Whichever is smaller.
1569 const LLT &Ty = Query.Types[BigTyIdx];
1570 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571 if (NewSizeInBits >= 256) {
1572 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573 if (RoundedTo < NewSizeInBits)
1574 NewSizeInBits = RoundedTo;
1575 }
1576 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577 })
1578 // Any vectors left are the wrong size. Scalarize them.
1579 .scalarize(0)
1580 .scalarize(1);
1581 }
1582
1583 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584 // RegBankSelect.
1585 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586 .legalFor({{S32}, {S64}});
1587
1588 if (ST.hasVOP3PInsts()) {
1589 SextInReg.lowerFor({{V2S16}})
1590 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591 // get more vector shift opportunities, since we'll get those when
1592 // expanded.
1593 .clampMaxNumElementsStrict(0, S16, 2);
1594 } else if (ST.has16BitInsts()) {
1595 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596 } else {
1597 // Prefer to promote to s32 before lowering if we don't have 16-bit
1598 // shifts. This avoid a lot of intermediate truncate and extend operations.
1599 SextInReg.lowerFor({{S32}, {S64}});
1600 }
1601
1602 SextInReg
1603 .scalarize(0)
1604 .clampScalar(0, S32, S64)
1605 .lower();
1606
1607 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608 .scalarize(0)
1609 .lower();
1610
1611 // TODO: Only Try to form v2s16 with legal packed instructions.
1612 getActionDefinitionsBuilder(G_FSHR)
1613 .legalFor({{S32, S32}})
1614 .lowerFor({{V2S16, V2S16}})
1615 .clampMaxNumElementsStrict(0, S16, 2)
1616 .scalarize(0)
1617 .lower();
1618
1619 if (ST.hasVOP3PInsts()) {
1620 getActionDefinitionsBuilder(G_FSHL)
1621 .lowerFor({{V2S16, V2S16}})
1622 .clampMaxNumElementsStrict(0, S16, 2)
1623 .scalarize(0)
1624 .lower();
1625 } else {
1626 getActionDefinitionsBuilder(G_FSHL)
1627 .scalarize(0)
1628 .lower();
1629 }
1630
1631 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632 .legalFor({S64});
1633
1634 getActionDefinitionsBuilder(G_FENCE)
1635 .alwaysLegal();
1636
1637 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638 .scalarize(0)
1639 .minScalar(0, S32)
1640 .lower();
1641
1642 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643 .legalFor({{S32, S32}, {S64, S32}})
1644 .clampScalar(1, S32, S32)
1645 .clampScalar(0, S32, S64)
1646 .widenScalarToNextPow2(0)
1647 .scalarize(0);
1648
1649 getActionDefinitionsBuilder({
1650 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651 G_FCOPYSIGN,
1652
1653 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654 G_ATOMICRMW_NAND,
1655 G_ATOMICRMW_FSUB,
1656 G_READ_REGISTER,
1657 G_WRITE_REGISTER,
1658
1659 G_SADDO, G_SSUBO,
1660
1661 // TODO: Implement
1662 G_FMINIMUM, G_FMAXIMUM}).lower();
1663
1664 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665 .lower();
1666
1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670 .unsupported();
1671
1672 getLegacyLegalizerInfo().computeTables();
1673 verify(*ST.getInstrInfo());
1674}
1675
1676bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677 MachineInstr &MI) const {
1678 MachineIRBuilder &B = Helper.MIRBuilder;
1679 MachineRegisterInfo &MRI = *B.getMRI();
1680
1681 switch (MI.getOpcode()) {
1682 case TargetOpcode::G_ADDRSPACE_CAST:
1683 return legalizeAddrSpaceCast(MI, MRI, B);
1684 case TargetOpcode::G_FRINT:
1685 return legalizeFrint(MI, MRI, B);
1686 case TargetOpcode::G_FCEIL:
1687 return legalizeFceil(MI, MRI, B);
1688 case TargetOpcode::G_FREM:
1689 return legalizeFrem(MI, MRI, B);
1690 case TargetOpcode::G_INTRINSIC_TRUNC:
1691 return legalizeIntrinsicTrunc(MI, MRI, B);
1692 case TargetOpcode::G_SITOFP:
1693 return legalizeITOFP(MI, MRI, B, true);
1694 case TargetOpcode::G_UITOFP:
1695 return legalizeITOFP(MI, MRI, B, false);
1696 case TargetOpcode::G_FPTOSI:
1697 return legalizeFPTOI(MI, MRI, B, true);
1698 case TargetOpcode::G_FPTOUI:
1699 return legalizeFPTOI(MI, MRI, B, false);
1700 case TargetOpcode::G_FMINNUM:
1701 case TargetOpcode::G_FMAXNUM:
1702 case TargetOpcode::G_FMINNUM_IEEE:
1703 case TargetOpcode::G_FMAXNUM_IEEE:
1704 return legalizeMinNumMaxNum(Helper, MI);
1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706 return legalizeExtractVectorElt(MI, MRI, B);
1707 case TargetOpcode::G_INSERT_VECTOR_ELT:
1708 return legalizeInsertVectorElt(MI, MRI, B);
1709 case TargetOpcode::G_SHUFFLE_VECTOR:
1710 return legalizeShuffleVector(MI, MRI, B);
1711 case TargetOpcode::G_FSIN:
1712 case TargetOpcode::G_FCOS:
1713 return legalizeSinCos(MI, MRI, B);
1714 case TargetOpcode::G_GLOBAL_VALUE:
1715 return legalizeGlobalValue(MI, MRI, B);
1716 case TargetOpcode::G_LOAD:
1717 case TargetOpcode::G_SEXTLOAD:
1718 case TargetOpcode::G_ZEXTLOAD:
1719 return legalizeLoad(Helper, MI);
1720 case TargetOpcode::G_FMAD:
1721 return legalizeFMad(MI, MRI, B);
1722 case TargetOpcode::G_FDIV:
1723 return legalizeFDIV(MI, MRI, B);
1724 case TargetOpcode::G_UDIV:
1725 case TargetOpcode::G_UREM:
1726 case TargetOpcode::G_UDIVREM:
1727 return legalizeUnsignedDIV_REM(MI, MRI, B);
1728 case TargetOpcode::G_SDIV:
1729 case TargetOpcode::G_SREM:
1730 case TargetOpcode::G_SDIVREM:
1731 return legalizeSignedDIV_REM(MI, MRI, B);
1732 case TargetOpcode::G_ATOMIC_CMPXCHG:
1733 return legalizeAtomicCmpXChg(MI, MRI, B);
1734 case TargetOpcode::G_FLOG:
1735 return legalizeFlog(MI, B, numbers::ln2f);
1736 case TargetOpcode::G_FLOG10:
1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738 case TargetOpcode::G_FEXP:
1739 return legalizeFExp(MI, B);
1740 case TargetOpcode::G_FPOW:
1741 return legalizeFPow(MI, B);
1742 case TargetOpcode::G_FFLOOR:
1743 return legalizeFFloor(MI, MRI, B);
1744 case TargetOpcode::G_BUILD_VECTOR:
1745 return legalizeBuildVector(MI, MRI, B);
1746 case TargetOpcode::G_CTLZ:
1747 case TargetOpcode::G_CTTZ:
1748 return legalizeCTLZ_CTTZ(MI, MRI, B);
1749 default:
1750 return false;
1751 }
1752
1753 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1753)
;
1754}
1755
1756Register AMDGPULegalizerInfo::getSegmentAperture(
1757 unsigned AS,
1758 MachineRegisterInfo &MRI,
1759 MachineIRBuilder &B) const {
1760 MachineFunction &MF = B.getMF();
1761 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762 const LLT S32 = LLT::scalar(32);
1763
1764 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1764, __extension__
__PRETTY_FUNCTION__))
;
1765
1766 if (ST.hasApertureRegs()) {
1767 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768 // getreg.
1769 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1771 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1772 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1773 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1774 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1775 unsigned Encoding =
1776 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1777 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1778 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779
1780 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781
1782 B.buildInstr(AMDGPU::S_GETREG_B32)
1783 .addDef(GetReg)
1784 .addImm(Encoding);
1785 MRI.setType(GetReg, S32);
1786
1787 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789 }
1790
1791 Register QueuePtr = MRI.createGenericVirtualRegister(
1792 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1793
1794 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1795 return Register();
1796
1797 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798 // private_segment_aperture_base_hi.
1799 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800
1801 // TODO: can we be smarter about machine pointer info?
1802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1803 MachineMemOperand *MMO = MF.getMachineMemOperand(
1804 PtrInfo,
1805 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1806 MachineMemOperand::MOInvariant,
1807 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808
1809 Register LoadAddr;
1810
1811 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813}
1814
1815/// Return true if the value is a known valid address, such that a null check is
1816/// not necessary.
1817static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1818 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1819 MachineInstr *Def = MRI.getVRegDef(Val);
1820 switch (Def->getOpcode()) {
1821 case AMDGPU::G_FRAME_INDEX:
1822 case AMDGPU::G_GLOBAL_VALUE:
1823 case AMDGPU::G_BLOCK_ADDR:
1824 return true;
1825 case AMDGPU::G_CONSTANT: {
1826 const ConstantInt *CI = Def->getOperand(1).getCImm();
1827 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1828 }
1829 default:
1830 return false;
1831 }
1832
1833 return false;
1834}
1835
1836bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1837 MachineInstr &MI, MachineRegisterInfo &MRI,
1838 MachineIRBuilder &B) const {
1839 MachineFunction &MF = B.getMF();
1840
1841 const LLT S32 = LLT::scalar(32);
1842 Register Dst = MI.getOperand(0).getReg();
1843 Register Src = MI.getOperand(1).getReg();
1844
1845 LLT DstTy = MRI.getType(Dst);
1846 LLT SrcTy = MRI.getType(Src);
1847 unsigned DestAS = DstTy.getAddressSpace();
1848 unsigned SrcAS = SrcTy.getAddressSpace();
1849
1850 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1851 // vector element.
1852 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1852, __extension__ __PRETTY_FUNCTION__))
;
1853
1854 const AMDGPUTargetMachine &TM
1855 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1856
1857 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1858 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1859 return true;
1860 }
1861
1862 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863 // Truncate.
1864 B.buildExtract(Dst, Src, 0);
1865 MI.eraseFromParent();
1866 return true;
1867 }
1868
1869 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1870 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1871 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1872
1873 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1874 // another. Merge operands are required to be the same type, but creating an
1875 // extra ptrtoint would be kind of pointless.
1876 auto HighAddr = B.buildConstant(
1877 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1878 B.buildMerge(Dst, {Src, HighAddr});
1879 MI.eraseFromParent();
1880 return true;
1881 }
1882
1883 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1884 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1885, __extension__
__PRETTY_FUNCTION__))
1885 DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1885, __extension__
__PRETTY_FUNCTION__))
;
1886
1887 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1888 // Extract low 32-bits of the pointer.
1889 B.buildExtract(Dst, Src, 0);
1890 MI.eraseFromParent();
1891 return true;
1892 }
1893
1894 unsigned NullVal = TM.getNullPointerValue(DestAS);
1895
1896 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1897 auto FlatNull = B.buildConstant(SrcTy, 0);
1898
1899 // Extract low 32-bits of the pointer.
1900 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1901
1902 auto CmpRes =
1903 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1904 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1905
1906 MI.eraseFromParent();
1907 return true;
1908 }
1909
1910 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1911 return false;
1912
1913 if (!ST.hasFlatAddressSpace())
1914 return false;
1915
1916 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1917 if (!ApertureReg.isValid())
1918 return false;
1919
1920 // Coerce the type of the low half of the result so we can use merge_values.
1921 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1922
1923 // TODO: Should we allow mismatched types but matching sizes in merges to
1924 // avoid the ptrtoint?
1925 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1926
1927 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1928 B.buildCopy(Dst, BuildPtr);
1929 MI.eraseFromParent();
1930 return true;
1931 }
1932
1933 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1934 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1935
1936 auto CmpRes =
1937 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1938
1939 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1940
1941 MI.eraseFromParent();
1942 return true;
1943}
1944
1945bool AMDGPULegalizerInfo::legalizeFrint(
1946 MachineInstr &MI, MachineRegisterInfo &MRI,
1947 MachineIRBuilder &B) const {
1948 Register Src = MI.getOperand(1).getReg();
1949 LLT Ty = MRI.getType(Src);
1950 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1950, __extension__
__PRETTY_FUNCTION__))
;
1951
1952 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1953 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1954
1955 auto C1 = B.buildFConstant(Ty, C1Val);
1956 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1957
1958 // TODO: Should this propagate fast-math-flags?
1959 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1960 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1961
1962 auto C2 = B.buildFConstant(Ty, C2Val);
1963 auto Fabs = B.buildFAbs(Ty, Src);
1964
1965 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1966 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1967 MI.eraseFromParent();
1968 return true;
1969}
1970
1971bool AMDGPULegalizerInfo::legalizeFceil(
1972 MachineInstr &MI, MachineRegisterInfo &MRI,
1973 MachineIRBuilder &B) const {
1974
1975 const LLT S1 = LLT::scalar(1);
1976 const LLT S64 = LLT::scalar(64);
1977
1978 Register Src = MI.getOperand(1).getReg();
1979 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1979, __extension__ __PRETTY_FUNCTION__))
;
1980
1981 // result = trunc(src)
1982 // if (src > 0.0 && src != result)
1983 // result += 1.0
1984
1985 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1986
1987 const auto Zero = B.buildFConstant(S64, 0.0);
1988 const auto One = B.buildFConstant(S64, 1.0);
1989 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1990 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1991 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1992 auto Add = B.buildSelect(S64, And, One, Zero);
1993
1994 // TODO: Should this propagate fast-math-flags?
1995 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1996 return true;
1997}
1998
1999bool AMDGPULegalizerInfo::legalizeFrem(
2000 MachineInstr &MI, MachineRegisterInfo &MRI,
2001 MachineIRBuilder &B) const {
2002 Register DstReg = MI.getOperand(0).getReg();
2003 Register Src0Reg = MI.getOperand(1).getReg();
2004 Register Src1Reg = MI.getOperand(2).getReg();
2005 auto Flags = MI.getFlags();
2006 LLT Ty = MRI.getType(DstReg);
2007
2008 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2009 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2010 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2011 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2012 MI.eraseFromParent();
2013 return true;
2014}
2015
2016static MachineInstrBuilder extractF64Exponent(Register Hi,
2017 MachineIRBuilder &B) {
2018 const unsigned FractBits = 52;
2019 const unsigned ExpBits = 11;
2020 LLT S32 = LLT::scalar(32);
2021
2022 auto Const0 = B.buildConstant(S32, FractBits - 32);
2023 auto Const1 = B.buildConstant(S32, ExpBits);
2024
2025 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2026 .addUse(Hi)
2027 .addUse(Const0.getReg(0))
2028 .addUse(Const1.getReg(0));
2029
2030 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2031}
2032
2033bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2034 MachineInstr &MI, MachineRegisterInfo &MRI,
2035 MachineIRBuilder &B) const {
2036 const LLT S1 = LLT::scalar(1);
2037 const LLT S32 = LLT::scalar(32);
2038 const LLT S64 = LLT::scalar(64);
2039
2040 Register Src = MI.getOperand(1).getReg();
2041 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2041, __extension__ __PRETTY_FUNCTION__))
;
2042
2043 // TODO: Should this use extract since the low half is unused?
2044 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2045 Register Hi = Unmerge.getReg(1);
2046
2047 // Extract the upper half, since this is where we will find the sign and
2048 // exponent.
2049 auto Exp = extractF64Exponent(Hi, B);
2050
2051 const unsigned FractBits = 52;
2052
2053 // Extract the sign bit.
2054 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2055 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2056
2057 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2058
2059 const auto Zero32 = B.buildConstant(S32, 0);
2060
2061 // Extend back to 64-bits.
2062 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2063
2064 auto Shr = B.buildAShr(S64, FractMask, Exp);
2065 auto Not = B.buildNot(S64, Shr);
2066 auto Tmp0 = B.buildAnd(S64, Src, Not);
2067 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2068
2069 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2070 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2071
2072 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2073 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2074 MI.eraseFromParent();
2075 return true;
2076}
2077
2078bool AMDGPULegalizerInfo::legalizeITOFP(
2079 MachineInstr &MI, MachineRegisterInfo &MRI,
2080 MachineIRBuilder &B, bool Signed) const {
2081
2082 Register Dst = MI.getOperand(0).getReg();
2083 Register Src = MI.getOperand(1).getReg();
2084
2085 const LLT S64 = LLT::scalar(64);
2086 const LLT S32 = LLT::scalar(32);
2087
2088 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2088, __extension__ __PRETTY_FUNCTION__))
;
2089
2090 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2091 auto ThirtyTwo = B.buildConstant(S32, 32);
2092
2093 if (MRI.getType(Dst) == S64) {
2094 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2095 : B.buildUITOFP(S64, Unmerge.getReg(1));
2096
2097 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2098 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2099 .addUse(CvtHi.getReg(0))
2100 .addUse(ThirtyTwo.getReg(0));
2101
2102 // TODO: Should this propagate fast-math-flags?
2103 B.buildFAdd(Dst, LdExp, CvtLo);
2104 MI.eraseFromParent();
2105 return true;
2106 }
2107
2108 assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0
) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2108, __extension__ __PRETTY_FUNCTION__))
;
2109
2110 auto One = B.buildConstant(S32, 1);
2111
2112 MachineInstrBuilder ShAmt;
2113 if (Signed) {
2114 auto ThirtyOne = B.buildConstant(S32, 31);
2115 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2116 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2117 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2118 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2119 /*HasSideEffects=*/false)
2120 .addUse(Unmerge.getReg(1));
2121 auto LS2 = B.buildSub(S32, LS, One);
2122 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2123 } else
2124 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2125 auto Norm = B.buildShl(S64, Src, ShAmt);
2126 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2127 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2128 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2129 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2130 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2131 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2132 /*HasSideEffects=*/false)
2133 .addUse(FVal.getReg(0))
2134 .addUse(Scale.getReg(0));
2135 MI.eraseFromParent();
2136 return true;
2137}
2138
2139// TODO: Copied from DAG implementation. Verify logic and document how this
2140// actually works.
2141bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2142 MachineRegisterInfo &MRI,
2143 MachineIRBuilder &B,
2144 bool Signed) const {
2145
2146 Register Dst = MI.getOperand(0).getReg();
2147 Register Src = MI.getOperand(1).getReg();
2148
2149 const LLT S64 = LLT::scalar(64);
2150 const LLT S32 = LLT::scalar(32);
2151
2152 const LLT SrcLT = MRI.getType(Src);
2153 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2153, __extension__
__PRETTY_FUNCTION__))
;
2154
2155 unsigned Flags = MI.getFlags();
2156
2157 // The basic idea of converting a floating point number into a pair of 32-bit
2158 // integers is illustrated as follows:
2159 //
2160 // tf := trunc(val);
2161 // hif := floor(tf * 2^-32);
2162 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2163 // hi := fptoi(hif);
2164 // lo := fptoi(lof);
2165 //
2166 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2167 MachineInstrBuilder Sign;
2168 if (Signed && SrcLT == S32) {
2169 // However, a 32-bit floating point number has only 23 bits mantissa and
2170 // it's not enough to hold all the significant bits of `lof` if val is
2171 // negative. To avoid the loss of precision, We need to take the absolute
2172 // value after truncating and flip the result back based on the original
2173 // signedness.
2174 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2175 Trunc = B.buildFAbs(S32, Trunc, Flags);
2176 }
2177 MachineInstrBuilder K0, K1;
2178 if (SrcLT == S64) {
2179 K0 = B.buildFConstant(S64,
2180 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2181 K1 = B.buildFConstant(S64,
2182 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2183 } else {
2184 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2185 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2186 }
2187
2188 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2189 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2190 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2191
2192 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2193 : B.buildFPTOUI(S32, FloorMul);
2194 auto Lo = B.buildFPTOUI(S32, Fma);
2195
2196 if (Signed && SrcLT == S32) {
2197 // Flip the result based on the signedness, which is either all 0s or 1s.
2198 Sign = B.buildMerge(S64, {Sign, Sign});
2199 // r := xor({lo, hi}, sign) - sign;
2200 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2201 } else
2202 B.buildMerge(Dst, {Lo, Hi});
2203 MI.eraseFromParent();
2204
2205 return true;
2206}
2207
2208bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2209 MachineInstr &MI) const {
2210 MachineFunction &MF = Helper.MIRBuilder.getMF();
2211 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2212
2213 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2214 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2215
2216 // With ieee_mode disabled, the instructions have the correct behavior
2217 // already for G_FMINNUM/G_FMAXNUM
2218 if (!MFI->getMode().IEEE)
2219 return !IsIEEEOp;
2220
2221 if (IsIEEEOp)
2222 return true;
2223
2224 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2225}
2226
2227bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2228 MachineInstr &MI, MachineRegisterInfo &MRI,
2229 MachineIRBuilder &B) const {
2230 // TODO: Should move some of this into LegalizerHelper.
2231
2232 // TODO: Promote dynamic indexing of s16 to s32
2233
2234 // FIXME: Artifact combiner probably should have replaced the truncated
2235 // constant before this, so we shouldn't need
2236 // getIConstantVRegValWithLookThrough.
2237 Optional<ValueAndVReg> MaybeIdxVal =
2238 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2239 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2240 return true;
2241 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2242
2243 Register Dst = MI.getOperand(0).getReg();
2244 Register Vec = MI.getOperand(1).getReg();
2245
2246 LLT VecTy = MRI.getType(Vec);
2247 LLT EltTy = VecTy.getElementType();
2248 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2248, __extension__ __PRETTY_FUNCTION__))
;
2249
2250 if (IdxVal < VecTy.getNumElements()) {
2251 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2252 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2253 } else {
2254 B.buildUndef(Dst);
2255 }
2256
2257 MI.eraseFromParent();
2258 return true;
2259}
2260
2261bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2262 MachineInstr &MI, MachineRegisterInfo &MRI,
2263 MachineIRBuilder &B) const {
2264 // TODO: Should move some of this into LegalizerHelper.
2265
2266 // TODO: Promote dynamic indexing of s16 to s32
2267
2268 // FIXME: Artifact combiner probably should have replaced the truncated
2269 // constant before this, so we shouldn't need
2270 // getIConstantVRegValWithLookThrough.
2271 Optional<ValueAndVReg> MaybeIdxVal =
2272 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2273 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2274 return true;
2275
2276 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2277 Register Dst = MI.getOperand(0).getReg();
2278 Register Vec = MI.getOperand(1).getReg();
2279 Register Ins = MI.getOperand(2).getReg();
2280
2281 LLT VecTy = MRI.getType(Vec);
2282 LLT EltTy = VecTy.getElementType();
2283 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2283, __extension__ __PRETTY_FUNCTION__))
;
2284 (void)Ins;
2285
2286 unsigned NumElts = VecTy.getNumElements();
2287 if (IdxVal < NumElts) {
2288 SmallVector<Register, 8> SrcRegs;
2289 for (unsigned i = 0; i < NumElts; ++i)
2290 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2291 B.buildUnmerge(SrcRegs, Vec);
2292
2293 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2294 B.buildMerge(Dst, SrcRegs);
2295 } else {
2296 B.buildUndef(Dst);
2297 }
2298
2299 MI.eraseFromParent();
2300 return true;
2301}
2302
2303bool AMDGPULegalizerInfo::legalizeShuffleVector(
2304 MachineInstr &MI, MachineRegisterInfo &MRI,
2305 MachineIRBuilder &B) const {
2306 const LLT V2S16 = LLT::fixed_vector(2, 16);
2307
2308 Register Dst = MI.getOperand(0).getReg();
2309 Register Src0 = MI.getOperand(1).getReg();
2310 LLT DstTy = MRI.getType(Dst);
2311 LLT SrcTy = MRI.getType(Src0);
2312
2313 if (SrcTy == V2S16 && DstTy == V2S16 &&
2314 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2315 return true;
2316
2317 MachineIRBuilder HelperBuilder(MI);
2318 GISelObserverWrapper DummyObserver;
2319 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2320 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2321}
2322
2323bool AMDGPULegalizerInfo::legalizeSinCos(
2324 MachineInstr &MI, MachineRegisterInfo &MRI,
2325 MachineIRBuilder &B) const {
2326
2327 Register DstReg = MI.getOperand(0).getReg();
2328 Register SrcReg = MI.getOperand(1).getReg();
2329 LLT Ty = MRI.getType(DstReg);
2330 unsigned Flags = MI.getFlags();
2331
2332 Register TrigVal;
2333 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2334 if (ST.hasTrigReducedRange()) {
2335 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2336 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2337 .addUse(MulVal.getReg(0))
2338 .setMIFlags(Flags).getReg(0);
2339 } else
2340 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2341
2342 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2343 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2344 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2345 .addUse(TrigVal)
2346 .setMIFlags(Flags);
2347 MI.eraseFromParent();
2348 return true;
2349}
2350
2351bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2352 MachineIRBuilder &B,
2353 const GlobalValue *GV,
2354 int64_t Offset,
2355 unsigned GAFlags) const {
2356 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2356, __extension__
__PRETTY_FUNCTION__))
;
2357 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2358 // to the following code sequence:
2359 //
2360 // For constant address space:
2361 // s_getpc_b64 s[0:1]
2362 // s_add_u32 s0, s0, $symbol
2363 // s_addc_u32 s1, s1, 0
2364 //
2365 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2366 // a fixup or relocation is emitted to replace $symbol with a literal
2367 // constant, which is a pc-relative offset from the encoding of the $symbol
2368 // operand to the global variable.
2369 //
2370 // For global address space:
2371 // s_getpc_b64 s[0:1]
2372 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2373 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2374 //
2375 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2376 // fixups or relocations are emitted to replace $symbol@*@lo and
2377 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2378 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2379 // operand to the global variable.
2380 //
2381 // What we want here is an offset from the value returned by s_getpc
2382 // (which is the address of the s_add_u32 instruction) to the global
2383 // variable, but since the encoding of $symbol starts 4 bytes after the start
2384 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2385 // small. This requires us to add 4 to the global variable offset in order to
2386 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2387 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2388 // instruction.
2389
2390 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2391
2392 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2393 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2394
2395 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2396 .addDef(PCReg);
2397
2398 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2399 if (GAFlags == SIInstrInfo::MO_NONE)
2400 MIB.addImm(0);
2401 else
2402 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2403
2404 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2405
2406 if (PtrTy.getSizeInBits() == 32)
2407 B.buildExtract(DstReg, PCReg, 0);
2408 return true;
2409 }
2410
2411bool AMDGPULegalizerInfo::legalizeGlobalValue(
2412 MachineInstr &MI, MachineRegisterInfo &MRI,
2413 MachineIRBuilder &B) const {
2414 Register DstReg = MI.getOperand(0).getReg();
2415 LLT Ty = MRI.getType(DstReg);
2416 unsigned AS = Ty.getAddressSpace();
2417
2418 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2419 MachineFunction &MF = B.getMF();
2420 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2421
2422 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2423 if (!MFI->isModuleEntryFunction() &&
2424 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2425 const Function &Fn = MF.getFunction();
2426 DiagnosticInfoUnsupported BadLDSDecl(
2427 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2428 DS_Warning);
2429 Fn.getContext().diagnose(BadLDSDecl);
2430
2431 // We currently don't have a way to correctly allocate LDS objects that
2432 // aren't directly associated with a kernel. We do force inlining of
2433 // functions that use local objects. However, if these dead functions are
2434 // not eliminated, we don't want a compile time error. Just emit a warning
2435 // and a trap, since there should be no callable path here.
2436 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2437 B.buildUndef(DstReg);
2438 MI.eraseFromParent();
2439 return true;
2440 }
2441
2442 // TODO: We could emit code to handle the initialization somewhere.
2443 // We ignore the initializer for now and legalize it to allow selection.
2444 // The initializer will anyway get errored out during assembly emission.
2445 const SITargetLowering *TLI = ST.getTargetLowering();
2446 if (!TLI->shouldUseLDSConstAddress(GV)) {
2447 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2448 return true; // Leave in place;
2449 }
2450
2451 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2452 Type *Ty = GV->getValueType();
2453 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2454 // zero-sized type in other languages to declare the dynamic shared
2455 // memory which size is not known at the compile time. They will be
2456 // allocated by the runtime and placed directly after the static
2457 // allocated ones. They all share the same offset.
2458 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2459 // Adjust alignment for that dynamic shared memory array.
2460 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2461 LLT S32 = LLT::scalar(32);
2462 auto Sz =
2463 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2464 B.buildIntToPtr(DstReg, Sz);
2465 MI.eraseFromParent();
2466 return true;
2467 }
2468 }
2469
2470 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2471 *cast<GlobalVariable>(GV)));
2472 MI.eraseFromParent();
2473 return true;
2474 }
2475
2476 const SITargetLowering *TLI = ST.getTargetLowering();
2477
2478 if (TLI->shouldEmitFixup(GV)) {
2479 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2480 MI.eraseFromParent();
2481 return true;
2482 }
2483
2484 if (TLI->shouldEmitPCReloc(GV)) {
2485 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2486 MI.eraseFromParent();
2487 return true;
2488 }
2489
2490 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2491 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2492
2493 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2494 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2495 MachinePointerInfo::getGOT(MF),
2496 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2497 MachineMemOperand::MOInvariant,
2498 LoadTy, Align(8));
2499
2500 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2501
2502 if (Ty.getSizeInBits() == 32) {
2503 // Truncate if this is a 32-bit constant address.
2504 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2505 B.buildExtract(DstReg, Load, 0);
2506 } else
2507 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2508
2509 MI.eraseFromParent();
2510 return true;
2511}
2512
2513static LLT widenToNextPowerOf2(LLT Ty) {
2514 if (Ty.isVector())
2515 return Ty.changeElementCount(
2516 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2517 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2518}
2519
2520bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2521 MachineInstr &MI) const {
2522 MachineIRBuilder &B = Helper.MIRBuilder;
2523 MachineRegisterInfo &MRI = *B.getMRI();
2524 GISelChangeObserver &Observer = Helper.Observer;
2525
2526 Register PtrReg = MI.getOperand(1).getReg();
2527 LLT PtrTy = MRI.getType(PtrReg);
2528 unsigned AddrSpace = PtrTy.getAddressSpace();
2529
2530 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2531 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2532 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2533 Observer.changingInstr(MI);
2534 MI.getOperand(1).setReg(Cast.getReg(0));
2535 Observer.changedInstr(MI);
2536 return true;
2537 }
2538
2539 if (MI.getOpcode() != AMDGPU::G_LOAD)
2540 return false;
2541
2542 Register ValReg = MI.getOperand(0).getReg();
2543 LLT ValTy = MRI.getType(ValReg);
2544
2545 MachineMemOperand *MMO = *MI.memoperands_begin();
2546 const unsigned ValSize = ValTy.getSizeInBits();
2547 const LLT MemTy = MMO->getMemoryType();
2548 const Align MemAlign = MMO->getAlign();
2549 const unsigned MemSize = MemTy.getSizeInBits();
2550 const unsigned AlignInBits = 8 * MemAlign.value();
2551
2552 // Widen non-power-of-2 loads to the alignment if needed
2553 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2554 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2555
2556 // This was already the correct extending load result type, so just adjust
2557 // the memory type.
2558 if (WideMemSize == ValSize) {
2559 MachineFunction &MF = B.getMF();
2560
2561 MachineMemOperand *WideMMO =
2562 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2563 Observer.changingInstr(MI);
2564 MI.setMemRefs(MF, {WideMMO});
2565 Observer.changedInstr(MI);
2566 return true;
2567 }
2568
2569 // Don't bother handling edge case that should probably never be produced.
2570 if (ValSize > WideMemSize)
2571 return false;
2572
2573 LLT WideTy = widenToNextPowerOf2(ValTy);
2574
2575 Register WideLoad;
2576 if (!WideTy.isVector()) {
2577 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2578 B.buildTrunc(ValReg, WideLoad).getReg(0);
2579 } else {
2580 // Extract the subvector.
2581
2582 if (isRegisterType(ValTy)) {
2583 // If this a case where G_EXTRACT is legal, use it.
2584 // (e.g. <3 x s32> -> <4 x s32>)
2585 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2586 B.buildExtract(ValReg, WideLoad, 0);
2587 } else {
2588 // For cases where the widened type isn't a nice register value, unmerge
2589 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2590 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2591 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2592 }
2593 }
2594
2595 MI.eraseFromParent();
2596 return true;
2597 }
2598
2599 return false;
2600}
2601
2602bool AMDGPULegalizerInfo::legalizeFMad(
2603 MachineInstr &MI, MachineRegisterInfo &MRI,
2604 MachineIRBuilder &B) const {
2605 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2606 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2606, __extension__ __PRETTY_FUNCTION__))
;
2607
2608 MachineFunction &MF = B.getMF();
2609 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2610
2611 // TODO: Always legal with future ftz flag.
2612 // FIXME: Do we need just output?
2613 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2614 return true;
2615 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2616 return true;
2617
2618 MachineIRBuilder HelperBuilder(MI);
2619 GISelObserverWrapper DummyObserver;
2620 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2621 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2622}
2623
2624bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2625 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2626 Register DstReg = MI.getOperand(0).getReg();
2627 Register PtrReg = MI.getOperand(1).getReg();
2628 Register CmpVal = MI.getOperand(2).getReg();
2629 Register NewVal = MI.getOperand(3).getReg();
2630
2631 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2632, __extension__
__PRETTY_FUNCTION__))
2632 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2632, __extension__
__PRETTY_FUNCTION__))
;
2633
2634 LLT ValTy = MRI.getType(CmpVal);
2635 LLT VecTy = LLT::fixed_vector(2, ValTy);
2636
2637 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2638
2639 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2640 .addDef(DstReg)
2641 .addUse(PtrReg)
2642 .addUse(PackedVal)
2643 .setMemRefs(MI.memoperands());
2644
2645 MI.eraseFromParent();
2646 return true;
2647}
2648
2649bool AMDGPULegalizerInfo::legalizeFlog(
2650 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2651 Register Dst = MI.getOperand(0).getReg();
2652 Register Src = MI.getOperand(1).getReg();
2653 LLT Ty = B.getMRI()->getType(Dst);
2654 unsigned Flags = MI.getFlags();
2655
2656 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2657 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2658
2659 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2660 MI.eraseFromParent();
2661 return true;
2662}
2663
2664bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2665 MachineIRBuilder &B) const {
2666 Register Dst = MI.getOperand(0).getReg();
2667 Register Src = MI.getOperand(1).getReg();
2668 unsigned Flags = MI.getFlags();
2669 LLT Ty = B.getMRI()->getType(Dst);
2670
2671 auto K = B.buildFConstant(Ty, numbers::log2e);
2672 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2673 B.buildFExp2(Dst, Mul, Flags);
2674 MI.eraseFromParent();
2675 return true;
2676}
2677
2678bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2679 MachineIRBuilder &B) const {
2680 Register Dst = MI.getOperand(0).getReg();
2681 Register Src0 = MI.getOperand(1).getReg();
2682 Register Src1 = MI.getOperand(2).getReg();
2683 unsigned Flags = MI.getFlags();
2684 LLT Ty = B.getMRI()->getType(Dst);
2685 const LLT S16 = LLT::scalar(16);
2686 const LLT S32 = LLT::scalar(32);
2687
2688 if (Ty == S32) {
2689 auto Log = B.buildFLog2(S32, Src0, Flags);
2690 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2691 .addUse(Log.getReg(0))
2692 .addUse(Src1)
2693 .setMIFlags(Flags);
2694 B.buildFExp2(Dst, Mul, Flags);
2695 } else if (Ty == S16) {
2696 // There's no f16 fmul_legacy, so we need to convert for it.
2697 auto Log = B.buildFLog2(S16, Src0, Flags);
2698 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2699 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2700 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2701 .addUse(Ext0.getReg(0))
2702 .addUse(Ext1.getReg(0))
2703 .setMIFlags(Flags);
2704
2705 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2706 } else
2707 return false;
2708
2709 MI.eraseFromParent();
2710 return true;
2711}
2712
2713// Find a source register, ignoring any possible source modifiers.
2714static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2715 Register ModSrc = OrigSrc;
2716 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2717 ModSrc = SrcFNeg->getOperand(1).getReg();
2718 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2719 ModSrc = SrcFAbs->getOperand(1).getReg();
2720 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2721 ModSrc = SrcFAbs->getOperand(1).getReg();
2722 return ModSrc;
2723}
2724
2725bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2726 MachineRegisterInfo &MRI,
2727 MachineIRBuilder &B) const {
2728
2729 const LLT S1 = LLT::scalar(1);
2730 const LLT S64 = LLT::scalar(64);
2731 Register Dst = MI.getOperand(0).getReg();
2732 Register OrigSrc = MI.getOperand(1).getReg();
2733 unsigned Flags = MI.getFlags();
2734 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2735, __extension__
__PRETTY_FUNCTION__))
2735 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2735, __extension__
__PRETTY_FUNCTION__))
;
2736
2737 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2738 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2739 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2740 // V_FRACT bug is:
2741 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2742 //
2743 // Convert floor(x) to (x - fract(x))
2744
2745 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2746 .addUse(OrigSrc)
2747 .setMIFlags(Flags);
2748
2749 // Give source modifier matching some assistance before obscuring a foldable
2750 // pattern.
2751
2752 // TODO: We can avoid the neg on the fract? The input sign to fract
2753 // shouldn't matter?
2754 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2755
2756 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2757
2758 Register Min = MRI.createGenericVirtualRegister(S64);
2759
2760 // We don't need to concern ourselves with the snan handling difference, so
2761 // use the one which will directly select.
2762 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2763 if (MFI->getMode().IEEE)
2764 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2765 else
2766 B.buildFMinNum(Min, Fract, Const, Flags);
2767
2768 Register CorrectedFract = Min;
2769 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2770 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2771 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2772 }
2773
2774 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2775 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2776
2777 MI.eraseFromParent();
2778 return true;
2779}
2780
2781// Turn an illegal packed v2s16 build vector into bit operations.
2782// TODO: This should probably be a bitcast action in LegalizerHelper.
2783bool AMDGPULegalizerInfo::legalizeBuildVector(
2784 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2785 Register Dst = MI.getOperand(0).getReg();
2786 const LLT S32 = LLT::scalar(32);
2787 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector
(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2787, __extension__
__PRETTY_FUNCTION__))
;
2788
2789 Register Src0 = MI.getOperand(1).getReg();
2790 Register Src1 = MI.getOperand(2).getReg();
2791 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16
)) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2791, __extension__
__PRETTY_FUNCTION__))
;
2792
2793 auto Merge = B.buildMerge(S32, {Src0, Src1});
2794 B.buildBitcast(Dst, Merge);
2795
2796 MI.eraseFromParent();
2797 return true;
2798}
2799
2800// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2801// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2802// case with a single min instruction instead of a compare+select.
2803bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2804 MachineRegisterInfo &MRI,
2805 MachineIRBuilder &B) const {
2806 Register Dst = MI.getOperand(0).getReg();
2807 Register Src = MI.getOperand(1).getReg();
2808 LLT DstTy = MRI.getType(Dst);
2809 LLT SrcTy = MRI.getType(Src);
2810
2811 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2812 ? AMDGPU::G_AMDGPU_FFBH_U32
2813 : AMDGPU::G_AMDGPU_FFBL_B32;
2814 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2815 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2816
2817 MI.eraseFromParent();
2818 return true;
2819}
2820
2821// Check that this is a G_XOR x, -1
2822static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2823 if (MI.getOpcode() != TargetOpcode::G_XOR)
2824 return false;
2825 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2826 return ConstVal && *ConstVal == -1;
2827}
2828
2829// Return the use branch instruction, otherwise null if the usage is invalid.
2830static MachineInstr *
2831verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2832 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2833 Register CondDef = MI.getOperand(0).getReg();
2834 if (!MRI.hasOneNonDBGUse(CondDef))
2835 return nullptr;
2836
2837 MachineBasicBlock *Parent = MI.getParent();
2838 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2839
2840 if (isNot(MRI, *UseMI)) {
2841 Register NegatedCond = UseMI->getOperand(0).getReg();
2842 if (!MRI.hasOneNonDBGUse(NegatedCond))
2843 return nullptr;
2844
2845 // We're deleting the def of this value, so we need to remove it.
2846 eraseInstr(*UseMI, MRI);
2847
2848 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2849 Negated = true;
2850 }
2851
2852 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2853 return nullptr;
2854
2855 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2856 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2857 if (Next == Parent->end()) {
2858 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2859 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2860 return nullptr;
2861 UncondBrTarget = &*NextMBB;
2862 } else {
2863 if (Next->getOpcode() != AMDGPU::G_BR)
2864 return nullptr;
2865 Br = &*Next;
2866 UncondBrTarget = Br->getOperand(0).getMBB();
2867 }
2868
2869 return UseMI;
2870}
2871
2872bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2873 const ArgDescriptor *Arg,
2874 const TargetRegisterClass *ArgRC,
2875 LLT ArgTy) const {
2876 MCRegister SrcReg = Arg->getRegister();
2877 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2877, __extension__
__PRETTY_FUNCTION__))
;
7
'?' condition is true
2878 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2878, __extension__
__PRETTY_FUNCTION__))
;
8
'?' condition is true
2879
2880 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2881 *ArgRC, B.getDebugLoc(), ArgTy);
2882 if (Arg->isMasked()) {
9
Calling 'ArgDescriptor::isMasked'
12
Returning from 'ArgDescriptor::isMasked'
13
Taking true branch
2883 // TODO: Should we try to emit this once in the entry block?
2884 const LLT S32 = LLT::scalar(32);
2885 const unsigned Mask = Arg->getMask();
2886 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
14
Calling 'countTrailingZeros<unsigned int>'
21
Returning from 'countTrailingZeros<unsigned int>'
22
'Shift' initialized to 32
2887
2888 Register AndMaskSrc = LiveIn;
2889
2890 if (Shift
22.1
'Shift' is not equal to 0
22.1
'Shift' is not equal to 0
22.1
'Shift' is not equal to 0
!= 0) {
23
Taking true branch
2891 auto ShiftAmt = B.buildConstant(S32, Shift);
2892 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2893 }
2894
2895 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
24
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2896 } else {
2897 B.buildCopy(DstReg, LiveIn);
2898 }
2899
2900 return true;
2901}
2902
2903bool AMDGPULegalizerInfo::loadInputValue(
2904 Register DstReg, MachineIRBuilder &B,
2905 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2906 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2907 const ArgDescriptor *Arg;
2908 const TargetRegisterClass *ArgRC;
2909 LLT ArgTy;
2910 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2911
2912 if (!Arg) {
4
Assuming 'Arg' is non-null
2913 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2914 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2915 // case the pointer argument may be missing and we use null.
2916 B.buildConstant(DstReg, 0);
2917 return true;
2918 }
2919
2920 // It's undefined behavior if a function marked with the amdgpu-no-*
2921 // attributes uses the corresponding intrinsic.
2922 B.buildUndef(DstReg);
2923 return true;
2924 }
2925
2926 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5
Taking false branch
2927 return false; // TODO: Handle these
2928 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
6
Calling 'AMDGPULegalizerInfo::loadInputValue'
2929}
2930
2931bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2932 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2933 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2934 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3
Calling 'AMDGPULegalizerInfo::loadInputValue'
2935 return false;
2936
2937 MI.eraseFromParent();
2938 return true;
2939}
2940
2941bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2942 MachineRegisterInfo &MRI,
2943 MachineIRBuilder &B) const {
2944 Register Dst = MI.getOperand(0).getReg();
2945 LLT DstTy = MRI.getType(Dst);
2946 LLT S16 = LLT::scalar(16);
2947 LLT S32 = LLT::scalar(32);
2948 LLT S64 = LLT::scalar(64);
2949
2950 if (DstTy == S16)
2951 return legalizeFDIV16(MI, MRI, B);
2952 if (DstTy == S32)
2953 return legalizeFDIV32(MI, MRI, B);
2954 if (DstTy == S64)
2955 return legalizeFDIV64(MI, MRI, B);
2956
2957 return false;
2958}
2959
2960void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2961 Register DstDivReg,
2962 Register DstRemReg,
2963 Register X,
2964 Register Y) const {
2965 const LLT S1 = LLT::scalar(1);
2966 const LLT S32 = LLT::scalar(32);
2967
2968 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2969 // algorithm used here.
2970
2971 // Initial estimate of inv(y).
2972 auto FloatY = B.buildUITOFP(S32, Y);
2973 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2974 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2975 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2976 auto Z = B.buildFPTOUI(S32, ScaledY);
2977
2978 // One round of UNR.
2979 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2980 auto NegYZ = B.buildMul(S32, NegY, Z);
2981 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2982
2983 // Quotient/remainder estimate.
2984 auto Q = B.buildUMulH(S32, X, Z);
2985 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2986
2987 // First quotient/remainder refinement.
2988 auto One = B.buildConstant(S32, 1);
2989 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2990 if (DstDivReg)
2991 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2992 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2993
2994 // Second quotient/remainder refinement.
2995 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2996 if (DstDivReg)
2997 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2998
2999 if (DstRemReg)
3000 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3001}
3002
3003// Build integer reciprocal sequence around V_RCP_IFLAG_F32
3004//
3005// Return lo, hi of result
3006//
3007// %cvt.lo = G_UITOFP Val.lo
3008// %cvt.hi = G_UITOFP Val.hi
3009// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3010// %rcp = G_AMDGPU_RCP_IFLAG %mad
3011// %mul1 = G_FMUL %rcp, 0x5f7ffffc
3012// %mul2 = G_FMUL %mul1, 2**(-32)
3013// %trunc = G_INTRINSIC_TRUNC %mul2
3014// %mad2 = G_FMAD %trunc, -(2**32), %mul1
3015// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3016static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3017 Register Val) {
3018 const LLT S32 = LLT::scalar(32);
3019 auto Unmerge = B.buildUnmerge(S32, Val);
3020
3021 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3022 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3023
3024 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3025 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3026
3027 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3028 auto Mul1 =
3029 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3030
3031 // 2**(-32)
3032 auto Mul2 =
3033 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3034 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3035
3036 // -(2**32)
3037 auto Mad2 = B.buildFMAD(S32, Trunc,
3038 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3039
3040 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3041 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3042
3043 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3044}
3045
3046void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3047 Register DstDivReg,
3048 Register DstRemReg,
3049 Register Numer,
3050 Register Denom) const {
3051 const LLT S32 = LLT::scalar(32);
3052 const LLT S64 = LLT::scalar(64);
3053 const LLT S1 = LLT::scalar(1);
3054 Register RcpLo, RcpHi;
3055
3056 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3057
3058 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3059
3060 auto Zero64 = B.buildConstant(S64, 0);
3061 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3062
3063 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3064 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3065
3066 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3067 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3068 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3069
3070 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3071 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3072 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3073
3074 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3075 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3076 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3077 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3078 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3079
3080 auto Zero32 = B.buildConstant(S32, 0);
3081 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3082 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3083 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3084
3085 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3086 Register NumerLo = UnmergeNumer.getReg(0);
3087 Register NumerHi = UnmergeNumer.getReg(1);
3088
3089 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3090 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3091 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3092 Register Mul3_Lo = UnmergeMul3.getReg(0);
3093 Register Mul3_Hi = UnmergeMul3.getReg(1);
3094 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3095 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3096 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3097 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3098
3099 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3100 Register DenomLo = UnmergeDenom.getReg(0);
3101 Register DenomHi = UnmergeDenom.getReg(1);
3102
3103 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3104 auto C1 = B.buildSExt(S32, CmpHi);
3105
3106 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3107 auto C2 = B.buildSExt(S32, CmpLo);
3108
3109 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3110 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3111
3112 // TODO: Here and below portions of the code can be enclosed into if/endif.
3113 // Currently control flow is unconditional and we have 4 selects after
3114 // potential endif to substitute PHIs.
3115
3116 // if C3 != 0 ...
3117 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3118 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3119 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3120 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3121
3122 auto One64 = B.buildConstant(S64, 1);
3123 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3124
3125 auto C4 =
3126 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3127 auto C5 =
3128 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3129 auto C6 = B.buildSelect(
3130 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3131
3132 // if (C6 != 0)
3133 auto Add4 = B.buildAdd(S64, Add3, One64);
3134 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3135
3136 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3137 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3138 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3139
3140 // endif C6
3141 // endif C3
3142
3143 if (DstDivReg) {
3144 auto Sel1 = B.buildSelect(
3145 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3146 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3147 Sel1, MulHi3);
3148 }
3149
3150 if (DstRemReg) {
3151 auto Sel2 = B.buildSelect(
3152 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3153 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3154 Sel2, Sub1);
3155 }
3156}
3157
3158bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3159 MachineRegisterInfo &MRI,
3160 MachineIRBuilder &B) const {
3161 Register DstDivReg, DstRemReg;
3162 switch (MI.getOpcode()) {
3163 default:
3164 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3164)
;
3165 case AMDGPU::G_UDIV: {
3166 DstDivReg = MI.getOperand(0).getReg();
3167 break;
3168 }
3169 case AMDGPU::G_UREM: {
3170 DstRemReg = MI.getOperand(0).getReg();
3171 break;
3172 }
3173 case AMDGPU::G_UDIVREM: {
3174 DstDivReg = MI.getOperand(0).getReg();
3175 DstRemReg = MI.getOperand(1).getReg();
3176 break;
3177 }
3178 }
3179
3180 const LLT S64 = LLT::scalar(64);
3181 const LLT S32 = LLT::scalar(32);
3182 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3183 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3184 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3185 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3186
3187 if (Ty == S32)
3188 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3189 else if (Ty == S64)
3190 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3191 else
3192 return false;
3193
3194 MI.eraseFromParent();
3195 return true;
3196}
3197
3198bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3199 MachineRegisterInfo &MRI,
3200 MachineIRBuilder &B) const {
3201 const LLT S64 = LLT::scalar(64);
3202 const LLT S32 = LLT::scalar(32);
3203
3204 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3205 if (Ty != S32 && Ty != S64)
3206 return false;
3207
3208 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3209 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3210 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3211
3212 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3213 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3214 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3215
3216 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3217 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3218
3219 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3220 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3221
3222 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3223 switch (MI.getOpcode()) {
3224 default:
3225 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3225)
;
3226 case AMDGPU::G_SDIV: {
3227 DstDivReg = MI.getOperand(0).getReg();
3228 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3229 break;
3230 }
3231 case AMDGPU::G_SREM: {
3232 DstRemReg = MI.getOperand(0).getReg();
3233 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3234 break;
3235 }
3236 case AMDGPU::G_SDIVREM: {
3237 DstDivReg = MI.getOperand(0).getReg();
3238 DstRemReg = MI.getOperand(1).getReg();
3239 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3240 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3241 break;
3242 }
3243 }
3244
3245 if (Ty == S32)
3246 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3247 else
3248 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3249
3250 if (DstDivReg) {
3251 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3252 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3253 B.buildSub(DstDivReg, SignXor, Sign);
3254 }
3255
3256 if (DstRemReg) {
3257 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3258 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3259 B.buildSub(DstRemReg, SignXor, Sign);
3260 }
3261
3262 MI.eraseFromParent();
3263 return true;
3264}
3265
3266bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3267 MachineRegisterInfo &MRI,
3268 MachineIRBuilder &B) const {
3269 Register Res = MI.getOperand(0).getReg();
3270 Register LHS = MI.getOperand(1).getReg();
3271 Register RHS = MI.getOperand(2).getReg();
3272 uint16_t Flags = MI.getFlags();
3273 LLT ResTy = MRI.getType(Res);
3274
3275 const MachineFunction &MF = B.getMF();
3276 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3277 MI.getFlag(MachineInstr::FmAfn);
3278
3279 if (!AllowInaccurateRcp)
3280 return false;
3281
3282 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3283 // 1 / x -> RCP(x)
3284 if (CLHS->isExactlyValue(1.0)) {
3285 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3286 .addUse(RHS)
3287 .setMIFlags(Flags);
3288
3289 MI.eraseFromParent();
3290 return true;
3291 }
3292
3293 // -1 / x -> RCP( FNEG(x) )
3294 if (CLHS->isExactlyValue(-1.0)) {
3295 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3296 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3297 .addUse(FNeg.getReg(0))
3298 .setMIFlags(Flags);
3299
3300 MI.eraseFromParent();
3301 return true;
3302 }
3303 }
3304
3305 // x / y -> x * (1.0 / y)
3306 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3307 .addUse(RHS)
3308 .setMIFlags(Flags);
3309 B.buildFMul(Res, LHS, RCP, Flags);
3310
3311 MI.eraseFromParent();
3312 return true;
3313}
3314
3315bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3316 MachineRegisterInfo &MRI,
3317 MachineIRBuilder &B) const {
3318 Register Res = MI.getOperand(0).getReg();
3319 Register X = MI.getOperand(1).getReg();
3320 Register Y = MI.getOperand(2).getReg();
3321 uint16_t Flags = MI.getFlags();
3322 LLT ResTy = MRI.getType(Res);
3323
3324 const MachineFunction &MF = B.getMF();
3325 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3326 MI.getFlag(MachineInstr::FmAfn);
3327
3328 if (!AllowInaccurateRcp)
3329 return false;
3330
3331 auto NegY = B.buildFNeg(ResTy, Y);
3332 auto One = B.buildFConstant(ResTy, 1.0);
3333
3334 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3335 .addUse(Y)
3336 .setMIFlags(Flags);
3337
3338 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3339 R = B.buildFMA(ResTy, Tmp0, R, R);
3340
3341 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3342 R = B.buildFMA(ResTy, Tmp1, R, R);
3343
3344 auto Ret = B.buildFMul(ResTy, X, R);
3345 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3346
3347 B.buildFMA(Res, Tmp2, R, Ret);
3348 MI.eraseFromParent();
3349 return true;
3350}
3351
3352bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3353 MachineRegisterInfo &MRI,
3354 MachineIRBuilder &B) const {
3355 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3356 return true;
3357
3358 Register Res = MI.getOperand(0).getReg();
3359 Register LHS = MI.getOperand(1).getReg();
3360 Register RHS = MI.getOperand(2).getReg();
3361
3362 uint16_t Flags = MI.getFlags();
3363
3364 LLT S16 = LLT::scalar(16);
3365 LLT S32 = LLT::scalar(32);
3366
3367 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3368 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3369
3370 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3371 .addUse(RHSExt.getReg(0))
3372 .setMIFlags(Flags);
3373
3374 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3375 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3376
3377 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3378 .addUse(RDst.getReg(0))
3379 .addUse(RHS)
3380 .addUse(LHS)
3381 .setMIFlags(Flags);
3382
3383 MI.eraseFromParent();
3384 return true;
3385}
3386
3387// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3388// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3389static void toggleSPDenormMode(bool Enable,
3390 MachineIRBuilder &B,
3391 const GCNSubtarget &ST,
3392 AMDGPU::SIModeRegisterDefaults Mode) {
3393 // Set SP denorm mode to this value.
3394 unsigned SPDenormMode =
3395 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3396
3397 if (ST.hasDenormModeInst()) {
3398 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3399 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3400
3401 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3402 B.buildInstr(AMDGPU::S_DENORM_MODE)
3403 .addImm(NewDenormModeValue);
3404
3405 } else {
3406 // Select FP32 bit field in mode register.
3407 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3408 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3409 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3410
3411 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3412 .addImm(SPDenormMode)
3413 .addImm(SPDenormModeBitField);
3414 }
3415}
3416
3417bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3418 MachineRegisterInfo &MRI,
3419 MachineIRBuilder &B) const {
3420 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3421 return true;
3422
3423 Register Res = MI.getOperand(0).getReg();
3424 Register LHS = MI.getOperand(1).getReg();
3425 Register RHS = MI.getOperand(2).getReg();
3426 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3427 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3428
3429 uint16_t Flags = MI.getFlags();
3430
3431 LLT S32 = LLT::scalar(32);
3432 LLT S1 = LLT::scalar(1);
3433
3434 auto One = B.buildFConstant(S32, 1.0f);
3435
3436 auto DenominatorScaled =
3437 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3438 .addUse(LHS)
3439 .addUse(RHS)
3440 .addImm(0)
3441 .setMIFlags(Flags);
3442 auto NumeratorScaled =
3443 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3444 .addUse(LHS)
3445 .addUse(RHS)
3446 .addImm(1)
3447 .setMIFlags(Flags);
3448
3449 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3450 .addUse(DenominatorScaled.getReg(0))
3451 .setMIFlags(Flags);
3452 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3453
3454 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3455 // aren't modeled as reading it.
3456 if (!Mode.allFP32Denormals())
3457 toggleSPDenormMode(true, B, ST, Mode);
3458
3459 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3460 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3461 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3462 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3463 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3464 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3465
3466 if (!Mode.allFP32Denormals())
3467 toggleSPDenormMode(false, B, ST, Mode);
3468
3469 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3470 .addUse(Fma4.getReg(0))
3471 .addUse(Fma1.getReg(0))
3472 .addUse(Fma3.getReg(0))
3473 .addUse(NumeratorScaled.getReg(1))
3474 .setMIFlags(Flags);
3475
3476 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3477 .addUse(Fmas.getReg(0))
3478 .addUse(RHS)
3479 .addUse(LHS)
3480 .setMIFlags(Flags);
3481
3482 MI.eraseFromParent();
3483 return true;
3484}
3485
3486bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3487 MachineRegisterInfo &MRI,
3488 MachineIRBuilder &B) const {
3489 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3490 return true;
3491
3492 Register Res = MI.getOperand(0).getReg();
3493 Register LHS = MI.getOperand(1).getReg();
3494 Register RHS = MI.getOperand(2).getReg();
3495
3496 uint16_t Flags = MI.getFlags();
3497
3498 LLT S64 = LLT::scalar(64);
3499 LLT S1 = LLT::scalar(1);
3500
3501 auto One = B.buildFConstant(S64, 1.0);
3502
3503 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3504 .addUse(LHS)
3505 .addUse(RHS)
3506 .addImm(0)
3507 .setMIFlags(Flags);
3508
3509 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3510
3511 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3512 .addUse(DivScale0.getReg(0))
3513 .setMIFlags(Flags);
3514
3515 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3516 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3517 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3518
3519 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3520 .addUse(LHS)
3521 .addUse(RHS)
3522 .addImm(1)
3523 .setMIFlags(Flags);
3524
3525 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3526 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3527 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3528
3529 Register Scale;
3530 if (!ST.hasUsableDivScaleConditionOutput()) {
3531 // Workaround a hardware bug on SI where the condition output from div_scale
3532 // is not usable.
3533
3534 LLT S32 = LLT::scalar(32);
3535
3536 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3537 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3538 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3539 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3540
3541 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3542 Scale1Unmerge.getReg(1));
3543 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3544 Scale0Unmerge.getReg(1));
3545 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3546 } else {
3547 Scale = DivScale1.getReg(1);
3548 }
3549
3550 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3551 .addUse(Fma4.getReg(0))
3552 .addUse(Fma3.getReg(0))
3553 .addUse(Mul.getReg(0))
3554 .addUse(Scale)
3555 .setMIFlags(Flags);
3556
3557 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3558 .addUse(Fmas.getReg(0))
3559 .addUse(RHS)
3560 .addUse(LHS)
3561 .setMIFlags(Flags);
3562
3563 MI.eraseFromParent();
3564 return true;
3565}
3566
3567bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3568 MachineRegisterInfo &MRI,
3569 MachineIRBuilder &B) const {
3570 Register Res = MI.getOperand(0).getReg();
3571 Register LHS = MI.getOperand(2).getReg();
3572 Register RHS = MI.getOperand(3).getReg();
3573 uint16_t Flags = MI.getFlags();
3574
3575 LLT S32 = LLT::scalar(32);
3576 LLT S1 = LLT::scalar(1);
3577
3578 auto Abs = B.buildFAbs(S32, RHS, Flags);
3579 const APFloat C0Val(1.0f);
3580
3581 auto C0 = B.buildConstant(S32, 0x6f800000);
3582 auto C1 = B.buildConstant(S32, 0x2f800000);
3583 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3584
3585 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3586 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3587
3588 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3589
3590 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3591 .addUse(Mul0.getReg(0))
3592 .setMIFlags(Flags);
3593
3594 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3595
3596 B.buildFMul(Res, Sel, Mul1, Flags);
3597
3598 MI.eraseFromParent();
3599 return true;
3600}
3601
3602// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3603// FIXME: Why do we handle this one but not other removed instructions?
3604//
3605// Reciprocal square root. The clamp prevents infinite results, clamping
3606// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3607// +-max_float.
3608bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3609 MachineRegisterInfo &MRI,
3610 MachineIRBuilder &B) const {
3611 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3612 return true;
3613
3614 Register Dst = MI.getOperand(0).getReg();
3615 Register Src = MI.getOperand(2).getReg();
3616 auto Flags = MI.getFlags();
3617
3618 LLT Ty = MRI.getType(Dst);
3619
3620 const fltSemantics *FltSemantics;
3621 if (Ty == LLT::scalar(32))
3622 FltSemantics = &APFloat::IEEEsingle();
3623 else if (Ty == LLT::scalar(64))
3624 FltSemantics = &APFloat::IEEEdouble();
3625 else
3626 return false;
3627
3628 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3629 .addUse(Src)
3630 .setMIFlags(Flags);
3631
3632 // We don't need to concern ourselves with the snan handling difference, since
3633 // the rsq quieted (or not) so use the one which will directly select.
3634 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3635 const bool UseIEEE = MFI->getMode().IEEE;
3636
3637 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3638 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3639 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3640
3641 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3642
3643 if (UseIEEE)
3644 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3645 else
3646 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3647 MI.eraseFromParent();
3648 return true;
3649}
3650
3651static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3652 switch (IID) {
3653 case Intrinsic::amdgcn_ds_fadd:
3654 return AMDGPU::G_ATOMICRMW_FADD;
3655 case Intrinsic::amdgcn_ds_fmin:
3656 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3657 case Intrinsic::amdgcn_ds_fmax:
3658 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3659 default:
3660 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3660)
;
3661 }
3662}
3663
3664bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3665 MachineInstr &MI,
3666 Intrinsic::ID IID) const {
3667 GISelChangeObserver &Observer = Helper.Observer;
3668 Observer.changingInstr(MI);
3669
3670 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3671
3672 // The remaining operands were used to set fields in the MemOperand on
3673 // construction.
3674 for (int I = 6; I > 3; --I)
3675 MI.RemoveOperand(I);
3676
3677 MI.RemoveOperand(1); // Remove the intrinsic ID.
3678 Observer.changedInstr(MI);
3679 return true;
3680}
3681
3682bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3683 MachineRegisterInfo &MRI,
3684 MachineIRBuilder &B) const {
3685 uint64_t Offset =
3686 ST.getTargetLowering()->getImplicitParameterOffset(
3687 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3688 LLT DstTy = MRI.getType(DstReg);
3689 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3690
3691 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3692 if (!loadInputValue(KernargPtrReg, B,
3693 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3694 return false;
3695
3696 // FIXME: This should be nuw
3697 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3698 return true;
3699}
3700
3701bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3702 MachineRegisterInfo &MRI,
3703 MachineIRBuilder &B) const {
3704 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3705 if (!MFI->isEntryFunction()) {
3706 return legalizePreloadedArgIntrin(MI, MRI, B,
3707 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3708 }
3709
3710 Register DstReg = MI.getOperand(0).getReg();
3711 if (!getImplicitArgPtr(DstReg, MRI, B))
3712 return false;
3713
3714 MI.eraseFromParent();
3715 return true;
3716}
3717
3718bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3719 MachineRegisterInfo &MRI,
3720 MachineIRBuilder &B,
3721 unsigned AddrSpace) const {
3722 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3723 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3724 Register Hi32 = Unmerge.getReg(1);
3725
3726 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3727 MI.eraseFromParent();
3728 return true;
3729}
3730
3731// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3732// offset (the offset that is included in bounds checking and swizzling, to be
3733// split between the instruction's voffset and immoffset fields) and soffset
3734// (the offset that is excluded from bounds checking and swizzling, to go in
3735// the instruction's soffset field). This function takes the first kind of
3736// offset and figures out how to split it between voffset and immoffset.
3737std::pair<Register, unsigned>
3738AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3739 Register OrigOffset) const {
3740 const unsigned MaxImm = 4095;
3741 Register BaseReg;
3742 unsigned ImmOffset;
3743 const LLT S32 = LLT::scalar(32);
3744 MachineRegisterInfo &MRI = *B.getMRI();
3745
3746 std::tie(BaseReg, ImmOffset) =
3747 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3748
3749 // If BaseReg is a pointer, convert it to int.
3750 if (MRI.getType(BaseReg).isPointer())
3751 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3752
3753 // If the immediate value is too big for the immoffset field, put the value
3754 // and -4096 into the immoffset field so that the value that is copied/added
3755 // for the voffset field is a multiple of 4096, and it stands more chance
3756 // of being CSEd with the copy/add for another similar load/store.
3757 // However, do not do that rounding down to a multiple of 4096 if that is a
3758 // negative number, as it appears to be illegal to have a negative offset
3759 // in the vgpr, even if adding the immediate offset makes it positive.
3760 unsigned Overflow = ImmOffset & ~MaxImm;
3761 ImmOffset -= Overflow;
3762 if ((int32_t)Overflow < 0) {
3763 Overflow += ImmOffset;
3764 ImmOffset = 0;
3765 }
3766
3767 if (Overflow != 0) {
3768 if (!BaseReg) {
3769 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3770 } else {
3771 auto OverflowVal = B.buildConstant(S32, Overflow);
3772 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3773 }
3774 }
3775
3776 if (!BaseReg)
3777 BaseReg = B.buildConstant(S32, 0).getReg(0);
3778
3779 return std::make_pair(BaseReg, ImmOffset);
3780}
3781
3782/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3783void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3784 Register VOffset, Register SOffset,
3785 unsigned ImmOffset, Register VIndex,
3786 MachineRegisterInfo &MRI) const {
3787 Optional<ValueAndVReg> MaybeVOffsetVal =
3788 getIConstantVRegValWithLookThrough(VOffset, MRI);
3789 Optional<ValueAndVReg> MaybeSOffsetVal =
3790 getIConstantVRegValWithLookThrough(SOffset, MRI);
3791 Optional<ValueAndVReg> MaybeVIndexVal =
3792 getIConstantVRegValWithLookThrough(VIndex, MRI);
3793 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3794 // update the MMO with that offset. The stride is unknown so we can only do
3795 // this if VIndex is constant 0.
3796 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3797 MaybeVIndexVal->Value == 0) {
3798 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3799 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3800 MMO->setOffset(TotalOffset);
3801 } else {
3802 // We don't have a constant combined offset to use in the MMO. Give up.
3803 MMO->setValue((Value *)nullptr);
3804 }
3805}
3806
3807/// Handle register layout difference for f16 images for some subtargets.
3808Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3809 MachineRegisterInfo &MRI,
3810 Register Reg,
3811 bool ImageStore) const {
3812 const LLT S16 = LLT::scalar(16);
3813 const LLT S32 = LLT::scalar(32);
3814 LLT StoreVT = MRI.getType(Reg);
3815 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3815, __extension__
__PRETTY_FUNCTION__))
;
3816
3817 if (ST.hasUnpackedD16VMem()) {
3818 auto Unmerge = B.buildUnmerge(S16, Reg);
3819
3820 SmallVector<Register, 4> WideRegs;
3821 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3822 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3823
3824 int NumElts = StoreVT.getNumElements();
3825
3826 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3827 .getReg(0);
3828 }
3829
3830 if (ImageStore && ST.hasImageStoreD16Bug()) {
3831 if (StoreVT.getNumElements() == 2) {
3832 SmallVector<Register, 4> PackedRegs;
3833 Reg = B.buildBitcast(S32, Reg).getReg(0);
3834 PackedRegs.push_back(Reg);
3835 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3836 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3837 .getReg(0);
3838 }
3839
3840 if (StoreVT.getNumElements() == 3) {
3841 SmallVector<Register, 4> PackedRegs;
3842 auto Unmerge = B.buildUnmerge(S16, Reg);
3843 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3844 PackedRegs.push_back(Unmerge.getReg(I));
3845 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3846 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3847 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3848 }
3849
3850 if (StoreVT.getNumElements() == 4) {
3851 SmallVector<Register, 4> PackedRegs;
3852 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3853 auto Unmerge = B.buildUnmerge(S32, Reg);
3854 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3855 PackedRegs.push_back(Unmerge.getReg(I));
3856 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3857 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3858 .getReg(0);
3859 }
3860
3861 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3861)
;
3862 }
3863
3864 if (StoreVT == LLT::fixed_vector(3, S16)) {
3865 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3866 .getReg(0);
3867 }
3868 return Reg;
3869}
3870
3871Register AMDGPULegalizerInfo::fixStoreSourceType(
3872 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3873 MachineRegisterInfo *MRI = B.getMRI();
3874 LLT Ty = MRI->getType(VData);
3875
3876 const LLT S16 = LLT::scalar(16);
3877
3878 // Fixup illegal register types for i8 stores.
3879 if (Ty == LLT::scalar(8) || Ty == S16) {
3880 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3881 return AnyExt;
3882 }
3883
3884 if (Ty.isVector()) {
3885 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3886 if (IsFormat)
3887 return handleD16VData(B, *MRI, VData);
3888 }
3889 }
3890
3891 return VData;
3892}
3893
3894bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3895 MachineRegisterInfo &MRI,
3896 MachineIRBuilder &B,
3897 bool IsTyped,
3898 bool IsFormat) const {
3899 Register VData = MI.getOperand(1).getReg();
3900 LLT Ty = MRI.getType(VData);
3901 LLT EltTy = Ty.getScalarType();
3902 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3903 const LLT S32 = LLT::scalar(32);
3904
3905 VData = fixStoreSourceType(B, VData, IsFormat);
3906 Register RSrc = MI.getOperand(2).getReg();
3907
3908 MachineMemOperand *MMO = *MI.memoperands_begin();
3909 const int MemSize = MMO->getSize();
3910
3911 unsigned ImmOffset;
3912
3913 // The typed intrinsics add an immediate after the registers.
3914 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3915
3916 // The struct intrinsic variants add one additional operand over raw.
3917 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3918 Register VIndex;
3919 int OpOffset = 0;
3920 if (HasVIndex) {
3921 VIndex = MI.getOperand(3).getReg();
3922 OpOffset = 1;
3923 } else {
3924 VIndex = B.buildConstant(S32, 0).getReg(0);
3925 }
3926
3927 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3928 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3929
3930 unsigned Format = 0;
3931 if (IsTyped) {
3932 Format = MI.getOperand(5 + OpOffset).getImm();
3933 ++OpOffset;
3934 }
3935
3936 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3937
3938 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3939 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3940
3941 unsigned Opc;
3942 if (IsTyped) {
3943 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3944 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3945 } else if (IsFormat) {
3946 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3947 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3948 } else {
3949 switch (MemSize) {
3950 case 1:
3951 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3952 break;
3953 case 2:
3954 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3955 break;
3956 default:
3957 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3958 break;
3959 }
3960 }
3961
3962 auto MIB = B.buildInstr(Opc)
3963 .addUse(VData) // vdata
3964 .addUse(RSrc) // rsrc
3965 .addUse(VIndex) // vindex
3966 .addUse(VOffset) // voffset
3967 .addUse(SOffset) // soffset
3968 .addImm(ImmOffset); // offset(imm)
3969
3970 if (IsTyped)
3971 MIB.addImm(Format);
3972
3973 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3974 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3975 .addMemOperand(MMO);
3976
3977 MI.eraseFromParent();
3978 return true;
3979}
3980
3981bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3982 MachineRegisterInfo &MRI,
3983 MachineIRBuilder &B,
3984 bool IsFormat,
3985 bool IsTyped) const {
3986 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3987 MachineMemOperand *MMO = *MI.memoperands_begin();
3988 const LLT MemTy = MMO->getMemoryType();
3989 const LLT S32 = LLT::scalar(32);
3990
3991 Register Dst = MI.getOperand(0).getReg();
3992 Register RSrc = MI.getOperand(2).getReg();
3993
3994 // The typed intrinsics add an immediate after the registers.
3995 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3996
3997 // The struct intrinsic variants add one additional operand over raw.
3998 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3999 Register VIndex;
4000 int OpOffset = 0;
4001 if (HasVIndex) {
4002 VIndex = MI.getOperand(3).getReg();
4003 OpOffset = 1;
4004 } else {
4005 VIndex = B.buildConstant(S32, 0).getReg(0);
4006 }
4007
4008 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4009 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4010
4011 unsigned Format = 0;
4012 if (IsTyped) {
4013 Format = MI.getOperand(5 + OpOffset).getImm();
4014 ++OpOffset;
4015 }
4016
4017 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4018 unsigned ImmOffset;
4019
4020 LLT Ty = MRI.getType(Dst);
4021 LLT EltTy = Ty.getScalarType();
4022 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4023 const bool Unpacked = ST.hasUnpackedD16VMem();
4024
4025 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4026 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4027
4028 unsigned Opc;
4029
4030 if (IsTyped) {
4031 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4032 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4033 } else if (IsFormat) {
4034 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4035 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4036 } else {
4037 switch (MemTy.getSizeInBits()) {
4038 case 8:
4039 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4040 break;
4041 case 16:
4042 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4043 break;
4044 default:
4045 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4046 break;
4047 }
4048 }
4049
4050 Register LoadDstReg;
4051
4052 bool IsExtLoad =
4053 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4054 LLT UnpackedTy = Ty.changeElementSize(32);
4055
4056 if (IsExtLoad)
4057 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4058 else if (Unpacked && IsD16 && Ty.isVector())
4059 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4060 else
4061 LoadDstReg = Dst;
4062
4063 auto MIB = B.buildInstr(Opc)
4064 .addDef(LoadDstReg) // vdata
4065 .addUse(RSrc) // rsrc
4066 .addUse(VIndex) // vindex
4067 .addUse(VOffset) // voffset
4068 .addUse(SOffset) // soffset
4069 .addImm(ImmOffset); // offset(imm)
4070
4071 if (IsTyped)
4072 MIB.addImm(Format);
4073
4074 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4075 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4076 .addMemOperand(MMO);
4077
4078 if (LoadDstReg != Dst) {
4079 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4080
4081 // Widen result for extending loads was widened.
4082 if (IsExtLoad)
4083 B.buildTrunc(Dst, LoadDstReg);
4084 else {
4085 // Repack to original 16-bit vector result
4086 // FIXME: G_TRUNC should work, but legalization currently fails
4087 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4088 SmallVector<Register, 4> Repack;
4089 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4090 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4091 B.buildMerge(Dst, Repack);
4092 }
4093 }
4094
4095 MI.eraseFromParent();
4096 return true;
4097}
4098
4099bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4100 MachineIRBuilder &B,
4101 bool IsInc) const {
4102 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4103 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4104 B.buildInstr(Opc)
4105 .addDef(MI.getOperand(0).getReg())
4106 .addUse(MI.getOperand(2).getReg())
4107 .addUse(MI.getOperand(3).getReg())
4108 .cloneMemRefs(MI);
4109 MI.eraseFromParent();
4110 return true;
4111}
4112
4113static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4114 switch (IntrID) {
4115 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4116 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4118 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4119 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4120 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4121 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4122 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4123 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4124 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4125 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4126 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4127 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4128 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4130 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4131 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4132 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4133 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4134 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4135 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4136 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4137 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4138 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4139 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4140 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4141 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4142 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4143 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4145 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4146 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4148 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4149 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4150 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4151 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4152 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4153 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4154 case Intrinsic::amdgcn_buffer_atomic_fadd:
4155 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4156 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4158 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4159 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4160 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4161 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4162 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4163 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4164 default:
4165 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4165)
;
4166 }
4167}
4168
4169bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4170 MachineIRBuilder &B,
4171 Intrinsic::ID IID) const {
4172 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4173 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4174 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4175
4176 Register Dst;
4177
4178 int OpOffset = 0;
4179 if (HasReturn) {
4180 // A few FP atomics do not support return values.
4181 Dst = MI.getOperand(0).getReg();
4182 } else {
4183 OpOffset = -1;
4184 }
4185
4186 Register VData = MI.getOperand(2 + OpOffset).getReg();
4187 Register CmpVal;
4188
4189 if (IsCmpSwap) {
4190 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4191 ++OpOffset;
4192 }
4193
4194 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4195 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4196
4197 // The struct intrinsic variants add one additional operand over raw.
4198 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4199 Register VIndex;
4200 if (HasVIndex) {
4201 VIndex = MI.getOperand(4 + OpOffset).getReg();
4202 ++OpOffset;
4203 } else {
4204 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4205 }
4206
4207 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4208 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4209 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4210
4211 MachineMemOperand *MMO = *MI.memoperands_begin();
4212
4213 unsigned ImmOffset;
4214 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4215 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4216
4217 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4218
4219 if (HasReturn)
4220 MIB.addDef(Dst);
4221
4222 MIB.addUse(VData); // vdata
4223
4224 if (IsCmpSwap)
4225 MIB.addReg(CmpVal);
4226
4227 MIB.addUse(RSrc) // rsrc
4228 .addUse(VIndex) // vindex
4229 .addUse(VOffset) // voffset
4230 .addUse(SOffset) // soffset
4231 .addImm(ImmOffset) // offset(imm)
4232 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4233 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4234 .addMemOperand(MMO);
4235
4236 MI.eraseFromParent();
4237 return true;
4238}
4239
4240/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4241/// vector with s16 typed elements.
4242static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4243 SmallVectorImpl<Register> &PackedAddrs,
4244 unsigned ArgOffset,
4245 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4246 bool IsA16, bool IsG16) {
4247 const LLT S16 = LLT::scalar(16);
4248 const LLT V2S16 = LLT::fixed_vector(2, 16);
4249 auto EndIdx = Intr->VAddrEnd;
4250
4251 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4252 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4253 if (!SrcOp.isReg())
4254 continue; // _L to _LZ may have eliminated this.
4255
4256 Register AddrReg = SrcOp.getReg();
4257
4258 if ((I < Intr->GradientStart) ||
4259 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4260 (I >= Intr->CoordStart && !IsA16)) {
4261 // Handle any gradient or coordinate operands that should not be packed
4262 if ((I < Intr->GradientStart) && IsA16 &&
4263 (B.getMRI()->getType(AddrReg) == S16)) {
4264 // Special handling of bias when A16 is on. Bias is of type half but
4265 // occupies full 32-bit.
4266 PackedAddrs.push_back(
4267 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4268 .getReg(0));
4269 } else {
4270 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4271 PackedAddrs.push_back(AddrReg);
4272 }
4273 } else {
4274 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4275 // derivatives dx/dh and dx/dv are packed with undef.
4276 if (((I + 1) >= EndIdx) ||
4277 ((Intr->NumGradients / 2) % 2 == 1 &&
4278 (I == static_cast<unsigned>(Intr->GradientStart +
4279 (Intr->NumGradients / 2) - 1) ||
4280 I == static_cast<unsigned>(Intr->GradientStart +
4281 Intr->NumGradients - 1))) ||
4282 // Check for _L to _LZ optimization
4283 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4284 PackedAddrs.push_back(
4285 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4286 .getReg(0));
4287 } else {
4288 PackedAddrs.push_back(
4289 B.buildBuildVector(
4290 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4291 .getReg(0));
4292 ++I;
4293 }
4294 }
4295 }
4296}
4297
4298/// Convert from separate vaddr components to a single vector address register,
4299/// and replace the remaining operands with $noreg.
4300static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4301 int DimIdx, int NumVAddrs) {
4302 const LLT S32 = LLT::scalar(32);
4303
4304 SmallVector<Register, 8> AddrRegs;
4305 for (int I = 0; I != NumVAddrs; ++I) {
4306 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4307 if (SrcOp.isReg()) {
4308 AddrRegs.push_back(SrcOp.getReg());
4309 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4309, __extension__
__PRETTY_FUNCTION__))
;
4310 }
4311 }
4312
4313 int NumAddrRegs = AddrRegs.size();
4314 if (NumAddrRegs != 1) {
4315 // Above 8 elements round up to next power of 2 (i.e. 16).
4316 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4317 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4318 auto Undef = B.buildUndef(S32);
4319 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4320 NumAddrRegs = RoundedNumRegs;
4321 }
4322
4323 auto VAddr =
4324 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4325 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4326 }
4327
4328 for (int I = 1; I != NumVAddrs; ++I) {
4329 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4330 if (SrcOp.isReg())
4331 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4332 }
4333}
4334
4335/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4336///
4337/// Depending on the subtarget, load/store with 16-bit element data need to be
4338/// rewritten to use the low half of 32-bit registers, or directly use a packed
4339/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4340/// registers.
4341///
4342/// We don't want to directly select image instructions just yet, but also want
4343/// to exposes all register repacking to the legalizer/combiners. We also don't
4344/// want a selected instrution entering RegBankSelect. In order to avoid
4345/// defining a multitude of intermediate image instructions, directly hack on
4346/// the intrinsic's arguments. In cases like a16 addresses, this requires
4347/// padding now unnecessary arguments with $noreg.
4348bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4349 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4350 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4351
4352 const unsigned NumDefs = MI.getNumExplicitDefs();
4353 const unsigned ArgOffset = NumDefs + 1;
4354 bool IsTFE = NumDefs == 2;
4355 // We are only processing the operands of d16 image operations on subtargets
4356 // that use the unpacked register layout, or need to repack the TFE result.
4357
4358 // TODO: Do we need to guard against already legalized intrinsics?
4359 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4360 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4361
4362 MachineRegisterInfo *MRI = B.getMRI();
4363 const LLT S32 = LLT::scalar(32);
4364 const LLT S16 = LLT::scalar(16);
4365 const LLT V2S16 = LLT::fixed_vector(2, 16);
4366
4367 unsigned DMask = 0;
4368 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4369 LLT Ty = MRI->getType(VData);
4370
4371 // Check for 16 bit addresses and pack if true.
4372 LLT GradTy =
4373 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4374 LLT AddrTy =
4375 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4376 const bool IsG16 = GradTy == S16;
4377 const bool IsA16 = AddrTy == S16;
4378 const bool IsD16 = Ty.getScalarType() == S16;
4379
4380 int DMaskLanes = 0;
4381 if (!BaseOpcode->Atomic) {
4382 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4383 if (BaseOpcode->Gather4) {
4384 DMaskLanes = 4;
4385 } else if (DMask != 0) {
4386 DMaskLanes = countPopulation(DMask);
4387 } else if (!IsTFE && !BaseOpcode->Store) {
4388 // If dmask is 0, this is a no-op load. This can be eliminated.
4389 B.buildUndef(MI.getOperand(0));
4390 MI.eraseFromParent();
4391 return true;
4392 }
4393 }
4394
4395 Observer.changingInstr(MI);
4396 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4397
4398 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4399 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4400 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4401 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4402 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4403
4404 // Track that we legalized this
4405 MI.setDesc(B.getTII().get(NewOpcode));
4406
4407 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4408 // dmask to be at least 1 otherwise the instruction will fail
4409 if (IsTFE && DMask == 0) {
4410 DMask = 0x1;
4411 DMaskLanes = 1;
4412 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4413 }
4414
4415 if (BaseOpcode->Atomic) {
4416 Register VData0 = MI.getOperand(2).getReg();
4417 LLT Ty = MRI->getType(VData0);
4418
4419 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4420 if (Ty.isVector())
4421 return false;
4422
4423 if (BaseOpcode->AtomicX2) {
4424 Register VData1 = MI.getOperand(3).getReg();
4425 // The two values are packed in one register.
4426 LLT PackedTy = LLT::fixed_vector(2, Ty);
4427 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4428 MI.getOperand(2).setReg(Concat.getReg(0));
4429 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4430 }
4431 }
4432
4433 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4434
4435 // Optimize _L to _LZ when _L is zero
4436 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4437 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4438 const ConstantFP *ConstantLod;
4439
4440 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4441 m_GFCst(ConstantLod))) {
4442 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4443 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4444 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4445 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4446 Intr->Dim);
4447
4448 // The starting indexes should remain in the same place.
4449 --CorrectedNumVAddrs;
4450
4451 MI.getOperand(NumDefs).setIntrinsicID(
4452 static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4453 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4454 Intr = NewImageDimIntr;
4455 }
4456 }
4457 }
4458
4459 // Optimize _mip away, when 'lod' is zero
4460 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4461 int64_t ConstantLod;
4462 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4463 m_ICst(ConstantLod))) {
4464 if (ConstantLod == 0) {
4465 // TODO: Change intrinsic opcode and remove operand instead or replacing
4466 // it with 0, as the _L to _LZ handling is done above.
4467 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4468 --CorrectedNumVAddrs;
4469 }
4470 }
4471 }
4472
4473 // Rewrite the addressing register layout before doing anything else.
4474 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4475 // 16 bit gradients are supported, but are tied to the A16 control
4476 // so both gradients and addresses must be 16 bit
4477 return false;
4478 }
4479
4480 if (IsA16 && !ST.hasA16()) {
4481 // A16 not supported
4482 return false;
4483 }
4484
4485 if (IsA16 || IsG16) {
4486 if (Intr->NumVAddrs > 1) {
4487 SmallVector<Register, 4> PackedRegs;
4488
4489 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4490 IsG16);
4491
4492 // See also below in the non-a16 branch
4493 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4494 PackedRegs.size() <= ST.getNSAMaxSize();
4495
4496 if (!UseNSA && PackedRegs.size() > 1) {
4497 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4498 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4499 PackedRegs[0] = Concat.getReg(0);
4500 PackedRegs.resize(1);
4501 }
4502
4503 const unsigned NumPacked = PackedRegs.size();
4504 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4505 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4506 if (!SrcOp.isReg()) {
4507 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4507, __extension__
__PRETTY_FUNCTION__))
;
4508 continue;
4509 }
4510
4511 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4511, __extension__
__PRETTY_FUNCTION__))
;
4512
4513 if (I - Intr->VAddrStart < NumPacked)
4514 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4515 else
4516 SrcOp.setReg(AMDGPU::NoRegister);
4517 }
4518 }
4519 } else {
4520 // If the register allocator cannot place the address registers contiguously
4521 // without introducing moves, then using the non-sequential address encoding
4522 // is always preferable, since it saves VALU instructions and is usually a
4523 // wash in terms of code size or even better.
4524 //
4525 // However, we currently have no way of hinting to the register allocator
4526 // that MIMG addresses should be placed contiguously when it is possible to
4527 // do so, so force non-NSA for the common 2-address case as a heuristic.
4528 //
4529 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4530 // allocation when possible.
4531 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4532 CorrectedNumVAddrs <= ST.getNSAMaxSize();
4533
4534 if (!UseNSA && Intr->NumVAddrs > 1)
4535 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4536 Intr->NumVAddrs);
4537 }
4538
4539 int Flags = 0;
4540 if (IsA16)
4541 Flags |= 1;
4542 if (IsG16)
4543 Flags |= 2;
4544 MI.addOperand(MachineOperand::CreateImm(Flags));
4545
4546 if (BaseOpcode->Store) { // No TFE for stores?
4547 // TODO: Handle dmask trim
4548 if (!Ty.isVector() || !IsD16)
4549 return true;
4550
4551 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4552 if (RepackedReg != VData) {
4553 MI.getOperand(1).setReg(RepackedReg);
4554 }
4555
4556 return true;
4557 }
4558
4559 Register DstReg = MI.getOperand(0).getReg();
4560 const LLT EltTy = Ty.getScalarType();
4561 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4562
4563 // Confirm that the return type is large enough for the dmask specified
4564 if (NumElts < DMaskLanes)
4565 return false;
4566
4567 if (NumElts > 4 || DMaskLanes > 4)
4568 return false;
4569
4570 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4571 const LLT AdjustedTy =
4572 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4573
4574 // The raw dword aligned data component of the load. The only legal cases
4575 // where this matters should be when using the packed D16 format, for
4576 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4577 LLT RoundedTy;
4578
4579 // S32 vector to to cover all data, plus TFE result element.
4580 LLT TFETy;
4581
4582 // Register type to use for each loaded component. Will be S32 or V2S16.
4583 LLT RegTy;
4584
4585 if (IsD16 && ST.hasUnpackedD16VMem()) {
4586 RoundedTy =
4587 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4588 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4589 RegTy = S32;
4590 } else {
4591 unsigned EltSize = EltTy.getSizeInBits();
4592 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4593 unsigned RoundedSize = 32 * RoundedElts;
4594 RoundedTy = LLT::scalarOrVector(
4595 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4596 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4597 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4598 }
4599
4600 // The return type does not need adjustment.
4601 // TODO: Should we change s16 case to s32 or <2 x s16>?
4602 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4603 return true;
4604
4605 Register Dst1Reg;
4606
4607 // Insert after the instruction.
4608 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4609
4610 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4611 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4612 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4613 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4614
4615 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4616
4617 MI.getOperand(0).setReg(NewResultReg);
4618
4619 // In the IR, TFE is supposed to be used with a 2 element struct return
4620 // type. The instruction really returns these two values in one contiguous
4621 // register, with one additional dword beyond the loaded data. Rewrite the
4622 // return type to use a single register result.
4623
4624 if (IsTFE) {
4625 Dst1Reg = MI.getOperand(1).getReg();
4626 if (MRI->getType(Dst1Reg) != S32)
4627 return false;
4628
4629 // TODO: Make sure the TFE operand bit is set.
4630 MI.RemoveOperand(1);
4631
4632 // Handle the easy case that requires no repack instructions.
4633 if (Ty == S32) {
4634 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4635 return true;
4636 }
4637 }
4638
4639 // Now figure out how to copy the new result register back into the old
4640 // result.
4641 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4642
4643 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4644
4645 if (ResultNumRegs == 1) {
4646 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp",
4646, __extension__ __PRETTY_FUNCTION__))
;
4647 ResultRegs[0] = NewResultReg;
4648 } else {
4649 // We have to repack into a new vector of some kind.
4650 for (int I = 0; I != NumDataRegs; ++I)
4651 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4652 B.buildUnmerge(ResultRegs, NewResultReg);
4653
4654 // Drop the final TFE element to get the data part. The TFE result is
4655 // directly written to the right place already.
4656 if (IsTFE)
4657 ResultRegs.resize(NumDataRegs);
4658 }
4659
4660 // For an s16 scalar result, we form an s32 result with a truncate regardless
4661 // of packed vs. unpacked.
4662 if (IsD16 && !Ty.isVector()) {
4663 B.buildTrunc(DstReg, ResultRegs[0]);
4664 return true;
4665 }
4666
4667 // Avoid a build/concat_vector of 1 entry.
4668 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4669 B.buildBitcast(DstReg, ResultRegs[0]);
4670 return true;
4671 }
4672
4673 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4673, __extension__ __PRETTY_FUNCTION__))
;
4674
4675 if (IsD16) {
4676 // For packed D16 results with TFE enabled, all the data components are
4677 // S32. Cast back to the expected type.
4678 //
4679 // TODO: We don't really need to use load s32 elements. We would only need one
4680 // cast for the TFE result if a multiple of v2s16 was used.
4681 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4682 for (Register &Reg : ResultRegs)
4683 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4684 } else if (ST.hasUnpackedD16VMem()) {
4685 for (Register &Reg : ResultRegs)
4686 Reg = B.buildTrunc(S16, Reg).getReg(0);
4687 }
4688 }
4689
4690 auto padWithUndef = [&](LLT Ty, int NumElts) {
4691 if (NumElts == 0)
4692 return;
4693 Register Undef = B.buildUndef(Ty).getReg(0);
4694 for (int I = 0; I != NumElts; ++I)
4695 ResultRegs.push_back(Undef);
4696 };
4697
4698 // Pad out any elements eliminated due to the dmask.
4699 LLT ResTy = MRI->getType(ResultRegs[0]);
4700 if (!ResTy.isVector()) {
4701 padWithUndef(ResTy, NumElts - ResultRegs.size());
4702 B.buildBuildVector(DstReg, ResultRegs);
4703 return true;
4704 }
4705
4706 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4706, __extension__
__PRETTY_FUNCTION__))
;
4707 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4708
4709 // Deal with the one annoying legal case.
4710 const LLT V3S16 = LLT::fixed_vector(3, 16);
4711 if (Ty == V3S16) {
4712 if (IsTFE) {
4713 if (ResultRegs.size() == 1) {
4714 NewResultReg = ResultRegs[0];
4715 } else if (ResultRegs.size() == 2) {
4716 LLT V4S16 = LLT::fixed_vector(4, 16);
4717 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4718 } else {
4719 return false;
4720 }
4721 }
4722
4723 if (MRI->getType(DstReg).getNumElements() <
4724 MRI->getType(NewResultReg).getNumElements()) {
4725 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4726 } else {
4727 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4728 }
4729 return true;
4730 }
4731
4732 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4733 B.buildConcatVectors(DstReg, ResultRegs);
4734 return true;
4735}
4736
4737bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4738 LegalizerHelper &Helper, MachineInstr &MI) const {
4739 MachineIRBuilder &B = Helper.MIRBuilder;
4740 GISelChangeObserver &Observer = Helper.Observer;
4741
4742 Register Dst = MI.getOperand(0).getReg();
4743 LLT Ty = B.getMRI()->getType(Dst);
4744 unsigned Size = Ty.getSizeInBits();
4745 MachineFunction &MF = B.getMF();
4746
4747 Observer.changingInstr(MI);
4748
4749 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4750 Ty = getBitcastRegisterType(Ty);
4751 Helper.bitcastDst(MI, Ty, 0);
4752 Dst = MI.getOperand(0).getReg();
4753 B.setInsertPt(B.getMBB(), MI);
4754 }
4755
4756 // FIXME: We don't really need this intermediate instruction. The intrinsic
4757 // should be fixed to have a memory operand. Since it's readnone, we're not
4758 // allowed to add one.
4759 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4760 MI.RemoveOperand(1); // Remove intrinsic ID
4761
4762 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4763 // TODO: Should this use datalayout alignment?
4764 const unsigned MemSize = (Size + 7) / 8;
4765 const Align MemAlign(4);
4766 MachineMemOperand *MMO = MF.getMachineMemOperand(
4767 MachinePointerInfo(),
4768 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4769 MachineMemOperand::MOInvariant,
4770 MemSize, MemAlign);
4771 MI.addMemOperand(MF, MMO);
4772
4773 // There are no 96-bit result scalar loads, but widening to 128-bit should
4774 // always be legal. We may need to restore this to a 96-bit result if it turns
4775 // out this needs to be converted to a vector load during RegBankSelect.
4776 if (!isPowerOf2_32(Size)) {
4777 if (Ty.isVector())
4778 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4779 else
4780 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4781 }
4782
4783 Observer.changedInstr(MI);
4784 return true;
4785}
4786
4787// TODO: Move to selection
4788bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4789 MachineRegisterInfo &MRI,
4790 MachineIRBuilder &B) const {
4791 if (!ST.isTrapHandlerEnabled() ||
4792 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4793 return legalizeTrapEndpgm(MI, MRI, B);
4794
4795 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4796 switch (*HsaAbiVer) {
4797 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4798 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4799 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4800 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4801 return ST.supportsGetDoorbellID() ?
4802 legalizeTrapHsa(MI, MRI, B) :
4803 legalizeTrapHsaQueuePtr(MI, MRI, B);
4804 }
4805 }
4806
4807 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4807)
;
4808}
4809
4810bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4811 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4812 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4813 MI.eraseFromParent();
4814 return true;
4815}
4816
4817bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4818 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4819 // Pass queue pointer to trap handler as input, and insert trap instruction
4820 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4821 Register LiveIn =
4822 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4823 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4824 return false;
4825
4826 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4827 B.buildCopy(SGPR01, LiveIn);
4828 B.buildInstr(AMDGPU::S_TRAP)
4829 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4830 .addReg(SGPR01, RegState::Implicit);
4831
4832 MI.eraseFromParent();
4833 return true;
4834}
4835
4836bool AMDGPULegalizerInfo::legalizeTrapHsa(
4837 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4838 B.buildInstr(AMDGPU::S_TRAP)
4839 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4840 MI.eraseFromParent();
4841 return true;
4842}
4843
4844bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4845 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4846 // Is non-HSA path or trap-handler disabled? Then, report a warning
4847 // accordingly
4848 if (!ST.isTrapHandlerEnabled() ||
4849 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4850 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4851 "debugtrap handler not supported",
4852 MI.getDebugLoc(), DS_Warning);
4853 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4854 Ctx.diagnose(NoTrap);
4855 } else {
4856 // Insert debug-trap instruction
4857 B.buildInstr(AMDGPU::S_TRAP)
4858 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4859 }
4860
4861 MI.eraseFromParent();
4862 return true;
4863}
4864
4865bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4866 MachineIRBuilder &B) const {
4867 MachineRegisterInfo &MRI = *B.getMRI();
4868 const LLT S16 = LLT::scalar(16);
4869 const LLT S32 = LLT::scalar(32);
4870
4871 Register DstReg = MI.getOperand(0).getReg();
4872 Register NodePtr = MI.getOperand(2).getReg();
4873 Register RayExtent = MI.getOperand(3).getReg();
4874 Register RayOrigin = MI.getOperand(4).getReg();
4875 Register RayDir = MI.getOperand(5).getReg();
4876 Register RayInvDir = MI.getOperand(6).getReg();
4877 Register TDescr = MI.getOperand(7).getReg();
4878
4879 if (!ST.hasGFX10_AEncoding()) {
4880 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4881 "intrinsic not supported on subtarget",
4882 MI.getDebugLoc());
4883 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4884 return false;
4885 }
4886
4887 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4888 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4889 const unsigned NumVDataDwords = 4;
4890 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4891 const bool UseNSA =
4892 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4893 const unsigned BaseOpcodes[2][2] = {
4894 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4895 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4896 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4897 int Opcode;
4898 if (UseNSA) {
4899 Opcode =
4900 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4901 NumVDataDwords, NumVAddrDwords);
4902 } else {
4903 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4904 AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4905 PowerOf2Ceil(NumVAddrDwords));
4906 }
4907 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4907, __extension__ __PRETTY_FUNCTION__))
;
4908
4909 SmallVector<Register, 12> Ops;
4910 if (Is64) {
4911 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4912 Ops.push_back(Unmerge.getReg(0));
4913 Ops.push_back(Unmerge.getReg(1));
4914 } else {
4915 Ops.push_back(NodePtr);
4916 }
4917 Ops.push_back(RayExtent);
4918
4919 auto packLanes = [&Ops, &S32, &B](Register Src) {
4920 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4921 Ops.push_back(Unmerge.getReg(0));
4922 Ops.push_back(Unmerge.getReg(1));
4923 Ops.push_back(Unmerge.getReg(2));
4924 };
4925
4926 packLanes(RayOrigin);
4927 if (IsA16) {
4928 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4929 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4930 Register R1 = MRI.createGenericVirtualRegister(S32);
4931 Register R2 = MRI.createGenericVirtualRegister(S32);
4932 Register R3 = MRI.createGenericVirtualRegister(S32);
4933 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4934 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4935 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4936 Ops.push_back(R1);
4937 Ops.push_back(R2);
4938 Ops.push_back(R3);
4939 } else {
4940 packLanes(RayDir);
4941 packLanes(RayInvDir);
4942 }
4943
4944 if (!UseNSA) {
4945 // Build a single vector containing all the operands so far prepared.
4946 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4947 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4948 Ops.clear();
4949 Ops.push_back(MergedOps);
4950 }
4951
4952 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4953 .addDef(DstReg)
4954 .addImm(Opcode);
4955
4956 for (Register R : Ops) {
4957 MIB.addUse(R);
4958 }
4959
4960 MIB.addUse(TDescr)
4961 .addImm(IsA16 ? 1 : 0)
4962 .cloneMemRefs(MI);
4963
4964 MI.eraseFromParent();
4965 return true;
4966}
4967
4968bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4969 MachineInstr &MI) const {
4970 MachineIRBuilder &B = Helper.MIRBuilder;
4971 MachineRegisterInfo &MRI = *B.getMRI();
4972
4973 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4974 auto IntrID = MI.getIntrinsicID();
4975 switch (IntrID) {
1
Control jumps to 'case amdgcn_dispatch_id:' at line 5097
4976 case Intrinsic::amdgcn_if:
4977 case Intrinsic::amdgcn_else: {
4978 MachineInstr *Br = nullptr;
4979 MachineBasicBlock *UncondBrTarget = nullptr;
4980 bool Negated = false;
4981 if (MachineInstr *BrCond =
4982 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4983 const SIRegisterInfo *TRI
4984 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4985
4986 Register Def = MI.getOperand(1).getReg();
4987 Register Use = MI.getOperand(3).getReg();
4988
4989 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4990
4991 if (Negated)
4992 std::swap(CondBrTarget, UncondBrTarget);
4993
4994 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4995 if (IntrID == Intrinsic::amdgcn_if) {
4996 B.buildInstr(AMDGPU::SI_IF)
4997 .addDef(Def)
4998 .addUse(Use)
4999 .addMBB(UncondBrTarget);
5000 } else {
5001 B.buildInstr(AMDGPU::SI_ELSE)
5002 .addDef(Def)
5003 .addUse(Use)
5004 .addMBB(UncondBrTarget);
5005 }
5006
5007 if (Br) {
5008 Br->getOperand(0).setMBB(CondBrTarget);
5009 } else {
5010 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5011 // since we're swapping branch targets it needs to be reinserted.
5012 // FIXME: IRTranslator should probably not do this
5013 B.buildBr(*CondBrTarget);
5014 }
5015
5016 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5017 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5018 MI.eraseFromParent();
5019 BrCond->eraseFromParent();
5020 return true;
5021 }
5022
5023 return false;
5024 }
5025 case Intrinsic::amdgcn_loop: {
5026 MachineInstr *Br = nullptr;
5027 MachineBasicBlock *UncondBrTarget = nullptr;
5028 bool Negated = false;
5029 if (MachineInstr *BrCond =
5030 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5031 const SIRegisterInfo *TRI
5032 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5033
5034 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5035 Register Reg = MI.getOperand(2).getReg();
5036
5037 if (Negated)
5038 std::swap(CondBrTarget, UncondBrTarget);
5039
5040 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5041 B.buildInstr(AMDGPU::SI_LOOP)
5042 .addUse(Reg)
5043 .addMBB(UncondBrTarget);
5044
5045 if (Br)
5046 Br->getOperand(0).setMBB(CondBrTarget);
5047 else
5048 B.buildBr(*CondBrTarget);
5049
5050 MI.eraseFromParent();
5051 BrCond->eraseFromParent();
5052 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5053 return true;
5054 }
5055
5056 return false;
5057 }
5058 case Intrinsic::amdgcn_kernarg_segment_ptr:
5059 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5060 // This only makes sense to call in a kernel, so just lower to null.
5061 B.buildConstant(MI.getOperand(0).getReg(), 0);
5062 MI.eraseFromParent();
5063 return true;
5064 }
5065
5066 return legalizePreloadedArgIntrin(
5067 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5068 case Intrinsic::amdgcn_implicitarg_ptr:
5069 return legalizeImplicitArgPtr(MI, MRI, B);
5070 case Intrinsic::amdgcn_workitem_id_x:
5071 return legalizePreloadedArgIntrin(MI, MRI, B,
5072 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5073 case Intrinsic::amdgcn_workitem_id_y:
5074 return legalizePreloadedArgIntrin(MI, MRI, B,
5075 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5076 case Intrinsic::amdgcn_workitem_id_z:
5077 return legalizePreloadedArgIntrin(MI, MRI, B,
5078 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5079 case Intrinsic::amdgcn_workgroup_id_x:
5080 return legalizePreloadedArgIntrin(MI, MRI, B,
5081 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5082 case Intrinsic::amdgcn_workgroup_id_y:
5083 return legalizePreloadedArgIntrin(MI, MRI, B,
5084 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5085 case Intrinsic::amdgcn_workgroup_id_z:
5086 return legalizePreloadedArgIntrin(MI, MRI, B,
5087 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5088 case Intrinsic::amdgcn_dispatch_ptr:
5089 return legalizePreloadedArgIntrin(MI, MRI, B,
5090 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5091 case Intrinsic::amdgcn_queue_ptr:
5092 return legalizePreloadedArgIntrin(MI, MRI, B,
5093 AMDGPUFunctionArgInfo::QUEUE_PTR);
5094 case Intrinsic::amdgcn_implicit_buffer_ptr:
5095 return legalizePreloadedArgIntrin(
5096 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5097 case Intrinsic::amdgcn_dispatch_id:
5098 return legalizePreloadedArgIntrin(MI, MRI, B,
2
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
5099 AMDGPUFunctionArgInfo::DISPATCH_ID);
5100 case Intrinsic::amdgcn_fdiv_fast:
5101 return legalizeFDIVFastIntrin(MI, MRI, B);
5102 case Intrinsic::amdgcn_is_shared:
5103 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5104 case Intrinsic::amdgcn_is_private:
5105 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5106 case Intrinsic::amdgcn_wavefrontsize: {
5107 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5108 MI.eraseFromParent();
5109 return true;
5110 }
5111 case Intrinsic::amdgcn_s_buffer_load:
5112 return legalizeSBufferLoad(Helper, MI);
5113 case Intrinsic::amdgcn_raw_buffer_store:
5114 case Intrinsic::amdgcn_struct_buffer_store:
5115 return legalizeBufferStore(MI, MRI, B, false, false);
5116 case Intrinsic::amdgcn_raw_buffer_store_format:
5117 case Intrinsic::amdgcn_struct_buffer_store_format:
5118 return legalizeBufferStore(MI, MRI, B, false, true);
5119 case Intrinsic::amdgcn_raw_tbuffer_store:
5120 case Intrinsic::amdgcn_struct_tbuffer_store:
5121 return legalizeBufferStore(MI, MRI, B, true, true);
5122 case Intrinsic::amdgcn_raw_buffer_load:
5123 case Intrinsic::amdgcn_struct_buffer_load:
5124 return legalizeBufferLoad(MI, MRI, B, false, false);
5125 case Intrinsic::amdgcn_raw_buffer_load_format:
5126 case Intrinsic::amdgcn_struct_buffer_load_format:
5127 return legalizeBufferLoad(MI, MRI, B, true, false);
5128 case Intrinsic::amdgcn_raw_tbuffer_load:
5129 case Intrinsic::amdgcn_struct_tbuffer_load:
5130 return legalizeBufferLoad(MI, MRI, B, true, true);
5131 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5132 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5133 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5134 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5135 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5136 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5137 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5138 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5139 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5140 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5141 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5142 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5143 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5144 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5145 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5146 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5147 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5148 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5149 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5150 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5151 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5152 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5153 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5154 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5155 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5156 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5157 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5158 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5159 case Intrinsic::amdgcn_buffer_atomic_fadd:
5160 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5161 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5162 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5163 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5164 return legalizeBufferAtomic(MI, B, IntrID);
5165 case Intrinsic::amdgcn_atomic_inc:
5166 return legalizeAtomicIncDec(MI, B, true);
5167 case Intrinsic::amdgcn_atomic_dec:
5168 return legalizeAtomicIncDec(MI, B, false);
5169 case Intrinsic::trap:
5170 return legalizeTrapIntrinsic(MI, MRI, B);
5171 case Intrinsic::debugtrap:
5172 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5173 case Intrinsic::amdgcn_rsq_clamp:
5174 return legalizeRsqClampIntrinsic(MI, MRI, B);
5175 case Intrinsic::amdgcn_ds_fadd:
5176 case Intrinsic::amdgcn_ds_fmin:
5177 case Intrinsic::amdgcn_ds_fmax:
5178 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5179 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5180 return legalizeBVHIntrinsic(MI, B);
5181 default: {
5182 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5183 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5184 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5185 return true;
5186 }
5187 }
5188
5189 return true;
5190}

/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)(static_cast <bool> (!IsStack) ? void (0) : __assert_fail
("!IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __extension__ __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)(static_cast <bool> (IsStack) ? void (0) : __assert_fail
("IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __extension__ __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
10
Assuming the condition is true
11
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/build/llvm-toolchain-snapshot-14~++20220112102225+310e9636caeb/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
15.1
'ZB' is not equal to ZB_Undefined
15.1
'ZB' is not equal to ZB_Undefined
15.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
16
Assuming 'Val' is equal to 0
17
Taking true branch
117 return 32;
18
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
15
Calling 'TrailingZerosCounter::count'
19
Returning from 'TrailingZerosCounter::count'
20
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 251, __extension__
__PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 429, __extension__
__PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 440, __extension__
__PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 447, __extension__
__PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 729, __extension__
__PRETTY_FUNCTION__))
;
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 754, __extension__
__PRETTY_FUNCTION__))
;
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 770, __extension__
__PRETTY_FUNCTION__))
;
771 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 771, __extension__
__PRETTY_FUNCTION__))
;
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 786, __extension__
__PRETTY_FUNCTION__))
;
787 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 787, __extension__
__PRETTY_FUNCTION__))
;
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif