Bug Summary

File:build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2961, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-04-20-140412-16051-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30
31#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
32
33using namespace llvm;
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
38
39// Hack until load/store selection patterns support any tuple of legal types.
40static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47static constexpr unsigned MaxRegisterSize = 1024;
48
49// Round the number of elements to the next power of two elements
50static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54}
55
56// Round the number of bits to the next power of two bits
57static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61}
62
63/// \returns true if this is an odd sized vector which should widen by adding an
64/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65/// excludes s1 vectors, which should always be scalarized.
66static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78}
79
80static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85}
86
87static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93}
94
95static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::make_pair(TypeIdx,
100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101 };
102}
103
104static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::make_pair(
112 TypeIdx,
113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114 };
115}
116
117// Increase the number of vector elements to reach the next multiple of 32-bit
118// type.
119static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120 return [=](const LegalityQuery &Query) {
121 const LLT Ty = Query.Types[TypeIdx];
122
123 const LLT EltTy = Ty.getElementType();
124 const int Size = Ty.getSizeInBits();
125 const int EltSize = EltTy.getSizeInBits();
126 const int NextMul32 = (Size + 31) / 32;
127
128 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 128, __extension__ __PRETTY_FUNCTION__))
;
129
130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132 };
133}
134
135static LLT getBitcastRegisterType(const LLT Ty) {
136 const unsigned Size = Ty.getSizeInBits();
137
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145}
146
147static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152}
153
154static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 158, __extension__ __PRETTY_FUNCTION__))
;
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162}
163
164static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169}
170
171static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176}
177
178static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183}
184
185static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187}
188
189static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192}
193
194static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199}
200
201static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209}
210
211// Any combination of 32 or 64-bit elements up the maximum register size, and
212// multiples of v2s16.
213static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217}
218
219static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227}
228
229// If we have a truncating store or an extending load with a data size larger
230// than 32-bits, we need to reduce to a 32-bit type.
231static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237}
238
239// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240// handle some operations by just promoting the register during
241// selection. There are also d16 loads on GFX9+ which preserve the high bits.
242static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 321, __extension__ __PRETTY_FUNCTION__))
;
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331}
332
333// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334// workaround this. Eventually it should ignore the type for loads and only care
335// about the size. Return true in cases where we will workaround this for now by
336// bitcasting.
337static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353}
354
355static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359}
360
361/// Return true if a load or store of the type should be lowered with a bitcast
362/// to a different type.
363static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377}
378
379/// Return true if we should legalize a load by widening an odd sized memory
380/// access up to the alignment. Note this case when the memory access itself
381/// changes, not the size of the result register.
382static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 uint64_t AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414}
415
416static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424}
425
426AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16, V2S16})
535 .minScalar(0, S16)
536 .clampMaxNumElementsStrict(0, S16, 2)
537 .widenScalarToNextMultipleOf(0, 32)
538 .maxScalar(0, S32)
539 .scalarize(0);
540
541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542 .legalFor({S32, S16, V2S16}) // Clamp modifier
543 .minScalarOrElt(0, S16)
544 .clampMaxNumElementsStrict(0, S16, 2)
545 .scalarize(0)
546 .widenScalarToNextPow2(0, 32)
547 .lower();
548 } else if (ST.has16BitInsts()) {
549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550 .legalFor({S32, S16})
551 .minScalar(0, S16)
552 .widenScalarToNextMultipleOf(0, 32)
553 .maxScalar(0, S32)
554 .scalarize(0);
555
556 // Technically the saturating operations require clamp bit support, but this
557 // was introduced at the same time as 16-bit operations.
558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559 .legalFor({S32, S16}) // Clamp modifier
560 .minScalar(0, S16)
561 .scalarize(0)
562 .widenScalarToNextPow2(0, 16)
563 .lower();
564
565 // We're just lowering this, but it helps get a better result to try to
566 // coerce to the desired type first.
567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568 .minScalar(0, S16)
569 .scalarize(0)
570 .lower();
571 } else {
572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573 .legalFor({S32})
574 .widenScalarToNextMultipleOf(0, 32)
575 .clampScalar(0, S32, S32)
576 .scalarize(0);
577
578 if (ST.hasIntClamp()) {
579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580 .legalFor({S32}) // Clamp modifier.
581 .scalarize(0)
582 .minScalarOrElt(0, S32)
583 .lower();
584 } else {
585 // Clamp bit support was added in VI, along with 16-bit operations.
586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587 .minScalar(0, S32)
588 .scalarize(0)
589 .lower();
590 }
591
592 // FIXME: DAG expansion gets better results. The widening uses the smaller
593 // range values and goes for the min/max lowering directly.
594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595 .minScalar(0, S32)
596 .scalarize(0)
597 .lower();
598 }
599
600 getActionDefinitionsBuilder(
601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602 .customFor({S32, S64})
603 .clampScalar(0, S32, S64)
604 .widenScalarToNextPow2(0, 32)
605 .scalarize(0);
606
607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608 .legalFor({S32})
609 .maxScalar(0, S32);
610
611 if (ST.hasVOP3PInsts()) {
612 Mulh
613 .clampMaxNumElements(0, S8, 2)
614 .lowerFor({V2S8});
615 }
616
617 Mulh
618 .scalarize(0)
619 .lower();
620
621 // Report legal for any types we can handle anywhere. For the cases only legal
622 // on the SALU, RegBankSelect will be able to re-legalize.
623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625 .clampScalar(0, S32, S64)
626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628 .widenScalarToNextPow2(0)
629 .scalarize(0);
630
631 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633 .legalFor({{S32, S1}, {S32, S32}})
634 .minScalar(0, S32)
635 .scalarize(0)
636 .lower();
637
638 getActionDefinitionsBuilder(G_BITCAST)
639 // Don't worry about the size constraint.
640 .legalIf(all(isRegisterType(0), isRegisterType(1)))
641 .lower();
642
643
644 getActionDefinitionsBuilder(G_CONSTANT)
645 .legalFor({S1, S32, S64, S16, GlobalPtr,
646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647 .legalIf(isPointer(0))
648 .clampScalar(0, S32, S64)
649 .widenScalarToNextPow2(0);
650
651 getActionDefinitionsBuilder(G_FCONSTANT)
652 .legalFor({S32, S64, S16})
653 .clampScalar(0, S16, S64);
654
655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656 .legalIf(isRegisterType(0))
657 // s1 and s16 are special cases because they have legal operations on
658 // them, but don't really occupy registers in the normal way.
659 .legalFor({S1, S16})
660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661 .clampScalarOrElt(0, S32, MaxScalar)
662 .widenScalarToNextPow2(0, 32)
663 .clampMaxNumElements(0, S32, 16);
664
665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666
667 // If the amount is divergent, we have to do a wave reduction to get the
668 // maximum value, so this is expanded during RegBankSelect.
669 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670 .legalFor({{PrivatePtr, S32}});
671
672 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673 .customIf(typeIsNot(0, PrivatePtr));
674
675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676
677 auto &FPOpActions = getActionDefinitionsBuilder(
678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679 .legalFor({S32, S64});
680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681 .customFor({S32, S64});
682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683 .customFor({S32, S64});
684
685 if (ST.has16BitInsts()) {
686 if (ST.hasVOP3PInsts())
687 FPOpActions.legalFor({S16, V2S16});
688 else
689 FPOpActions.legalFor({S16});
690
691 TrigActions.customFor({S16});
692 FDIVActions.customFor({S16});
693 }
694
695 auto &MinNumMaxNum = getActionDefinitionsBuilder({
696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697
698 if (ST.hasVOP3PInsts()) {
699 MinNumMaxNum.customFor(FPTypesPK16)
700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701 .clampMaxNumElements(0, S16, 2)
702 .clampScalar(0, S16, S64)
703 .scalarize(0);
704 } else if (ST.has16BitInsts()) {
705 MinNumMaxNum.customFor(FPTypes16)
706 .clampScalar(0, S16, S64)
707 .scalarize(0);
708 } else {
709 MinNumMaxNum.customFor(FPTypesBase)
710 .clampScalar(0, S32, S64)
711 .scalarize(0);
712 }
713
714 if (ST.hasVOP3PInsts())
715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716
717 FPOpActions
718 .scalarize(0)
719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720
721 TrigActions
722 .scalarize(0)
723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724
725 FDIVActions
726 .scalarize(0)
727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728
729 getActionDefinitionsBuilder({G_FNEG, G_FABS})
730 .legalFor(FPTypesPK16)
731 .clampMaxNumElementsStrict(0, S16, 2)
732 .scalarize(0)
733 .clampScalar(0, S16, S64);
734
735 if (ST.has16BitInsts()) {
736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737 .legalFor({S32, S64, S16})
738 .scalarize(0)
739 .clampScalar(0, S16, S64);
740 } else {
741 getActionDefinitionsBuilder(G_FSQRT)
742 .legalFor({S32, S64})
743 .scalarize(0)
744 .clampScalar(0, S32, S64);
745
746 if (ST.hasFractBug()) {
747 getActionDefinitionsBuilder(G_FFLOOR)
748 .customFor({S64})
749 .legalFor({S32, S64})
750 .scalarize(0)
751 .clampScalar(0, S32, S64);
752 } else {
753 getActionDefinitionsBuilder(G_FFLOOR)
754 .legalFor({S32, S64})
755 .scalarize(0)
756 .clampScalar(0, S32, S64);
757 }
758 }
759
760 getActionDefinitionsBuilder(G_FPTRUNC)
761 .legalFor({{S32, S64}, {S16, S32}})
762 .scalarize(0)
763 .lower();
764
765 getActionDefinitionsBuilder(G_FPEXT)
766 .legalFor({{S64, S32}, {S32, S16}})
767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768 .scalarize(0);
769
770 getActionDefinitionsBuilder(G_FSUB)
771 // Use actual fsub instruction
772 .legalFor({S32})
773 // Must use fadd + fneg
774 .lowerFor({S64, S16, V2S16})
775 .scalarize(0)
776 .clampScalar(0, S32, S64);
777
778 // Whether this is legal depends on the floating point mode for the function.
779 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781 FMad.customFor({S32, S16});
782 else if (ST.hasMadMacF32Insts())
783 FMad.customFor({S32});
784 else if (ST.hasMadF16())
785 FMad.customFor({S16});
786 FMad.scalarize(0)
787 .lower();
788
789 auto &FRem = getActionDefinitionsBuilder(G_FREM);
790 if (ST.has16BitInsts()) {
791 FRem.customFor({S16, S32, S64});
792 } else {
793 FRem.minScalar(0, S32)
794 .customFor({S32, S64});
795 }
796 FRem.scalarize(0);
797
798 // TODO: Do we need to clamp maximum bitwidth?
799 getActionDefinitionsBuilder(G_TRUNC)
800 .legalIf(isScalar(0))
801 .legalFor({{V2S16, V2S32}})
802 .clampMaxNumElements(0, S16, 2)
803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804 // situations (like an invalid implicit use), we don't want to infinite loop
805 // in the legalizer.
806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807 .alwaysLegal();
808
809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811 {S32, S1}, {S64, S1}, {S16, S1}})
812 .scalarize(0)
813 .clampScalar(0, S32, S64)
814 .widenScalarToNextPow2(1, 32);
815
816 // TODO: Split s1->s64 during regbankselect for VALU.
817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819 .lowerIf(typeIs(1, S1))
820 .customFor({{S32, S64}, {S64, S64}});
821 if (ST.has16BitInsts())
822 IToFP.legalFor({{S16, S16}});
823 IToFP.clampScalar(1, S32, S64)
824 .minScalar(0, S32)
825 .scalarize(0)
826 .widenScalarToNextPow2(1);
827
828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830 .customFor({{S64, S32}, {S64, S64}})
831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832 if (ST.has16BitInsts())
833 FPToI.legalFor({{S16, S16}});
834 else
835 FPToI.minScalar(1, S32);
836
837 FPToI.minScalar(0, S32)
838 .widenScalarToNextPow2(0, 32)
839 .scalarize(0)
840 .lower();
841
842 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
843 .customFor({S16, S32})
844 .scalarize(0)
845 .lower();
846
847 // Lower roundeven into G_FRINT
848 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
849 .scalarize(0)
850 .lower();
851
852 if (ST.has16BitInsts()) {
853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854 .legalFor({S16, S32, S64})
855 .clampScalar(0, S16, S64)
856 .scalarize(0);
857 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859 .legalFor({S32, S64})
860 .clampScalar(0, S32, S64)
861 .scalarize(0);
862 } else {
863 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
864 .legalFor({S32})
865 .customFor({S64})
866 .clampScalar(0, S32, S64)
867 .scalarize(0);
868 }
869
870 getActionDefinitionsBuilder(G_PTR_ADD)
871 .legalIf(all(isPointer(0), sameSize(0, 1)))
872 .scalarize(0)
873 .scalarSameSizeAs(1, 0);
874
875 getActionDefinitionsBuilder(G_PTRMASK)
876 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
877 .scalarSameSizeAs(1, 0)
878 .scalarize(0);
879
880 auto &CmpBuilder =
881 getActionDefinitionsBuilder(G_ICMP)
882 // The compare output type differs based on the register bank of the output,
883 // so make both s1 and s32 legal.
884 //
885 // Scalar compares producing output in scc will be promoted to s32, as that
886 // is the allocatable register type that will be needed for the copy from
887 // scc. This will be promoted during RegBankSelect, and we assume something
888 // before that won't try to use s32 result types.
889 //
890 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
891 // bank.
892 .legalForCartesianProduct(
893 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
894 .legalForCartesianProduct(
895 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
896 if (ST.has16BitInsts()) {
897 CmpBuilder.legalFor({{S1, S16}});
898 }
899
900 CmpBuilder
901 .widenScalarToNextPow2(1)
902 .clampScalar(1, S32, S64)
903 .scalarize(0)
904 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
905
906 getActionDefinitionsBuilder(G_FCMP)
907 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
908 .widenScalarToNextPow2(1)
909 .clampScalar(1, S32, S64)
910 .scalarize(0);
911
912 // FIXME: fpow has a selection pattern that should move to custom lowering.
913 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
914 if (ST.has16BitInsts())
915 Exp2Ops.legalFor({S32, S16});
916 else
917 Exp2Ops.legalFor({S32});
918 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
919 Exp2Ops.scalarize(0);
920
921 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
922 if (ST.has16BitInsts())
923 ExpOps.customFor({{S32}, {S16}});
924 else
925 ExpOps.customFor({S32});
926 ExpOps.clampScalar(0, MinScalarFPTy, S32)
927 .scalarize(0);
928
929 getActionDefinitionsBuilder(G_FPOWI)
930 .clampScalar(0, MinScalarFPTy, S32)
931 .lower();
932
933 // The 64-bit versions produce 32-bit results, but only on the SALU.
934 getActionDefinitionsBuilder(G_CTPOP)
935 .legalFor({{S32, S32}, {S32, S64}})
936 .clampScalar(0, S32, S32)
937 .widenScalarToNextPow2(1, 32)
938 .clampScalar(1, S32, S64)
939 .scalarize(0)
940 .widenScalarToNextPow2(0, 32);
941
942
943 // The hardware instructions return a different result on 0 than the generic
944 // instructions expect. The hardware produces -1, but these produce the
945 // bitwidth.
946 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
947 .scalarize(0)
948 .clampScalar(0, S32, S32)
949 .clampScalar(1, S32, S64)
950 .widenScalarToNextPow2(0, 32)
951 .widenScalarToNextPow2(1, 32)
952 .custom();
953
954 // The 64-bit versions produce 32-bit results, but only on the SALU.
955 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
956 .legalFor({{S32, S32}, {S32, S64}})
957 .clampScalar(0, S32, S32)
958 .clampScalar(1, S32, S64)
959 .scalarize(0)
960 .widenScalarToNextPow2(0, 32)
961 .widenScalarToNextPow2(1, 32);
962
963 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
964 // RegBankSelect.
965 getActionDefinitionsBuilder(G_BITREVERSE)
966 .legalFor({S32, S64})
967 .clampScalar(0, S32, S64)
968 .scalarize(0)
969 .widenScalarToNextPow2(0);
970
971 if (ST.has16BitInsts()) {
972 getActionDefinitionsBuilder(G_BSWAP)
973 .legalFor({S16, S32, V2S16})
974 .clampMaxNumElementsStrict(0, S16, 2)
975 // FIXME: Fixing non-power-of-2 before clamp is workaround for
976 // narrowScalar limitation.
977 .widenScalarToNextPow2(0)
978 .clampScalar(0, S16, S32)
979 .scalarize(0);
980
981 if (ST.hasVOP3PInsts()) {
982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983 .legalFor({S32, S16, V2S16})
984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
985 .clampMaxNumElements(0, S16, 2)
986 .minScalar(0, S16)
987 .widenScalarToNextPow2(0)
988 .scalarize(0)
989 .lower();
990 } else {
991 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
992 .legalFor({S32, S16})
993 .widenScalarToNextPow2(0)
994 .minScalar(0, S16)
995 .scalarize(0)
996 .lower();
997 }
998 } else {
999 // TODO: Should have same legality without v_perm_b32
1000 getActionDefinitionsBuilder(G_BSWAP)
1001 .legalFor({S32})
1002 .lowerIf(scalarNarrowerThan(0, 32))
1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1004 // narrowScalar limitation.
1005 .widenScalarToNextPow2(0)
1006 .maxScalar(0, S32)
1007 .scalarize(0)
1008 .lower();
1009
1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1011 .legalFor({S32})
1012 .minScalar(0, S32)
1013 .widenScalarToNextPow2(0)
1014 .scalarize(0)
1015 .lower();
1016 }
1017
1018 getActionDefinitionsBuilder(G_INTTOPTR)
1019 // List the common cases
1020 .legalForCartesianProduct(AddrSpaces64, {S64})
1021 .legalForCartesianProduct(AddrSpaces32, {S32})
1022 .scalarize(0)
1023 // Accept any address space as long as the size matches
1024 .legalIf(sameSize(0, 1))
1025 .widenScalarIf(smallerThan(1, 0),
1026 [](const LegalityQuery &Query) {
1027 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1028 })
1029 .narrowScalarIf(largerThan(1, 0),
1030 [](const LegalityQuery &Query) {
1031 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1032 });
1033
1034 getActionDefinitionsBuilder(G_PTRTOINT)
1035 // List the common cases
1036 .legalForCartesianProduct(AddrSpaces64, {S64})
1037 .legalForCartesianProduct(AddrSpaces32, {S32})
1038 .scalarize(0)
1039 // Accept any address space as long as the size matches
1040 .legalIf(sameSize(0, 1))
1041 .widenScalarIf(smallerThan(0, 1),
1042 [](const LegalityQuery &Query) {
1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044 })
1045 .narrowScalarIf(
1046 largerThan(0, 1),
1047 [](const LegalityQuery &Query) {
1048 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1049 });
1050
1051 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1052 .scalarize(0)
1053 .custom();
1054
1055 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1056 bool IsLoad) -> bool {
1057 const LLT DstTy = Query.Types[0];
1058
1059 // Split vector extloads.
1060 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1061
1062 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1063 return true;
1064
1065 const LLT PtrTy = Query.Types[1];
1066 unsigned AS = PtrTy.getAddressSpace();
1067 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1068 return true;
1069
1070 // Catch weird sized loads that don't evenly divide into the access sizes
1071 // TODO: May be able to widen depending on alignment etc.
1072 unsigned NumRegs = (MemSize + 31) / 32;
1073 if (NumRegs == 3) {
1074 if (!ST.hasDwordx3LoadStores())
1075 return true;
1076 } else {
1077 // If the alignment allows, these should have been widened.
1078 if (!isPowerOf2_32(NumRegs))
1079 return true;
1080 }
1081
1082 return false;
1083 };
1084
1085 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1086 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1087 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1088
1089 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1090 // LDS
1091 // TODO: Unsupported flat for SI.
1092
1093 for (unsigned Op : {G_LOAD, G_STORE}) {
1094 const bool IsStore = Op == G_STORE;
1095
1096 auto &Actions = getActionDefinitionsBuilder(Op);
1097 // Explicitly list some common cases.
1098 // TODO: Does this help compile time at all?
1099 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1100 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1101 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1102 {S64, GlobalPtr, S64, GlobalAlign32},
1103 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1104 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1105 {S32, GlobalPtr, S8, GlobalAlign8},
1106 {S32, GlobalPtr, S16, GlobalAlign16},
1107
1108 {S32, LocalPtr, S32, 32},
1109 {S64, LocalPtr, S64, 32},
1110 {V2S32, LocalPtr, V2S32, 32},
1111 {S32, LocalPtr, S8, 8},
1112 {S32, LocalPtr, S16, 16},
1113 {V2S16, LocalPtr, S32, 32},
1114
1115 {S32, PrivatePtr, S32, 32},
1116 {S32, PrivatePtr, S8, 8},
1117 {S32, PrivatePtr, S16, 16},
1118 {V2S16, PrivatePtr, S32, 32},
1119
1120 {S32, ConstantPtr, S32, GlobalAlign32},
1121 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1122 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1123 {S64, ConstantPtr, S64, GlobalAlign32},
1124 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1125 Actions.legalIf(
1126 [=](const LegalityQuery &Query) -> bool {
1127 return isLoadStoreLegal(ST, Query);
1128 });
1129
1130 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1131 // 64-bits.
1132 //
1133 // TODO: Should generalize bitcast action into coerce, which will also cover
1134 // inserting addrspacecasts.
1135 Actions.customIf(typeIs(1, Constant32Ptr));
1136
1137 // Turn any illegal element vectors into something easier to deal
1138 // with. These will ultimately produce 32-bit scalar shifts to extract the
1139 // parts anyway.
1140 //
1141 // For odd 16-bit element vectors, prefer to split those into pieces with
1142 // 16-bit vector parts.
1143 Actions.bitcastIf(
1144 [=](const LegalityQuery &Query) -> bool {
1145 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1146 Query.MMODescrs[0].MemoryTy);
1147 }, bitcastToRegisterType(0));
1148
1149 if (!IsStore) {
1150 // Widen suitably aligned loads by loading extra bytes. The standard
1151 // legalization actions can't properly express widening memory operands.
1152 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1153 return shouldWidenLoad(ST, Query, G_LOAD);
1154 });
1155 }
1156
1157 // FIXME: load/store narrowing should be moved to lower action
1158 Actions
1159 .narrowScalarIf(
1160 [=](const LegalityQuery &Query) -> bool {
1161 return !Query.Types[0].isVector() &&
1162 needToSplitMemOp(Query, Op == G_LOAD);
1163 },
1164 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1165 const LLT DstTy = Query.Types[0];
1166 const LLT PtrTy = Query.Types[1];
1167
1168 const unsigned DstSize = DstTy.getSizeInBits();
1169 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1170
1171 // Split extloads.
1172 if (DstSize > MemSize)
1173 return std::make_pair(0, LLT::scalar(MemSize));
1174
1175 unsigned MaxSize = maxSizeForAddrSpace(ST,
1176 PtrTy.getAddressSpace(),
1177 Op == G_LOAD);
1178 if (MemSize > MaxSize)
1179 return std::make_pair(0, LLT::scalar(MaxSize));
1180
1181 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1182 return std::make_pair(0, LLT::scalar(Align));
1183 })
1184 .fewerElementsIf(
1185 [=](const LegalityQuery &Query) -> bool {
1186 return Query.Types[0].isVector() &&
1187 needToSplitMemOp(Query, Op == G_LOAD);
1188 },
1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190 const LLT DstTy = Query.Types[0];
1191 const LLT PtrTy = Query.Types[1];
1192
1193 LLT EltTy = DstTy.getElementType();
1194 unsigned MaxSize = maxSizeForAddrSpace(ST,
1195 PtrTy.getAddressSpace(),
1196 Op == G_LOAD);
1197
1198 // FIXME: Handle widened to power of 2 results better. This ends
1199 // up scalarizing.
1200 // FIXME: 3 element stores scalarized on SI
1201
1202 // Split if it's too large for the address space.
1203 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1204 if (MemSize > MaxSize) {
1205 unsigned NumElts = DstTy.getNumElements();
1206 unsigned EltSize = EltTy.getSizeInBits();
1207
1208 if (MaxSize % EltSize == 0) {
1209 return std::make_pair(
1210 0, LLT::scalarOrVector(
1211 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1212 }
1213
1214 unsigned NumPieces = MemSize / MaxSize;
1215
1216 // FIXME: Refine when odd breakdowns handled
1217 // The scalars will need to be re-legalized.
1218 if (NumPieces == 1 || NumPieces >= NumElts ||
1219 NumElts % NumPieces != 0)
1220 return std::make_pair(0, EltTy);
1221
1222 return std::make_pair(
1223 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1224 }
1225
1226 // FIXME: We could probably handle weird extending loads better.
1227 if (DstTy.getSizeInBits() > MemSize)
1228 return std::make_pair(0, EltTy);
1229
1230 unsigned EltSize = EltTy.getSizeInBits();
1231 unsigned DstSize = DstTy.getSizeInBits();
1232 if (!isPowerOf2_32(DstSize)) {
1233 // We're probably decomposing an odd sized store. Try to split
1234 // to the widest type. TODO: Account for alignment. As-is it
1235 // should be OK, since the new parts will be further legalized.
1236 unsigned FloorSize = PowerOf2Floor(DstSize);
1237 return std::make_pair(
1238 0, LLT::scalarOrVector(
1239 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1240 }
1241
1242 // May need relegalization for the scalars.
1243 return std::make_pair(0, EltTy);
1244 })
1245 .minScalar(0, S32)
1246 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1247 .widenScalarToNextPow2(0)
1248 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1249 .lower();
1250 }
1251
1252 // FIXME: Unaligned accesses not lowered.
1253 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1254 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1255 {S32, GlobalPtr, S16, 2 * 8},
1256 {S32, LocalPtr, S8, 8},
1257 {S32, LocalPtr, S16, 16},
1258 {S32, PrivatePtr, S8, 8},
1259 {S32, PrivatePtr, S16, 16},
1260 {S32, ConstantPtr, S8, 8},
1261 {S32, ConstantPtr, S16, 2 * 8}})
1262 .legalIf(
1263 [=](const LegalityQuery &Query) -> bool {
1264 return isLoadStoreLegal(ST, Query);
1265 });
1266
1267 if (ST.hasFlatAddressSpace()) {
1268 ExtLoads.legalForTypesWithMemDesc(
1269 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1270 }
1271
1272 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1273 // 64-bits.
1274 //
1275 // TODO: Should generalize bitcast action into coerce, which will also cover
1276 // inserting addrspacecasts.
1277 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1278
1279 ExtLoads.clampScalar(0, S32, S32)
1280 .widenScalarToNextPow2(0)
1281 .lower();
1282
1283 auto &Atomics = getActionDefinitionsBuilder(
1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287 G_ATOMICRMW_UMIN})
1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289 {S64, GlobalPtr}, {S64, LocalPtr},
1290 {S32, RegionPtr}, {S64, RegionPtr}});
1291 if (ST.hasFlatAddressSpace()) {
1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293 }
1294
1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296 if (ST.hasLDSFPAtomicAdd()) {
1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298 if (ST.hasGFX90AInsts())
1299 Atomic.legalFor({{S64, LocalPtr}});
1300 if (ST.hasGFX940Insts())
1301 Atomic.legalFor({{V2S16, LocalPtr}});
1302 }
1303 if (ST.hasAtomicFaddInsts())
1304 Atomic.legalFor({{S32, GlobalPtr}});
1305
1306 if (ST.hasGFX90AInsts()) {
1307 // These are legal with some caveats, and should have undergone expansion in
1308 // the IR in most situations
1309 // TODO: Move atomic expansion into legalizer
1310 // TODO: Also supports <2 x f16>
1311 Atomic.legalFor({
1312 {S32, GlobalPtr},
1313 {S64, GlobalPtr},
1314 {S64, FlatPtr}
1315 });
1316 }
1317
1318 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1319 // demarshalling
1320 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1321 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1322 {S32, FlatPtr}, {S64, FlatPtr}})
1323 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1324 {S32, RegionPtr}, {S64, RegionPtr}});
1325 // TODO: Pointer types, any 32-bit or 64-bit vector
1326
1327 // Condition should be s32 for scalar, s1 for vector.
1328 getActionDefinitionsBuilder(G_SELECT)
1329 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1330 LocalPtr, FlatPtr, PrivatePtr,
1331 LLT::fixed_vector(2, LocalPtr),
1332 LLT::fixed_vector(2, PrivatePtr)},
1333 {S1, S32})
1334 .clampScalar(0, S16, S64)
1335 .scalarize(1)
1336 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1337 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1338 .clampMaxNumElements(0, S32, 2)
1339 .clampMaxNumElements(0, LocalPtr, 2)
1340 .clampMaxNumElements(0, PrivatePtr, 2)
1341 .scalarize(0)
1342 .widenScalarToNextPow2(0)
1343 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1344
1345 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1346 // be more flexible with the shift amount type.
1347 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1348 .legalFor({{S32, S32}, {S64, S32}});
1349 if (ST.has16BitInsts()) {
1350 if (ST.hasVOP3PInsts()) {
1351 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1352 .clampMaxNumElements(0, S16, 2);
1353 } else
1354 Shifts.legalFor({{S16, S16}});
1355
1356 // TODO: Support 16-bit shift amounts for all types
1357 Shifts.widenScalarIf(
1358 [=](const LegalityQuery &Query) {
1359 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1360 // 32-bit amount.
1361 const LLT ValTy = Query.Types[0];
1362 const LLT AmountTy = Query.Types[1];
1363 return ValTy.getSizeInBits() <= 16 &&
1364 AmountTy.getSizeInBits() < 16;
1365 }, changeTo(1, S16));
1366 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1367 Shifts.clampScalar(1, S32, S32);
1368 Shifts.widenScalarToNextPow2(0, 16);
1369 Shifts.clampScalar(0, S16, S64);
1370
1371 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1372 .minScalar(0, S16)
1373 .scalarize(0)
1374 .lower();
1375 } else {
1376 // Make sure we legalize the shift amount type first, as the general
1377 // expansion for the shifted type will produce much worse code if it hasn't
1378 // been truncated already.
1379 Shifts.clampScalar(1, S32, S32);
1380 Shifts.widenScalarToNextPow2(0, 32);
1381 Shifts.clampScalar(0, S32, S64);
1382
1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384 .minScalar(0, S32)
1385 .scalarize(0)
1386 .lower();
1387 }
1388 Shifts.scalarize(0);
1389
1390 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1391 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1392 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1393 unsigned IdxTypeIdx = 2;
1394
1395 getActionDefinitionsBuilder(Op)
1396 .customIf([=](const LegalityQuery &Query) {
1397 const LLT EltTy = Query.Types[EltTypeIdx];
1398 const LLT VecTy = Query.Types[VecTypeIdx];
1399 const LLT IdxTy = Query.Types[IdxTypeIdx];
1400 const unsigned EltSize = EltTy.getSizeInBits();
1401 return (EltSize == 32 || EltSize == 64) &&
1402 VecTy.getSizeInBits() % 32 == 0 &&
1403 VecTy.getSizeInBits() <= MaxRegisterSize &&
1404 IdxTy.getSizeInBits() == 32;
1405 })
1406 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1407 bitcastToVectorElement32(VecTypeIdx))
1408 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1409 .bitcastIf(
1410 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1411 [=](const LegalityQuery &Query) {
1412 // For > 64-bit element types, try to turn this into a 64-bit
1413 // element vector since we may be able to do better indexing
1414 // if this is scalar. If not, fall back to 32.
1415 const LLT EltTy = Query.Types[EltTypeIdx];
1416 const LLT VecTy = Query.Types[VecTypeIdx];
1417 const unsigned DstEltSize = EltTy.getSizeInBits();
1418 const unsigned VecSize = VecTy.getSizeInBits();
1419
1420 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1421 return std::make_pair(
1422 VecTypeIdx,
1423 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1424 })
1425 .clampScalar(EltTypeIdx, S32, S64)
1426 .clampScalar(VecTypeIdx, S32, S64)
1427 .clampScalar(IdxTypeIdx, S32, S32)
1428 .clampMaxNumElements(VecTypeIdx, S32, 32)
1429 // TODO: Clamp elements for 64-bit vectors?
1430 // It should only be necessary with variable indexes.
1431 // As a last resort, lower to the stack
1432 .lower();
1433 }
1434
1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436 .unsupportedIf([=](const LegalityQuery &Query) {
1437 const LLT &EltTy = Query.Types[1].getElementType();
1438 return Query.Types[0] != EltTy;
1439 });
1440
1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444
1445 // FIXME: Doesn't handle extract of illegal sizes.
1446 getActionDefinitionsBuilder(Op)
1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448 .lowerIf([=](const LegalityQuery &Query) {
1449 // Sub-vector(or single element) insert and extract.
1450 // TODO: verify immediate offset here since lower only works with
1451 // whole elements.
1452 const LLT BigTy = Query.Types[BigTyIdx];
1453 return BigTy.isVector();
1454 })
1455 // FIXME: Multiples of 16 should not be legal.
1456 .legalIf([=](const LegalityQuery &Query) {
1457 const LLT BigTy = Query.Types[BigTyIdx];
1458 const LLT LitTy = Query.Types[LitTyIdx];
1459 return (BigTy.getSizeInBits() % 32 == 0) &&
1460 (LitTy.getSizeInBits() % 16 == 0);
1461 })
1462 .widenScalarIf(
1463 [=](const LegalityQuery &Query) {
1464 const LLT BigTy = Query.Types[BigTyIdx];
1465 return (BigTy.getScalarSizeInBits() < 16);
1466 },
1467 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1468 .widenScalarIf(
1469 [=](const LegalityQuery &Query) {
1470 const LLT LitTy = Query.Types[LitTyIdx];
1471 return (LitTy.getScalarSizeInBits() < 16);
1472 },
1473 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1474 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1475 .widenScalarToNextPow2(BigTyIdx, 32);
1476
1477 }
1478
1479 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1480 .legalForCartesianProduct(AllS32Vectors, {S32})
1481 .legalForCartesianProduct(AllS64Vectors, {S64})
1482 .clampNumElements(0, V16S32, V32S32)
1483 .clampNumElements(0, V2S64, V16S64)
1484 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1485
1486 if (ST.hasScalarPackInsts()) {
1487 BuildVector
1488 // FIXME: Should probably widen s1 vectors straight to s32
1489 .minScalarOrElt(0, S16)
1490 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1491 .minScalar(1, S32);
1492
1493 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1494 .legalFor({V2S16, S32})
1495 .lower();
1496 BuildVector.minScalarOrElt(0, S32);
1497 } else {
1498 BuildVector.customFor({V2S16, S16});
1499 BuildVector.minScalarOrElt(0, S32);
1500
1501 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1502 .customFor({V2S16, S32})
1503 .lower();
1504 }
1505
1506 BuildVector.legalIf(isRegisterType(0));
1507
1508 // FIXME: Clamp maximum size
1509 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1510 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1511 .clampMaxNumElements(0, S32, 32)
1512 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1513 .clampMaxNumElements(0, S16, 64);
1514
1515 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1516 // pre-legalize.
1517 if (ST.hasVOP3PInsts()) {
1518 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1519 .customFor({V2S16, V2S16})
1520 .lower();
1521 } else
1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1523
1524 // Merge/Unmerge
1525 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1526 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1527 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1528
1529 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1530 const LLT Ty = Query.Types[TypeIdx];
1531 if (Ty.isVector()) {
1532 const LLT &EltTy = Ty.getElementType();
1533 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1534 return true;
1535 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1536 return true;
1537 }
1538 return false;
1539 };
1540
1541 auto &Builder = getActionDefinitionsBuilder(Op)
1542 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1543 .lowerFor({{S16, V2S16}})
1544 .lowerIf([=](const LegalityQuery &Query) {
1545 const LLT BigTy = Query.Types[BigTyIdx];
1546 return BigTy.getSizeInBits() == 32;
1547 })
1548 // Try to widen to s16 first for small types.
1549 // TODO: Only do this on targets with legal s16 shifts
1550 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1551 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1552 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1553 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1554 elementTypeIs(1, S16)),
1555 changeTo(1, V2S16))
1556 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1557 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1558 // valid.
1559 .clampScalar(LitTyIdx, S32, S512)
1560 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1561 // Break up vectors with weird elements into scalars
1562 .fewerElementsIf(
1563 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1564 scalarize(0))
1565 .fewerElementsIf(
1566 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1567 scalarize(1))
1568 .clampScalar(BigTyIdx, S32, MaxScalar);
1569
1570 if (Op == G_MERGE_VALUES) {
1571 Builder.widenScalarIf(
1572 // TODO: Use 16-bit shifts if legal for 8-bit values?
1573 [=](const LegalityQuery &Query) {
1574 const LLT Ty = Query.Types[LitTyIdx];
1575 return Ty.getSizeInBits() < 32;
1576 },
1577 changeTo(LitTyIdx, S32));
1578 }
1579
1580 Builder.widenScalarIf(
1581 [=](const LegalityQuery &Query) {
1582 const LLT Ty = Query.Types[BigTyIdx];
1583 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1584 Ty.getSizeInBits() % 16 != 0;
1585 },
1586 [=](const LegalityQuery &Query) {
1587 // Pick the next power of 2, or a multiple of 64 over 128.
1588 // Whichever is smaller.
1589 const LLT &Ty = Query.Types[BigTyIdx];
1590 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1591 if (NewSizeInBits >= 256) {
1592 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1593 if (RoundedTo < NewSizeInBits)
1594 NewSizeInBits = RoundedTo;
1595 }
1596 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1597 })
1598 // Any vectors left are the wrong size. Scalarize them.
1599 .scalarize(0)
1600 .scalarize(1);
1601 }
1602
1603 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1604 // RegBankSelect.
1605 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1606 .legalFor({{S32}, {S64}});
1607
1608 if (ST.hasVOP3PInsts()) {
1609 SextInReg.lowerFor({{V2S16}})
1610 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1611 // get more vector shift opportunities, since we'll get those when
1612 // expanded.
1613 .clampMaxNumElementsStrict(0, S16, 2);
1614 } else if (ST.has16BitInsts()) {
1615 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1616 } else {
1617 // Prefer to promote to s32 before lowering if we don't have 16-bit
1618 // shifts. This avoid a lot of intermediate truncate and extend operations.
1619 SextInReg.lowerFor({{S32}, {S64}});
1620 }
1621
1622 SextInReg
1623 .scalarize(0)
1624 .clampScalar(0, S32, S64)
1625 .lower();
1626
1627 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1628 .scalarize(0)
1629 .lower();
1630
1631 // TODO: Only Try to form v2s16 with legal packed instructions.
1632 getActionDefinitionsBuilder(G_FSHR)
1633 .legalFor({{S32, S32}})
1634 .lowerFor({{V2S16, V2S16}})
1635 .clampMaxNumElementsStrict(0, S16, 2)
1636 .scalarize(0)
1637 .lower();
1638
1639 if (ST.hasVOP3PInsts()) {
1640 getActionDefinitionsBuilder(G_FSHL)
1641 .lowerFor({{V2S16, V2S16}})
1642 .clampMaxNumElementsStrict(0, S16, 2)
1643 .scalarize(0)
1644 .lower();
1645 } else {
1646 getActionDefinitionsBuilder(G_FSHL)
1647 .scalarize(0)
1648 .lower();
1649 }
1650
1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652 .legalFor({S64});
1653
1654 getActionDefinitionsBuilder(G_FENCE)
1655 .alwaysLegal();
1656
1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658 .scalarize(0)
1659 .minScalar(0, S32)
1660 .lower();
1661
1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663 .legalFor({{S32, S32}, {S64, S32}})
1664 .clampScalar(1, S32, S32)
1665 .clampScalar(0, S32, S64)
1666 .widenScalarToNextPow2(0)
1667 .scalarize(0);
1668
1669 getActionDefinitionsBuilder({
1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671 G_FCOPYSIGN,
1672
1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674 G_ATOMICRMW_NAND,
1675 G_ATOMICRMW_FSUB,
1676 G_READ_REGISTER,
1677 G_WRITE_REGISTER,
1678
1679 G_SADDO, G_SSUBO,
1680
1681 // TODO: Implement
1682 G_FMINIMUM, G_FMAXIMUM}).lower();
1683
1684 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1685 .lower();
1686
1687 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1688 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1689 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1690 .unsupported();
1691
1692 getLegacyLegalizerInfo().computeTables();
1693 verify(*ST.getInstrInfo());
1694}
1695
1696bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1697 MachineInstr &MI) const {
1698 MachineIRBuilder &B = Helper.MIRBuilder;
1699 MachineRegisterInfo &MRI = *B.getMRI();
1700
1701 switch (MI.getOpcode()) {
1702 case TargetOpcode::G_ADDRSPACE_CAST:
1703 return legalizeAddrSpaceCast(MI, MRI, B);
1704 case TargetOpcode::G_FRINT:
1705 return legalizeFrint(MI, MRI, B);
1706 case TargetOpcode::G_FCEIL:
1707 return legalizeFceil(MI, MRI, B);
1708 case TargetOpcode::G_FREM:
1709 return legalizeFrem(MI, MRI, B);
1710 case TargetOpcode::G_INTRINSIC_TRUNC:
1711 return legalizeIntrinsicTrunc(MI, MRI, B);
1712 case TargetOpcode::G_SITOFP:
1713 return legalizeITOFP(MI, MRI, B, true);
1714 case TargetOpcode::G_UITOFP:
1715 return legalizeITOFP(MI, MRI, B, false);
1716 case TargetOpcode::G_FPTOSI:
1717 return legalizeFPTOI(MI, MRI, B, true);
1718 case TargetOpcode::G_FPTOUI:
1719 return legalizeFPTOI(MI, MRI, B, false);
1720 case TargetOpcode::G_FMINNUM:
1721 case TargetOpcode::G_FMAXNUM:
1722 case TargetOpcode::G_FMINNUM_IEEE:
1723 case TargetOpcode::G_FMAXNUM_IEEE:
1724 return legalizeMinNumMaxNum(Helper, MI);
1725 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1726 return legalizeExtractVectorElt(MI, MRI, B);
1727 case TargetOpcode::G_INSERT_VECTOR_ELT:
1728 return legalizeInsertVectorElt(MI, MRI, B);
1729 case TargetOpcode::G_SHUFFLE_VECTOR:
1730 return legalizeShuffleVector(MI, MRI, B);
1731 case TargetOpcode::G_FSIN:
1732 case TargetOpcode::G_FCOS:
1733 return legalizeSinCos(MI, MRI, B);
1734 case TargetOpcode::G_GLOBAL_VALUE:
1735 return legalizeGlobalValue(MI, MRI, B);
1736 case TargetOpcode::G_LOAD:
1737 case TargetOpcode::G_SEXTLOAD:
1738 case TargetOpcode::G_ZEXTLOAD:
1739 return legalizeLoad(Helper, MI);
1740 case TargetOpcode::G_FMAD:
1741 return legalizeFMad(MI, MRI, B);
1742 case TargetOpcode::G_FDIV:
1743 return legalizeFDIV(MI, MRI, B);
1744 case TargetOpcode::G_UDIV:
1745 case TargetOpcode::G_UREM:
1746 case TargetOpcode::G_UDIVREM:
1747 return legalizeUnsignedDIV_REM(MI, MRI, B);
1748 case TargetOpcode::G_SDIV:
1749 case TargetOpcode::G_SREM:
1750 case TargetOpcode::G_SDIVREM:
1751 return legalizeSignedDIV_REM(MI, MRI, B);
1752 case TargetOpcode::G_ATOMIC_CMPXCHG:
1753 return legalizeAtomicCmpXChg(MI, MRI, B);
1754 case TargetOpcode::G_FLOG:
1755 return legalizeFlog(MI, B, numbers::ln2f);
1756 case TargetOpcode::G_FLOG10:
1757 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1758 case TargetOpcode::G_FEXP:
1759 return legalizeFExp(MI, B);
1760 case TargetOpcode::G_FPOW:
1761 return legalizeFPow(MI, B);
1762 case TargetOpcode::G_FFLOOR:
1763 return legalizeFFloor(MI, MRI, B);
1764 case TargetOpcode::G_BUILD_VECTOR:
1765 return legalizeBuildVector(MI, MRI, B);
1766 case TargetOpcode::G_CTLZ:
1767 case TargetOpcode::G_CTTZ:
1768 return legalizeCTLZ_CTTZ(MI, MRI, B);
1769 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1770 return legalizeFPTruncRound(MI, B);
1771 default:
1772 return false;
1773 }
1774
1775 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1775)
;
1776}
1777
1778Register AMDGPULegalizerInfo::getSegmentAperture(
1779 unsigned AS,
1780 MachineRegisterInfo &MRI,
1781 MachineIRBuilder &B) const {
1782 MachineFunction &MF = B.getMF();
1783 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1784 const LLT S32 = LLT::scalar(32);
1785
1786 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1786, __extension__
__PRETTY_FUNCTION__))
;
1787
1788 if (ST.hasApertureRegs()) {
1789 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1790 // getreg.
1791 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1792 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1793 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1794 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1795 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1796 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1797 unsigned Encoding =
1798 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1799 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1800 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1801
1802 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1803
1804 B.buildInstr(AMDGPU::S_GETREG_B32)
1805 .addDef(GetReg)
1806 .addImm(Encoding);
1807 MRI.setType(GetReg, S32);
1808
1809 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1810 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1811 }
1812
1813 // TODO: can we be smarter about machine pointer info?
1814 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1815 Register LoadAddr = MRI.createGenericVirtualRegister(
1816 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1817 // For code object version 5, private_base and shared_base are passed through
1818 // implicit kernargs.
1819 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1820 AMDGPUTargetLowering::ImplicitParameter Param =
1821 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1822 : AMDGPUTargetLowering::PRIVATE_BASE;
1823 uint64_t Offset =
1824 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1825
1826 Register KernargPtrReg = MRI.createGenericVirtualRegister(
1827 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1828
1829 if (!loadInputValue(KernargPtrReg, B,
1830 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1831 return Register();
1832
1833 MachineMemOperand *MMO = MF.getMachineMemOperand(
1834 PtrInfo,
1835 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1836 MachineMemOperand::MOInvariant,
1837 LLT::scalar(32), commonAlignment(Align(64), Offset));
1838
1839 // Pointer address
1840 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1841 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1842 // Load address
1843 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1844 }
1845
1846 Register QueuePtr = MRI.createGenericVirtualRegister(
1847 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1848
1849 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1850 return Register();
1851
1852 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1853 // private_segment_aperture_base_hi.
1854 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1855
1856 MachineMemOperand *MMO = MF.getMachineMemOperand(
1857 PtrInfo,
1858 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1859 MachineMemOperand::MOInvariant,
1860 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1861
1862 B.buildPtrAdd(LoadAddr, QueuePtr,
1863 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1864 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1865}
1866
1867/// Return true if the value is a known valid address, such that a null check is
1868/// not necessary.
1869static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1870 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1871 MachineInstr *Def = MRI.getVRegDef(Val);
1872 switch (Def->getOpcode()) {
1873 case AMDGPU::G_FRAME_INDEX:
1874 case AMDGPU::G_GLOBAL_VALUE:
1875 case AMDGPU::G_BLOCK_ADDR:
1876 return true;
1877 case AMDGPU::G_CONSTANT: {
1878 const ConstantInt *CI = Def->getOperand(1).getCImm();
1879 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1880 }
1881 default:
1882 return false;
1883 }
1884
1885 return false;
1886}
1887
1888bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1889 MachineInstr &MI, MachineRegisterInfo &MRI,
1890 MachineIRBuilder &B) const {
1891 MachineFunction &MF = B.getMF();
1892
1893 const LLT S32 = LLT::scalar(32);
1894 Register Dst = MI.getOperand(0).getReg();
1895 Register Src = MI.getOperand(1).getReg();
1896
1897 LLT DstTy = MRI.getType(Dst);
1898 LLT SrcTy = MRI.getType(Src);
1899 unsigned DestAS = DstTy.getAddressSpace();
1900 unsigned SrcAS = SrcTy.getAddressSpace();
1901
1902 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1903 // vector element.
1904 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1904, __extension__ __PRETTY_FUNCTION__))
;
1905
1906 const AMDGPUTargetMachine &TM
1907 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1908
1909 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1910 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1911 return true;
1912 }
1913
1914 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1915 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1916 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1917 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1918 // Extract low 32-bits of the pointer.
1919 B.buildExtract(Dst, Src, 0);
1920 MI.eraseFromParent();
1921 return true;
1922 }
1923
1924 unsigned NullVal = TM.getNullPointerValue(DestAS);
1925
1926 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1927 auto FlatNull = B.buildConstant(SrcTy, 0);
1928
1929 // Extract low 32-bits of the pointer.
1930 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1931
1932 auto CmpRes =
1933 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1934 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1935
1936 MI.eraseFromParent();
1937 return true;
1938 }
1939
1940 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1941 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1942 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1943 if (!ST.hasFlatAddressSpace())
1944 return false;
1945
1946 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1947 if (!ApertureReg.isValid())
1948 return false;
1949
1950 // Coerce the type of the low half of the result so we can use merge_values.
1951 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1952
1953 // TODO: Should we allow mismatched types but matching sizes in merges to
1954 // avoid the ptrtoint?
1955 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1956
1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1958 B.buildCopy(Dst, BuildPtr);
1959 MI.eraseFromParent();
1960 return true;
1961 }
1962
1963 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1964 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1965
1966 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1967 SegmentNull.getReg(0));
1968
1969 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1970
1971 MI.eraseFromParent();
1972 return true;
1973 }
1974
1975 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1976 SrcTy.getSizeInBits() == 64) {
1977 // Truncate.
1978 B.buildExtract(Dst, Src, 0);
1979 MI.eraseFromParent();
1980 return true;
1981 }
1982
1983 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1984 DstTy.getSizeInBits() == 64) {
1985 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1986 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1987
1988 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1989 // another. Merge operands are required to be the same type, but creating an
1990 // extra ptrtoint would be kind of pointless.
1991 auto HighAddr = B.buildConstant(
1992 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1993 B.buildMerge(Dst, {Src, HighAddr});
1994 MI.eraseFromParent();
1995 return true;
1996 }
1997
1998 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1999 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2000
2001 LLVMContext &Ctx = MF.getFunction().getContext();
2002 Ctx.diagnose(InvalidAddrSpaceCast);
2003 B.buildUndef(Dst);
2004 MI.eraseFromParent();
2005 return true;
2006}
2007
2008bool AMDGPULegalizerInfo::legalizeFrint(
2009 MachineInstr &MI, MachineRegisterInfo &MRI,
2010 MachineIRBuilder &B) const {
2011 Register Src = MI.getOperand(1).getReg();
2012 LLT Ty = MRI.getType(Src);
2013 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2013, __extension__
__PRETTY_FUNCTION__))
;
2014
2015 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2016 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2017
2018 auto C1 = B.buildFConstant(Ty, C1Val);
2019 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2020
2021 // TODO: Should this propagate fast-math-flags?
2022 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2023 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2024
2025 auto C2 = B.buildFConstant(Ty, C2Val);
2026 auto Fabs = B.buildFAbs(Ty, Src);
2027
2028 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2029 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2030 MI.eraseFromParent();
2031 return true;
2032}
2033
2034bool AMDGPULegalizerInfo::legalizeFceil(
2035 MachineInstr &MI, MachineRegisterInfo &MRI,
2036 MachineIRBuilder &B) const {
2037
2038 const LLT S1 = LLT::scalar(1);
2039 const LLT S64 = LLT::scalar(64);
2040
2041 Register Src = MI.getOperand(1).getReg();
2042 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2042, __extension__ __PRETTY_FUNCTION__))
;
2043
2044 // result = trunc(src)
2045 // if (src > 0.0 && src != result)
2046 // result += 1.0
2047
2048 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2049
2050 const auto Zero = B.buildFConstant(S64, 0.0);
2051 const auto One = B.buildFConstant(S64, 1.0);
2052 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2053 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2054 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2055 auto Add = B.buildSelect(S64, And, One, Zero);
2056
2057 // TODO: Should this propagate fast-math-flags?
2058 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2059 MI.eraseFromParent();
2060 return true;
2061}
2062
2063bool AMDGPULegalizerInfo::legalizeFrem(
2064 MachineInstr &MI, MachineRegisterInfo &MRI,
2065 MachineIRBuilder &B) const {
2066 Register DstReg = MI.getOperand(0).getReg();
2067 Register Src0Reg = MI.getOperand(1).getReg();
2068 Register Src1Reg = MI.getOperand(2).getReg();
2069 auto Flags = MI.getFlags();
2070 LLT Ty = MRI.getType(DstReg);
2071
2072 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2073 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2074 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2075 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2076 MI.eraseFromParent();
2077 return true;
2078}
2079
2080static MachineInstrBuilder extractF64Exponent(Register Hi,
2081 MachineIRBuilder &B) {
2082 const unsigned FractBits = 52;
2083 const unsigned ExpBits = 11;
2084 LLT S32 = LLT::scalar(32);
2085
2086 auto Const0 = B.buildConstant(S32, FractBits - 32);
2087 auto Const1 = B.buildConstant(S32, ExpBits);
2088
2089 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2090 .addUse(Hi)
2091 .addUse(Const0.getReg(0))
2092 .addUse(Const1.getReg(0));
2093
2094 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2095}
2096
2097bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2098 MachineInstr &MI, MachineRegisterInfo &MRI,
2099 MachineIRBuilder &B) const {
2100 const LLT S1 = LLT::scalar(1);
2101 const LLT S32 = LLT::scalar(32);
2102 const LLT S64 = LLT::scalar(64);
2103
2104 Register Src = MI.getOperand(1).getReg();
2105 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2105, __extension__ __PRETTY_FUNCTION__))
;
2106
2107 // TODO: Should this use extract since the low half is unused?
2108 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2109 Register Hi = Unmerge.getReg(1);
2110
2111 // Extract the upper half, since this is where we will find the sign and
2112 // exponent.
2113 auto Exp = extractF64Exponent(Hi, B);
2114
2115 const unsigned FractBits = 52;
2116
2117 // Extract the sign bit.
2118 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2119 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2120
2121 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2122
2123 const auto Zero32 = B.buildConstant(S32, 0);
2124
2125 // Extend back to 64-bits.
2126 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2127
2128 auto Shr = B.buildAShr(S64, FractMask, Exp);
2129 auto Not = B.buildNot(S64, Shr);
2130 auto Tmp0 = B.buildAnd(S64, Src, Not);
2131 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2132
2133 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2134 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2135
2136 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2137 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2138 MI.eraseFromParent();
2139 return true;
2140}
2141
2142bool AMDGPULegalizerInfo::legalizeITOFP(
2143 MachineInstr &MI, MachineRegisterInfo &MRI,
2144 MachineIRBuilder &B, bool Signed) const {
2145
2146 Register Dst = MI.getOperand(0).getReg();
2147 Register Src = MI.getOperand(1).getReg();
2148
2149 const LLT S64 = LLT::scalar(64);
2150 const LLT S32 = LLT::scalar(32);
2151
2152 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2152, __extension__ __PRETTY_FUNCTION__))
;
2153
2154 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2155 auto ThirtyTwo = B.buildConstant(S32, 32);
2156
2157 if (MRI.getType(Dst) == S64) {
2158 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2159 : B.buildUITOFP(S64, Unmerge.getReg(1));
2160
2161 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2162 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2163 .addUse(CvtHi.getReg(0))
2164 .addUse(ThirtyTwo.getReg(0));
2165
2166 // TODO: Should this propagate fast-math-flags?
2167 B.buildFAdd(Dst, LdExp, CvtLo);
2168 MI.eraseFromParent();
2169 return true;
2170 }
2171
2172 assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0
) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2172, __extension__ __PRETTY_FUNCTION__))
;
2173
2174 auto One = B.buildConstant(S32, 1);
2175
2176 MachineInstrBuilder ShAmt;
2177 if (Signed) {
2178 auto ThirtyOne = B.buildConstant(S32, 31);
2179 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2180 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2181 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2182 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2183 /*HasSideEffects=*/false)
2184 .addUse(Unmerge.getReg(1));
2185 auto LS2 = B.buildSub(S32, LS, One);
2186 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2187 } else
2188 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2189 auto Norm = B.buildShl(S64, Src, ShAmt);
2190 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2191 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2192 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2193 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2194 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2195 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2196 /*HasSideEffects=*/false)
2197 .addUse(FVal.getReg(0))
2198 .addUse(Scale.getReg(0));
2199 MI.eraseFromParent();
2200 return true;
2201}
2202
2203// TODO: Copied from DAG implementation. Verify logic and document how this
2204// actually works.
2205bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2206 MachineRegisterInfo &MRI,
2207 MachineIRBuilder &B,
2208 bool Signed) const {
2209
2210 Register Dst = MI.getOperand(0).getReg();
2211 Register Src = MI.getOperand(1).getReg();
2212
2213 const LLT S64 = LLT::scalar(64);
2214 const LLT S32 = LLT::scalar(32);
2215
2216 const LLT SrcLT = MRI.getType(Src);
2217 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2217, __extension__
__PRETTY_FUNCTION__))
;
2218
2219 unsigned Flags = MI.getFlags();
2220
2221 // The basic idea of converting a floating point number into a pair of 32-bit
2222 // integers is illustrated as follows:
2223 //
2224 // tf := trunc(val);
2225 // hif := floor(tf * 2^-32);
2226 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2227 // hi := fptoi(hif);
2228 // lo := fptoi(lof);
2229 //
2230 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2231 MachineInstrBuilder Sign;
2232 if (Signed && SrcLT == S32) {
2233 // However, a 32-bit floating point number has only 23 bits mantissa and
2234 // it's not enough to hold all the significant bits of `lof` if val is
2235 // negative. To avoid the loss of precision, We need to take the absolute
2236 // value after truncating and flip the result back based on the original
2237 // signedness.
2238 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2239 Trunc = B.buildFAbs(S32, Trunc, Flags);
2240 }
2241 MachineInstrBuilder K0, K1;
2242 if (SrcLT == S64) {
2243 K0 = B.buildFConstant(S64,
2244 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2245 K1 = B.buildFConstant(S64,
2246 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2247 } else {
2248 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2249 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2250 }
2251
2252 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2253 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2254 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2255
2256 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2257 : B.buildFPTOUI(S32, FloorMul);
2258 auto Lo = B.buildFPTOUI(S32, Fma);
2259
2260 if (Signed && SrcLT == S32) {
2261 // Flip the result based on the signedness, which is either all 0s or 1s.
2262 Sign = B.buildMerge(S64, {Sign, Sign});
2263 // r := xor({lo, hi}, sign) - sign;
2264 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2265 } else
2266 B.buildMerge(Dst, {Lo, Hi});
2267 MI.eraseFromParent();
2268
2269 return true;
2270}
2271
2272bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2273 MachineInstr &MI) const {
2274 MachineFunction &MF = Helper.MIRBuilder.getMF();
2275 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2276
2277 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2278 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2279
2280 // With ieee_mode disabled, the instructions have the correct behavior
2281 // already for G_FMINNUM/G_FMAXNUM
2282 if (!MFI->getMode().IEEE)
2283 return !IsIEEEOp;
2284
2285 if (IsIEEEOp)
2286 return true;
2287
2288 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2289}
2290
2291bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2292 MachineInstr &MI, MachineRegisterInfo &MRI,
2293 MachineIRBuilder &B) const {
2294 // TODO: Should move some of this into LegalizerHelper.
2295
2296 // TODO: Promote dynamic indexing of s16 to s32
2297
2298 // FIXME: Artifact combiner probably should have replaced the truncated
2299 // constant before this, so we shouldn't need
2300 // getIConstantVRegValWithLookThrough.
2301 Optional<ValueAndVReg> MaybeIdxVal =
2302 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2303 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2304 return true;
2305 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2306
2307 Register Dst = MI.getOperand(0).getReg();
2308 Register Vec = MI.getOperand(1).getReg();
2309
2310 LLT VecTy = MRI.getType(Vec);
2311 LLT EltTy = VecTy.getElementType();
2312 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2312, __extension__ __PRETTY_FUNCTION__))
;
2313
2314 if (IdxVal < VecTy.getNumElements()) {
2315 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2316 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2317 } else {
2318 B.buildUndef(Dst);
2319 }
2320
2321 MI.eraseFromParent();
2322 return true;
2323}
2324
2325bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2326 MachineInstr &MI, MachineRegisterInfo &MRI,
2327 MachineIRBuilder &B) const {
2328 // TODO: Should move some of this into LegalizerHelper.
2329
2330 // TODO: Promote dynamic indexing of s16 to s32
2331
2332 // FIXME: Artifact combiner probably should have replaced the truncated
2333 // constant before this, so we shouldn't need
2334 // getIConstantVRegValWithLookThrough.
2335 Optional<ValueAndVReg> MaybeIdxVal =
2336 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2337 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2338 return true;
2339
2340 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2341 Register Dst = MI.getOperand(0).getReg();
2342 Register Vec = MI.getOperand(1).getReg();
2343 Register Ins = MI.getOperand(2).getReg();
2344
2345 LLT VecTy = MRI.getType(Vec);
2346 LLT EltTy = VecTy.getElementType();
2347 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2347, __extension__ __PRETTY_FUNCTION__))
;
2348 (void)Ins;
2349
2350 unsigned NumElts = VecTy.getNumElements();
2351 if (IdxVal < NumElts) {
2352 SmallVector<Register, 8> SrcRegs;
2353 for (unsigned i = 0; i < NumElts; ++i)
2354 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2355 B.buildUnmerge(SrcRegs, Vec);
2356
2357 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2358 B.buildMerge(Dst, SrcRegs);
2359 } else {
2360 B.buildUndef(Dst);
2361 }
2362
2363 MI.eraseFromParent();
2364 return true;
2365}
2366
2367bool AMDGPULegalizerInfo::legalizeShuffleVector(
2368 MachineInstr &MI, MachineRegisterInfo &MRI,
2369 MachineIRBuilder &B) const {
2370 const LLT V2S16 = LLT::fixed_vector(2, 16);
2371
2372 Register Dst = MI.getOperand(0).getReg();
2373 Register Src0 = MI.getOperand(1).getReg();
2374 LLT DstTy = MRI.getType(Dst);
2375 LLT SrcTy = MRI.getType(Src0);
2376
2377 if (SrcTy == V2S16 && DstTy == V2S16 &&
2378 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2379 return true;
2380
2381 MachineIRBuilder HelperBuilder(MI);
2382 GISelObserverWrapper DummyObserver;
2383 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2384 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2385}
2386
2387bool AMDGPULegalizerInfo::legalizeSinCos(
2388 MachineInstr &MI, MachineRegisterInfo &MRI,
2389 MachineIRBuilder &B) const {
2390
2391 Register DstReg = MI.getOperand(0).getReg();
2392 Register SrcReg = MI.getOperand(1).getReg();
2393 LLT Ty = MRI.getType(DstReg);
2394 unsigned Flags = MI.getFlags();
2395
2396 Register TrigVal;
2397 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2398 if (ST.hasTrigReducedRange()) {
2399 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2400 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2401 .addUse(MulVal.getReg(0))
2402 .setMIFlags(Flags).getReg(0);
2403 } else
2404 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2405
2406 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2407 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2408 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2409 .addUse(TrigVal)
2410 .setMIFlags(Flags);
2411 MI.eraseFromParent();
2412 return true;
2413}
2414
2415bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2416 MachineIRBuilder &B,
2417 const GlobalValue *GV,
2418 int64_t Offset,
2419 unsigned GAFlags) const {
2420 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2420, __extension__
__PRETTY_FUNCTION__))
;
2421 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2422 // to the following code sequence:
2423 //
2424 // For constant address space:
2425 // s_getpc_b64 s[0:1]
2426 // s_add_u32 s0, s0, $symbol
2427 // s_addc_u32 s1, s1, 0
2428 //
2429 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2430 // a fixup or relocation is emitted to replace $symbol with a literal
2431 // constant, which is a pc-relative offset from the encoding of the $symbol
2432 // operand to the global variable.
2433 //
2434 // For global address space:
2435 // s_getpc_b64 s[0:1]
2436 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2437 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2438 //
2439 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2440 // fixups or relocations are emitted to replace $symbol@*@lo and
2441 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2442 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2443 // operand to the global variable.
2444 //
2445 // What we want here is an offset from the value returned by s_getpc
2446 // (which is the address of the s_add_u32 instruction) to the global
2447 // variable, but since the encoding of $symbol starts 4 bytes after the start
2448 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2449 // small. This requires us to add 4 to the global variable offset in order to
2450 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2451 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2452 // instruction.
2453
2454 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2455
2456 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2457 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2458
2459 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2460 .addDef(PCReg);
2461
2462 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2463 if (GAFlags == SIInstrInfo::MO_NONE)
2464 MIB.addImm(0);
2465 else
2466 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2467
2468 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2469
2470 if (PtrTy.getSizeInBits() == 32)
2471 B.buildExtract(DstReg, PCReg, 0);
2472 return true;
2473 }
2474
2475bool AMDGPULegalizerInfo::legalizeGlobalValue(
2476 MachineInstr &MI, MachineRegisterInfo &MRI,
2477 MachineIRBuilder &B) const {
2478 Register DstReg = MI.getOperand(0).getReg();
2479 LLT Ty = MRI.getType(DstReg);
2480 unsigned AS = Ty.getAddressSpace();
2481
2482 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2483 MachineFunction &MF = B.getMF();
2484 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2485
2486 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2487 if (!MFI->isModuleEntryFunction() &&
2488 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2489 const Function &Fn = MF.getFunction();
2490 DiagnosticInfoUnsupported BadLDSDecl(
2491 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2492 DS_Warning);
2493 Fn.getContext().diagnose(BadLDSDecl);
2494
2495 // We currently don't have a way to correctly allocate LDS objects that
2496 // aren't directly associated with a kernel. We do force inlining of
2497 // functions that use local objects. However, if these dead functions are
2498 // not eliminated, we don't want a compile time error. Just emit a warning
2499 // and a trap, since there should be no callable path here.
2500 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2501 B.buildUndef(DstReg);
2502 MI.eraseFromParent();
2503 return true;
2504 }
2505
2506 // TODO: We could emit code to handle the initialization somewhere.
2507 // We ignore the initializer for now and legalize it to allow selection.
2508 // The initializer will anyway get errored out during assembly emission.
2509 const SITargetLowering *TLI = ST.getTargetLowering();
2510 if (!TLI->shouldUseLDSConstAddress(GV)) {
2511 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2512 return true; // Leave in place;
2513 }
2514
2515 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2516 Type *Ty = GV->getValueType();
2517 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2518 // zero-sized type in other languages to declare the dynamic shared
2519 // memory which size is not known at the compile time. They will be
2520 // allocated by the runtime and placed directly after the static
2521 // allocated ones. They all share the same offset.
2522 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2523 // Adjust alignment for that dynamic shared memory array.
2524 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2525 LLT S32 = LLT::scalar(32);
2526 auto Sz =
2527 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2528 B.buildIntToPtr(DstReg, Sz);
2529 MI.eraseFromParent();
2530 return true;
2531 }
2532 }
2533
2534 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2535 *cast<GlobalVariable>(GV)));
2536 MI.eraseFromParent();
2537 return true;
2538 }
2539
2540 const SITargetLowering *TLI = ST.getTargetLowering();
2541
2542 if (TLI->shouldEmitFixup(GV)) {
2543 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2544 MI.eraseFromParent();
2545 return true;
2546 }
2547
2548 if (TLI->shouldEmitPCReloc(GV)) {
2549 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2550 MI.eraseFromParent();
2551 return true;
2552 }
2553
2554 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2555 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2556
2557 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2558 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2559 MachinePointerInfo::getGOT(MF),
2560 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2561 MachineMemOperand::MOInvariant,
2562 LoadTy, Align(8));
2563
2564 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2565
2566 if (Ty.getSizeInBits() == 32) {
2567 // Truncate if this is a 32-bit constant address.
2568 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2569 B.buildExtract(DstReg, Load, 0);
2570 } else
2571 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2572
2573 MI.eraseFromParent();
2574 return true;
2575}
2576
2577static LLT widenToNextPowerOf2(LLT Ty) {
2578 if (Ty.isVector())
2579 return Ty.changeElementCount(
2580 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2581 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2582}
2583
2584bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2585 MachineInstr &MI) const {
2586 MachineIRBuilder &B = Helper.MIRBuilder;
2587 MachineRegisterInfo &MRI = *B.getMRI();
2588 GISelChangeObserver &Observer = Helper.Observer;
2589
2590 Register PtrReg = MI.getOperand(1).getReg();
2591 LLT PtrTy = MRI.getType(PtrReg);
2592 unsigned AddrSpace = PtrTy.getAddressSpace();
2593
2594 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2595 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2596 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2597 Observer.changingInstr(MI);
2598 MI.getOperand(1).setReg(Cast.getReg(0));
2599 Observer.changedInstr(MI);
2600 return true;
2601 }
2602
2603 if (MI.getOpcode() != AMDGPU::G_LOAD)
2604 return false;
2605
2606 Register ValReg = MI.getOperand(0).getReg();
2607 LLT ValTy = MRI.getType(ValReg);
2608
2609 MachineMemOperand *MMO = *MI.memoperands_begin();
2610 const unsigned ValSize = ValTy.getSizeInBits();
2611 const LLT MemTy = MMO->getMemoryType();
2612 const Align MemAlign = MMO->getAlign();
2613 const unsigned MemSize = MemTy.getSizeInBits();
2614 const uint64_t AlignInBits = 8 * MemAlign.value();
2615
2616 // Widen non-power-of-2 loads to the alignment if needed
2617 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2618 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2619
2620 // This was already the correct extending load result type, so just adjust
2621 // the memory type.
2622 if (WideMemSize == ValSize) {
2623 MachineFunction &MF = B.getMF();
2624
2625 MachineMemOperand *WideMMO =
2626 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2627 Observer.changingInstr(MI);
2628 MI.setMemRefs(MF, {WideMMO});
2629 Observer.changedInstr(MI);
2630 return true;
2631 }
2632
2633 // Don't bother handling edge case that should probably never be produced.
2634 if (ValSize > WideMemSize)
2635 return false;
2636
2637 LLT WideTy = widenToNextPowerOf2(ValTy);
2638
2639 Register WideLoad;
2640 if (!WideTy.isVector()) {
2641 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2642 B.buildTrunc(ValReg, WideLoad).getReg(0);
2643 } else {
2644 // Extract the subvector.
2645
2646 if (isRegisterType(ValTy)) {
2647 // If this a case where G_EXTRACT is legal, use it.
2648 // (e.g. <3 x s32> -> <4 x s32>)
2649 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2650 B.buildExtract(ValReg, WideLoad, 0);
2651 } else {
2652 // For cases where the widened type isn't a nice register value, unmerge
2653 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2654 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2655 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2656 }
2657 }
2658
2659 MI.eraseFromParent();
2660 return true;
2661 }
2662
2663 return false;
2664}
2665
2666bool AMDGPULegalizerInfo::legalizeFMad(
2667 MachineInstr &MI, MachineRegisterInfo &MRI,
2668 MachineIRBuilder &B) const {
2669 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2670 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2670, __extension__ __PRETTY_FUNCTION__))
;
2671
2672 MachineFunction &MF = B.getMF();
2673 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2674
2675 // TODO: Always legal with future ftz flag.
2676 // FIXME: Do we need just output?
2677 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2678 return true;
2679 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2680 return true;
2681
2682 MachineIRBuilder HelperBuilder(MI);
2683 GISelObserverWrapper DummyObserver;
2684 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2685 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2686}
2687
2688bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2690 Register DstReg = MI.getOperand(0).getReg();
2691 Register PtrReg = MI.getOperand(1).getReg();
2692 Register CmpVal = MI.getOperand(2).getReg();
2693 Register NewVal = MI.getOperand(3).getReg();
2694
2695 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2696, __extension__
__PRETTY_FUNCTION__))
2696 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2696, __extension__
__PRETTY_FUNCTION__))
;
2697
2698 LLT ValTy = MRI.getType(CmpVal);
2699 LLT VecTy = LLT::fixed_vector(2, ValTy);
2700
2701 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2702
2703 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2704 .addDef(DstReg)
2705 .addUse(PtrReg)
2706 .addUse(PackedVal)
2707 .setMemRefs(MI.memoperands());
2708
2709 MI.eraseFromParent();
2710 return true;
2711}
2712
2713bool AMDGPULegalizerInfo::legalizeFlog(
2714 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2715 Register Dst = MI.getOperand(0).getReg();
2716 Register Src = MI.getOperand(1).getReg();
2717 LLT Ty = B.getMRI()->getType(Dst);
2718 unsigned Flags = MI.getFlags();
2719
2720 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2721 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2722
2723 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2724 MI.eraseFromParent();
2725 return true;
2726}
2727
2728bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2729 MachineIRBuilder &B) const {
2730 Register Dst = MI.getOperand(0).getReg();
2731 Register Src = MI.getOperand(1).getReg();
2732 unsigned Flags = MI.getFlags();
2733 LLT Ty = B.getMRI()->getType(Dst);
2734
2735 auto K = B.buildFConstant(Ty, numbers::log2e);
2736 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2737 B.buildFExp2(Dst, Mul, Flags);
2738 MI.eraseFromParent();
2739 return true;
2740}
2741
2742bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2743 MachineIRBuilder &B) const {
2744 Register Dst = MI.getOperand(0).getReg();
2745 Register Src0 = MI.getOperand(1).getReg();
2746 Register Src1 = MI.getOperand(2).getReg();
2747 unsigned Flags = MI.getFlags();
2748 LLT Ty = B.getMRI()->getType(Dst);
2749 const LLT S16 = LLT::scalar(16);
2750 const LLT S32 = LLT::scalar(32);
2751
2752 if (Ty == S32) {
2753 auto Log = B.buildFLog2(S32, Src0, Flags);
2754 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2755 .addUse(Log.getReg(0))
2756 .addUse(Src1)
2757 .setMIFlags(Flags);
2758 B.buildFExp2(Dst, Mul, Flags);
2759 } else if (Ty == S16) {
2760 // There's no f16 fmul_legacy, so we need to convert for it.
2761 auto Log = B.buildFLog2(S16, Src0, Flags);
2762 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2763 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2764 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2765 .addUse(Ext0.getReg(0))
2766 .addUse(Ext1.getReg(0))
2767 .setMIFlags(Flags);
2768
2769 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2770 } else
2771 return false;
2772
2773 MI.eraseFromParent();
2774 return true;
2775}
2776
2777// Find a source register, ignoring any possible source modifiers.
2778static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2779 Register ModSrc = OrigSrc;
2780 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2781 ModSrc = SrcFNeg->getOperand(1).getReg();
2782 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2783 ModSrc = SrcFAbs->getOperand(1).getReg();
2784 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2785 ModSrc = SrcFAbs->getOperand(1).getReg();
2786 return ModSrc;
2787}
2788
2789bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2790 MachineRegisterInfo &MRI,
2791 MachineIRBuilder &B) const {
2792
2793 const LLT S1 = LLT::scalar(1);
2794 const LLT S64 = LLT::scalar(64);
2795 Register Dst = MI.getOperand(0).getReg();
2796 Register OrigSrc = MI.getOperand(1).getReg();
2797 unsigned Flags = MI.getFlags();
2798 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2799, __extension__
__PRETTY_FUNCTION__))
2799 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2799, __extension__
__PRETTY_FUNCTION__))
;
2800
2801 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2802 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2803 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2804 // V_FRACT bug is:
2805 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2806 //
2807 // Convert floor(x) to (x - fract(x))
2808
2809 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2810 .addUse(OrigSrc)
2811 .setMIFlags(Flags);
2812
2813 // Give source modifier matching some assistance before obscuring a foldable
2814 // pattern.
2815
2816 // TODO: We can avoid the neg on the fract? The input sign to fract
2817 // shouldn't matter?
2818 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2819
2820 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2821
2822 Register Min = MRI.createGenericVirtualRegister(S64);
2823
2824 // We don't need to concern ourselves with the snan handling difference, so
2825 // use the one which will directly select.
2826 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2827 if (MFI->getMode().IEEE)
2828 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2829 else
2830 B.buildFMinNum(Min, Fract, Const, Flags);
2831
2832 Register CorrectedFract = Min;
2833 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2834 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2835 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2836 }
2837
2838 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2839 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2840
2841 MI.eraseFromParent();
2842 return true;
2843}
2844
2845// Turn an illegal packed v2s16 build vector into bit operations.
2846// TODO: This should probably be a bitcast action in LegalizerHelper.
2847bool AMDGPULegalizerInfo::legalizeBuildVector(
2848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2849 Register Dst = MI.getOperand(0).getReg();
2850 const LLT S32 = LLT::scalar(32);
2851 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector
(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2851, __extension__
__PRETTY_FUNCTION__))
;
2852
2853 Register Src0 = MI.getOperand(1).getReg();
2854 Register Src1 = MI.getOperand(2).getReg();
2855 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16
)) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2855, __extension__
__PRETTY_FUNCTION__))
;
2856
2857 auto Merge = B.buildMerge(S32, {Src0, Src1});
2858 B.buildBitcast(Dst, Merge);
2859
2860 MI.eraseFromParent();
2861 return true;
2862}
2863
2864// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2865// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2866// case with a single min instruction instead of a compare+select.
2867bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2868 MachineRegisterInfo &MRI,
2869 MachineIRBuilder &B) const {
2870 Register Dst = MI.getOperand(0).getReg();
2871 Register Src = MI.getOperand(1).getReg();
2872 LLT DstTy = MRI.getType(Dst);
2873 LLT SrcTy = MRI.getType(Src);
2874
2875 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2876 ? AMDGPU::G_AMDGPU_FFBH_U32
2877 : AMDGPU::G_AMDGPU_FFBL_B32;
2878 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2879 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2880
2881 MI.eraseFromParent();
2882 return true;
2883}
2884
2885// Check that this is a G_XOR x, -1
2886static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2887 if (MI.getOpcode() != TargetOpcode::G_XOR)
2888 return false;
2889 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2890 return ConstVal && *ConstVal == -1;
2891}
2892
2893// Return the use branch instruction, otherwise null if the usage is invalid.
2894static MachineInstr *
2895verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2896 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2897 Register CondDef = MI.getOperand(0).getReg();
2898 if (!MRI.hasOneNonDBGUse(CondDef))
2899 return nullptr;
2900
2901 MachineBasicBlock *Parent = MI.getParent();
2902 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2903
2904 if (isNot(MRI, *UseMI)) {
2905 Register NegatedCond = UseMI->getOperand(0).getReg();
2906 if (!MRI.hasOneNonDBGUse(NegatedCond))
2907 return nullptr;
2908
2909 // We're deleting the def of this value, so we need to remove it.
2910 eraseInstr(*UseMI, MRI);
2911
2912 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2913 Negated = true;
2914 }
2915
2916 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2917 return nullptr;
2918
2919 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2920 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2921 if (Next == Parent->end()) {
2922 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2923 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2924 return nullptr;
2925 UncondBrTarget = &*NextMBB;
2926 } else {
2927 if (Next->getOpcode() != AMDGPU::G_BR)
2928 return nullptr;
2929 Br = &*Next;
2930 UncondBrTarget = Br->getOperand(0).getMBB();
2931 }
2932
2933 return UseMI;
2934}
2935
2936bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2937 const ArgDescriptor *Arg,
2938 const TargetRegisterClass *ArgRC,
2939 LLT ArgTy) const {
2940 MCRegister SrcReg = Arg->getRegister();
2941 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2941, __extension__
__PRETTY_FUNCTION__))
;
5
'?' condition is true
2942 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2942, __extension__
__PRETTY_FUNCTION__))
;
6
'?' condition is true
2943
2944 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2945 *ArgRC, B.getDebugLoc(), ArgTy);
2946 if (Arg->isMasked()) {
7
Taking true branch
2947 // TODO: Should we try to emit this once in the entry block?
2948 const LLT S32 = LLT::scalar(32);
2949 const unsigned Mask = Arg->getMask();
2950 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
8
Calling 'countTrailingZeros<unsigned int>'
15
Returning from 'countTrailingZeros<unsigned int>'
16
'Shift' initialized to 32
2951
2952 Register AndMaskSrc = LiveIn;
2953
2954 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2955 // 0.
2956 if (Shift
16.1
'Shift' is not equal to 0
16.1
'Shift' is not equal to 0
!= 0) {
17
Taking true branch
2957 auto ShiftAmt = B.buildConstant(S32, Shift);
2958 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2959 }
2960
2961 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
18
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2962 } else {
2963 B.buildCopy(DstReg, LiveIn);
2964 }
2965
2966 return true;
2967}
2968
2969bool AMDGPULegalizerInfo::loadInputValue(
2970 Register DstReg, MachineIRBuilder &B,
2971 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2972 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2973 const ArgDescriptor *Arg;
2974 const TargetRegisterClass *ArgRC;
2975 LLT ArgTy;
2976 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2977
2978 if (!Arg) {
2
Assuming 'Arg' is non-null
2979 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2980 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2981 // case the pointer argument may be missing and we use null.
2982 B.buildConstant(DstReg, 0);
2983 return true;
2984 }
2985
2986 // It's undefined behavior if a function marked with the amdgpu-no-*
2987 // attributes uses the corresponding intrinsic.
2988 B.buildUndef(DstReg);
2989 return true;
2990 }
2991
2992 if (!Arg->isRegister() || !Arg->getRegister().isValid())
3
Taking false branch
2993 return false; // TODO: Handle these
2994 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4
Calling 'AMDGPULegalizerInfo::loadInputValue'
2995}
2996
2997bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2998 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2999 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3000 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3001 return false;
3002
3003 MI.eraseFromParent();
3004 return true;
3005}
3006
3007Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3008 int64_t Offset) const {
3009 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3010 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3011
3012 // TODO: If we passed in the base kernel offset we could have a better
3013 // alignment than 4, but we don't really need it.
3014 if (!loadInputValue(KernArgReg, B,
1
Calling 'AMDGPULegalizerInfo::loadInputValue'
3015 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3016 llvm_unreachable("failed to find kernarg segment ptr")::llvm::llvm_unreachable_internal("failed to find kernarg segment ptr"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3016)
;
3017
3018 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3019 // TODO: Should get nuw
3020 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3021}
3022
3023/// Legalize a value that's loaded from kernel arguments. This is only used by
3024/// legacy intrinsics.
3025bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3026 MachineIRBuilder &B,
3027 uint64_t Offset,
3028 Align Alignment) const {
3029 Register DstReg = MI.getOperand(0).getReg();
3030
3031 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3032, __extension__
__PRETTY_FUNCTION__))
3032 "unexpected kernarg parameter type")(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3032, __extension__
__PRETTY_FUNCTION__))
;
3033
3034 Register Ptr = getKernargParameterPtr(B, Offset);
3035 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3036 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3037 MachineMemOperand::MODereferenceable |
3038 MachineMemOperand::MOInvariant);
3039 MI.eraseFromParent();
3040 return true;
3041}
3042
3043bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3044 MachineRegisterInfo &MRI,
3045 MachineIRBuilder &B) const {
3046 Register Dst = MI.getOperand(0).getReg();
3047 LLT DstTy = MRI.getType(Dst);
3048 LLT S16 = LLT::scalar(16);
3049 LLT S32 = LLT::scalar(32);
3050 LLT S64 = LLT::scalar(64);
3051
3052 if (DstTy == S16)
3053 return legalizeFDIV16(MI, MRI, B);
3054 if (DstTy == S32)
3055 return legalizeFDIV32(MI, MRI, B);
3056 if (DstTy == S64)
3057 return legalizeFDIV64(MI, MRI, B);
3058
3059 return false;
3060}
3061
3062void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3063 Register DstDivReg,
3064 Register DstRemReg,
3065 Register X,
3066 Register Y) const {
3067 const LLT S1 = LLT::scalar(1);
3068 const LLT S32 = LLT::scalar(32);
3069
3070 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3071 // algorithm used here.
3072
3073 // Initial estimate of inv(y).
3074 auto FloatY = B.buildUITOFP(S32, Y);
3075 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3076 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3077 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3078 auto Z = B.buildFPTOUI(S32, ScaledY);
3079
3080 // One round of UNR.
3081 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3082 auto NegYZ = B.buildMul(S32, NegY, Z);
3083 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3084
3085 // Quotient/remainder estimate.
3086 auto Q = B.buildUMulH(S32, X, Z);
3087 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3088
3089 // First quotient/remainder refinement.
3090 auto One = B.buildConstant(S32, 1);
3091 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3092 if (DstDivReg)
3093 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3094 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3095
3096 // Second quotient/remainder refinement.
3097 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3098 if (DstDivReg)
3099 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3100
3101 if (DstRemReg)
3102 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3103}
3104
3105// Build integer reciprocal sequence around V_RCP_IFLAG_F32
3106//
3107// Return lo, hi of result
3108//
3109// %cvt.lo = G_UITOFP Val.lo
3110// %cvt.hi = G_UITOFP Val.hi
3111// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3112// %rcp = G_AMDGPU_RCP_IFLAG %mad
3113// %mul1 = G_FMUL %rcp, 0x5f7ffffc
3114// %mul2 = G_FMUL %mul1, 2**(-32)
3115// %trunc = G_INTRINSIC_TRUNC %mul2
3116// %mad2 = G_FMAD %trunc, -(2**32), %mul1
3117// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3118static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3119 Register Val) {
3120 const LLT S32 = LLT::scalar(32);
3121 auto Unmerge = B.buildUnmerge(S32, Val);
3122
3123 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3124 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3125
3126 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3127 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3128
3129 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3130 auto Mul1 =
3131 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3132
3133 // 2**(-32)
3134 auto Mul2 =
3135 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3136 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3137
3138 // -(2**32)
3139 auto Mad2 = B.buildFMAD(S32, Trunc,
3140 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3141
3142 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3143 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3144
3145 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3146}
3147
3148void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3149 Register DstDivReg,
3150 Register DstRemReg,
3151 Register Numer,
3152 Register Denom) const {
3153 const LLT S32 = LLT::scalar(32);
3154 const LLT S64 = LLT::scalar(64);
3155 const LLT S1 = LLT::scalar(1);
3156 Register RcpLo, RcpHi;
3157
3158 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3159
3160 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3161
3162 auto Zero64 = B.buildConstant(S64, 0);
3163 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3164
3165 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3166 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3167
3168 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3169 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3170 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3171
3172 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3173 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3174 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3175
3176 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3177 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3178 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3179 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3180 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3181
3182 auto Zero32 = B.buildConstant(S32, 0);
3183 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3184 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3185 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3186
3187 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3188 Register NumerLo = UnmergeNumer.getReg(0);
3189 Register NumerHi = UnmergeNumer.getReg(1);
3190
3191 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3192 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3193 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3194 Register Mul3_Lo = UnmergeMul3.getReg(0);
3195 Register Mul3_Hi = UnmergeMul3.getReg(1);
3196 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3197 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3198 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3199 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3200
3201 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3202 Register DenomLo = UnmergeDenom.getReg(0);
3203 Register DenomHi = UnmergeDenom.getReg(1);
3204
3205 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3206 auto C1 = B.buildSExt(S32, CmpHi);
3207
3208 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3209 auto C2 = B.buildSExt(S32, CmpLo);
3210
3211 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3212 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3213
3214 // TODO: Here and below portions of the code can be enclosed into if/endif.
3215 // Currently control flow is unconditional and we have 4 selects after
3216 // potential endif to substitute PHIs.
3217
3218 // if C3 != 0 ...
3219 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3220 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3221 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3222 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3223
3224 auto One64 = B.buildConstant(S64, 1);
3225 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3226
3227 auto C4 =
3228 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3229 auto C5 =
3230 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3231 auto C6 = B.buildSelect(
3232 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3233
3234 // if (C6 != 0)
3235 auto Add4 = B.buildAdd(S64, Add3, One64);
3236 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3237
3238 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3239 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3240 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3241
3242 // endif C6
3243 // endif C3
3244
3245 if (DstDivReg) {
3246 auto Sel1 = B.buildSelect(
3247 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3248 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3249 Sel1, MulHi3);
3250 }
3251
3252 if (DstRemReg) {
3253 auto Sel2 = B.buildSelect(
3254 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3255 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3256 Sel2, Sub1);
3257 }
3258}
3259
3260bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3261 MachineRegisterInfo &MRI,
3262 MachineIRBuilder &B) const {
3263 Register DstDivReg, DstRemReg;
3264 switch (MI.getOpcode()) {
3265 default:
3266 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3266)
;
3267 case AMDGPU::G_UDIV: {
3268 DstDivReg = MI.getOperand(0).getReg();
3269 break;
3270 }
3271 case AMDGPU::G_UREM: {
3272 DstRemReg = MI.getOperand(0).getReg();
3273 break;
3274 }
3275 case AMDGPU::G_UDIVREM: {
3276 DstDivReg = MI.getOperand(0).getReg();
3277 DstRemReg = MI.getOperand(1).getReg();
3278 break;
3279 }
3280 }
3281
3282 const LLT S64 = LLT::scalar(64);
3283 const LLT S32 = LLT::scalar(32);
3284 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3285 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3286 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3287 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3288
3289 if (Ty == S32)
3290 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3291 else if (Ty == S64)
3292 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3293 else
3294 return false;
3295
3296 MI.eraseFromParent();
3297 return true;
3298}
3299
3300bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3301 MachineRegisterInfo &MRI,
3302 MachineIRBuilder &B) const {
3303 const LLT S64 = LLT::scalar(64);
3304 const LLT S32 = LLT::scalar(32);
3305
3306 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3307 if (Ty != S32 && Ty != S64)
3308 return false;
3309
3310 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3311 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3312 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3313
3314 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3315 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3316 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3317
3318 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3319 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3320
3321 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3322 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3323
3324 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3325 switch (MI.getOpcode()) {
3326 default:
3327 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3327)
;
3328 case AMDGPU::G_SDIV: {
3329 DstDivReg = MI.getOperand(0).getReg();
3330 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3331 break;
3332 }
3333 case AMDGPU::G_SREM: {
3334 DstRemReg = MI.getOperand(0).getReg();
3335 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3336 break;
3337 }
3338 case AMDGPU::G_SDIVREM: {
3339 DstDivReg = MI.getOperand(0).getReg();
3340 DstRemReg = MI.getOperand(1).getReg();
3341 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3342 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3343 break;
3344 }
3345 }
3346
3347 if (Ty == S32)
3348 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3349 else
3350 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3351
3352 if (DstDivReg) {
3353 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3354 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3355 B.buildSub(DstDivReg, SignXor, Sign);
3356 }
3357
3358 if (DstRemReg) {
3359 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3360 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3361 B.buildSub(DstRemReg, SignXor, Sign);
3362 }
3363
3364 MI.eraseFromParent();
3365 return true;
3366}
3367
3368bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3369 MachineRegisterInfo &MRI,
3370 MachineIRBuilder &B) const {
3371 Register Res = MI.getOperand(0).getReg();
3372 Register LHS = MI.getOperand(1).getReg();
3373 Register RHS = MI.getOperand(2).getReg();
3374 uint16_t Flags = MI.getFlags();
3375 LLT ResTy = MRI.getType(Res);
3376
3377 const MachineFunction &MF = B.getMF();
3378 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3379 MI.getFlag(MachineInstr::FmAfn);
3380
3381 if (!AllowInaccurateRcp)
3382 return false;
3383
3384 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3385 // 1 / x -> RCP(x)
3386 if (CLHS->isExactlyValue(1.0)) {
3387 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3388 .addUse(RHS)
3389 .setMIFlags(Flags);
3390
3391 MI.eraseFromParent();
3392 return true;
3393 }
3394
3395 // -1 / x -> RCP( FNEG(x) )
3396 if (CLHS->isExactlyValue(-1.0)) {
3397 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3398 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3399 .addUse(FNeg.getReg(0))
3400 .setMIFlags(Flags);
3401
3402 MI.eraseFromParent();
3403 return true;
3404 }
3405 }
3406
3407 // x / y -> x * (1.0 / y)
3408 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3409 .addUse(RHS)
3410 .setMIFlags(Flags);
3411 B.buildFMul(Res, LHS, RCP, Flags);
3412
3413 MI.eraseFromParent();
3414 return true;
3415}
3416
3417bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3418 MachineRegisterInfo &MRI,
3419 MachineIRBuilder &B) const {
3420 Register Res = MI.getOperand(0).getReg();
3421 Register X = MI.getOperand(1).getReg();
3422 Register Y = MI.getOperand(2).getReg();
3423 uint16_t Flags = MI.getFlags();
3424 LLT ResTy = MRI.getType(Res);
3425
3426 const MachineFunction &MF = B.getMF();
3427 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3428 MI.getFlag(MachineInstr::FmAfn);
3429
3430 if (!AllowInaccurateRcp)
3431 return false;
3432
3433 auto NegY = B.buildFNeg(ResTy, Y);
3434 auto One = B.buildFConstant(ResTy, 1.0);
3435
3436 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3437 .addUse(Y)
3438 .setMIFlags(Flags);
3439
3440 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3441 R = B.buildFMA(ResTy, Tmp0, R, R);
3442
3443 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3444 R = B.buildFMA(ResTy, Tmp1, R, R);
3445
3446 auto Ret = B.buildFMul(ResTy, X, R);
3447 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3448
3449 B.buildFMA(Res, Tmp2, R, Ret);
3450 MI.eraseFromParent();
3451 return true;
3452}
3453
3454bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3455 MachineRegisterInfo &MRI,
3456 MachineIRBuilder &B) const {
3457 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3458 return true;
3459
3460 Register Res = MI.getOperand(0).getReg();
3461 Register LHS = MI.getOperand(1).getReg();
3462 Register RHS = MI.getOperand(2).getReg();
3463
3464 uint16_t Flags = MI.getFlags();
3465
3466 LLT S16 = LLT::scalar(16);
3467 LLT S32 = LLT::scalar(32);
3468
3469 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3470 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3471
3472 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3473 .addUse(RHSExt.getReg(0))
3474 .setMIFlags(Flags);
3475
3476 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3477 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3478
3479 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3480 .addUse(RDst.getReg(0))
3481 .addUse(RHS)
3482 .addUse(LHS)
3483 .setMIFlags(Flags);
3484
3485 MI.eraseFromParent();
3486 return true;
3487}
3488
3489// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3490// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3491static void toggleSPDenormMode(bool Enable,
3492 MachineIRBuilder &B,
3493 const GCNSubtarget &ST,
3494 AMDGPU::SIModeRegisterDefaults Mode) {
3495 // Set SP denorm mode to this value.
3496 unsigned SPDenormMode =
3497 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3498
3499 if (ST.hasDenormModeInst()) {
3500 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3501 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3502
3503 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3504 B.buildInstr(AMDGPU::S_DENORM_MODE)
3505 .addImm(NewDenormModeValue);
3506
3507 } else {
3508 // Select FP32 bit field in mode register.
3509 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3510 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3511 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3512
3513 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3514 .addImm(SPDenormMode)
3515 .addImm(SPDenormModeBitField);
3516 }
3517}
3518
3519bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3520 MachineRegisterInfo &MRI,
3521 MachineIRBuilder &B) const {
3522 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3523 return true;
3524
3525 Register Res = MI.getOperand(0).getReg();
3526 Register LHS = MI.getOperand(1).getReg();
3527 Register RHS = MI.getOperand(2).getReg();
3528 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3529 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3530
3531 uint16_t Flags = MI.getFlags();
3532
3533 LLT S32 = LLT::scalar(32);
3534 LLT S1 = LLT::scalar(1);
3535
3536 auto One = B.buildFConstant(S32, 1.0f);
3537
3538 auto DenominatorScaled =
3539 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3540 .addUse(LHS)
3541 .addUse(RHS)
3542 .addImm(0)
3543 .setMIFlags(Flags);
3544 auto NumeratorScaled =
3545 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3546 .addUse(LHS)
3547 .addUse(RHS)
3548 .addImm(1)
3549 .setMIFlags(Flags);
3550
3551 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3552 .addUse(DenominatorScaled.getReg(0))
3553 .setMIFlags(Flags);
3554 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3555
3556 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3557 // aren't modeled as reading it.
3558 if (!Mode.allFP32Denormals())
3559 toggleSPDenormMode(true, B, ST, Mode);
3560
3561 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3562 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3563 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3564 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3565 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3566 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3567
3568 if (!Mode.allFP32Denormals())
3569 toggleSPDenormMode(false, B, ST, Mode);
3570
3571 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3572 .addUse(Fma4.getReg(0))
3573 .addUse(Fma1.getReg(0))
3574 .addUse(Fma3.getReg(0))
3575 .addUse(NumeratorScaled.getReg(1))
3576 .setMIFlags(Flags);
3577
3578 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3579 .addUse(Fmas.getReg(0))
3580 .addUse(RHS)
3581 .addUse(LHS)
3582 .setMIFlags(Flags);
3583
3584 MI.eraseFromParent();
3585 return true;
3586}
3587
3588bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3589 MachineRegisterInfo &MRI,
3590 MachineIRBuilder &B) const {
3591 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3592 return true;
3593
3594 Register Res = MI.getOperand(0).getReg();
3595 Register LHS = MI.getOperand(1).getReg();
3596 Register RHS = MI.getOperand(2).getReg();
3597
3598 uint16_t Flags = MI.getFlags();
3599
3600 LLT S64 = LLT::scalar(64);
3601 LLT S1 = LLT::scalar(1);
3602
3603 auto One = B.buildFConstant(S64, 1.0);
3604
3605 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3606 .addUse(LHS)
3607 .addUse(RHS)
3608 .addImm(0)
3609 .setMIFlags(Flags);
3610
3611 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3612
3613 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3614 .addUse(DivScale0.getReg(0))
3615 .setMIFlags(Flags);
3616
3617 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3618 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3619 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3620
3621 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3622 .addUse(LHS)
3623 .addUse(RHS)
3624 .addImm(1)
3625 .setMIFlags(Flags);
3626
3627 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3628 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3629 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3630
3631 Register Scale;
3632 if (!ST.hasUsableDivScaleConditionOutput()) {
3633 // Workaround a hardware bug on SI where the condition output from div_scale
3634 // is not usable.
3635
3636 LLT S32 = LLT::scalar(32);
3637
3638 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3639 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3640 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3641 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3642
3643 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3644 Scale1Unmerge.getReg(1));
3645 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3646 Scale0Unmerge.getReg(1));
3647 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3648 } else {
3649 Scale = DivScale1.getReg(1);
3650 }
3651
3652 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3653 .addUse(Fma4.getReg(0))
3654 .addUse(Fma3.getReg(0))
3655 .addUse(Mul.getReg(0))
3656 .addUse(Scale)
3657 .setMIFlags(Flags);
3658
3659 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3660 .addUse(Fmas.getReg(0))
3661 .addUse(RHS)
3662 .addUse(LHS)
3663 .setMIFlags(Flags);
3664
3665 MI.eraseFromParent();
3666 return true;
3667}
3668
3669bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3670 MachineRegisterInfo &MRI,
3671 MachineIRBuilder &B) const {
3672 Register Res = MI.getOperand(0).getReg();
3673 Register LHS = MI.getOperand(2).getReg();
3674 Register RHS = MI.getOperand(3).getReg();
3675 uint16_t Flags = MI.getFlags();
3676
3677 LLT S32 = LLT::scalar(32);
3678 LLT S1 = LLT::scalar(1);
3679
3680 auto Abs = B.buildFAbs(S32, RHS, Flags);
3681 const APFloat C0Val(1.0f);
3682
3683 auto C0 = B.buildConstant(S32, 0x6f800000);
3684 auto C1 = B.buildConstant(S32, 0x2f800000);
3685 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3686
3687 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3688 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3689
3690 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3691
3692 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3693 .addUse(Mul0.getReg(0))
3694 .setMIFlags(Flags);
3695
3696 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3697
3698 B.buildFMul(Res, Sel, Mul1, Flags);
3699
3700 MI.eraseFromParent();
3701 return true;
3702}
3703
3704// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3705// FIXME: Why do we handle this one but not other removed instructions?
3706//
3707// Reciprocal square root. The clamp prevents infinite results, clamping
3708// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3709// +-max_float.
3710bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3711 MachineRegisterInfo &MRI,
3712 MachineIRBuilder &B) const {
3713 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3714 return true;
3715
3716 Register Dst = MI.getOperand(0).getReg();
3717 Register Src = MI.getOperand(2).getReg();
3718 auto Flags = MI.getFlags();
3719
3720 LLT Ty = MRI.getType(Dst);
3721
3722 const fltSemantics *FltSemantics;
3723 if (Ty == LLT::scalar(32))
3724 FltSemantics = &APFloat::IEEEsingle();
3725 else if (Ty == LLT::scalar(64))
3726 FltSemantics = &APFloat::IEEEdouble();
3727 else
3728 return false;
3729
3730 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3731 .addUse(Src)
3732 .setMIFlags(Flags);
3733
3734 // We don't need to concern ourselves with the snan handling difference, since
3735 // the rsq quieted (or not) so use the one which will directly select.
3736 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3737 const bool UseIEEE = MFI->getMode().IEEE;
3738
3739 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3740 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3741 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3742
3743 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3744
3745 if (UseIEEE)
3746 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3747 else
3748 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3749 MI.eraseFromParent();
3750 return true;
3751}
3752
3753static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3754 switch (IID) {
3755 case Intrinsic::amdgcn_ds_fadd:
3756 return AMDGPU::G_ATOMICRMW_FADD;
3757 case Intrinsic::amdgcn_ds_fmin:
3758 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3759 case Intrinsic::amdgcn_ds_fmax:
3760 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3761 default:
3762 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3762)
;
3763 }
3764}
3765
3766bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3767 MachineInstr &MI,
3768 Intrinsic::ID IID) const {
3769 GISelChangeObserver &Observer = Helper.Observer;
3770 Observer.changingInstr(MI);
3771
3772 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3773
3774 // The remaining operands were used to set fields in the MemOperand on
3775 // construction.
3776 for (int I = 6; I > 3; --I)
3777 MI.removeOperand(I);
3778
3779 MI.removeOperand(1); // Remove the intrinsic ID.
3780 Observer.changedInstr(MI);
3781 return true;
3782}
3783
3784bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3785 MachineRegisterInfo &MRI,
3786 MachineIRBuilder &B) const {
3787 uint64_t Offset =
3788 ST.getTargetLowering()->getImplicitParameterOffset(
3789 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3790 LLT DstTy = MRI.getType(DstReg);
3791 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3792
3793 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3794 if (!loadInputValue(KernargPtrReg, B,
3795 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3796 return false;
3797
3798 // FIXME: This should be nuw
3799 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3800 return true;
3801}
3802
3803bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3804 MachineRegisterInfo &MRI,
3805 MachineIRBuilder &B) const {
3806 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3807 if (!MFI->isEntryFunction()) {
3808 return legalizePreloadedArgIntrin(MI, MRI, B,
3809 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3810 }
3811
3812 Register DstReg = MI.getOperand(0).getReg();
3813 if (!getImplicitArgPtr(DstReg, MRI, B))
3814 return false;
3815
3816 MI.eraseFromParent();
3817 return true;
3818}
3819
3820bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3821 MachineRegisterInfo &MRI,
3822 MachineIRBuilder &B,
3823 unsigned AddrSpace) const {
3824 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3825 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3826 Register Hi32 = Unmerge.getReg(1);
3827
3828 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3829 MI.eraseFromParent();
3830 return true;
3831}
3832
3833// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3834// offset (the offset that is included in bounds checking and swizzling, to be
3835// split between the instruction's voffset and immoffset fields) and soffset
3836// (the offset that is excluded from bounds checking and swizzling, to go in
3837// the instruction's soffset field). This function takes the first kind of
3838// offset and figures out how to split it between voffset and immoffset.
3839std::pair<Register, unsigned>
3840AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3841 Register OrigOffset) const {
3842 const unsigned MaxImm = 4095;
3843 Register BaseReg;
3844 unsigned ImmOffset;
3845 const LLT S32 = LLT::scalar(32);
3846 MachineRegisterInfo &MRI = *B.getMRI();
3847
3848 std::tie(BaseReg, ImmOffset) =
3849 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3850
3851 // If BaseReg is a pointer, convert it to int.
3852 if (MRI.getType(BaseReg).isPointer())
3853 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3854
3855 // If the immediate value is too big for the immoffset field, put the value
3856 // and -4096 into the immoffset field so that the value that is copied/added
3857 // for the voffset field is a multiple of 4096, and it stands more chance
3858 // of being CSEd with the copy/add for another similar load/store.
3859 // However, do not do that rounding down to a multiple of 4096 if that is a
3860 // negative number, as it appears to be illegal to have a negative offset
3861 // in the vgpr, even if adding the immediate offset makes it positive.
3862 unsigned Overflow = ImmOffset & ~MaxImm;
3863 ImmOffset -= Overflow;
3864 if ((int32_t)Overflow < 0) {
3865 Overflow += ImmOffset;
3866 ImmOffset = 0;
3867 }
3868
3869 if (Overflow != 0) {
3870 if (!BaseReg) {
3871 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3872 } else {
3873 auto OverflowVal = B.buildConstant(S32, Overflow);
3874 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3875 }
3876 }
3877
3878 if (!BaseReg)
3879 BaseReg = B.buildConstant(S32, 0).getReg(0);
3880
3881 return std::make_pair(BaseReg, ImmOffset);
3882}
3883
3884/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3885void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3886 Register VOffset, Register SOffset,
3887 unsigned ImmOffset, Register VIndex,
3888 MachineRegisterInfo &MRI) const {
3889 Optional<ValueAndVReg> MaybeVOffsetVal =
3890 getIConstantVRegValWithLookThrough(VOffset, MRI);
3891 Optional<ValueAndVReg> MaybeSOffsetVal =
3892 getIConstantVRegValWithLookThrough(SOffset, MRI);
3893 Optional<ValueAndVReg> MaybeVIndexVal =
3894 getIConstantVRegValWithLookThrough(VIndex, MRI);
3895 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3896 // update the MMO with that offset. The stride is unknown so we can only do
3897 // this if VIndex is constant 0.
3898 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3899 MaybeVIndexVal->Value == 0) {
3900 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3901 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3902 MMO->setOffset(TotalOffset);
3903 } else {
3904 // We don't have a constant combined offset to use in the MMO. Give up.
3905 MMO->setValue((Value *)nullptr);
3906 }
3907}
3908
3909/// Handle register layout difference for f16 images for some subtargets.
3910Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3911 MachineRegisterInfo &MRI,
3912 Register Reg,
3913 bool ImageStore) const {
3914 const LLT S16 = LLT::scalar(16);
3915 const LLT S32 = LLT::scalar(32);
3916 LLT StoreVT = MRI.getType(Reg);
3917 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3917, __extension__
__PRETTY_FUNCTION__))
;
3918
3919 if (ST.hasUnpackedD16VMem()) {
3920 auto Unmerge = B.buildUnmerge(S16, Reg);
3921
3922 SmallVector<Register, 4> WideRegs;
3923 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3924 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3925
3926 int NumElts = StoreVT.getNumElements();
3927
3928 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3929 .getReg(0);
3930 }
3931
3932 if (ImageStore && ST.hasImageStoreD16Bug()) {
3933 if (StoreVT.getNumElements() == 2) {
3934 SmallVector<Register, 4> PackedRegs;
3935 Reg = B.buildBitcast(S32, Reg).getReg(0);
3936 PackedRegs.push_back(Reg);
3937 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3938 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3939 .getReg(0);
3940 }
3941
3942 if (StoreVT.getNumElements() == 3) {
3943 SmallVector<Register, 4> PackedRegs;
3944 auto Unmerge = B.buildUnmerge(S16, Reg);
3945 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3946 PackedRegs.push_back(Unmerge.getReg(I));
3947 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3948 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3949 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3950 }
3951
3952 if (StoreVT.getNumElements() == 4) {
3953 SmallVector<Register, 4> PackedRegs;
3954 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3955 auto Unmerge = B.buildUnmerge(S32, Reg);
3956 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3957 PackedRegs.push_back(Unmerge.getReg(I));
3958 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3959 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3960 .getReg(0);
3961 }
3962
3963 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3963)
;
3964 }
3965
3966 if (StoreVT == LLT::fixed_vector(3, S16)) {
3967 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3968 .getReg(0);
3969 }
3970 return Reg;
3971}
3972
3973Register AMDGPULegalizerInfo::fixStoreSourceType(
3974 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3975 MachineRegisterInfo *MRI = B.getMRI();
3976 LLT Ty = MRI->getType(VData);
3977
3978 const LLT S16 = LLT::scalar(16);
3979
3980 // Fixup illegal register types for i8 stores.
3981 if (Ty == LLT::scalar(8) || Ty == S16) {
3982 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3983 return AnyExt;
3984 }
3985
3986 if (Ty.isVector()) {
3987 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3988 if (IsFormat)
3989 return handleD16VData(B, *MRI, VData);
3990 }
3991 }
3992
3993 return VData;
3994}
3995
3996bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3997 MachineRegisterInfo &MRI,
3998 MachineIRBuilder &B,
3999 bool IsTyped,
4000 bool IsFormat) const {
4001 Register VData = MI.getOperand(1).getReg();
4002 LLT Ty = MRI.getType(VData);
4003 LLT EltTy = Ty.getScalarType();
4004 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4005 const LLT S32 = LLT::scalar(32);
4006
4007 VData = fixStoreSourceType(B, VData, IsFormat);
4008 Register RSrc = MI.getOperand(2).getReg();
4009
4010 MachineMemOperand *MMO = *MI.memoperands_begin();
4011 const int MemSize = MMO->getSize();
4012
4013 unsigned ImmOffset;
4014
4015 // The typed intrinsics add an immediate after the registers.
4016 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4017
4018 // The struct intrinsic variants add one additional operand over raw.
4019 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4020 Register VIndex;
4021 int OpOffset = 0;
4022 if (HasVIndex) {
4023 VIndex = MI.getOperand(3).getReg();
4024 OpOffset = 1;
4025 } else {
4026 VIndex = B.buildConstant(S32, 0).getReg(0);
4027 }
4028
4029 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4030 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4031
4032 unsigned Format = 0;
4033 if (IsTyped) {
4034 Format = MI.getOperand(5 + OpOffset).getImm();
4035 ++OpOffset;
4036 }
4037
4038 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4039
4040 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4041 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4042
4043 unsigned Opc;
4044 if (IsTyped) {
4045 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4046 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4047 } else if (IsFormat) {
4048 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4049 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4050 } else {
4051 switch (MemSize) {
4052 case 1:
4053 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4054 break;
4055 case 2:
4056 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4057 break;
4058 default:
4059 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4060 break;
4061 }
4062 }
4063
4064 auto MIB = B.buildInstr(Opc)
4065 .addUse(VData) // vdata
4066 .addUse(RSrc) // rsrc
4067 .addUse(VIndex) // vindex
4068 .addUse(VOffset) // voffset
4069 .addUse(SOffset) // soffset
4070 .addImm(ImmOffset); // offset(imm)
4071
4072 if (IsTyped)
4073 MIB.addImm(Format);
4074
4075 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4076 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4077 .addMemOperand(MMO);
4078
4079 MI.eraseFromParent();
4080 return true;
4081}
4082
4083bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4084 MachineRegisterInfo &MRI,
4085 MachineIRBuilder &B,
4086 bool IsFormat,
4087 bool IsTyped) const {
4088 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4089 MachineMemOperand *MMO = *MI.memoperands_begin();
4090 const LLT MemTy = MMO->getMemoryType();
4091 const LLT S32 = LLT::scalar(32);
4092
4093 Register Dst = MI.getOperand(0).getReg();
4094 Register RSrc = MI.getOperand(2).getReg();
4095
4096 // The typed intrinsics add an immediate after the registers.
4097 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4098
4099 // The struct intrinsic variants add one additional operand over raw.
4100 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4101 Register VIndex;
4102 int OpOffset = 0;
4103 if (HasVIndex) {
4104 VIndex = MI.getOperand(3).getReg();
4105 OpOffset = 1;
4106 } else {
4107 VIndex = B.buildConstant(S32, 0).getReg(0);
4108 }
4109
4110 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4111 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4112
4113 unsigned Format = 0;
4114 if (IsTyped) {
4115 Format = MI.getOperand(5 + OpOffset).getImm();
4116 ++OpOffset;
4117 }
4118
4119 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4120 unsigned ImmOffset;
4121
4122 LLT Ty = MRI.getType(Dst);
4123 LLT EltTy = Ty.getScalarType();
4124 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4125 const bool Unpacked = ST.hasUnpackedD16VMem();
4126
4127 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4128 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4129
4130 unsigned Opc;
4131
4132 if (IsTyped) {
4133 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4134 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4135 } else if (IsFormat) {
4136 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4137 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4138 } else {
4139 switch (MemTy.getSizeInBits()) {
4140 case 8:
4141 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4142 break;
4143 case 16:
4144 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4145 break;
4146 default:
4147 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4148 break;
4149 }
4150 }
4151
4152 Register LoadDstReg;
4153
4154 bool IsExtLoad =
4155 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4156 LLT UnpackedTy = Ty.changeElementSize(32);
4157
4158 if (IsExtLoad)
4159 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4160 else if (Unpacked && IsD16 && Ty.isVector())
4161 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4162 else
4163 LoadDstReg = Dst;
4164
4165 auto MIB = B.buildInstr(Opc)
4166 .addDef(LoadDstReg) // vdata
4167 .addUse(RSrc) // rsrc
4168 .addUse(VIndex) // vindex
4169 .addUse(VOffset) // voffset
4170 .addUse(SOffset) // soffset
4171 .addImm(ImmOffset); // offset(imm)
4172
4173 if (IsTyped)
4174 MIB.addImm(Format);
4175
4176 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4177 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4178 .addMemOperand(MMO);
4179
4180 if (LoadDstReg != Dst) {
4181 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4182
4183 // Widen result for extending loads was widened.
4184 if (IsExtLoad)
4185 B.buildTrunc(Dst, LoadDstReg);
4186 else {
4187 // Repack to original 16-bit vector result
4188 // FIXME: G_TRUNC should work, but legalization currently fails
4189 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4190 SmallVector<Register, 4> Repack;
4191 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4192 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4193 B.buildMerge(Dst, Repack);
4194 }
4195 }
4196
4197 MI.eraseFromParent();
4198 return true;
4199}
4200
4201bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4202 MachineIRBuilder &B,
4203 bool IsInc) const {
4204 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4205 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4206 B.buildInstr(Opc)
4207 .addDef(MI.getOperand(0).getReg())
4208 .addUse(MI.getOperand(2).getReg())
4209 .addUse(MI.getOperand(3).getReg())
4210 .cloneMemRefs(MI);
4211 MI.eraseFromParent();
4212 return true;
4213}
4214
4215static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4216 switch (IntrID) {
4217 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4218 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4220 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4221 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4223 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4224 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4226 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4227 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4228 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4229 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4230 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4231 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4232 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4233 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4234 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4235 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4236 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4237 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4238 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4239 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4240 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4241 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4242 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4243 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4244 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4245 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4246 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4247 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4248 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4249 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4250 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4251 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4252 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4253 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4254 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4255 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4256 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4257 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4258 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4259 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4260 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4261 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4262 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4263 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4264 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4265 default:
4266 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4266)
;
4267 }
4268}
4269
4270bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4271 MachineIRBuilder &B,
4272 Intrinsic::ID IID) const {
4273 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4274 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4275 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4276
4277 Register Dst;
4278
4279 int OpOffset = 0;
4280 if (HasReturn) {
4281 // A few FP atomics do not support return values.
4282 Dst = MI.getOperand(0).getReg();
4283 } else {
4284 OpOffset = -1;
4285 }
4286
4287 Register VData = MI.getOperand(2 + OpOffset).getReg();
4288 Register CmpVal;
4289
4290 if (IsCmpSwap) {
4291 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4292 ++OpOffset;
4293 }
4294
4295 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4296 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4297
4298 // The struct intrinsic variants add one additional operand over raw.
4299 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4300 Register VIndex;
4301 if (HasVIndex) {
4302 VIndex = MI.getOperand(4 + OpOffset).getReg();
4303 ++OpOffset;
4304 } else {
4305 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4306 }
4307
4308 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4309 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4310 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4311
4312 MachineMemOperand *MMO = *MI.memoperands_begin();
4313
4314 unsigned ImmOffset;
4315 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4316 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4317
4318 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4319
4320 if (HasReturn)
4321 MIB.addDef(Dst);
4322
4323 MIB.addUse(VData); // vdata
4324
4325 if (IsCmpSwap)
4326 MIB.addReg(CmpVal);
4327
4328 MIB.addUse(RSrc) // rsrc
4329 .addUse(VIndex) // vindex
4330 .addUse(VOffset) // voffset
4331 .addUse(SOffset) // soffset
4332 .addImm(ImmOffset) // offset(imm)
4333 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4334 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4335 .addMemOperand(MMO);
4336
4337 MI.eraseFromParent();
4338 return true;
4339}
4340
4341/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4342/// vector with s16 typed elements.
4343static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4344 SmallVectorImpl<Register> &PackedAddrs,
4345 unsigned ArgOffset,
4346 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4347 bool IsA16, bool IsG16) {
4348 const LLT S16 = LLT::scalar(16);
4349 const LLT V2S16 = LLT::fixed_vector(2, 16);
4350 auto EndIdx = Intr->VAddrEnd;
4351
4352 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4353 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4354 if (!SrcOp.isReg())
4355 continue; // _L to _LZ may have eliminated this.
4356
4357 Register AddrReg = SrcOp.getReg();
4358
4359 if ((I < Intr->GradientStart) ||
4360 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4361 (I >= Intr->CoordStart && !IsA16)) {
4362 if ((I < Intr->GradientStart) && IsA16 &&
4363 (B.getMRI()->getType(AddrReg) == S16)) {
4364 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument")(static_cast <bool> (I == Intr->BiasIndex &&
"Got unexpected 16-bit extra argument") ? void (0) : __assert_fail
("I == Intr->BiasIndex && \"Got unexpected 16-bit extra argument\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4364, __extension__
__PRETTY_FUNCTION__))
;
4365 // Special handling of bias when A16 is on. Bias is of type half but
4366 // occupies full 32-bit.
4367 PackedAddrs.push_back(
4368 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4369 .getReg(0));
4370 } else {
4371 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4372, __extension__
__PRETTY_FUNCTION__))
4372 "Bias needs to be converted to 16 bit in A16 mode")(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4372, __extension__
__PRETTY_FUNCTION__))
;
4373 // Handle any gradient or coordinate operands that should not be packed
4374 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4375 PackedAddrs.push_back(AddrReg);
4376 }
4377 } else {
4378 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4379 // derivatives dx/dh and dx/dv are packed with undef.
4380 if (((I + 1) >= EndIdx) ||
4381 ((Intr->NumGradients / 2) % 2 == 1 &&
4382 (I == static_cast<unsigned>(Intr->GradientStart +
4383 (Intr->NumGradients / 2) - 1) ||
4384 I == static_cast<unsigned>(Intr->GradientStart +
4385 Intr->NumGradients - 1))) ||
4386 // Check for _L to _LZ optimization
4387 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4388 PackedAddrs.push_back(
4389 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4390 .getReg(0));
4391 } else {
4392 PackedAddrs.push_back(
4393 B.buildBuildVector(
4394 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4395 .getReg(0));
4396 ++I;
4397 }
4398 }
4399 }
4400}
4401
4402/// Convert from separate vaddr components to a single vector address register,
4403/// and replace the remaining operands with $noreg.
4404static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4405 int DimIdx, int NumVAddrs) {
4406 const LLT S32 = LLT::scalar(32);
4407
4408 SmallVector<Register, 8> AddrRegs;
4409 for (int I = 0; I != NumVAddrs; ++I) {
4410 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4411 if (SrcOp.isReg()) {
4412 AddrRegs.push_back(SrcOp.getReg());
4413 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4413, __extension__
__PRETTY_FUNCTION__))
;
4414 }
4415 }
4416
4417 int NumAddrRegs = AddrRegs.size();
4418 if (NumAddrRegs != 1) {
4419 // Above 8 elements round up to next power of 2 (i.e. 16).
4420 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4421 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4422 auto Undef = B.buildUndef(S32);
4423 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4424 NumAddrRegs = RoundedNumRegs;
4425 }
4426
4427 auto VAddr =
4428 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4429 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4430 }
4431
4432 for (int I = 1; I != NumVAddrs; ++I) {
4433 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4434 if (SrcOp.isReg())
4435 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4436 }
4437}
4438
4439/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4440///
4441/// Depending on the subtarget, load/store with 16-bit element data need to be
4442/// rewritten to use the low half of 32-bit registers, or directly use a packed
4443/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4444/// registers.
4445///
4446/// We don't want to directly select image instructions just yet, but also want
4447/// to exposes all register repacking to the legalizer/combiners. We also don't
4448/// want a selected instruction entering RegBankSelect. In order to avoid
4449/// defining a multitude of intermediate image instructions, directly hack on
4450/// the intrinsic's arguments. In cases like a16 addresses, this requires
4451/// padding now unnecessary arguments with $noreg.
4452bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4453 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4454 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4455
4456 const unsigned NumDefs = MI.getNumExplicitDefs();
4457 const unsigned ArgOffset = NumDefs + 1;
4458 bool IsTFE = NumDefs == 2;
4459 // We are only processing the operands of d16 image operations on subtargets
4460 // that use the unpacked register layout, or need to repack the TFE result.
4461
4462 // TODO: Do we need to guard against already legalized intrinsics?
4463 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4464 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4465
4466 MachineRegisterInfo *MRI = B.getMRI();
4467 const LLT S32 = LLT::scalar(32);
4468 const LLT S16 = LLT::scalar(16);
4469 const LLT V2S16 = LLT::fixed_vector(2, 16);
4470
4471 unsigned DMask = 0;
4472 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4473 LLT Ty = MRI->getType(VData);
4474
4475 // Check for 16 bit addresses and pack if true.
4476 LLT GradTy =
4477 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4478 LLT AddrTy =
4479 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4480 const bool IsG16 = GradTy == S16;
4481 const bool IsA16 = AddrTy == S16;
4482 const bool IsD16 = Ty.getScalarType() == S16;
4483
4484 int DMaskLanes = 0;
4485 if (!BaseOpcode->Atomic) {
4486 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4487 if (BaseOpcode->Gather4) {
4488 DMaskLanes = 4;
4489 } else if (DMask != 0) {
4490 DMaskLanes = countPopulation(DMask);
4491 } else if (!IsTFE && !BaseOpcode->Store) {
4492 // If dmask is 0, this is a no-op load. This can be eliminated.
4493 B.buildUndef(MI.getOperand(0));
4494 MI.eraseFromParent();
4495 return true;
4496 }
4497 }
4498
4499 Observer.changingInstr(MI);
4500 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4501
4502 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4503 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4504 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4505 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4506 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4507
4508 // Track that we legalized this
4509 MI.setDesc(B.getTII().get(NewOpcode));
4510
4511 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4512 // dmask to be at least 1 otherwise the instruction will fail
4513 if (IsTFE && DMask == 0) {
4514 DMask = 0x1;
4515 DMaskLanes = 1;
4516 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4517 }
4518
4519 if (BaseOpcode->Atomic) {
4520 Register VData0 = MI.getOperand(2).getReg();
4521 LLT Ty = MRI->getType(VData0);
4522
4523 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4524 if (Ty.isVector())
4525 return false;
4526
4527 if (BaseOpcode->AtomicX2) {
4528 Register VData1 = MI.getOperand(3).getReg();
4529 // The two values are packed in one register.
4530 LLT PackedTy = LLT::fixed_vector(2, Ty);
4531 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4532 MI.getOperand(2).setReg(Concat.getReg(0));
4533 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4534 }
4535 }
4536
4537 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4538
4539 // Rewrite the addressing register layout before doing anything else.
4540 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4541 // 16 bit gradients are supported, but are tied to the A16 control
4542 // so both gradients and addresses must be 16 bit
4543 return false;
4544 }
4545
4546 if (IsA16 && !ST.hasA16()) {
4547 // A16 not supported
4548 return false;
4549 }
4550
4551 if (IsA16 || IsG16) {
4552 if (Intr->NumVAddrs > 1) {
4553 SmallVector<Register, 4> PackedRegs;
4554
4555 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4556 IsG16);
4557
4558 // See also below in the non-a16 branch
4559 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4560 PackedRegs.size() <= ST.getNSAMaxSize();
4561
4562 if (!UseNSA && PackedRegs.size() > 1) {
4563 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4564 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4565 PackedRegs[0] = Concat.getReg(0);
4566 PackedRegs.resize(1);
4567 }
4568
4569 const unsigned NumPacked = PackedRegs.size();
4570 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4571 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4572 if (!SrcOp.isReg()) {
4573 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4573, __extension__
__PRETTY_FUNCTION__))
;
4574 continue;
4575 }
4576
4577 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4577, __extension__
__PRETTY_FUNCTION__))
;
4578
4579 if (I - Intr->VAddrStart < NumPacked)
4580 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4581 else
4582 SrcOp.setReg(AMDGPU::NoRegister);
4583 }
4584 }
4585 } else {
4586 // If the register allocator cannot place the address registers contiguously
4587 // without introducing moves, then using the non-sequential address encoding
4588 // is always preferable, since it saves VALU instructions and is usually a
4589 // wash in terms of code size or even better.
4590 //
4591 // However, we currently have no way of hinting to the register allocator
4592 // that MIMG addresses should be placed contiguously when it is possible to
4593 // do so, so force non-NSA for the common 2-address case as a heuristic.
4594 //
4595 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4596 // allocation when possible.
4597 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4598 CorrectedNumVAddrs <= ST.getNSAMaxSize();
4599
4600 if (!UseNSA && Intr->NumVAddrs > 1)
4601 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4602 Intr->NumVAddrs);
4603 }
4604
4605 int Flags = 0;
4606 if (IsA16)
4607 Flags |= 1;
4608 if (IsG16)
4609 Flags |= 2;
4610 MI.addOperand(MachineOperand::CreateImm(Flags));
4611
4612 if (BaseOpcode->Store) { // No TFE for stores?
4613 // TODO: Handle dmask trim
4614 if (!Ty.isVector() || !IsD16)
4615 return true;
4616
4617 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4618 if (RepackedReg != VData) {
4619 MI.getOperand(1).setReg(RepackedReg);
4620 }
4621
4622 return true;
4623 }
4624
4625 Register DstReg = MI.getOperand(0).getReg();
4626 const LLT EltTy = Ty.getScalarType();
4627 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4628
4629 // Confirm that the return type is large enough for the dmask specified
4630 if (NumElts < DMaskLanes)
4631 return false;
4632
4633 if (NumElts > 4 || DMaskLanes > 4)
4634 return false;
4635
4636 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4637 const LLT AdjustedTy =
4638 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4639
4640 // The raw dword aligned data component of the load. The only legal cases
4641 // where this matters should be when using the packed D16 format, for
4642 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4643 LLT RoundedTy;
4644
4645 // S32 vector to to cover all data, plus TFE result element.
4646 LLT TFETy;
4647
4648 // Register type to use for each loaded component. Will be S32 or V2S16.
4649 LLT RegTy;
4650
4651 if (IsD16 && ST.hasUnpackedD16VMem()) {
4652 RoundedTy =
4653 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4654 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4655 RegTy = S32;
4656 } else {
4657 unsigned EltSize = EltTy.getSizeInBits();
4658 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4659 unsigned RoundedSize = 32 * RoundedElts;
4660 RoundedTy = LLT::scalarOrVector(
4661 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4662 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4663 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4664 }
4665
4666 // The return type does not need adjustment.
4667 // TODO: Should we change s16 case to s32 or <2 x s16>?
4668 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4669 return true;
4670
4671 Register Dst1Reg;
4672
4673 // Insert after the instruction.
4674 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4675
4676 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4677 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4678 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4679 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4680
4681 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4682
4683 MI.getOperand(0).setReg(NewResultReg);
4684
4685 // In the IR, TFE is supposed to be used with a 2 element struct return
4686 // type. The instruction really returns these two values in one contiguous
4687 // register, with one additional dword beyond the loaded data. Rewrite the
4688 // return type to use a single register result.
4689
4690 if (IsTFE) {
4691 Dst1Reg = MI.getOperand(1).getReg();
4692 if (MRI->getType(Dst1Reg) != S32)
4693 return false;
4694
4695 // TODO: Make sure the TFE operand bit is set.
4696 MI.removeOperand(1);
4697
4698 // Handle the easy case that requires no repack instructions.
4699 if (Ty == S32) {
4700 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4701 return true;
4702 }
4703 }
4704
4705 // Now figure out how to copy the new result register back into the old
4706 // result.
4707 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4708
4709 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4710
4711 if (ResultNumRegs == 1) {
4712 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp",
4712, __extension__ __PRETTY_FUNCTION__))
;
4713 ResultRegs[0] = NewResultReg;
4714 } else {
4715 // We have to repack into a new vector of some kind.
4716 for (int I = 0; I != NumDataRegs; ++I)
4717 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4718 B.buildUnmerge(ResultRegs, NewResultReg);
4719
4720 // Drop the final TFE element to get the data part. The TFE result is
4721 // directly written to the right place already.
4722 if (IsTFE)
4723 ResultRegs.resize(NumDataRegs);
4724 }
4725
4726 // For an s16 scalar result, we form an s32 result with a truncate regardless
4727 // of packed vs. unpacked.
4728 if (IsD16 && !Ty.isVector()) {
4729 B.buildTrunc(DstReg, ResultRegs[0]);
4730 return true;
4731 }
4732
4733 // Avoid a build/concat_vector of 1 entry.
4734 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4735 B.buildBitcast(DstReg, ResultRegs[0]);
4736 return true;
4737 }
4738
4739 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4739, __extension__ __PRETTY_FUNCTION__))
;
4740
4741 if (IsD16) {
4742 // For packed D16 results with TFE enabled, all the data components are
4743 // S32. Cast back to the expected type.
4744 //
4745 // TODO: We don't really need to use load s32 elements. We would only need one
4746 // cast for the TFE result if a multiple of v2s16 was used.
4747 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4748 for (Register &Reg : ResultRegs)
4749 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4750 } else if (ST.hasUnpackedD16VMem()) {
4751 for (Register &Reg : ResultRegs)
4752 Reg = B.buildTrunc(S16, Reg).getReg(0);
4753 }
4754 }
4755
4756 auto padWithUndef = [&](LLT Ty, int NumElts) {
4757 if (NumElts == 0)
4758 return;
4759 Register Undef = B.buildUndef(Ty).getReg(0);
4760 for (int I = 0; I != NumElts; ++I)
4761 ResultRegs.push_back(Undef);
4762 };
4763
4764 // Pad out any elements eliminated due to the dmask.
4765 LLT ResTy = MRI->getType(ResultRegs[0]);
4766 if (!ResTy.isVector()) {
4767 padWithUndef(ResTy, NumElts - ResultRegs.size());
4768 B.buildBuildVector(DstReg, ResultRegs);
4769 return true;
4770 }
4771
4772 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4772, __extension__
__PRETTY_FUNCTION__))
;
4773 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4774
4775 // Deal with the one annoying legal case.
4776 const LLT V3S16 = LLT::fixed_vector(3, 16);
4777 if (Ty == V3S16) {
4778 if (IsTFE) {
4779 if (ResultRegs.size() == 1) {
4780 NewResultReg = ResultRegs[0];
4781 } else if (ResultRegs.size() == 2) {
4782 LLT V4S16 = LLT::fixed_vector(4, 16);
4783 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4784 } else {
4785 return false;
4786 }
4787 }
4788
4789 if (MRI->getType(DstReg).getNumElements() <
4790 MRI->getType(NewResultReg).getNumElements()) {
4791 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4792 } else {
4793 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4794 }
4795 return true;
4796 }
4797
4798 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4799 B.buildConcatVectors(DstReg, ResultRegs);
4800 return true;
4801}
4802
4803bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4804 LegalizerHelper &Helper, MachineInstr &MI) const {
4805 MachineIRBuilder &B = Helper.MIRBuilder;
4806 GISelChangeObserver &Observer = Helper.Observer;
4807
4808 Register Dst = MI.getOperand(0).getReg();
4809 LLT Ty = B.getMRI()->getType(Dst);
4810 unsigned Size = Ty.getSizeInBits();
4811 MachineFunction &MF = B.getMF();
4812
4813 Observer.changingInstr(MI);
4814
4815 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4816 Ty = getBitcastRegisterType(Ty);
4817 Helper.bitcastDst(MI, Ty, 0);
4818 Dst = MI.getOperand(0).getReg();
4819 B.setInsertPt(B.getMBB(), MI);
4820 }
4821
4822 // FIXME: We don't really need this intermediate instruction. The intrinsic
4823 // should be fixed to have a memory operand. Since it's readnone, we're not
4824 // allowed to add one.
4825 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4826 MI.removeOperand(1); // Remove intrinsic ID
4827
4828 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4829 // TODO: Should this use datalayout alignment?
4830 const unsigned MemSize = (Size + 7) / 8;
4831 const Align MemAlign(4);
4832 MachineMemOperand *MMO = MF.getMachineMemOperand(
4833 MachinePointerInfo(),
4834 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4835 MachineMemOperand::MOInvariant,
4836 MemSize, MemAlign);
4837 MI.addMemOperand(MF, MMO);
4838
4839 // There are no 96-bit result scalar loads, but widening to 128-bit should
4840 // always be legal. We may need to restore this to a 96-bit result if it turns
4841 // out this needs to be converted to a vector load during RegBankSelect.
4842 if (!isPowerOf2_32(Size)) {
4843 if (Ty.isVector())
4844 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4845 else
4846 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4847 }
4848
4849 Observer.changedInstr(MI);
4850 return true;
4851}
4852
4853// TODO: Move to selection
4854bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4855 MachineRegisterInfo &MRI,
4856 MachineIRBuilder &B) const {
4857 if (!ST.isTrapHandlerEnabled() ||
4858 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4859 return legalizeTrapEndpgm(MI, MRI, B);
4860
4861 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4862 switch (*HsaAbiVer) {
4863 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4864 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4865 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4866 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4867 case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
4868 return ST.supportsGetDoorbellID() ?
4869 legalizeTrapHsa(MI, MRI, B) :
4870 legalizeTrapHsaQueuePtr(MI, MRI, B);
4871 }
4872 }
4873
4874 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4874)
;
4875}
4876
4877bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4878 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4879 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4880 MI.eraseFromParent();
4881 return true;
4882}
4883
4884bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4885 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4886 MachineFunction &MF = B.getMF();
4887 const LLT S64 = LLT::scalar(64);
4888
4889 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4890 // For code object version 5, queue_ptr is passed through implicit kernarg.
4891 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
4892 AMDGPUTargetLowering::ImplicitParameter Param =
4893 AMDGPUTargetLowering::QUEUE_PTR;
4894 uint64_t Offset =
4895 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
4896
4897 Register KernargPtrReg = MRI.createGenericVirtualRegister(
4898 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4899
4900 if (!loadInputValue(KernargPtrReg, B,
4901 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4902 return false;
4903
4904 // TODO: can we be smarter about machine pointer info?
4905 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4906 MachineMemOperand *MMO = MF.getMachineMemOperand(
4907 PtrInfo,
4908 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4909 MachineMemOperand::MOInvariant,
4910 LLT::scalar(64), commonAlignment(Align(64), Offset));
4911
4912 // Pointer address
4913 Register LoadAddr = MRI.createGenericVirtualRegister(
4914 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4915 B.buildPtrAdd(LoadAddr, KernargPtrReg,
4916 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
4917 // Load address
4918 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
4919 B.buildCopy(SGPR01, Temp);
4920 B.buildInstr(AMDGPU::S_TRAP)
4921 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4922 .addReg(SGPR01, RegState::Implicit);
4923 MI.eraseFromParent();
4924 return true;
4925 }
4926
4927 // Pass queue pointer to trap handler as input, and insert trap instruction
4928 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4929 Register LiveIn =
4930 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4931 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4932 return false;
4933
4934 B.buildCopy(SGPR01, LiveIn);
4935 B.buildInstr(AMDGPU::S_TRAP)
4936 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4937 .addReg(SGPR01, RegState::Implicit);
4938
4939 MI.eraseFromParent();
4940 return true;
4941}
4942
4943bool AMDGPULegalizerInfo::legalizeTrapHsa(
4944 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4945 B.buildInstr(AMDGPU::S_TRAP)
4946 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4947 MI.eraseFromParent();
4948 return true;
4949}
4950
4951bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4952 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4953 // Is non-HSA path or trap-handler disabled? Then, report a warning
4954 // accordingly
4955 if (!ST.isTrapHandlerEnabled() ||
4956 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4957 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4958 "debugtrap handler not supported",
4959 MI.getDebugLoc(), DS_Warning);
4960 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4961 Ctx.diagnose(NoTrap);
4962 } else {
4963 // Insert debug-trap instruction
4964 B.buildInstr(AMDGPU::S_TRAP)
4965 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4966 }
4967
4968 MI.eraseFromParent();
4969 return true;
4970}
4971
4972bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4973 MachineIRBuilder &B) const {
4974 MachineRegisterInfo &MRI = *B.getMRI();
4975 const LLT S16 = LLT::scalar(16);
4976 const LLT S32 = LLT::scalar(32);
4977
4978 Register DstReg = MI.getOperand(0).getReg();
4979 Register NodePtr = MI.getOperand(2).getReg();
4980 Register RayExtent = MI.getOperand(3).getReg();
4981 Register RayOrigin = MI.getOperand(4).getReg();
4982 Register RayDir = MI.getOperand(5).getReg();
4983 Register RayInvDir = MI.getOperand(6).getReg();
4984 Register TDescr = MI.getOperand(7).getReg();
4985
4986 if (!ST.hasGFX10_AEncoding()) {
4987 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4988 "intrinsic not supported on subtarget",
4989 MI.getDebugLoc());
4990 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4991 return false;
4992 }
4993
4994 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4995 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4996 const unsigned NumVDataDwords = 4;
4997 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4998 const bool UseNSA =
4999 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
5000 const unsigned BaseOpcodes[2][2] = {
5001 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5002 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5003 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5004 int Opcode;
5005 if (UseNSA) {
5006 Opcode =
5007 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
5008 NumVDataDwords, NumVAddrDwords);
5009 } else {
5010 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5011 AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
5012 PowerOf2Ceil(NumVAddrDwords));
5013 }
5014 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5014, __extension__ __PRETTY_FUNCTION__))
;
5015
5016 SmallVector<Register, 12> Ops;
5017 if (Is64) {
5018 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5019 Ops.push_back(Unmerge.getReg(0));
5020 Ops.push_back(Unmerge.getReg(1));
5021 } else {
5022 Ops.push_back(NodePtr);
5023 }
5024 Ops.push_back(RayExtent);
5025
5026 auto packLanes = [&Ops, &S32, &B](Register Src) {
5027 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5028 Ops.push_back(Unmerge.getReg(0));
5029 Ops.push_back(Unmerge.getReg(1));
5030 Ops.push_back(Unmerge.getReg(2));
5031 };
5032
5033 packLanes(RayOrigin);
5034 if (IsA16) {
5035 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5036 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5037 Register R1 = MRI.createGenericVirtualRegister(S32);
5038 Register R2 = MRI.createGenericVirtualRegister(S32);
5039 Register R3 = MRI.createGenericVirtualRegister(S32);
5040 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5041 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5042 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5043 Ops.push_back(R1);
5044 Ops.push_back(R2);
5045 Ops.push_back(R3);
5046 } else {
5047 packLanes(RayDir);
5048 packLanes(RayInvDir);
5049 }
5050
5051 if (!UseNSA) {
5052 // Build a single vector containing all the operands so far prepared.
5053 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5054 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5055 Ops.clear();
5056 Ops.push_back(MergedOps);
5057 }
5058
5059 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5060 .addDef(DstReg)
5061 .addImm(Opcode);
5062
5063 for (Register R : Ops) {
5064 MIB.addUse(R);
5065 }
5066
5067 MIB.addUse(TDescr)
5068 .addImm(IsA16 ? 1 : 0)
5069 .cloneMemRefs(MI);
5070
5071 MI.eraseFromParent();
5072 return true;
5073}
5074
5075static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
5076 B.buildConstant(MI.getOperand(0).getReg(), C);
5077 MI.eraseFromParent();
5078 return true;
5079}
5080
5081bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5082 MachineIRBuilder &B) const {
5083 unsigned Opc;
5084 int RoundMode = MI.getOperand(2).getImm();
5085
5086 if (RoundMode == (int)RoundingMode::TowardPositive)
5087 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5088 else if (RoundMode == (int)RoundingMode::TowardNegative)
5089 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5090 else
5091 return false;
5092
5093 B.buildInstr(Opc)
5094 .addDef(MI.getOperand(0).getReg())
5095 .addUse(MI.getOperand(1).getReg());
5096
5097 MI.eraseFromParent();
5098
5099 return true;
5100}
5101
5102bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5103 MachineInstr &MI) const {
5104 MachineIRBuilder &B = Helper.MIRBuilder;
5105 MachineRegisterInfo &MRI = *B.getMRI();
5106
5107 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5108 auto IntrID = MI.getIntrinsicID();
5109 switch (IntrID) {
5110 case Intrinsic::amdgcn_if:
5111 case Intrinsic::amdgcn_else: {
5112 MachineInstr *Br = nullptr;
5113 MachineBasicBlock *UncondBrTarget = nullptr;
5114 bool Negated = false;
5115 if (MachineInstr *BrCond =
5116 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5117 const SIRegisterInfo *TRI
5118 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5119
5120 Register Def = MI.getOperand(1).getReg();
5121 Register Use = MI.getOperand(3).getReg();
5122
5123 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5124
5125 if (Negated)
5126 std::swap(CondBrTarget, UncondBrTarget);
5127
5128 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5129 if (IntrID == Intrinsic::amdgcn_if) {
5130 B.buildInstr(AMDGPU::SI_IF)
5131 .addDef(Def)
5132 .addUse(Use)
5133 .addMBB(UncondBrTarget);
5134 } else {
5135 B.buildInstr(AMDGPU::SI_ELSE)
5136 .addDef(Def)
5137 .addUse(Use)
5138 .addMBB(UncondBrTarget);
5139 }
5140
5141 if (Br) {
5142 Br->getOperand(0).setMBB(CondBrTarget);
5143 } else {
5144 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5145 // since we're swapping branch targets it needs to be reinserted.
5146 // FIXME: IRTranslator should probably not do this
5147 B.buildBr(*CondBrTarget);
5148 }
5149
5150 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5151 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5152 MI.eraseFromParent();
5153 BrCond->eraseFromParent();
5154 return true;
5155 }
5156
5157 return false;
5158 }
5159 case Intrinsic::amdgcn_loop: {
5160 MachineInstr *Br = nullptr;
5161 MachineBasicBlock *UncondBrTarget = nullptr;
5162 bool Negated = false;
5163 if (MachineInstr *BrCond =
5164 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5165 const SIRegisterInfo *TRI
5166 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5167
5168 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5169 Register Reg = MI.getOperand(2).getReg();
5170
5171 if (Negated)
5172 std::swap(CondBrTarget, UncondBrTarget);
5173
5174 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5175 B.buildInstr(AMDGPU::SI_LOOP)
5176 .addUse(Reg)
5177 .addMBB(UncondBrTarget);
5178
5179 if (Br)
5180 Br->getOperand(0).setMBB(CondBrTarget);
5181 else
5182 B.buildBr(*CondBrTarget);
5183
5184 MI.eraseFromParent();
5185 BrCond->eraseFromParent();
5186 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5187 return true;
5188 }
5189
5190 return false;
5191 }
5192 case Intrinsic::amdgcn_kernarg_segment_ptr:
5193 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5194 // This only makes sense to call in a kernel, so just lower to null.
5195 B.buildConstant(MI.getOperand(0).getReg(), 0);
5196 MI.eraseFromParent();
5197 return true;
5198 }
5199
5200 return legalizePreloadedArgIntrin(
5201 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5202 case Intrinsic::amdgcn_implicitarg_ptr:
5203 return legalizeImplicitArgPtr(MI, MRI, B);
5204 case Intrinsic::amdgcn_workitem_id_x:
5205 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5206 return replaceWithConstant(B, MI, 0);
5207 return legalizePreloadedArgIntrin(MI, MRI, B,
5208 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5209 case Intrinsic::amdgcn_workitem_id_y:
5210 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5211 return replaceWithConstant(B, MI, 0);
5212
5213 return legalizePreloadedArgIntrin(MI, MRI, B,
5214 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5215 case Intrinsic::amdgcn_workitem_id_z:
5216 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5217 return replaceWithConstant(B, MI, 0);
5218
5219 return legalizePreloadedArgIntrin(MI, MRI, B,
5220 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5221 case Intrinsic::amdgcn_workgroup_id_x:
5222 return legalizePreloadedArgIntrin(MI, MRI, B,
5223 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5224 case Intrinsic::amdgcn_workgroup_id_y:
5225 return legalizePreloadedArgIntrin(MI, MRI, B,
5226 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5227 case Intrinsic::amdgcn_workgroup_id_z:
5228 return legalizePreloadedArgIntrin(MI, MRI, B,
5229 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5230 case Intrinsic::amdgcn_dispatch_ptr:
5231 return legalizePreloadedArgIntrin(MI, MRI, B,
5232 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5233 case Intrinsic::amdgcn_queue_ptr:
5234 return legalizePreloadedArgIntrin(MI, MRI, B,
5235 AMDGPUFunctionArgInfo::QUEUE_PTR);
5236 case Intrinsic::amdgcn_implicit_buffer_ptr:
5237 return legalizePreloadedArgIntrin(
5238 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5239 case Intrinsic::amdgcn_dispatch_id:
5240 return legalizePreloadedArgIntrin(MI, MRI, B,
5241 AMDGPUFunctionArgInfo::DISPATCH_ID);
5242 case Intrinsic::r600_read_ngroups_x:
5243 // TODO: Emit error for hsa
5244 return legalizeKernargMemParameter(MI, B,
5245 SI::KernelInputOffsets::NGROUPS_X);
5246 case Intrinsic::r600_read_ngroups_y:
5247 return legalizeKernargMemParameter(MI, B,
5248 SI::KernelInputOffsets::NGROUPS_Y);
5249 case Intrinsic::r600_read_ngroups_z:
5250 return legalizeKernargMemParameter(MI, B,
5251 SI::KernelInputOffsets::NGROUPS_Z);
5252 case Intrinsic::r600_read_local_size_x:
5253 // TODO: Could insert G_ASSERT_ZEXT from s16
5254 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5255 case Intrinsic::r600_read_local_size_y:
5256 // TODO: Could insert G_ASSERT_ZEXT from s16
5257 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
5258 // TODO: Could insert G_ASSERT_ZEXT from s16
5259 case Intrinsic::r600_read_local_size_z:
5260 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5261 case Intrinsic::r600_read_global_size_x:
5262 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5263 case Intrinsic::r600_read_global_size_y:
5264 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5265 case Intrinsic::r600_read_global_size_z:
5266 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5267 case Intrinsic::amdgcn_fdiv_fast:
5268 return legalizeFDIVFastIntrin(MI, MRI, B);
5269 case Intrinsic::amdgcn_is_shared:
5270 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5271 case Intrinsic::amdgcn_is_private:
5272 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5273 case Intrinsic::amdgcn_wavefrontsize: {
5274 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5275 MI.eraseFromParent();
5276 return true;
5277 }
5278 case Intrinsic::amdgcn_s_buffer_load:
5279 return legalizeSBufferLoad(Helper, MI);
5280 case Intrinsic::amdgcn_raw_buffer_store:
5281 case Intrinsic::amdgcn_struct_buffer_store:
5282 return legalizeBufferStore(MI, MRI, B, false, false);
5283 case Intrinsic::amdgcn_raw_buffer_store_format:
5284 case Intrinsic::amdgcn_struct_buffer_store_format:
5285 return legalizeBufferStore(MI, MRI, B, false, true);
5286 case Intrinsic::amdgcn_raw_tbuffer_store:
5287 case Intrinsic::amdgcn_struct_tbuffer_store:
5288 return legalizeBufferStore(MI, MRI, B, true, true);
5289 case Intrinsic::amdgcn_raw_buffer_load:
5290 case Intrinsic::amdgcn_struct_buffer_load:
5291 return legalizeBufferLoad(MI, MRI, B, false, false);
5292 case Intrinsic::amdgcn_raw_buffer_load_format:
5293 case Intrinsic::amdgcn_struct_buffer_load_format:
5294 return legalizeBufferLoad(MI, MRI, B, true, false);
5295 case Intrinsic::amdgcn_raw_tbuffer_load:
5296 case Intrinsic::amdgcn_struct_tbuffer_load:
5297 return legalizeBufferLoad(MI, MRI, B, true, true);
5298 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5299 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5300 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5301 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5302 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5303 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5304 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5305 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5306 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5307 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5308 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5309 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5310 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5311 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5312 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5313 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5314 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5315 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5316 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5317 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5318 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5319 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5320 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5321 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5322 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5323 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5324 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5325 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5326 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5327 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5328 return legalizeBufferAtomic(MI, B, IntrID);
5329 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5330 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5331 Register DstReg = MI.getOperand(0).getReg();
5332 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5333 Function &F = B.getMF().getFunction();
5334 DiagnosticInfoUnsupported NoFpRet(
5335 F, "return versions of fp atomics not supported", B.getDebugLoc(),
5336 DS_Error);
5337 F.getContext().diagnose(NoFpRet);
5338 B.buildUndef(DstReg);
5339 MI.eraseFromParent();
5340 return true;
5341 }
5342
5343 return legalizeBufferAtomic(MI, B, IntrID);
5344 }
5345 case Intrinsic::amdgcn_atomic_inc:
5346 return legalizeAtomicIncDec(MI, B, true);
5347 case Intrinsic::amdgcn_atomic_dec:
5348 return legalizeAtomicIncDec(MI, B, false);
5349 case Intrinsic::trap:
5350 return legalizeTrapIntrinsic(MI, MRI, B);
5351 case Intrinsic::debugtrap:
5352 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5353 case Intrinsic::amdgcn_rsq_clamp:
5354 return legalizeRsqClampIntrinsic(MI, MRI, B);
5355 case Intrinsic::amdgcn_ds_fadd:
5356 case Intrinsic::amdgcn_ds_fmin:
5357 case Intrinsic::amdgcn_ds_fmax:
5358 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5359 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5360 return legalizeBVHIntrinsic(MI, B);
5361 default: {
5362 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5363 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5364 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5365 return true;
5366 }
5367 }
5368
5369 return true;
5370}

/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
9.1
'ZB' is not equal to ZB_Undefined
9.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
10
Assuming 'Val' is equal to 0
11
Taking true branch
117 return 32;
12
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
9
Calling 'TrailingZerosCounter::count'
13
Returning from 'TrailingZerosCounter::count'
14
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 251, __extension__
__PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 429, __extension__
__PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 440, __extension__
__PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 447, __extension__
__PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Return true if the argument contains a non-empty sequence of ones with the
575/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
576/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
577/// MaskLen is updated to specify the length of the mask, else neither are
578/// updated.
579inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
580 unsigned &MaskLen) {
581 if (!isShiftedMask_32(Value))
582 return false;
583 MaskIdx = countTrailingZeros(Value);
584 MaskLen = countPopulation(Value);
585 return true;
586}
587
588/// Return true if the argument contains a non-empty sequence of ones with the
589/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
590/// of the lowest set bit and \p MaskLen is updated to specify the length of the
591/// mask, else neither are updated.
592inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
593 unsigned &MaskLen) {
594 if (!isShiftedMask_64(Value))
595 return false;
596 MaskIdx = countTrailingZeros(Value);
597 MaskLen = countPopulation(Value);
598 return true;
599}
600
601/// Compile time Log2.
602/// Valid only for positive powers of two.
603template <size_t kValue> constexpr inline size_t CTLog2() {
604 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
605 "Value is not a valid power of 2");
606 return 1 + CTLog2<kValue / 2>();
607}
608
609template <> constexpr inline size_t CTLog2<1>() { return 0; }
610
611/// Return the log base 2 of the specified value.
612inline double Log2(double Value) {
613#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
614 return __builtin_log(Value) / __builtin_log(2.0);
615#else
616 return log2(Value);
617#endif
618}
619
620/// Return the floor log base 2 of the specified value, -1 if the value is zero.
621/// (32 bit edition.)
622/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
623inline unsigned Log2_32(uint32_t Value) {
624 return 31 - countLeadingZeros(Value);
625}
626
627/// Return the floor log base 2 of the specified value, -1 if the value is zero.
628/// (64 bit edition.)
629inline unsigned Log2_64(uint64_t Value) {
630 return 63 - countLeadingZeros(Value);
631}
632
633/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
634/// (32 bit edition).
635/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
636inline unsigned Log2_32_Ceil(uint32_t Value) {
637 return 32 - countLeadingZeros(Value - 1);
638}
639
640/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
641/// (64 bit edition.)
642inline unsigned Log2_64_Ceil(uint64_t Value) {
643 return 64 - countLeadingZeros(Value - 1);
644}
645
646/// Return the greatest common divisor of the values using Euclid's algorithm.
647template <typename T>
648inline T greatestCommonDivisor(T A, T B) {
649 while (B) {
650 T Tmp = B;
651 B = A % B;
652 A = Tmp;
653 }
654 return A;
655}
656
657inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
658 return greatestCommonDivisor<uint64_t>(A, B);
659}
660
661/// This function takes a 64-bit integer and returns the bit equivalent double.
662inline double BitsToDouble(uint64_t Bits) {
663 double D;
664 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
665 memcpy(&D, &Bits, sizeof(Bits));
666 return D;
667}
668
669/// This function takes a 32-bit integer and returns the bit equivalent float.
670inline float BitsToFloat(uint32_t Bits) {
671 float F;
672 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
673 memcpy(&F, &Bits, sizeof(Bits));
674 return F;
675}
676
677/// This function takes a double and returns the bit equivalent 64-bit integer.
678/// Note that copying doubles around changes the bits of NaNs on some hosts,
679/// notably x86, so this routine cannot be used if these bits are needed.
680inline uint64_t DoubleToBits(double Double) {
681 uint64_t Bits;
682 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
683 memcpy(&Bits, &Double, sizeof(Double));
684 return Bits;
685}
686
687/// This function takes a float and returns the bit equivalent 32-bit integer.
688/// Note that copying floats around changes the bits of NaNs on some hosts,
689/// notably x86, so this routine cannot be used if these bits are needed.
690inline uint32_t FloatToBits(float Float) {
691 uint32_t Bits;
692 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
693 memcpy(&Bits, &Float, sizeof(Float));
694 return Bits;
695}
696
697/// A and B are either alignments or offsets. Return the minimum alignment that
698/// may be assumed after adding the two together.
699constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
700 // The largest power of 2 that divides both A and B.
701 //
702 // Replace "-Value" by "1+~Value" in the following commented code to avoid
703 // MSVC warning C4146
704 // return (A | B) & -(A | B);
705 return (A | B) & (1 + ~(A | B));
706}
707
708/// Returns the next power of two (in 64-bits) that is strictly greater than A.
709/// Returns zero on overflow.
710constexpr inline uint64_t NextPowerOf2(uint64_t A) {
711 A |= (A >> 1);
712 A |= (A >> 2);
713 A |= (A >> 4);
714 A |= (A >> 8);
715 A |= (A >> 16);
716 A |= (A >> 32);
717 return A + 1;
718}
719
720/// Returns the power of two which is less than or equal to the given value.
721/// Essentially, it is a floor operation across the domain of powers of two.
722inline uint64_t PowerOf2Floor(uint64_t A) {
723 if (!A) return 0;
724 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
725}
726
727/// Returns the power of two which is greater than or equal to the given value.
728/// Essentially, it is a ceil operation across the domain of powers of two.
729inline uint64_t PowerOf2Ceil(uint64_t A) {
730 if (!A)
731 return 0;
732 return NextPowerOf2(A - 1);
733}
734
735/// Returns the next integer (mod 2**64) that is greater than or equal to
736/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
737///
738/// If non-zero \p Skew is specified, the return value will be a minimal
739/// integer that is greater than or equal to \p Value and equal to
740/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
741/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
742///
743/// Examples:
744/// \code
745/// alignTo(5, 8) = 8
746/// alignTo(17, 8) = 24
747/// alignTo(~0LL, 8) = 0
748/// alignTo(321, 255) = 510
749///
750/// alignTo(5, 8, 7) = 7
751/// alignTo(17, 8, 1) = 17
752/// alignTo(~0LL, 8, 3) = 3
753/// alignTo(321, 255, 42) = 552
754/// \endcode
755inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
756 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 756, __extension__
__PRETTY_FUNCTION__))
;
757 Skew %= Align;
758 return (Value + Align - 1 - Skew) / Align * Align + Skew;
759}
760
761/// Returns the next integer (mod 2**64) that is greater than or equal to
762/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
763template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
764 static_assert(Align != 0u, "Align must be non-zero");
765 return (Value + Align - 1) / Align * Align;
766}
767
768/// Returns the integer ceil(Numerator / Denominator).
769inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
770 return alignTo(Numerator, Denominator) / Denominator;
771}
772
773/// Returns the integer nearest(Numerator / Denominator).
774inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
775