Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2898, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-19-134126-35450-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returns true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx,
99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100 };
101}
102
103static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 unsigned Size = Ty.getSizeInBits();
108 unsigned Pieces = (Size + 63) / 64;
109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110 return std::make_pair(
111 TypeIdx,
112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113 };
114}
115
116// Increase the number of vector elements to reach the next multiple of 32-bit
117// type.
118static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 127, __extension__ __PRETTY_FUNCTION__))
;
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132}
133
134static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
136
137 LLT CoercedTy;
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145}
146
147static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152}
153
154static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 158, __extension__ __PRETTY_FUNCTION__))
;
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162}
163
164static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169}
170
171static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176}
177
178static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183}
184
185static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187}
188
189static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192}
193
194static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199}
200
201static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209}
210
211// Any combination of 32 or 64-bit elements up the maximum register size, and
212// multiples of v2s16.
213static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217}
218
219static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227}
228
229// If we have a truncating store or an extending load with a data size larger
230// than 32-bits, we need to reduce to a 32-bit type.
231static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237}
238
239// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240// handle some operations by just promoting the register during
241// selection. There are also d16 loads on GFX9+ which preserve the high bits.
242static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 321, __extension__ __PRETTY_FUNCTION__))
;
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331}
332
333// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334// workaround this. Eventually it should ignore the type for loads and only care
335// about the size. Return true in cases where we will workaround this for now by
336// bitcasting.
337static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353}
354
355static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359}
360
361/// Return true if a load or store of the type should be lowered with a bitcast
362/// to a different type.
363static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377}
378
379/// Return true if we should legalize a load by widening an odd sized memory
380/// access up to the alignment. Note this case when the memory access itself
381/// changes, not the size of the result register.
382static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 uint64_t AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414}
415
416static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424}
425
426AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16, V2S16})
535 .minScalar(0, S16)
536 .clampMaxNumElementsStrict(0, S16, 2)
537 .widenScalarToNextMultipleOf(0, 32)
538 .maxScalar(0, S32)
539 .scalarize(0);
540
541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542 .legalFor({S32, S16, V2S16}) // Clamp modifier
543 .minScalarOrElt(0, S16)
544 .clampMaxNumElementsStrict(0, S16, 2)
545 .scalarize(0)
546 .widenScalarToNextPow2(0, 32)
547 .lower();
548 } else if (ST.has16BitInsts()) {
549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550 .legalFor({S32, S16})
551 .minScalar(0, S16)
552 .widenScalarToNextMultipleOf(0, 32)
553 .maxScalar(0, S32)
554 .scalarize(0);
555
556 // Technically the saturating operations require clamp bit support, but this
557 // was introduced at the same time as 16-bit operations.
558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559 .legalFor({S32, S16}) // Clamp modifier
560 .minScalar(0, S16)
561 .scalarize(0)
562 .widenScalarToNextPow2(0, 16)
563 .lower();
564
565 // We're just lowering this, but it helps get a better result to try to
566 // coerce to the desired type first.
567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568 .minScalar(0, S16)
569 .scalarize(0)
570 .lower();
571 } else {
572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573 .legalFor({S32})
574 .widenScalarToNextMultipleOf(0, 32)
575 .clampScalar(0, S32, S32)
576 .scalarize(0);
577
578 if (ST.hasIntClamp()) {
579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580 .legalFor({S32}) // Clamp modifier.
581 .scalarize(0)
582 .minScalarOrElt(0, S32)
583 .lower();
584 } else {
585 // Clamp bit support was added in VI, along with 16-bit operations.
586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587 .minScalar(0, S32)
588 .scalarize(0)
589 .lower();
590 }
591
592 // FIXME: DAG expansion gets better results. The widening uses the smaller
593 // range values and goes for the min/max lowering directly.
594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595 .minScalar(0, S32)
596 .scalarize(0)
597 .lower();
598 }
599
600 getActionDefinitionsBuilder(
601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602 .customFor({S32, S64})
603 .clampScalar(0, S32, S64)
604 .widenScalarToNextPow2(0, 32)
605 .scalarize(0);
606
607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608 .legalFor({S32})
609 .maxScalar(0, S32);
610
611 if (ST.hasVOP3PInsts()) {
612 Mulh
613 .clampMaxNumElements(0, S8, 2)
614 .lowerFor({V2S8});
615 }
616
617 Mulh
618 .scalarize(0)
619 .lower();
620
621 // Report legal for any types we can handle anywhere. For the cases only legal
622 // on the SALU, RegBankSelect will be able to re-legalize.
623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625 .clampScalar(0, S32, S64)
626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628 .widenScalarToNextPow2(0)
629 .scalarize(0);
630
631 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633 .legalFor({{S32, S1}, {S32, S32}})
634 .minScalar(0, S32)
635 // TODO: .scalarize(0)
636 .lower();
637
638 getActionDefinitionsBuilder(G_BITCAST)
639 // Don't worry about the size constraint.
640 .legalIf(all(isRegisterType(0), isRegisterType(1)))
641 .lower();
642
643
644 getActionDefinitionsBuilder(G_CONSTANT)
645 .legalFor({S1, S32, S64, S16, GlobalPtr,
646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647 .legalIf(isPointer(0))
648 .clampScalar(0, S32, S64)
649 .widenScalarToNextPow2(0);
650
651 getActionDefinitionsBuilder(G_FCONSTANT)
652 .legalFor({S32, S64, S16})
653 .clampScalar(0, S16, S64);
654
655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656 .legalIf(isRegisterType(0))
657 // s1 and s16 are special cases because they have legal operations on
658 // them, but don't really occupy registers in the normal way.
659 .legalFor({S1, S16})
660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661 .clampScalarOrElt(0, S32, MaxScalar)
662 .widenScalarToNextPow2(0, 32)
663 .clampMaxNumElements(0, S32, 16);
664
665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666
667 // If the amount is divergent, we have to do a wave reduction to get the
668 // maximum value, so this is expanded during RegBankSelect.
669 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670 .legalFor({{PrivatePtr, S32}});
671
672 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673 .customIf(typeIsNot(0, PrivatePtr));
674
675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676
677 auto &FPOpActions = getActionDefinitionsBuilder(
678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679 .legalFor({S32, S64});
680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681 .customFor({S32, S64});
682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683 .customFor({S32, S64});
684
685 if (ST.has16BitInsts()) {
686 if (ST.hasVOP3PInsts())
687 FPOpActions.legalFor({S16, V2S16});
688 else
689 FPOpActions.legalFor({S16});
690
691 TrigActions.customFor({S16});
692 FDIVActions.customFor({S16});
693 }
694
695 auto &MinNumMaxNum = getActionDefinitionsBuilder({
696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697
698 if (ST.hasVOP3PInsts()) {
699 MinNumMaxNum.customFor(FPTypesPK16)
700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701 .clampMaxNumElements(0, S16, 2)
702 .clampScalar(0, S16, S64)
703 .scalarize(0);
704 } else if (ST.has16BitInsts()) {
705 MinNumMaxNum.customFor(FPTypes16)
706 .clampScalar(0, S16, S64)
707 .scalarize(0);
708 } else {
709 MinNumMaxNum.customFor(FPTypesBase)
710 .clampScalar(0, S32, S64)
711 .scalarize(0);
712 }
713
714 if (ST.hasVOP3PInsts())
715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716
717 FPOpActions
718 .scalarize(0)
719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720
721 TrigActions
722 .scalarize(0)
723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724
725 FDIVActions
726 .scalarize(0)
727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728
729 getActionDefinitionsBuilder({G_FNEG, G_FABS})
730 .legalFor(FPTypesPK16)
731 .clampMaxNumElementsStrict(0, S16, 2)
732 .scalarize(0)
733 .clampScalar(0, S16, S64);
734
735 if (ST.has16BitInsts()) {
736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737 .legalFor({S32, S64, S16})
738 .scalarize(0)
739 .clampScalar(0, S16, S64);
740 } else {
741 getActionDefinitionsBuilder(G_FSQRT)
742 .legalFor({S32, S64})
743 .scalarize(0)
744 .clampScalar(0, S32, S64);
745
746 if (ST.hasFractBug()) {
747 getActionDefinitionsBuilder(G_FFLOOR)
748 .customFor({S64})
749 .legalFor({S32, S64})
750 .scalarize(0)
751 .clampScalar(0, S32, S64);
752 } else {
753 getActionDefinitionsBuilder(G_FFLOOR)
754 .legalFor({S32, S64})
755 .scalarize(0)
756 .clampScalar(0, S32, S64);
757 }
758 }
759
760 getActionDefinitionsBuilder(G_FPTRUNC)
761 .legalFor({{S32, S64}, {S16, S32}})
762 .scalarize(0)
763 .lower();
764
765 getActionDefinitionsBuilder(G_FPEXT)
766 .legalFor({{S64, S32}, {S32, S16}})
767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768 .scalarize(0);
769
770 getActionDefinitionsBuilder(G_FSUB)
771 // Use actual fsub instruction
772 .legalFor({S32})
773 // Must use fadd + fneg
774 .lowerFor({S64, S16, V2S16})
775 .scalarize(0)
776 .clampScalar(0, S32, S64);
777
778 // Whether this is legal depends on the floating point mode for the function.
779 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781 FMad.customFor({S32, S16});
782 else if (ST.hasMadMacF32Insts())
783 FMad.customFor({S32});
784 else if (ST.hasMadF16())
785 FMad.customFor({S16});
786 FMad.scalarize(0)
787 .lower();
788
789 auto &FRem = getActionDefinitionsBuilder(G_FREM);
790 if (ST.has16BitInsts()) {
791 FRem.customFor({S16, S32, S64});
792 } else {
793 FRem.minScalar(0, S32)
794 .customFor({S32, S64});
795 }
796 FRem.scalarize(0);
797
798 // TODO: Do we need to clamp maximum bitwidth?
799 getActionDefinitionsBuilder(G_TRUNC)
800 .legalIf(isScalar(0))
801 .legalFor({{V2S16, V2S32}})
802 .clampMaxNumElements(0, S16, 2)
803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804 // situations (like an invalid implicit use), we don't want to infinite loop
805 // in the legalizer.
806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807 .alwaysLegal();
808
809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811 {S32, S1}, {S64, S1}, {S16, S1}})
812 .scalarize(0)
813 .clampScalar(0, S32, S64)
814 .widenScalarToNextPow2(1, 32);
815
816 // TODO: Split s1->s64 during regbankselect for VALU.
817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819 .lowerIf(typeIs(1, S1))
820 .customFor({{S32, S64}, {S64, S64}});
821 if (ST.has16BitInsts())
822 IToFP.legalFor({{S16, S16}});
823 IToFP.clampScalar(1, S32, S64)
824 .minScalar(0, S32)
825 .scalarize(0)
826 .widenScalarToNextPow2(1);
827
828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830 .customFor({{S64, S32}, {S64, S64}})
831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832 if (ST.has16BitInsts())
833 FPToI.legalFor({{S16, S16}});
834 else
835 FPToI.minScalar(1, S32);
836
837 FPToI.minScalar(0, S32)
838 .widenScalarToNextPow2(0, 32)
839 .scalarize(0)
840 .lower();
841
842 // Lower roundeven into G_FRINT
843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844 .scalarize(0)
845 .lower();
846
847 if (ST.has16BitInsts()) {
848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849 .legalFor({S16, S32, S64})
850 .clampScalar(0, S16, S64)
851 .scalarize(0);
852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854 .legalFor({S32, S64})
855 .clampScalar(0, S32, S64)
856 .scalarize(0);
857 } else {
858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859 .legalFor({S32})
860 .customFor({S64})
861 .clampScalar(0, S32, S64)
862 .scalarize(0);
863 }
864
865 getActionDefinitionsBuilder(G_PTR_ADD)
866 .legalIf(all(isPointer(0), sameSize(0, 1)))
867 .scalarize(0)
868 .scalarSameSizeAs(1, 0);
869
870 getActionDefinitionsBuilder(G_PTRMASK)
871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872 .scalarSameSizeAs(1, 0)
873 .scalarize(0);
874
875 auto &CmpBuilder =
876 getActionDefinitionsBuilder(G_ICMP)
877 // The compare output type differs based on the register bank of the output,
878 // so make both s1 and s32 legal.
879 //
880 // Scalar compares producing output in scc will be promoted to s32, as that
881 // is the allocatable register type that will be needed for the copy from
882 // scc. This will be promoted during RegBankSelect, and we assume something
883 // before that won't try to use s32 result types.
884 //
885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886 // bank.
887 .legalForCartesianProduct(
888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889 .legalForCartesianProduct(
890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891 if (ST.has16BitInsts()) {
892 CmpBuilder.legalFor({{S1, S16}});
893 }
894
895 CmpBuilder
896 .widenScalarToNextPow2(1)
897 .clampScalar(1, S32, S64)
898 .scalarize(0)
899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900
901 getActionDefinitionsBuilder(G_FCMP)
902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903 .widenScalarToNextPow2(1)
904 .clampScalar(1, S32, S64)
905 .scalarize(0);
906
907 // FIXME: fpow has a selection pattern that should move to custom lowering.
908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909 if (ST.has16BitInsts())
910 Exp2Ops.legalFor({S32, S16});
911 else
912 Exp2Ops.legalFor({S32});
913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914 Exp2Ops.scalarize(0);
915
916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917 if (ST.has16BitInsts())
918 ExpOps.customFor({{S32}, {S16}});
919 else
920 ExpOps.customFor({S32});
921 ExpOps.clampScalar(0, MinScalarFPTy, S32)
922 .scalarize(0);
923
924 getActionDefinitionsBuilder(G_FPOWI)
925 .clampScalar(0, MinScalarFPTy, S32)
926 .lower();
927
928 // The 64-bit versions produce 32-bit results, but only on the SALU.
929 getActionDefinitionsBuilder(G_CTPOP)
930 .legalFor({{S32, S32}, {S32, S64}})
931 .clampScalar(0, S32, S32)
932 .clampScalar(1, S32, S64)
933 .scalarize(0)
934 .widenScalarToNextPow2(0, 32)
935 .widenScalarToNextPow2(1, 32);
936
937 // The hardware instructions return a different result on 0 than the generic
938 // instructions expect. The hardware produces -1, but these produce the
939 // bitwidth.
940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941 .scalarize(0)
942 .clampScalar(0, S32, S32)
943 .clampScalar(1, S32, S64)
944 .widenScalarToNextPow2(0, 32)
945 .widenScalarToNextPow2(1, 32)
946 .custom();
947
948 // The 64-bit versions produce 32-bit results, but only on the SALU.
949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950 .legalFor({{S32, S32}, {S32, S64}})
951 .clampScalar(0, S32, S32)
952 .clampScalar(1, S32, S64)
953 .scalarize(0)
954 .widenScalarToNextPow2(0, 32)
955 .widenScalarToNextPow2(1, 32);
956
957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958 // RegBankSelect.
959 getActionDefinitionsBuilder(G_BITREVERSE)
960 .legalFor({S32, S64})
961 .clampScalar(0, S32, S64)
962 .scalarize(0)
963 .widenScalarToNextPow2(0);
964
965 if (ST.has16BitInsts()) {
966 getActionDefinitionsBuilder(G_BSWAP)
967 .legalFor({S16, S32, V2S16})
968 .clampMaxNumElementsStrict(0, S16, 2)
969 // FIXME: Fixing non-power-of-2 before clamp is workaround for
970 // narrowScalar limitation.
971 .widenScalarToNextPow2(0)
972 .clampScalar(0, S16, S32)
973 .scalarize(0);
974
975 if (ST.hasVOP3PInsts()) {
976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977 .legalFor({S32, S16, V2S16})
978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979 .clampMaxNumElements(0, S16, 2)
980 .minScalar(0, S16)
981 .widenScalarToNextPow2(0)
982 .scalarize(0)
983 .lower();
984 } else {
985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986 .legalFor({S32, S16})
987 .widenScalarToNextPow2(0)
988 .minScalar(0, S16)
989 .scalarize(0)
990 .lower();
991 }
992 } else {
993 // TODO: Should have same legality without v_perm_b32
994 getActionDefinitionsBuilder(G_BSWAP)
995 .legalFor({S32})
996 .lowerIf(scalarNarrowerThan(0, 32))
997 // FIXME: Fixing non-power-of-2 before clamp is workaround for
998 // narrowScalar limitation.
999 .widenScalarToNextPow2(0)
1000 .maxScalar(0, S32)
1001 .scalarize(0)
1002 .lower();
1003
1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005 .legalFor({S32})
1006 .minScalar(0, S32)
1007 .widenScalarToNextPow2(0)
1008 .scalarize(0)
1009 .lower();
1010 }
1011
1012 getActionDefinitionsBuilder(G_INTTOPTR)
1013 // List the common cases
1014 .legalForCartesianProduct(AddrSpaces64, {S64})
1015 .legalForCartesianProduct(AddrSpaces32, {S32})
1016 .scalarize(0)
1017 // Accept any address space as long as the size matches
1018 .legalIf(sameSize(0, 1))
1019 .widenScalarIf(smallerThan(1, 0),
1020 [](const LegalityQuery &Query) {
1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022 })
1023 .narrowScalarIf(largerThan(1, 0),
1024 [](const LegalityQuery &Query) {
1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026 });
1027
1028 getActionDefinitionsBuilder(G_PTRTOINT)
1029 // List the common cases
1030 .legalForCartesianProduct(AddrSpaces64, {S64})
1031 .legalForCartesianProduct(AddrSpaces32, {S32})
1032 .scalarize(0)
1033 // Accept any address space as long as the size matches
1034 .legalIf(sameSize(0, 1))
1035 .widenScalarIf(smallerThan(0, 1),
1036 [](const LegalityQuery &Query) {
1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038 })
1039 .narrowScalarIf(
1040 largerThan(0, 1),
1041 [](const LegalityQuery &Query) {
1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043 });
1044
1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046 .scalarize(0)
1047 .custom();
1048
1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050 bool IsLoad) -> bool {
1051 const LLT DstTy = Query.Types[0];
1052
1053 // Split vector extloads.
1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055
1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057 return true;
1058
1059 const LLT PtrTy = Query.Types[1];
1060 unsigned AS = PtrTy.getAddressSpace();
1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062 return true;
1063
1064 // Catch weird sized loads that don't evenly divide into the access sizes
1065 // TODO: May be able to widen depending on alignment etc.
1066 unsigned NumRegs = (MemSize + 31) / 32;
1067 if (NumRegs == 3) {
1068 if (!ST.hasDwordx3LoadStores())
1069 return true;
1070 } else {
1071 // If the alignment allows, these should have been widened.
1072 if (!isPowerOf2_32(NumRegs))
1073 return true;
1074 }
1075
1076 return false;
1077 };
1078
1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082
1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084 // LDS
1085 // TODO: Unsupported flat for SI.
1086
1087 for (unsigned Op : {G_LOAD, G_STORE}) {
1088 const bool IsStore = Op == G_STORE;
1089
1090 auto &Actions = getActionDefinitionsBuilder(Op);
1091 // Explicitly list some common cases.
1092 // TODO: Does this help compile time at all?
1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096 {S64, GlobalPtr, S64, GlobalAlign32},
1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099 {S32, GlobalPtr, S8, GlobalAlign8},
1100 {S32, GlobalPtr, S16, GlobalAlign16},
1101
1102 {S32, LocalPtr, S32, 32},
1103 {S64, LocalPtr, S64, 32},
1104 {V2S32, LocalPtr, V2S32, 32},
1105 {S32, LocalPtr, S8, 8},
1106 {S32, LocalPtr, S16, 16},
1107 {V2S16, LocalPtr, S32, 32},
1108
1109 {S32, PrivatePtr, S32, 32},
1110 {S32, PrivatePtr, S8, 8},
1111 {S32, PrivatePtr, S16, 16},
1112 {V2S16, PrivatePtr, S32, 32},
1113
1114 {S32, ConstantPtr, S32, GlobalAlign32},
1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117 {S64, ConstantPtr, S64, GlobalAlign32},
1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119 Actions.legalIf(
1120 [=](const LegalityQuery &Query) -> bool {
1121 return isLoadStoreLegal(ST, Query);
1122 });
1123
1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125 // 64-bits.
1126 //
1127 // TODO: Should generalize bitcast action into coerce, which will also cover
1128 // inserting addrspacecasts.
1129 Actions.customIf(typeIs(1, Constant32Ptr));
1130
1131 // Turn any illegal element vectors into something easier to deal
1132 // with. These will ultimately produce 32-bit scalar shifts to extract the
1133 // parts anyway.
1134 //
1135 // For odd 16-bit element vectors, prefer to split those into pieces with
1136 // 16-bit vector parts.
1137 Actions.bitcastIf(
1138 [=](const LegalityQuery &Query) -> bool {
1139 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140 Query.MMODescrs[0].MemoryTy);
1141 }, bitcastToRegisterType(0));
1142
1143 if (!IsStore) {
1144 // Widen suitably aligned loads by loading extra bytes. The standard
1145 // legalization actions can't properly express widening memory operands.
1146 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147 return shouldWidenLoad(ST, Query, G_LOAD);
1148 });
1149 }
1150
1151 // FIXME: load/store narrowing should be moved to lower action
1152 Actions
1153 .narrowScalarIf(
1154 [=](const LegalityQuery &Query) -> bool {
1155 return !Query.Types[0].isVector() &&
1156 needToSplitMemOp(Query, Op == G_LOAD);
1157 },
1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159 const LLT DstTy = Query.Types[0];
1160 const LLT PtrTy = Query.Types[1];
1161
1162 const unsigned DstSize = DstTy.getSizeInBits();
1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164
1165 // Split extloads.
1166 if (DstSize > MemSize)
1167 return std::make_pair(0, LLT::scalar(MemSize));
1168
1169 unsigned MaxSize = maxSizeForAddrSpace(ST,
1170 PtrTy.getAddressSpace(),
1171 Op == G_LOAD);
1172 if (MemSize > MaxSize)
1173 return std::make_pair(0, LLT::scalar(MaxSize));
1174
1175 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1176 return std::make_pair(0, LLT::scalar(Align));
1177 })
1178 .fewerElementsIf(
1179 [=](const LegalityQuery &Query) -> bool {
1180 return Query.Types[0].isVector() &&
1181 needToSplitMemOp(Query, Op == G_LOAD);
1182 },
1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184 const LLT DstTy = Query.Types[0];
1185 const LLT PtrTy = Query.Types[1];
1186
1187 LLT EltTy = DstTy.getElementType();
1188 unsigned MaxSize = maxSizeForAddrSpace(ST,
1189 PtrTy.getAddressSpace(),
1190 Op == G_LOAD);
1191
1192 // FIXME: Handle widened to power of 2 results better. This ends
1193 // up scalarizing.
1194 // FIXME: 3 element stores scalarized on SI
1195
1196 // Split if it's too large for the address space.
1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198 if (MemSize > MaxSize) {
1199 unsigned NumElts = DstTy.getNumElements();
1200 unsigned EltSize = EltTy.getSizeInBits();
1201
1202 if (MaxSize % EltSize == 0) {
1203 return std::make_pair(
1204 0, LLT::scalarOrVector(
1205 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206 }
1207
1208 unsigned NumPieces = MemSize / MaxSize;
1209
1210 // FIXME: Refine when odd breakdowns handled
1211 // The scalars will need to be re-legalized.
1212 if (NumPieces == 1 || NumPieces >= NumElts ||
1213 NumElts % NumPieces != 0)
1214 return std::make_pair(0, EltTy);
1215
1216 return std::make_pair(
1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218 }
1219
1220 // FIXME: We could probably handle weird extending loads better.
1221 if (DstTy.getSizeInBits() > MemSize)
1222 return std::make_pair(0, EltTy);
1223
1224 unsigned EltSize = EltTy.getSizeInBits();
1225 unsigned DstSize = DstTy.getSizeInBits();
1226 if (!isPowerOf2_32(DstSize)) {
1227 // We're probably decomposing an odd sized store. Try to split
1228 // to the widest type. TODO: Account for alignment. As-is it
1229 // should be OK, since the new parts will be further legalized.
1230 unsigned FloorSize = PowerOf2Floor(DstSize);
1231 return std::make_pair(
1232 0, LLT::scalarOrVector(
1233 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234 }
1235
1236 // May need relegalization for the scalars.
1237 return std::make_pair(0, EltTy);
1238 })
1239 .minScalar(0, S32)
1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241 .widenScalarToNextPow2(0)
1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243 .lower();
1244 }
1245
1246 // FIXME: Unaligned accesses not lowered.
1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249 {S32, GlobalPtr, S16, 2 * 8},
1250 {S32, LocalPtr, S8, 8},
1251 {S32, LocalPtr, S16, 16},
1252 {S32, PrivatePtr, S8, 8},
1253 {S32, PrivatePtr, S16, 16},
1254 {S32, ConstantPtr, S8, 8},
1255 {S32, ConstantPtr, S16, 2 * 8}})
1256 .legalIf(
1257 [=](const LegalityQuery &Query) -> bool {
1258 return isLoadStoreLegal(ST, Query);
1259 });
1260
1261 if (ST.hasFlatAddressSpace()) {
1262 ExtLoads.legalForTypesWithMemDesc(
1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264 }
1265
1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267 // 64-bits.
1268 //
1269 // TODO: Should generalize bitcast action into coerce, which will also cover
1270 // inserting addrspacecasts.
1271 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272
1273 ExtLoads.clampScalar(0, S32, S32)
1274 .widenScalarToNextPow2(0)
1275 .lower();
1276
1277 auto &Atomics = getActionDefinitionsBuilder(
1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281 G_ATOMICRMW_UMIN})
1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283 {S64, GlobalPtr}, {S64, LocalPtr},
1284 {S32, RegionPtr}, {S64, RegionPtr}});
1285 if (ST.hasFlatAddressSpace()) {
1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287 }
1288
1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290 if (ST.hasLDSFPAtomicAdd()) {
1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292 if (ST.hasGFX90AInsts())
1293 Atomic.legalFor({{S64, LocalPtr}});
1294 }
1295 if (ST.hasAtomicFaddInsts())
1296 Atomic.legalFor({{S32, GlobalPtr}});
1297
1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299 // demarshalling
1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302 {S32, FlatPtr}, {S64, FlatPtr}})
1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304 {S32, RegionPtr}, {S64, RegionPtr}});
1305 // TODO: Pointer types, any 32-bit or 64-bit vector
1306
1307 // Condition should be s32 for scalar, s1 for vector.
1308 getActionDefinitionsBuilder(G_SELECT)
1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310 LocalPtr, FlatPtr, PrivatePtr,
1311 LLT::fixed_vector(2, LocalPtr),
1312 LLT::fixed_vector(2, PrivatePtr)},
1313 {S1, S32})
1314 .clampScalar(0, S16, S64)
1315 .scalarize(1)
1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318 .clampMaxNumElements(0, S32, 2)
1319 .clampMaxNumElements(0, LocalPtr, 2)
1320 .clampMaxNumElements(0, PrivatePtr, 2)
1321 .scalarize(0)
1322 .widenScalarToNextPow2(0)
1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324
1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326 // be more flexible with the shift amount type.
1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328 .legalFor({{S32, S32}, {S64, S32}});
1329 if (ST.has16BitInsts()) {
1330 if (ST.hasVOP3PInsts()) {
1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332 .clampMaxNumElements(0, S16, 2);
1333 } else
1334 Shifts.legalFor({{S16, S16}});
1335
1336 // TODO: Support 16-bit shift amounts for all types
1337 Shifts.widenScalarIf(
1338 [=](const LegalityQuery &Query) {
1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340 // 32-bit amount.
1341 const LLT ValTy = Query.Types[0];
1342 const LLT AmountTy = Query.Types[1];
1343 return ValTy.getSizeInBits() <= 16 &&
1344 AmountTy.getSizeInBits() < 16;
1345 }, changeTo(1, S16));
1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347 Shifts.clampScalar(1, S32, S32);
1348 Shifts.widenScalarToNextPow2(0, 16);
1349 Shifts.clampScalar(0, S16, S64);
1350
1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352 .minScalar(0, S16)
1353 .scalarize(0)
1354 .lower();
1355 } else {
1356 // Make sure we legalize the shift amount type first, as the general
1357 // expansion for the shifted type will produce much worse code if it hasn't
1358 // been truncated already.
1359 Shifts.clampScalar(1, S32, S32);
1360 Shifts.widenScalarToNextPow2(0, 32);
1361 Shifts.clampScalar(0, S32, S64);
1362
1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364 .minScalar(0, S32)
1365 .scalarize(0)
1366 .lower();
1367 }
1368 Shifts.scalarize(0);
1369
1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373 unsigned IdxTypeIdx = 2;
1374
1375 getActionDefinitionsBuilder(Op)
1376 .customIf([=](const LegalityQuery &Query) {
1377 const LLT EltTy = Query.Types[EltTypeIdx];
1378 const LLT VecTy = Query.Types[VecTypeIdx];
1379 const LLT IdxTy = Query.Types[IdxTypeIdx];
1380 const unsigned EltSize = EltTy.getSizeInBits();
1381 return (EltSize == 32 || EltSize == 64) &&
1382 VecTy.getSizeInBits() % 32 == 0 &&
1383 VecTy.getSizeInBits() <= MaxRegisterSize &&
1384 IdxTy.getSizeInBits() == 32;
1385 })
1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387 bitcastToVectorElement32(VecTypeIdx))
1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389 .bitcastIf(
1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391 [=](const LegalityQuery &Query) {
1392 // For > 64-bit element types, try to turn this into a 64-bit
1393 // element vector since we may be able to do better indexing
1394 // if this is scalar. If not, fall back to 32.
1395 const LLT EltTy = Query.Types[EltTypeIdx];
1396 const LLT VecTy = Query.Types[VecTypeIdx];
1397 const unsigned DstEltSize = EltTy.getSizeInBits();
1398 const unsigned VecSize = VecTy.getSizeInBits();
1399
1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401 return std::make_pair(
1402 VecTypeIdx,
1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404 })
1405 .clampScalar(EltTypeIdx, S32, S64)
1406 .clampScalar(VecTypeIdx, S32, S64)
1407 .clampScalar(IdxTypeIdx, S32, S32)
1408 .clampMaxNumElements(VecTypeIdx, S32, 32)
1409 // TODO: Clamp elements for 64-bit vectors?
1410 // It should only be necessary with variable indexes.
1411 // As a last resort, lower to the stack
1412 .lower();
1413 }
1414
1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416 .unsupportedIf([=](const LegalityQuery &Query) {
1417 const LLT &EltTy = Query.Types[1].getElementType();
1418 return Query.Types[0] != EltTy;
1419 });
1420
1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424
1425 // FIXME: Doesn't handle extract of illegal sizes.
1426 getActionDefinitionsBuilder(Op)
1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428 .lowerIf([=](const LegalityQuery &Query) {
1429 // Sub-vector(or single element) insert and extract.
1430 // TODO: verify immediate offset here since lower only works with
1431 // whole elements.
1432 const LLT BigTy = Query.Types[BigTyIdx];
1433 return BigTy.isVector();
1434 })
1435 // FIXME: Multiples of 16 should not be legal.
1436 .legalIf([=](const LegalityQuery &Query) {
1437 const LLT BigTy = Query.Types[BigTyIdx];
1438 const LLT LitTy = Query.Types[LitTyIdx];
1439 return (BigTy.getSizeInBits() % 32 == 0) &&
1440 (LitTy.getSizeInBits() % 16 == 0);
1441 })
1442 .widenScalarIf(
1443 [=](const LegalityQuery &Query) {
1444 const LLT BigTy = Query.Types[BigTyIdx];
1445 return (BigTy.getScalarSizeInBits() < 16);
1446 },
1447 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1448 .widenScalarIf(
1449 [=](const LegalityQuery &Query) {
1450 const LLT LitTy = Query.Types[LitTyIdx];
1451 return (LitTy.getScalarSizeInBits() < 16);
1452 },
1453 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1454 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455 .widenScalarToNextPow2(BigTyIdx, 32);
1456
1457 }
1458
1459 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460 .legalForCartesianProduct(AllS32Vectors, {S32})
1461 .legalForCartesianProduct(AllS64Vectors, {S64})
1462 .clampNumElements(0, V16S32, V32S32)
1463 .clampNumElements(0, V2S64, V16S64)
1464 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465
1466 if (ST.hasScalarPackInsts()) {
1467 BuildVector
1468 // FIXME: Should probably widen s1 vectors straight to s32
1469 .minScalarOrElt(0, S16)
1470 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471 .minScalar(1, S32);
1472
1473 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474 .legalFor({V2S16, S32})
1475 .lower();
1476 BuildVector.minScalarOrElt(0, S32);
1477 } else {
1478 BuildVector.customFor({V2S16, S16});
1479 BuildVector.minScalarOrElt(0, S32);
1480
1481 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482 .customFor({V2S16, S32})
1483 .lower();
1484 }
1485
1486 BuildVector.legalIf(isRegisterType(0));
1487
1488 // FIXME: Clamp maximum size
1489 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491 .clampMaxNumElements(0, S32, 32)
1492 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493 .clampMaxNumElements(0, S16, 64);
1494
1495 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496 // pre-legalize.
1497 if (ST.hasVOP3PInsts()) {
1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499 .customFor({V2S16, V2S16})
1500 .lower();
1501 } else
1502 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503
1504 // Merge/Unmerge
1505 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508
1509 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510 const LLT Ty = Query.Types[TypeIdx];
1511 if (Ty.isVector()) {
1512 const LLT &EltTy = Ty.getElementType();
1513 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514 return true;
1515 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516 return true;
1517 }
1518 return false;
1519 };
1520
1521 auto &Builder = getActionDefinitionsBuilder(Op)
1522 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523 .lowerFor({{S16, V2S16}})
1524 .lowerIf([=](const LegalityQuery &Query) {
1525 const LLT BigTy = Query.Types[BigTyIdx];
1526 return BigTy.getSizeInBits() == 32;
1527 })
1528 // Try to widen to s16 first for small types.
1529 // TODO: Only do this on targets with legal s16 shifts
1530 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534 elementTypeIs(1, S16)),
1535 changeTo(1, V2S16))
1536 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538 // valid.
1539 .clampScalar(LitTyIdx, S32, S512)
1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541 // Break up vectors with weird elements into scalars
1542 .fewerElementsIf(
1543 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544 scalarize(0))
1545 .fewerElementsIf(
1546 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547 scalarize(1))
1548 .clampScalar(BigTyIdx, S32, MaxScalar);
1549
1550 if (Op == G_MERGE_VALUES) {
1551 Builder.widenScalarIf(
1552 // TODO: Use 16-bit shifts if legal for 8-bit values?
1553 [=](const LegalityQuery &Query) {
1554 const LLT Ty = Query.Types[LitTyIdx];
1555 return Ty.getSizeInBits() < 32;
1556 },
1557 changeTo(LitTyIdx, S32));
1558 }
1559
1560 Builder.widenScalarIf(
1561 [=](const LegalityQuery &Query) {
1562 const LLT Ty = Query.Types[BigTyIdx];
1563 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564 Ty.getSizeInBits() % 16 != 0;
1565 },
1566 [=](const LegalityQuery &Query) {
1567 // Pick the next power of 2, or a multiple of 64 over 128.
1568 // Whichever is smaller.
1569 const LLT &Ty = Query.Types[BigTyIdx];
1570 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571 if (NewSizeInBits >= 256) {
1572 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573 if (RoundedTo < NewSizeInBits)
1574 NewSizeInBits = RoundedTo;
1575 }
1576 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577 })
1578 // Any vectors left are the wrong size. Scalarize them.
1579 .scalarize(0)
1580 .scalarize(1);
1581 }
1582
1583 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584 // RegBankSelect.
1585 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586 .legalFor({{S32}, {S64}});
1587
1588 if (ST.hasVOP3PInsts()) {
1589 SextInReg.lowerFor({{V2S16}})
1590 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591 // get more vector shift opportunities, since we'll get those when
1592 // expanded.
1593 .clampMaxNumElementsStrict(0, S16, 2);
1594 } else if (ST.has16BitInsts()) {
1595 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596 } else {
1597 // Prefer to promote to s32 before lowering if we don't have 16-bit
1598 // shifts. This avoid a lot of intermediate truncate and extend operations.
1599 SextInReg.lowerFor({{S32}, {S64}});
1600 }
1601
1602 SextInReg
1603 .scalarize(0)
1604 .clampScalar(0, S32, S64)
1605 .lower();
1606
1607 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608 .scalarize(0)
1609 .lower();
1610
1611 // TODO: Only Try to form v2s16 with legal packed instructions.
1612 getActionDefinitionsBuilder(G_FSHR)
1613 .legalFor({{S32, S32}})
1614 .lowerFor({{V2S16, V2S16}})
1615 .clampMaxNumElementsStrict(0, S16, 2)
1616 .scalarize(0)
1617 .lower();
1618
1619 if (ST.hasVOP3PInsts()) {
1620 getActionDefinitionsBuilder(G_FSHL)
1621 .lowerFor({{V2S16, V2S16}})
1622 .clampMaxNumElementsStrict(0, S16, 2)
1623 .scalarize(0)
1624 .lower();
1625 } else {
1626 getActionDefinitionsBuilder(G_FSHL)
1627 .scalarize(0)
1628 .lower();
1629 }
1630
1631 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632 .legalFor({S64});
1633
1634 getActionDefinitionsBuilder(G_FENCE)
1635 .alwaysLegal();
1636
1637 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638 .scalarize(0)
1639 .minScalar(0, S32)
1640 .lower();
1641
1642 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643 .legalFor({{S32, S32}, {S64, S32}})
1644 .clampScalar(1, S32, S32)
1645 .clampScalar(0, S32, S64)
1646 .widenScalarToNextPow2(0)
1647 .scalarize(0);
1648
1649 getActionDefinitionsBuilder({
1650 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651 G_FCOPYSIGN,
1652
1653 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654 G_ATOMICRMW_NAND,
1655 G_ATOMICRMW_FSUB,
1656 G_READ_REGISTER,
1657 G_WRITE_REGISTER,
1658
1659 G_SADDO, G_SSUBO,
1660
1661 // TODO: Implement
1662 G_FMINIMUM, G_FMAXIMUM}).lower();
1663
1664 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665 .lower();
1666
1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670 .unsupported();
1671
1672 getLegacyLegalizerInfo().computeTables();
1673 verify(*ST.getInstrInfo());
1674}
1675
1676bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677 MachineInstr &MI) const {
1678 MachineIRBuilder &B = Helper.MIRBuilder;
1679 MachineRegisterInfo &MRI = *B.getMRI();
1680
1681 switch (MI.getOpcode()) {
1682 case TargetOpcode::G_ADDRSPACE_CAST:
1683 return legalizeAddrSpaceCast(MI, MRI, B);
1684 case TargetOpcode::G_FRINT:
1685 return legalizeFrint(MI, MRI, B);
1686 case TargetOpcode::G_FCEIL:
1687 return legalizeFceil(MI, MRI, B);
1688 case TargetOpcode::G_FREM:
1689 return legalizeFrem(MI, MRI, B);
1690 case TargetOpcode::G_INTRINSIC_TRUNC:
1691 return legalizeIntrinsicTrunc(MI, MRI, B);
1692 case TargetOpcode::G_SITOFP:
1693 return legalizeITOFP(MI, MRI, B, true);
1694 case TargetOpcode::G_UITOFP:
1695 return legalizeITOFP(MI, MRI, B, false);
1696 case TargetOpcode::G_FPTOSI:
1697 return legalizeFPTOI(MI, MRI, B, true);
1698 case TargetOpcode::G_FPTOUI:
1699 return legalizeFPTOI(MI, MRI, B, false);
1700 case TargetOpcode::G_FMINNUM:
1701 case TargetOpcode::G_FMAXNUM:
1702 case TargetOpcode::G_FMINNUM_IEEE:
1703 case TargetOpcode::G_FMAXNUM_IEEE:
1704 return legalizeMinNumMaxNum(Helper, MI);
1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706 return legalizeExtractVectorElt(MI, MRI, B);
1707 case TargetOpcode::G_INSERT_VECTOR_ELT:
1708 return legalizeInsertVectorElt(MI, MRI, B);
1709 case TargetOpcode::G_SHUFFLE_VECTOR:
1710 return legalizeShuffleVector(MI, MRI, B);
1711 case TargetOpcode::G_FSIN:
1712 case TargetOpcode::G_FCOS:
1713 return legalizeSinCos(MI, MRI, B);
1714 case TargetOpcode::G_GLOBAL_VALUE:
1715 return legalizeGlobalValue(MI, MRI, B);
1716 case TargetOpcode::G_LOAD:
1717 case TargetOpcode::G_SEXTLOAD:
1718 case TargetOpcode::G_ZEXTLOAD:
1719 return legalizeLoad(Helper, MI);
1720 case TargetOpcode::G_FMAD:
1721 return legalizeFMad(MI, MRI, B);
1722 case TargetOpcode::G_FDIV:
1723 return legalizeFDIV(MI, MRI, B);
1724 case TargetOpcode::G_UDIV:
1725 case TargetOpcode::G_UREM:
1726 case TargetOpcode::G_UDIVREM:
1727 return legalizeUnsignedDIV_REM(MI, MRI, B);
1728 case TargetOpcode::G_SDIV:
1729 case TargetOpcode::G_SREM:
1730 case TargetOpcode::G_SDIVREM:
1731 return legalizeSignedDIV_REM(MI, MRI, B);
1732 case TargetOpcode::G_ATOMIC_CMPXCHG:
1733 return legalizeAtomicCmpXChg(MI, MRI, B);
1734 case TargetOpcode::G_FLOG:
1735 return legalizeFlog(MI, B, numbers::ln2f);
1736 case TargetOpcode::G_FLOG10:
1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738 case TargetOpcode::G_FEXP:
1739 return legalizeFExp(MI, B);
1740 case TargetOpcode::G_FPOW:
1741 return legalizeFPow(MI, B);
1742 case TargetOpcode::G_FFLOOR:
1743 return legalizeFFloor(MI, MRI, B);
1744 case TargetOpcode::G_BUILD_VECTOR:
1745 return legalizeBuildVector(MI, MRI, B);
1746 case TargetOpcode::G_CTLZ:
1747 case TargetOpcode::G_CTTZ:
1748 return legalizeCTLZ_CTTZ(MI, MRI, B);
1749 default:
1750 return false;
1751 }
1752
1753 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1753)
;
1754}
1755
1756Register AMDGPULegalizerInfo::getSegmentAperture(
1757 unsigned AS,
1758 MachineRegisterInfo &MRI,
1759 MachineIRBuilder &B) const {
1760 MachineFunction &MF = B.getMF();
1761 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762 const LLT S32 = LLT::scalar(32);
1763
1764 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1764, __extension__
__PRETTY_FUNCTION__))
;
1765
1766 if (ST.hasApertureRegs()) {
1767 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768 // getreg.
1769 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1771 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1772 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1773 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1774 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1775 unsigned Encoding =
1776 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1777 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1778 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779
1780 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781
1782 B.buildInstr(AMDGPU::S_GETREG_B32)
1783 .addDef(GetReg)
1784 .addImm(Encoding);
1785 MRI.setType(GetReg, S32);
1786
1787 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789 }
1790
1791 Register QueuePtr = MRI.createGenericVirtualRegister(
1792 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1793
1794 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1795 return Register();
1796
1797 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798 // private_segment_aperture_base_hi.
1799 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800
1801 // TODO: can we be smarter about machine pointer info?
1802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1803 MachineMemOperand *MMO = MF.getMachineMemOperand(
1804 PtrInfo,
1805 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1806 MachineMemOperand::MOInvariant,
1807 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808
1809 Register LoadAddr;
1810
1811 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813}
1814
1815/// Return true if the value is a known valid address, such that a null check is
1816/// not necessary.
1817static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1818 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1819 MachineInstr *Def = MRI.getVRegDef(Val);
1820 switch (Def->getOpcode()) {
1821 case AMDGPU::G_FRAME_INDEX:
1822 case AMDGPU::G_GLOBAL_VALUE:
1823 case AMDGPU::G_BLOCK_ADDR:
1824 return true;
1825 case AMDGPU::G_CONSTANT: {
1826 const ConstantInt *CI = Def->getOperand(1).getCImm();
1827 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1828 }
1829 default:
1830 return false;
1831 }
1832
1833 return false;
1834}
1835
1836bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1837 MachineInstr &MI, MachineRegisterInfo &MRI,
1838 MachineIRBuilder &B) const {
1839 MachineFunction &MF = B.getMF();
1840
1841 const LLT S32 = LLT::scalar(32);
1842 Register Dst = MI.getOperand(0).getReg();
1843 Register Src = MI.getOperand(1).getReg();
1844
1845 LLT DstTy = MRI.getType(Dst);
1846 LLT SrcTy = MRI.getType(Src);
1847 unsigned DestAS = DstTy.getAddressSpace();
1848 unsigned SrcAS = SrcTy.getAddressSpace();
1849
1850 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1851 // vector element.
1852 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1852, __extension__ __PRETTY_FUNCTION__))
;
1853
1854 const AMDGPUTargetMachine &TM
1855 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1856
1857 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1858 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1859 return true;
1860 }
1861
1862 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863 // Truncate.
1864 B.buildExtract(Dst, Src, 0);
1865 MI.eraseFromParent();
1866 return true;
1867 }
1868
1869 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1870 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1871 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1872
1873 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1874 // another. Merge operands are required to be the same type, but creating an
1875 // extra ptrtoint would be kind of pointless.
1876 auto HighAddr = B.buildConstant(
1877 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1878 B.buildMerge(Dst, {Src, HighAddr});
1879 MI.eraseFromParent();
1880 return true;
1881 }
1882
1883 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1884 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1885, __extension__
__PRETTY_FUNCTION__))
1885 DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1885, __extension__
__PRETTY_FUNCTION__))
;
1886
1887 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1888 // Extract low 32-bits of the pointer.
1889 B.buildExtract(Dst, Src, 0);
1890 MI.eraseFromParent();
1891 return true;
1892 }
1893
1894 unsigned NullVal = TM.getNullPointerValue(DestAS);
1895
1896 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1897 auto FlatNull = B.buildConstant(SrcTy, 0);
1898
1899 // Extract low 32-bits of the pointer.
1900 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1901
1902 auto CmpRes =
1903 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1904 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1905
1906 MI.eraseFromParent();
1907 return true;
1908 }
1909
1910 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1911 return false;
1912
1913 if (!ST.hasFlatAddressSpace())
1914 return false;
1915
1916 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1917 if (!ApertureReg.isValid())
1918 return false;
1919
1920 // Coerce the type of the low half of the result so we can use merge_values.
1921 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1922
1923 // TODO: Should we allow mismatched types but matching sizes in merges to
1924 // avoid the ptrtoint?
1925 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1926
1927 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1928 B.buildCopy(Dst, BuildPtr);
1929 MI.eraseFromParent();
1930 return true;
1931 }
1932
1933 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1934 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1935
1936 auto CmpRes =
1937 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1938
1939 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1940
1941 MI.eraseFromParent();
1942 return true;
1943}
1944
1945bool AMDGPULegalizerInfo::legalizeFrint(
1946 MachineInstr &MI, MachineRegisterInfo &MRI,
1947 MachineIRBuilder &B) const {
1948 Register Src = MI.getOperand(1).getReg();
1949 LLT Ty = MRI.getType(Src);
1950 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1950, __extension__
__PRETTY_FUNCTION__))
;
1951
1952 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1953 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1954
1955 auto C1 = B.buildFConstant(Ty, C1Val);
1956 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1957
1958 // TODO: Should this propagate fast-math-flags?
1959 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1960 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1961
1962 auto C2 = B.buildFConstant(Ty, C2Val);
1963 auto Fabs = B.buildFAbs(Ty, Src);
1964
1965 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1966 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1967 MI.eraseFromParent();
1968 return true;
1969}
1970
1971bool AMDGPULegalizerInfo::legalizeFceil(
1972 MachineInstr &MI, MachineRegisterInfo &MRI,
1973 MachineIRBuilder &B) const {
1974
1975 const LLT S1 = LLT::scalar(1);
1976 const LLT S64 = LLT::scalar(64);
1977
1978 Register Src = MI.getOperand(1).getReg();
1979 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1979, __extension__ __PRETTY_FUNCTION__))
;
1980
1981 // result = trunc(src)
1982 // if (src > 0.0 && src != result)
1983 // result += 1.0
1984
1985 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1986
1987 const auto Zero = B.buildFConstant(S64, 0.0);
1988 const auto One = B.buildFConstant(S64, 1.0);
1989 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1990 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1991 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1992 auto Add = B.buildSelect(S64, And, One, Zero);
1993
1994 // TODO: Should this propagate fast-math-flags?
1995 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1996 MI.eraseFromParent();
1997 return true;
1998}
1999
2000bool AMDGPULegalizerInfo::legalizeFrem(
2001 MachineInstr &MI, MachineRegisterInfo &MRI,
2002 MachineIRBuilder &B) const {
2003 Register DstReg = MI.getOperand(0).getReg();
2004 Register Src0Reg = MI.getOperand(1).getReg();
2005 Register Src1Reg = MI.getOperand(2).getReg();
2006 auto Flags = MI.getFlags();
2007 LLT Ty = MRI.getType(DstReg);
2008
2009 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2010 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2011 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2012 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2013 MI.eraseFromParent();
2014 return true;
2015}
2016
2017static MachineInstrBuilder extractF64Exponent(Register Hi,
2018 MachineIRBuilder &B) {
2019 const unsigned FractBits = 52;
2020 const unsigned ExpBits = 11;
2021 LLT S32 = LLT::scalar(32);
2022
2023 auto Const0 = B.buildConstant(S32, FractBits - 32);
2024 auto Const1 = B.buildConstant(S32, ExpBits);
2025
2026 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2027 .addUse(Hi)
2028 .addUse(Const0.getReg(0))
2029 .addUse(Const1.getReg(0));
2030
2031 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2032}
2033
2034bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2035 MachineInstr &MI, MachineRegisterInfo &MRI,
2036 MachineIRBuilder &B) const {
2037 const LLT S1 = LLT::scalar(1);
2038 const LLT S32 = LLT::scalar(32);
2039 const LLT S64 = LLT::scalar(64);
2040
2041 Register Src = MI.getOperand(1).getReg();
2042 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2042, __extension__ __PRETTY_FUNCTION__))
;
2043
2044 // TODO: Should this use extract since the low half is unused?
2045 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2046 Register Hi = Unmerge.getReg(1);
2047
2048 // Extract the upper half, since this is where we will find the sign and
2049 // exponent.
2050 auto Exp = extractF64Exponent(Hi, B);
2051
2052 const unsigned FractBits = 52;
2053
2054 // Extract the sign bit.
2055 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2056 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2057
2058 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2059
2060 const auto Zero32 = B.buildConstant(S32, 0);
2061
2062 // Extend back to 64-bits.
2063 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2064
2065 auto Shr = B.buildAShr(S64, FractMask, Exp);
2066 auto Not = B.buildNot(S64, Shr);
2067 auto Tmp0 = B.buildAnd(S64, Src, Not);
2068 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2069
2070 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2071 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2072
2073 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2074 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2075 MI.eraseFromParent();
2076 return true;
2077}
2078
2079bool AMDGPULegalizerInfo::legalizeITOFP(
2080 MachineInstr &MI, MachineRegisterInfo &MRI,
2081 MachineIRBuilder &B, bool Signed) const {
2082
2083 Register Dst = MI.getOperand(0).getReg();
2084 Register Src = MI.getOperand(1).getReg();
2085
2086 const LLT S64 = LLT::scalar(64);
2087 const LLT S32 = LLT::scalar(32);
2088
2089 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2089, __extension__ __PRETTY_FUNCTION__))
;
2090
2091 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2092 auto ThirtyTwo = B.buildConstant(S32, 32);
2093
2094 if (MRI.getType(Dst) == S64) {
2095 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2096 : B.buildUITOFP(S64, Unmerge.getReg(1));
2097
2098 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2099 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2100 .addUse(CvtHi.getReg(0))
2101 .addUse(ThirtyTwo.getReg(0));
2102
2103 // TODO: Should this propagate fast-math-flags?
2104 B.buildFAdd(Dst, LdExp, CvtLo);
2105 MI.eraseFromParent();
2106 return true;
2107 }
2108
2109 assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0
) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2109, __extension__ __PRETTY_FUNCTION__))
;
2110
2111 auto One = B.buildConstant(S32, 1);
2112
2113 MachineInstrBuilder ShAmt;
2114 if (Signed) {
2115 auto ThirtyOne = B.buildConstant(S32, 31);
2116 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2117 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2118 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2119 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2120 /*HasSideEffects=*/false)
2121 .addUse(Unmerge.getReg(1));
2122 auto LS2 = B.buildSub(S32, LS, One);
2123 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2124 } else
2125 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2126 auto Norm = B.buildShl(S64, Src, ShAmt);
2127 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2128 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2129 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2130 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2131 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2132 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2133 /*HasSideEffects=*/false)
2134 .addUse(FVal.getReg(0))
2135 .addUse(Scale.getReg(0));
2136 MI.eraseFromParent();
2137 return true;
2138}
2139
2140// TODO: Copied from DAG implementation. Verify logic and document how this
2141// actually works.
2142bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2143 MachineRegisterInfo &MRI,
2144 MachineIRBuilder &B,
2145 bool Signed) const {
2146
2147 Register Dst = MI.getOperand(0).getReg();
2148 Register Src = MI.getOperand(1).getReg();
2149
2150 const LLT S64 = LLT::scalar(64);
2151 const LLT S32 = LLT::scalar(32);
2152
2153 const LLT SrcLT = MRI.getType(Src);
2154 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2154, __extension__
__PRETTY_FUNCTION__))
;
2155
2156 unsigned Flags = MI.getFlags();
2157
2158 // The basic idea of converting a floating point number into a pair of 32-bit
2159 // integers is illustrated as follows:
2160 //
2161 // tf := trunc(val);
2162 // hif := floor(tf * 2^-32);
2163 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2164 // hi := fptoi(hif);
2165 // lo := fptoi(lof);
2166 //
2167 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2168 MachineInstrBuilder Sign;
2169 if (Signed && SrcLT == S32) {
2170 // However, a 32-bit floating point number has only 23 bits mantissa and
2171 // it's not enough to hold all the significant bits of `lof` if val is
2172 // negative. To avoid the loss of precision, We need to take the absolute
2173 // value after truncating and flip the result back based on the original
2174 // signedness.
2175 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2176 Trunc = B.buildFAbs(S32, Trunc, Flags);
2177 }
2178 MachineInstrBuilder K0, K1;
2179 if (SrcLT == S64) {
2180 K0 = B.buildFConstant(S64,
2181 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2182 K1 = B.buildFConstant(S64,
2183 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2184 } else {
2185 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2186 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2187 }
2188
2189 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2190 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2191 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2192
2193 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2194 : B.buildFPTOUI(S32, FloorMul);
2195 auto Lo = B.buildFPTOUI(S32, Fma);
2196
2197 if (Signed && SrcLT == S32) {
2198 // Flip the result based on the signedness, which is either all 0s or 1s.
2199 Sign = B.buildMerge(S64, {Sign, Sign});
2200 // r := xor({lo, hi}, sign) - sign;
2201 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2202 } else
2203 B.buildMerge(Dst, {Lo, Hi});
2204 MI.eraseFromParent();
2205
2206 return true;
2207}
2208
2209bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2210 MachineInstr &MI) const {
2211 MachineFunction &MF = Helper.MIRBuilder.getMF();
2212 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2213
2214 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2215 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2216
2217 // With ieee_mode disabled, the instructions have the correct behavior
2218 // already for G_FMINNUM/G_FMAXNUM
2219 if (!MFI->getMode().IEEE)
2220 return !IsIEEEOp;
2221
2222 if (IsIEEEOp)
2223 return true;
2224
2225 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2226}
2227
2228bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2229 MachineInstr &MI, MachineRegisterInfo &MRI,
2230 MachineIRBuilder &B) const {
2231 // TODO: Should move some of this into LegalizerHelper.
2232
2233 // TODO: Promote dynamic indexing of s16 to s32
2234
2235 // FIXME: Artifact combiner probably should have replaced the truncated
2236 // constant before this, so we shouldn't need
2237 // getIConstantVRegValWithLookThrough.
2238 Optional<ValueAndVReg> MaybeIdxVal =
2239 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2240 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2241 return true;
2242 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2243
2244 Register Dst = MI.getOperand(0).getReg();
2245 Register Vec = MI.getOperand(1).getReg();
2246
2247 LLT VecTy = MRI.getType(Vec);
2248 LLT EltTy = VecTy.getElementType();
2249 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2249, __extension__ __PRETTY_FUNCTION__))
;
2250
2251 if (IdxVal < VecTy.getNumElements()) {
2252 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2253 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2254 } else {
2255 B.buildUndef(Dst);
2256 }
2257
2258 MI.eraseFromParent();
2259 return true;
2260}
2261
2262bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2263 MachineInstr &MI, MachineRegisterInfo &MRI,
2264 MachineIRBuilder &B) const {
2265 // TODO: Should move some of this into LegalizerHelper.
2266
2267 // TODO: Promote dynamic indexing of s16 to s32
2268
2269 // FIXME: Artifact combiner probably should have replaced the truncated
2270 // constant before this, so we shouldn't need
2271 // getIConstantVRegValWithLookThrough.
2272 Optional<ValueAndVReg> MaybeIdxVal =
2273 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2274 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2275 return true;
2276
2277 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2278 Register Dst = MI.getOperand(0).getReg();
2279 Register Vec = MI.getOperand(1).getReg();
2280 Register Ins = MI.getOperand(2).getReg();
2281
2282 LLT VecTy = MRI.getType(Vec);
2283 LLT EltTy = VecTy.getElementType();
2284 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2284, __extension__ __PRETTY_FUNCTION__))
;
2285 (void)Ins;
2286
2287 unsigned NumElts = VecTy.getNumElements();
2288 if (IdxVal < NumElts) {
2289 SmallVector<Register, 8> SrcRegs;
2290 for (unsigned i = 0; i < NumElts; ++i)
2291 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2292 B.buildUnmerge(SrcRegs, Vec);
2293
2294 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2295 B.buildMerge(Dst, SrcRegs);
2296 } else {
2297 B.buildUndef(Dst);
2298 }
2299
2300 MI.eraseFromParent();
2301 return true;
2302}
2303
2304bool AMDGPULegalizerInfo::legalizeShuffleVector(
2305 MachineInstr &MI, MachineRegisterInfo &MRI,
2306 MachineIRBuilder &B) const {
2307 const LLT V2S16 = LLT::fixed_vector(2, 16);
2308
2309 Register Dst = MI.getOperand(0).getReg();
2310 Register Src0 = MI.getOperand(1).getReg();
2311 LLT DstTy = MRI.getType(Dst);
2312 LLT SrcTy = MRI.getType(Src0);
2313
2314 if (SrcTy == V2S16 && DstTy == V2S16 &&
2315 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2316 return true;
2317
2318 MachineIRBuilder HelperBuilder(MI);
2319 GISelObserverWrapper DummyObserver;
2320 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2321 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2322}
2323
2324bool AMDGPULegalizerInfo::legalizeSinCos(
2325 MachineInstr &MI, MachineRegisterInfo &MRI,
2326 MachineIRBuilder &B) const {
2327
2328 Register DstReg = MI.getOperand(0).getReg();
2329 Register SrcReg = MI.getOperand(1).getReg();
2330 LLT Ty = MRI.getType(DstReg);
2331 unsigned Flags = MI.getFlags();
2332
2333 Register TrigVal;
2334 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2335 if (ST.hasTrigReducedRange()) {
2336 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2337 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2338 .addUse(MulVal.getReg(0))
2339 .setMIFlags(Flags).getReg(0);
2340 } else
2341 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2342
2343 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2344 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2345 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2346 .addUse(TrigVal)
2347 .setMIFlags(Flags);
2348 MI.eraseFromParent();
2349 return true;
2350}
2351
2352bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2353 MachineIRBuilder &B,
2354 const GlobalValue *GV,
2355 int64_t Offset,
2356 unsigned GAFlags) const {
2357 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2357, __extension__
__PRETTY_FUNCTION__))
;
2358 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2359 // to the following code sequence:
2360 //
2361 // For constant address space:
2362 // s_getpc_b64 s[0:1]
2363 // s_add_u32 s0, s0, $symbol
2364 // s_addc_u32 s1, s1, 0
2365 //
2366 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2367 // a fixup or relocation is emitted to replace $symbol with a literal
2368 // constant, which is a pc-relative offset from the encoding of the $symbol
2369 // operand to the global variable.
2370 //
2371 // For global address space:
2372 // s_getpc_b64 s[0:1]
2373 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2374 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2375 //
2376 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2377 // fixups or relocations are emitted to replace $symbol@*@lo and
2378 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2379 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2380 // operand to the global variable.
2381 //
2382 // What we want here is an offset from the value returned by s_getpc
2383 // (which is the address of the s_add_u32 instruction) to the global
2384 // variable, but since the encoding of $symbol starts 4 bytes after the start
2385 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2386 // small. This requires us to add 4 to the global variable offset in order to
2387 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2388 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2389 // instruction.
2390
2391 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2392
2393 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2394 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2395
2396 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2397 .addDef(PCReg);
2398
2399 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2400 if (GAFlags == SIInstrInfo::MO_NONE)
2401 MIB.addImm(0);
2402 else
2403 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2404
2405 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2406
2407 if (PtrTy.getSizeInBits() == 32)
2408 B.buildExtract(DstReg, PCReg, 0);
2409 return true;
2410 }
2411
2412bool AMDGPULegalizerInfo::legalizeGlobalValue(
2413 MachineInstr &MI, MachineRegisterInfo &MRI,
2414 MachineIRBuilder &B) const {
2415 Register DstReg = MI.getOperand(0).getReg();
2416 LLT Ty = MRI.getType(DstReg);
2417 unsigned AS = Ty.getAddressSpace();
2418
2419 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2420 MachineFunction &MF = B.getMF();
2421 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2422
2423 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2424 if (!MFI->isModuleEntryFunction() &&
2425 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2426 const Function &Fn = MF.getFunction();
2427 DiagnosticInfoUnsupported BadLDSDecl(
2428 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2429 DS_Warning);
2430 Fn.getContext().diagnose(BadLDSDecl);
2431
2432 // We currently don't have a way to correctly allocate LDS objects that
2433 // aren't directly associated with a kernel. We do force inlining of
2434 // functions that use local objects. However, if these dead functions are
2435 // not eliminated, we don't want a compile time error. Just emit a warning
2436 // and a trap, since there should be no callable path here.
2437 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2438 B.buildUndef(DstReg);
2439 MI.eraseFromParent();
2440 return true;
2441 }
2442
2443 // TODO: We could emit code to handle the initialization somewhere.
2444 // We ignore the initializer for now and legalize it to allow selection.
2445 // The initializer will anyway get errored out during assembly emission.
2446 const SITargetLowering *TLI = ST.getTargetLowering();
2447 if (!TLI->shouldUseLDSConstAddress(GV)) {
2448 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2449 return true; // Leave in place;
2450 }
2451
2452 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2453 Type *Ty = GV->getValueType();
2454 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2455 // zero-sized type in other languages to declare the dynamic shared
2456 // memory which size is not known at the compile time. They will be
2457 // allocated by the runtime and placed directly after the static
2458 // allocated ones. They all share the same offset.
2459 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2460 // Adjust alignment for that dynamic shared memory array.
2461 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2462 LLT S32 = LLT::scalar(32);
2463 auto Sz =
2464 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2465 B.buildIntToPtr(DstReg, Sz);
2466 MI.eraseFromParent();
2467 return true;
2468 }
2469 }
2470
2471 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2472 *cast<GlobalVariable>(GV)));
2473 MI.eraseFromParent();
2474 return true;
2475 }
2476
2477 const SITargetLowering *TLI = ST.getTargetLowering();
2478
2479 if (TLI->shouldEmitFixup(GV)) {
2480 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2481 MI.eraseFromParent();
2482 return true;
2483 }
2484
2485 if (TLI->shouldEmitPCReloc(GV)) {
2486 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2487 MI.eraseFromParent();
2488 return true;
2489 }
2490
2491 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2492 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2493
2494 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2495 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2496 MachinePointerInfo::getGOT(MF),
2497 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2498 MachineMemOperand::MOInvariant,
2499 LoadTy, Align(8));
2500
2501 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2502
2503 if (Ty.getSizeInBits() == 32) {
2504 // Truncate if this is a 32-bit constant address.
2505 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2506 B.buildExtract(DstReg, Load, 0);
2507 } else
2508 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2509
2510 MI.eraseFromParent();
2511 return true;
2512}
2513
2514static LLT widenToNextPowerOf2(LLT Ty) {
2515 if (Ty.isVector())
2516 return Ty.changeElementCount(
2517 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2518 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2519}
2520
2521bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2522 MachineInstr &MI) const {
2523 MachineIRBuilder &B = Helper.MIRBuilder;
2524 MachineRegisterInfo &MRI = *B.getMRI();
2525 GISelChangeObserver &Observer = Helper.Observer;
2526
2527 Register PtrReg = MI.getOperand(1).getReg();
2528 LLT PtrTy = MRI.getType(PtrReg);
2529 unsigned AddrSpace = PtrTy.getAddressSpace();
2530
2531 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2532 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2533 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2534 Observer.changingInstr(MI);
2535 MI.getOperand(1).setReg(Cast.getReg(0));
2536 Observer.changedInstr(MI);
2537 return true;
2538 }
2539
2540 if (MI.getOpcode() != AMDGPU::G_LOAD)
2541 return false;
2542
2543 Register ValReg = MI.getOperand(0).getReg();
2544 LLT ValTy = MRI.getType(ValReg);
2545
2546 MachineMemOperand *MMO = *MI.memoperands_begin();
2547 const unsigned ValSize = ValTy.getSizeInBits();
2548 const LLT MemTy = MMO->getMemoryType();
2549 const Align MemAlign = MMO->getAlign();
2550 const unsigned MemSize = MemTy.getSizeInBits();
2551 const uint64_t AlignInBits = 8 * MemAlign.value();
2552
2553 // Widen non-power-of-2 loads to the alignment if needed
2554 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2555 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2556
2557 // This was already the correct extending load result type, so just adjust
2558 // the memory type.
2559 if (WideMemSize == ValSize) {
2560 MachineFunction &MF = B.getMF();
2561
2562 MachineMemOperand *WideMMO =
2563 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2564 Observer.changingInstr(MI);
2565 MI.setMemRefs(MF, {WideMMO});
2566 Observer.changedInstr(MI);
2567 return true;
2568 }
2569
2570 // Don't bother handling edge case that should probably never be produced.
2571 if (ValSize > WideMemSize)
2572 return false;
2573
2574 LLT WideTy = widenToNextPowerOf2(ValTy);
2575
2576 Register WideLoad;
2577 if (!WideTy.isVector()) {
2578 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2579 B.buildTrunc(ValReg, WideLoad).getReg(0);
2580 } else {
2581 // Extract the subvector.
2582
2583 if (isRegisterType(ValTy)) {
2584 // If this a case where G_EXTRACT is legal, use it.
2585 // (e.g. <3 x s32> -> <4 x s32>)
2586 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2587 B.buildExtract(ValReg, WideLoad, 0);
2588 } else {
2589 // For cases where the widened type isn't a nice register value, unmerge
2590 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2591 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2592 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2593 }
2594 }
2595
2596 MI.eraseFromParent();
2597 return true;
2598 }
2599
2600 return false;
2601}
2602
2603bool AMDGPULegalizerInfo::legalizeFMad(
2604 MachineInstr &MI, MachineRegisterInfo &MRI,
2605 MachineIRBuilder &B) const {
2606 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2607 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2607, __extension__ __PRETTY_FUNCTION__))
;
2608
2609 MachineFunction &MF = B.getMF();
2610 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2611
2612 // TODO: Always legal with future ftz flag.
2613 // FIXME: Do we need just output?
2614 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2615 return true;
2616 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2617 return true;
2618
2619 MachineIRBuilder HelperBuilder(MI);
2620 GISelObserverWrapper DummyObserver;
2621 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2622 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2623}
2624
2625bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2626 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2627 Register DstReg = MI.getOperand(0).getReg();
2628 Register PtrReg = MI.getOperand(1).getReg();
2629 Register CmpVal = MI.getOperand(2).getReg();
2630 Register NewVal = MI.getOperand(3).getReg();
2631
2632 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2633, __extension__
__PRETTY_FUNCTION__))
2633 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2633, __extension__
__PRETTY_FUNCTION__))
;
2634
2635 LLT ValTy = MRI.getType(CmpVal);
2636 LLT VecTy = LLT::fixed_vector(2, ValTy);
2637
2638 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2639
2640 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2641 .addDef(DstReg)
2642 .addUse(PtrReg)
2643 .addUse(PackedVal)
2644 .setMemRefs(MI.memoperands());
2645
2646 MI.eraseFromParent();
2647 return true;
2648}
2649
2650bool AMDGPULegalizerInfo::legalizeFlog(
2651 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2652 Register Dst = MI.getOperand(0).getReg();
2653 Register Src = MI.getOperand(1).getReg();
2654 LLT Ty = B.getMRI()->getType(Dst);
2655 unsigned Flags = MI.getFlags();
2656
2657 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2658 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2659
2660 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2661 MI.eraseFromParent();
2662 return true;
2663}
2664
2665bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2666 MachineIRBuilder &B) const {
2667 Register Dst = MI.getOperand(0).getReg();
2668 Register Src = MI.getOperand(1).getReg();
2669 unsigned Flags = MI.getFlags();
2670 LLT Ty = B.getMRI()->getType(Dst);
2671
2672 auto K = B.buildFConstant(Ty, numbers::log2e);
2673 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2674 B.buildFExp2(Dst, Mul, Flags);
2675 MI.eraseFromParent();
2676 return true;
2677}
2678
2679bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2680 MachineIRBuilder &B) const {
2681 Register Dst = MI.getOperand(0).getReg();
2682 Register Src0 = MI.getOperand(1).getReg();
2683 Register Src1 = MI.getOperand(2).getReg();
2684 unsigned Flags = MI.getFlags();
2685 LLT Ty = B.getMRI()->getType(Dst);
2686 const LLT S16 = LLT::scalar(16);
2687 const LLT S32 = LLT::scalar(32);
2688
2689 if (Ty == S32) {
2690 auto Log = B.buildFLog2(S32, Src0, Flags);
2691 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2692 .addUse(Log.getReg(0))
2693 .addUse(Src1)
2694 .setMIFlags(Flags);
2695 B.buildFExp2(Dst, Mul, Flags);
2696 } else if (Ty == S16) {
2697 // There's no f16 fmul_legacy, so we need to convert for it.
2698 auto Log = B.buildFLog2(S16, Src0, Flags);
2699 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2700 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2701 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2702 .addUse(Ext0.getReg(0))
2703 .addUse(Ext1.getReg(0))
2704 .setMIFlags(Flags);
2705
2706 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2707 } else
2708 return false;
2709
2710 MI.eraseFromParent();
2711 return true;
2712}
2713
2714// Find a source register, ignoring any possible source modifiers.
2715static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2716 Register ModSrc = OrigSrc;
2717 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2718 ModSrc = SrcFNeg->getOperand(1).getReg();
2719 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2720 ModSrc = SrcFAbs->getOperand(1).getReg();
2721 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2722 ModSrc = SrcFAbs->getOperand(1).getReg();
2723 return ModSrc;
2724}
2725
2726bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2727 MachineRegisterInfo &MRI,
2728 MachineIRBuilder &B) const {
2729
2730 const LLT S1 = LLT::scalar(1);
2731 const LLT S64 = LLT::scalar(64);
2732 Register Dst = MI.getOperand(0).getReg();
2733 Register OrigSrc = MI.getOperand(1).getReg();
2734 unsigned Flags = MI.getFlags();
2735 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2736, __extension__
__PRETTY_FUNCTION__))
2736 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2736, __extension__
__PRETTY_FUNCTION__))
;
2737
2738 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2739 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2740 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2741 // V_FRACT bug is:
2742 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2743 //
2744 // Convert floor(x) to (x - fract(x))
2745
2746 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2747 .addUse(OrigSrc)
2748 .setMIFlags(Flags);
2749
2750 // Give source modifier matching some assistance before obscuring a foldable
2751 // pattern.
2752
2753 // TODO: We can avoid the neg on the fract? The input sign to fract
2754 // shouldn't matter?
2755 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2756
2757 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2758
2759 Register Min = MRI.createGenericVirtualRegister(S64);
2760
2761 // We don't need to concern ourselves with the snan handling difference, so
2762 // use the one which will directly select.
2763 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2764 if (MFI->getMode().IEEE)
2765 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2766 else
2767 B.buildFMinNum(Min, Fract, Const, Flags);
2768
2769 Register CorrectedFract = Min;
2770 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2771 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2772 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2773 }
2774
2775 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2776 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2777
2778 MI.eraseFromParent();
2779 return true;
2780}
2781
2782// Turn an illegal packed v2s16 build vector into bit operations.
2783// TODO: This should probably be a bitcast action in LegalizerHelper.
2784bool AMDGPULegalizerInfo::legalizeBuildVector(
2785 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2786 Register Dst = MI.getOperand(0).getReg();
2787 const LLT S32 = LLT::scalar(32);
2788 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector
(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2788, __extension__
__PRETTY_FUNCTION__))
;
2789
2790 Register Src0 = MI.getOperand(1).getReg();
2791 Register Src1 = MI.getOperand(2).getReg();
2792 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16
)) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2792, __extension__
__PRETTY_FUNCTION__))
;
2793
2794 auto Merge = B.buildMerge(S32, {Src0, Src1});
2795 B.buildBitcast(Dst, Merge);
2796
2797 MI.eraseFromParent();
2798 return true;
2799}
2800
2801// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2802// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2803// case with a single min instruction instead of a compare+select.
2804bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2805 MachineRegisterInfo &MRI,
2806 MachineIRBuilder &B) const {
2807 Register Dst = MI.getOperand(0).getReg();
2808 Register Src = MI.getOperand(1).getReg();
2809 LLT DstTy = MRI.getType(Dst);
2810 LLT SrcTy = MRI.getType(Src);
2811
2812 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2813 ? AMDGPU::G_AMDGPU_FFBH_U32
2814 : AMDGPU::G_AMDGPU_FFBL_B32;
2815 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2816 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2817
2818 MI.eraseFromParent();
2819 return true;
2820}
2821
2822// Check that this is a G_XOR x, -1
2823static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2824 if (MI.getOpcode() != TargetOpcode::G_XOR)
2825 return false;
2826 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2827 return ConstVal && *ConstVal == -1;
2828}
2829
2830// Return the use branch instruction, otherwise null if the usage is invalid.
2831static MachineInstr *
2832verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2833 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2834 Register CondDef = MI.getOperand(0).getReg();
2835 if (!MRI.hasOneNonDBGUse(CondDef))
2836 return nullptr;
2837
2838 MachineBasicBlock *Parent = MI.getParent();
2839 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2840
2841 if (isNot(MRI, *UseMI)) {
2842 Register NegatedCond = UseMI->getOperand(0).getReg();
2843 if (!MRI.hasOneNonDBGUse(NegatedCond))
2844 return nullptr;
2845
2846 // We're deleting the def of this value, so we need to remove it.
2847 eraseInstr(*UseMI, MRI);
2848
2849 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2850 Negated = true;
2851 }
2852
2853 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2854 return nullptr;
2855
2856 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2857 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2858 if (Next == Parent->end()) {
2859 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2860 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2861 return nullptr;
2862 UncondBrTarget = &*NextMBB;
2863 } else {
2864 if (Next->getOpcode() != AMDGPU::G_BR)
2865 return nullptr;
2866 Br = &*Next;
2867 UncondBrTarget = Br->getOperand(0).getMBB();
2868 }
2869
2870 return UseMI;
2871}
2872
2873bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2874 const ArgDescriptor *Arg,
2875 const TargetRegisterClass *ArgRC,
2876 LLT ArgTy) const {
2877 MCRegister SrcReg = Arg->getRegister();
2878 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2878, __extension__
__PRETTY_FUNCTION__))
;
7
'?' condition is true
2879 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2879, __extension__
__PRETTY_FUNCTION__))
;
8
'?' condition is true
2880
2881 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2882 *ArgRC, B.getDebugLoc(), ArgTy);
2883 if (Arg->isMasked()) {
9
Calling 'ArgDescriptor::isMasked'
12
Returning from 'ArgDescriptor::isMasked'
13
Taking true branch
2884 // TODO: Should we try to emit this once in the entry block?
2885 const LLT S32 = LLT::scalar(32);
2886 const unsigned Mask = Arg->getMask();
2887 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
14
Calling 'countTrailingZeros<unsigned int>'
21
Returning from 'countTrailingZeros<unsigned int>'
22
'Shift' initialized to 32
2888
2889 Register AndMaskSrc = LiveIn;
2890
2891 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2892 // 0.
2893 if (Shift
22.1
'Shift' is not equal to 0
22.1
'Shift' is not equal to 0
22.1
'Shift' is not equal to 0
!= 0) {
23
Taking true branch
2894 auto ShiftAmt = B.buildConstant(S32, Shift);
2895 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2896 }
2897
2898 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
24
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2899 } else {
2900 B.buildCopy(DstReg, LiveIn);
2901 }
2902
2903 return true;
2904}
2905
2906bool AMDGPULegalizerInfo::loadInputValue(
2907 Register DstReg, MachineIRBuilder &B,
2908 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2909 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2910 const ArgDescriptor *Arg;
2911 const TargetRegisterClass *ArgRC;
2912 LLT ArgTy;
2913 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2914
2915 if (!Arg) {
4
Assuming 'Arg' is non-null
2916 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2917 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2918 // case the pointer argument may be missing and we use null.
2919 B.buildConstant(DstReg, 0);
2920 return true;
2921 }
2922
2923 // It's undefined behavior if a function marked with the amdgpu-no-*
2924 // attributes uses the corresponding intrinsic.
2925 B.buildUndef(DstReg);
2926 return true;
2927 }
2928
2929 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5
Taking false branch
2930 return false; // TODO: Handle these
2931 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
6
Calling 'AMDGPULegalizerInfo::loadInputValue'
2932}
2933
2934bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2935 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2936 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2937 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3
Calling 'AMDGPULegalizerInfo::loadInputValue'
2938 return false;
2939
2940 MI.eraseFromParent();
2941 return true;
2942}
2943
2944bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2945 MachineRegisterInfo &MRI,
2946 MachineIRBuilder &B) const {
2947 Register Dst = MI.getOperand(0).getReg();
2948 LLT DstTy = MRI.getType(Dst);
2949 LLT S16 = LLT::scalar(16);
2950 LLT S32 = LLT::scalar(32);
2951 LLT S64 = LLT::scalar(64);
2952
2953 if (DstTy == S16)
2954 return legalizeFDIV16(MI, MRI, B);
2955 if (DstTy == S32)
2956 return legalizeFDIV32(MI, MRI, B);
2957 if (DstTy == S64)
2958 return legalizeFDIV64(MI, MRI, B);
2959
2960 return false;
2961}
2962
2963void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2964 Register DstDivReg,
2965 Register DstRemReg,
2966 Register X,
2967 Register Y) const {
2968 const LLT S1 = LLT::scalar(1);
2969 const LLT S32 = LLT::scalar(32);
2970
2971 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2972 // algorithm used here.
2973
2974 // Initial estimate of inv(y).
2975 auto FloatY = B.buildUITOFP(S32, Y);
2976 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2977 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2978 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2979 auto Z = B.buildFPTOUI(S32, ScaledY);
2980
2981 // One round of UNR.
2982 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2983 auto NegYZ = B.buildMul(S32, NegY, Z);
2984 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2985
2986 // Quotient/remainder estimate.
2987 auto Q = B.buildUMulH(S32, X, Z);
2988 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2989
2990 // First quotient/remainder refinement.
2991 auto One = B.buildConstant(S32, 1);
2992 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2993 if (DstDivReg)
2994 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2995 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2996
2997 // Second quotient/remainder refinement.
2998 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2999 if (DstDivReg)
3000 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3001
3002 if (DstRemReg)
3003 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3004}
3005
3006// Build integer reciprocal sequence around V_RCP_IFLAG_F32
3007//
3008// Return lo, hi of result
3009//
3010// %cvt.lo = G_UITOFP Val.lo
3011// %cvt.hi = G_UITOFP Val.hi
3012// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3013// %rcp = G_AMDGPU_RCP_IFLAG %mad
3014// %mul1 = G_FMUL %rcp, 0x5f7ffffc
3015// %mul2 = G_FMUL %mul1, 2**(-32)
3016// %trunc = G_INTRINSIC_TRUNC %mul2
3017// %mad2 = G_FMAD %trunc, -(2**32), %mul1
3018// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3019static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3020 Register Val) {
3021 const LLT S32 = LLT::scalar(32);
3022 auto Unmerge = B.buildUnmerge(S32, Val);
3023
3024 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3025 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3026
3027 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3028 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3029
3030 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3031 auto Mul1 =
3032 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3033
3034 // 2**(-32)
3035 auto Mul2 =
3036 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3037 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3038
3039 // -(2**32)
3040 auto Mad2 = B.buildFMAD(S32, Trunc,
3041 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3042
3043 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3044 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3045
3046 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3047}
3048
3049void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3050 Register DstDivReg,
3051 Register DstRemReg,
3052 Register Numer,
3053 Register Denom) const {
3054 const LLT S32 = LLT::scalar(32);
3055 const LLT S64 = LLT::scalar(64);
3056 const LLT S1 = LLT::scalar(1);
3057 Register RcpLo, RcpHi;
3058
3059 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3060
3061 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3062
3063 auto Zero64 = B.buildConstant(S64, 0);
3064 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3065
3066 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3067 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3068
3069 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3070 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3071 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3072
3073 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3074 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3075 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3076
3077 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3078 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3079 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3080 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3081 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3082
3083 auto Zero32 = B.buildConstant(S32, 0);
3084 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3085 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3086 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3087
3088 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3089 Register NumerLo = UnmergeNumer.getReg(0);
3090 Register NumerHi = UnmergeNumer.getReg(1);
3091
3092 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3093 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3094 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3095 Register Mul3_Lo = UnmergeMul3.getReg(0);
3096 Register Mul3_Hi = UnmergeMul3.getReg(1);
3097 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3098 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3099 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3100 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3101
3102 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3103 Register DenomLo = UnmergeDenom.getReg(0);
3104 Register DenomHi = UnmergeDenom.getReg(1);
3105
3106 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3107 auto C1 = B.buildSExt(S32, CmpHi);
3108
3109 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3110 auto C2 = B.buildSExt(S32, CmpLo);
3111
3112 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3113 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3114
3115 // TODO: Here and below portions of the code can be enclosed into if/endif.
3116 // Currently control flow is unconditional and we have 4 selects after
3117 // potential endif to substitute PHIs.
3118
3119 // if C3 != 0 ...
3120 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3121 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3122 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3123 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3124
3125 auto One64 = B.buildConstant(S64, 1);
3126 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3127
3128 auto C4 =
3129 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3130 auto C5 =
3131 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3132 auto C6 = B.buildSelect(
3133 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3134
3135 // if (C6 != 0)
3136 auto Add4 = B.buildAdd(S64, Add3, One64);
3137 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3138
3139 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3140 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3141 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3142
3143 // endif C6
3144 // endif C3
3145
3146 if (DstDivReg) {
3147 auto Sel1 = B.buildSelect(
3148 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3149 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3150 Sel1, MulHi3);
3151 }
3152
3153 if (DstRemReg) {
3154 auto Sel2 = B.buildSelect(
3155 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3156 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3157 Sel2, Sub1);
3158 }
3159}
3160
3161bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3162 MachineRegisterInfo &MRI,
3163 MachineIRBuilder &B) const {
3164 Register DstDivReg, DstRemReg;
3165 switch (MI.getOpcode()) {
3166 default:
3167 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3167)
;
3168 case AMDGPU::G_UDIV: {
3169 DstDivReg = MI.getOperand(0).getReg();
3170 break;
3171 }
3172 case AMDGPU::G_UREM: {
3173 DstRemReg = MI.getOperand(0).getReg();
3174 break;
3175 }
3176 case AMDGPU::G_UDIVREM: {
3177 DstDivReg = MI.getOperand(0).getReg();
3178 DstRemReg = MI.getOperand(1).getReg();
3179 break;
3180 }
3181 }
3182
3183 const LLT S64 = LLT::scalar(64);
3184 const LLT S32 = LLT::scalar(32);
3185 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3186 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3187 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3188 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3189
3190 if (Ty == S32)
3191 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3192 else if (Ty == S64)
3193 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3194 else
3195 return false;
3196
3197 MI.eraseFromParent();
3198 return true;
3199}
3200
3201bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3202 MachineRegisterInfo &MRI,
3203 MachineIRBuilder &B) const {
3204 const LLT S64 = LLT::scalar(64);
3205 const LLT S32 = LLT::scalar(32);
3206
3207 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3208 if (Ty != S32 && Ty != S64)
3209 return false;
3210
3211 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3212 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3213 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3214
3215 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3216 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3217 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3218
3219 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3220 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3221
3222 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3223 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3224
3225 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3226 switch (MI.getOpcode()) {
3227 default:
3228 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3228)
;
3229 case AMDGPU::G_SDIV: {
3230 DstDivReg = MI.getOperand(0).getReg();
3231 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3232 break;
3233 }
3234 case AMDGPU::G_SREM: {
3235 DstRemReg = MI.getOperand(0).getReg();
3236 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3237 break;
3238 }
3239 case AMDGPU::G_SDIVREM: {
3240 DstDivReg = MI.getOperand(0).getReg();
3241 DstRemReg = MI.getOperand(1).getReg();
3242 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3243 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3244 break;
3245 }
3246 }
3247
3248 if (Ty == S32)
3249 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3250 else
3251 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3252
3253 if (DstDivReg) {
3254 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3255 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3256 B.buildSub(DstDivReg, SignXor, Sign);
3257 }
3258
3259 if (DstRemReg) {
3260 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3261 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3262 B.buildSub(DstRemReg, SignXor, Sign);
3263 }
3264
3265 MI.eraseFromParent();
3266 return true;
3267}
3268
3269bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3270 MachineRegisterInfo &MRI,
3271 MachineIRBuilder &B) const {
3272 Register Res = MI.getOperand(0).getReg();
3273 Register LHS = MI.getOperand(1).getReg();
3274 Register RHS = MI.getOperand(2).getReg();
3275 uint16_t Flags = MI.getFlags();
3276 LLT ResTy = MRI.getType(Res);
3277
3278 const MachineFunction &MF = B.getMF();
3279 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3280 MI.getFlag(MachineInstr::FmAfn);
3281
3282 if (!AllowInaccurateRcp)
3283 return false;
3284
3285 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3286 // 1 / x -> RCP(x)
3287 if (CLHS->isExactlyValue(1.0)) {
3288 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3289 .addUse(RHS)
3290 .setMIFlags(Flags);
3291
3292 MI.eraseFromParent();
3293 return true;
3294 }
3295
3296 // -1 / x -> RCP( FNEG(x) )
3297 if (CLHS->isExactlyValue(-1.0)) {
3298 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3299 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3300 .addUse(FNeg.getReg(0))
3301 .setMIFlags(Flags);
3302
3303 MI.eraseFromParent();
3304 return true;
3305 }
3306 }
3307
3308 // x / y -> x * (1.0 / y)
3309 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3310 .addUse(RHS)
3311 .setMIFlags(Flags);
3312 B.buildFMul(Res, LHS, RCP, Flags);
3313
3314 MI.eraseFromParent();
3315 return true;
3316}
3317
3318bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3319 MachineRegisterInfo &MRI,
3320 MachineIRBuilder &B) const {
3321 Register Res = MI.getOperand(0).getReg();
3322 Register X = MI.getOperand(1).getReg();
3323 Register Y = MI.getOperand(2).getReg();
3324 uint16_t Flags = MI.getFlags();
3325 LLT ResTy = MRI.getType(Res);
3326
3327 const MachineFunction &MF = B.getMF();
3328 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3329 MI.getFlag(MachineInstr::FmAfn);
3330
3331 if (!AllowInaccurateRcp)
3332 return false;
3333
3334 auto NegY = B.buildFNeg(ResTy, Y);
3335 auto One = B.buildFConstant(ResTy, 1.0);
3336
3337 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3338 .addUse(Y)
3339 .setMIFlags(Flags);
3340
3341 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3342 R = B.buildFMA(ResTy, Tmp0, R, R);
3343
3344 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3345 R = B.buildFMA(ResTy, Tmp1, R, R);
3346
3347 auto Ret = B.buildFMul(ResTy, X, R);
3348 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3349
3350 B.buildFMA(Res, Tmp2, R, Ret);
3351 MI.eraseFromParent();
3352 return true;
3353}
3354
3355bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3356 MachineRegisterInfo &MRI,
3357 MachineIRBuilder &B) const {
3358 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3359 return true;
3360
3361 Register Res = MI.getOperand(0).getReg();
3362 Register LHS = MI.getOperand(1).getReg();
3363 Register RHS = MI.getOperand(2).getReg();
3364
3365 uint16_t Flags = MI.getFlags();
3366
3367 LLT S16 = LLT::scalar(16);
3368 LLT S32 = LLT::scalar(32);
3369
3370 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3371 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3372
3373 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3374 .addUse(RHSExt.getReg(0))
3375 .setMIFlags(Flags);
3376
3377 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3378 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3379
3380 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3381 .addUse(RDst.getReg(0))
3382 .addUse(RHS)
3383 .addUse(LHS)
3384 .setMIFlags(Flags);
3385
3386 MI.eraseFromParent();
3387 return true;
3388}
3389
3390// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3391// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3392static void toggleSPDenormMode(bool Enable,
3393 MachineIRBuilder &B,
3394 const GCNSubtarget &ST,
3395 AMDGPU::SIModeRegisterDefaults Mode) {
3396 // Set SP denorm mode to this value.
3397 unsigned SPDenormMode =
3398 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3399
3400 if (ST.hasDenormModeInst()) {
3401 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3402 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3403
3404 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3405 B.buildInstr(AMDGPU::S_DENORM_MODE)
3406 .addImm(NewDenormModeValue);
3407
3408 } else {
3409 // Select FP32 bit field in mode register.
3410 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3411 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3412 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3413
3414 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3415 .addImm(SPDenormMode)
3416 .addImm(SPDenormModeBitField);
3417 }
3418}
3419
3420bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3421 MachineRegisterInfo &MRI,
3422 MachineIRBuilder &B) const {
3423 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3424 return true;
3425
3426 Register Res = MI.getOperand(0).getReg();
3427 Register LHS = MI.getOperand(1).getReg();
3428 Register RHS = MI.getOperand(2).getReg();
3429 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3430 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3431
3432 uint16_t Flags = MI.getFlags();
3433
3434 LLT S32 = LLT::scalar(32);
3435 LLT S1 = LLT::scalar(1);
3436
3437 auto One = B.buildFConstant(S32, 1.0f);
3438
3439 auto DenominatorScaled =
3440 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3441 .addUse(LHS)
3442 .addUse(RHS)
3443 .addImm(0)
3444 .setMIFlags(Flags);
3445 auto NumeratorScaled =
3446 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3447 .addUse(LHS)
3448 .addUse(RHS)
3449 .addImm(1)
3450 .setMIFlags(Flags);
3451
3452 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3453 .addUse(DenominatorScaled.getReg(0))
3454 .setMIFlags(Flags);
3455 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3456
3457 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3458 // aren't modeled as reading it.
3459 if (!Mode.allFP32Denormals())
3460 toggleSPDenormMode(true, B, ST, Mode);
3461
3462 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3463 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3464 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3465 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3466 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3467 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3468
3469 if (!Mode.allFP32Denormals())
3470 toggleSPDenormMode(false, B, ST, Mode);
3471
3472 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3473 .addUse(Fma4.getReg(0))
3474 .addUse(Fma1.getReg(0))
3475 .addUse(Fma3.getReg(0))
3476 .addUse(NumeratorScaled.getReg(1))
3477 .setMIFlags(Flags);
3478
3479 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3480 .addUse(Fmas.getReg(0))
3481 .addUse(RHS)
3482 .addUse(LHS)
3483 .setMIFlags(Flags);
3484
3485 MI.eraseFromParent();
3486 return true;
3487}
3488
3489bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3490 MachineRegisterInfo &MRI,
3491 MachineIRBuilder &B) const {
3492 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3493 return true;
3494
3495 Register Res = MI.getOperand(0).getReg();
3496 Register LHS = MI.getOperand(1).getReg();
3497 Register RHS = MI.getOperand(2).getReg();
3498
3499 uint16_t Flags = MI.getFlags();
3500
3501 LLT S64 = LLT::scalar(64);
3502 LLT S1 = LLT::scalar(1);
3503
3504 auto One = B.buildFConstant(S64, 1.0);
3505
3506 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3507 .addUse(LHS)
3508 .addUse(RHS)
3509 .addImm(0)
3510 .setMIFlags(Flags);
3511
3512 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3513
3514 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3515 .addUse(DivScale0.getReg(0))
3516 .setMIFlags(Flags);
3517
3518 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3519 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3520 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3521
3522 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3523 .addUse(LHS)
3524 .addUse(RHS)
3525 .addImm(1)
3526 .setMIFlags(Flags);
3527
3528 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3529 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3530 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3531
3532 Register Scale;
3533 if (!ST.hasUsableDivScaleConditionOutput()) {
3534 // Workaround a hardware bug on SI where the condition output from div_scale
3535 // is not usable.
3536
3537 LLT S32 = LLT::scalar(32);
3538
3539 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3540 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3541 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3542 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3543
3544 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3545 Scale1Unmerge.getReg(1));
3546 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3547 Scale0Unmerge.getReg(1));
3548 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3549 } else {
3550 Scale = DivScale1.getReg(1);
3551 }
3552
3553 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3554 .addUse(Fma4.getReg(0))
3555 .addUse(Fma3.getReg(0))
3556 .addUse(Mul.getReg(0))
3557 .addUse(Scale)
3558 .setMIFlags(Flags);
3559
3560 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3561 .addUse(Fmas.getReg(0))
3562 .addUse(RHS)
3563 .addUse(LHS)
3564 .setMIFlags(Flags);
3565
3566 MI.eraseFromParent();
3567 return true;
3568}
3569
3570bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3571 MachineRegisterInfo &MRI,
3572 MachineIRBuilder &B) const {
3573 Register Res = MI.getOperand(0).getReg();
3574 Register LHS = MI.getOperand(2).getReg();
3575 Register RHS = MI.getOperand(3).getReg();
3576 uint16_t Flags = MI.getFlags();
3577
3578 LLT S32 = LLT::scalar(32);
3579 LLT S1 = LLT::scalar(1);
3580
3581 auto Abs = B.buildFAbs(S32, RHS, Flags);
3582 const APFloat C0Val(1.0f);
3583
3584 auto C0 = B.buildConstant(S32, 0x6f800000);
3585 auto C1 = B.buildConstant(S32, 0x2f800000);
3586 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3587
3588 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3589 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3590
3591 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3592
3593 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3594 .addUse(Mul0.getReg(0))
3595 .setMIFlags(Flags);
3596
3597 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3598
3599 B.buildFMul(Res, Sel, Mul1, Flags);
3600
3601 MI.eraseFromParent();
3602 return true;
3603}
3604
3605// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3606// FIXME: Why do we handle this one but not other removed instructions?
3607//
3608// Reciprocal square root. The clamp prevents infinite results, clamping
3609// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3610// +-max_float.
3611bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3612 MachineRegisterInfo &MRI,
3613 MachineIRBuilder &B) const {
3614 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3615 return true;
3616
3617 Register Dst = MI.getOperand(0).getReg();
3618 Register Src = MI.getOperand(2).getReg();
3619 auto Flags = MI.getFlags();
3620
3621 LLT Ty = MRI.getType(Dst);
3622
3623 const fltSemantics *FltSemantics;
3624 if (Ty == LLT::scalar(32))
3625 FltSemantics = &APFloat::IEEEsingle();
3626 else if (Ty == LLT::scalar(64))
3627 FltSemantics = &APFloat::IEEEdouble();
3628 else
3629 return false;
3630
3631 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3632 .addUse(Src)
3633 .setMIFlags(Flags);
3634
3635 // We don't need to concern ourselves with the snan handling difference, since
3636 // the rsq quieted (or not) so use the one which will directly select.
3637 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3638 const bool UseIEEE = MFI->getMode().IEEE;
3639
3640 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3641 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3642 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3643
3644 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3645
3646 if (UseIEEE)
3647 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3648 else
3649 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3650 MI.eraseFromParent();
3651 return true;
3652}
3653
3654static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3655 switch (IID) {
3656 case Intrinsic::amdgcn_ds_fadd:
3657 return AMDGPU::G_ATOMICRMW_FADD;
3658 case Intrinsic::amdgcn_ds_fmin:
3659 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3660 case Intrinsic::amdgcn_ds_fmax:
3661 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3662 default:
3663 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3663)
;
3664 }
3665}
3666
3667bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3668 MachineInstr &MI,
3669 Intrinsic::ID IID) const {
3670 GISelChangeObserver &Observer = Helper.Observer;
3671 Observer.changingInstr(MI);
3672
3673 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3674
3675 // The remaining operands were used to set fields in the MemOperand on
3676 // construction.
3677 for (int I = 6; I > 3; --I)
3678 MI.RemoveOperand(I);
3679
3680 MI.RemoveOperand(1); // Remove the intrinsic ID.
3681 Observer.changedInstr(MI);
3682 return true;
3683}
3684
3685bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3686 MachineRegisterInfo &MRI,
3687 MachineIRBuilder &B) const {
3688 uint64_t Offset =
3689 ST.getTargetLowering()->getImplicitParameterOffset(
3690 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3691 LLT DstTy = MRI.getType(DstReg);
3692 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3693
3694 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3695 if (!loadInputValue(KernargPtrReg, B,
3696 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3697 return false;
3698
3699 // FIXME: This should be nuw
3700 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3701 return true;
3702}
3703
3704bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3705 MachineRegisterInfo &MRI,
3706 MachineIRBuilder &B) const {
3707 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3708 if (!MFI->isEntryFunction()) {
3709 return legalizePreloadedArgIntrin(MI, MRI, B,
3710 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3711 }
3712
3713 Register DstReg = MI.getOperand(0).getReg();
3714 if (!getImplicitArgPtr(DstReg, MRI, B))
3715 return false;
3716
3717 MI.eraseFromParent();
3718 return true;
3719}
3720
3721bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3722 MachineRegisterInfo &MRI,
3723 MachineIRBuilder &B,
3724 unsigned AddrSpace) const {
3725 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3726 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3727 Register Hi32 = Unmerge.getReg(1);
3728
3729 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3730 MI.eraseFromParent();
3731 return true;
3732}
3733
3734// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3735// offset (the offset that is included in bounds checking and swizzling, to be
3736// split between the instruction's voffset and immoffset fields) and soffset
3737// (the offset that is excluded from bounds checking and swizzling, to go in
3738// the instruction's soffset field). This function takes the first kind of
3739// offset and figures out how to split it between voffset and immoffset.
3740std::pair<Register, unsigned>
3741AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3742 Register OrigOffset) const {
3743 const unsigned MaxImm = 4095;
3744 Register BaseReg;
3745 unsigned ImmOffset;
3746 const LLT S32 = LLT::scalar(32);
3747 MachineRegisterInfo &MRI = *B.getMRI();
3748
3749 std::tie(BaseReg, ImmOffset) =
3750 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3751
3752 // If BaseReg is a pointer, convert it to int.
3753 if (MRI.getType(BaseReg).isPointer())
3754 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3755
3756 // If the immediate value is too big for the immoffset field, put the value
3757 // and -4096 into the immoffset field so that the value that is copied/added
3758 // for the voffset field is a multiple of 4096, and it stands more chance
3759 // of being CSEd with the copy/add for another similar load/store.
3760 // However, do not do that rounding down to a multiple of 4096 if that is a
3761 // negative number, as it appears to be illegal to have a negative offset
3762 // in the vgpr, even if adding the immediate offset makes it positive.
3763 unsigned Overflow = ImmOffset & ~MaxImm;
3764 ImmOffset -= Overflow;
3765 if ((int32_t)Overflow < 0) {
3766 Overflow += ImmOffset;
3767 ImmOffset = 0;
3768 }
3769
3770 if (Overflow != 0) {
3771 if (!BaseReg) {
3772 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3773 } else {
3774 auto OverflowVal = B.buildConstant(S32, Overflow);
3775 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3776 }
3777 }
3778
3779 if (!BaseReg)
3780 BaseReg = B.buildConstant(S32, 0).getReg(0);
3781
3782 return std::make_pair(BaseReg, ImmOffset);
3783}
3784
3785/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3786void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3787 Register VOffset, Register SOffset,
3788 unsigned ImmOffset, Register VIndex,
3789 MachineRegisterInfo &MRI) const {
3790 Optional<ValueAndVReg> MaybeVOffsetVal =
3791 getIConstantVRegValWithLookThrough(VOffset, MRI);
3792 Optional<ValueAndVReg> MaybeSOffsetVal =
3793 getIConstantVRegValWithLookThrough(SOffset, MRI);
3794 Optional<ValueAndVReg> MaybeVIndexVal =
3795 getIConstantVRegValWithLookThrough(VIndex, MRI);
3796 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3797 // update the MMO with that offset. The stride is unknown so we can only do
3798 // this if VIndex is constant 0.
3799 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3800 MaybeVIndexVal->Value == 0) {
3801 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3802 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3803 MMO->setOffset(TotalOffset);
3804 } else {
3805 // We don't have a constant combined offset to use in the MMO. Give up.
3806 MMO->setValue((Value *)nullptr);
3807 }
3808}
3809
3810/// Handle register layout difference for f16 images for some subtargets.
3811Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3812 MachineRegisterInfo &MRI,
3813 Register Reg,
3814 bool ImageStore) const {
3815 const LLT S16 = LLT::scalar(16);
3816 const LLT S32 = LLT::scalar(32);
3817 LLT StoreVT = MRI.getType(Reg);
3818 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3818, __extension__
__PRETTY_FUNCTION__))
;
3819
3820 if (ST.hasUnpackedD16VMem()) {
3821 auto Unmerge = B.buildUnmerge(S16, Reg);
3822
3823 SmallVector<Register, 4> WideRegs;
3824 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3825 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3826
3827 int NumElts = StoreVT.getNumElements();
3828
3829 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3830 .getReg(0);
3831 }
3832
3833 if (ImageStore && ST.hasImageStoreD16Bug()) {
3834 if (StoreVT.getNumElements() == 2) {
3835 SmallVector<Register, 4> PackedRegs;
3836 Reg = B.buildBitcast(S32, Reg).getReg(0);
3837 PackedRegs.push_back(Reg);
3838 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3839 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3840 .getReg(0);
3841 }
3842
3843 if (StoreVT.getNumElements() == 3) {
3844 SmallVector<Register, 4> PackedRegs;
3845 auto Unmerge = B.buildUnmerge(S16, Reg);
3846 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3847 PackedRegs.push_back(Unmerge.getReg(I));
3848 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3849 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3850 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3851 }
3852
3853 if (StoreVT.getNumElements() == 4) {
3854 SmallVector<Register, 4> PackedRegs;
3855 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3856 auto Unmerge = B.buildUnmerge(S32, Reg);
3857 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3858 PackedRegs.push_back(Unmerge.getReg(I));
3859 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3860 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3861 .getReg(0);
3862 }
3863
3864 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3864)
;
3865 }
3866
3867 if (StoreVT == LLT::fixed_vector(3, S16)) {
3868 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3869 .getReg(0);
3870 }
3871 return Reg;
3872}
3873
3874Register AMDGPULegalizerInfo::fixStoreSourceType(
3875 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3876 MachineRegisterInfo *MRI = B.getMRI();
3877 LLT Ty = MRI->getType(VData);
3878
3879 const LLT S16 = LLT::scalar(16);
3880
3881 // Fixup illegal register types for i8 stores.
3882 if (Ty == LLT::scalar(8) || Ty == S16) {
3883 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3884 return AnyExt;
3885 }
3886
3887 if (Ty.isVector()) {
3888 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3889 if (IsFormat)
3890 return handleD16VData(B, *MRI, VData);
3891 }
3892 }
3893
3894 return VData;
3895}
3896
3897bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3898 MachineRegisterInfo &MRI,
3899 MachineIRBuilder &B,
3900 bool IsTyped,
3901 bool IsFormat) const {
3902 Register VData = MI.getOperand(1).getReg();
3903 LLT Ty = MRI.getType(VData);
3904 LLT EltTy = Ty.getScalarType();
3905 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3906 const LLT S32 = LLT::scalar(32);
3907
3908 VData = fixStoreSourceType(B, VData, IsFormat);
3909 Register RSrc = MI.getOperand(2).getReg();
3910
3911 MachineMemOperand *MMO = *MI.memoperands_begin();
3912 const int MemSize = MMO->getSize();
3913
3914 unsigned ImmOffset;
3915
3916 // The typed intrinsics add an immediate after the registers.
3917 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3918
3919 // The struct intrinsic variants add one additional operand over raw.
3920 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3921 Register VIndex;
3922 int OpOffset = 0;
3923 if (HasVIndex) {
3924 VIndex = MI.getOperand(3).getReg();
3925 OpOffset = 1;
3926 } else {
3927 VIndex = B.buildConstant(S32, 0).getReg(0);
3928 }
3929
3930 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3931 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3932
3933 unsigned Format = 0;
3934 if (IsTyped) {
3935 Format = MI.getOperand(5 + OpOffset).getImm();
3936 ++OpOffset;
3937 }
3938
3939 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3940
3941 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3942 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3943
3944 unsigned Opc;
3945 if (IsTyped) {
3946 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3947 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3948 } else if (IsFormat) {
3949 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3950 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3951 } else {
3952 switch (MemSize) {
3953 case 1:
3954 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3955 break;
3956 case 2:
3957 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3958 break;
3959 default:
3960 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3961 break;
3962 }
3963 }
3964
3965 auto MIB = B.buildInstr(Opc)
3966 .addUse(VData) // vdata
3967 .addUse(RSrc) // rsrc
3968 .addUse(VIndex) // vindex
3969 .addUse(VOffset) // voffset
3970 .addUse(SOffset) // soffset
3971 .addImm(ImmOffset); // offset(imm)
3972
3973 if (IsTyped)
3974 MIB.addImm(Format);
3975
3976 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3977 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3978 .addMemOperand(MMO);
3979
3980 MI.eraseFromParent();
3981 return true;
3982}
3983
3984bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3985 MachineRegisterInfo &MRI,
3986 MachineIRBuilder &B,
3987 bool IsFormat,
3988 bool IsTyped) const {
3989 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3990 MachineMemOperand *MMO = *MI.memoperands_begin();
3991 const LLT MemTy = MMO->getMemoryType();
3992 const LLT S32 = LLT::scalar(32);
3993
3994 Register Dst = MI.getOperand(0).getReg();
3995 Register RSrc = MI.getOperand(2).getReg();
3996
3997 // The typed intrinsics add an immediate after the registers.
3998 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3999
4000 // The struct intrinsic variants add one additional operand over raw.
4001 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4002 Register VIndex;
4003 int OpOffset = 0;
4004 if (HasVIndex) {
4005 VIndex = MI.getOperand(3).getReg();
4006 OpOffset = 1;
4007 } else {
4008 VIndex = B.buildConstant(S32, 0).getReg(0);
4009 }
4010
4011 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4012 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4013
4014 unsigned Format = 0;
4015 if (IsTyped) {
4016 Format = MI.getOperand(5 + OpOffset).getImm();
4017 ++OpOffset;
4018 }
4019
4020 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4021 unsigned ImmOffset;
4022
4023 LLT Ty = MRI.getType(Dst);
4024 LLT EltTy = Ty.getScalarType();
4025 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4026 const bool Unpacked = ST.hasUnpackedD16VMem();
4027
4028 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4029 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4030
4031 unsigned Opc;
4032
4033 if (IsTyped) {
4034 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4035 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4036 } else if (IsFormat) {
4037 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4038 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4039 } else {
4040 switch (MemTy.getSizeInBits()) {
4041 case 8:
4042 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4043 break;
4044 case 16:
4045 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4046 break;
4047 default:
4048 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4049 break;
4050 }
4051 }
4052
4053 Register LoadDstReg;
4054
4055 bool IsExtLoad =
4056 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4057 LLT UnpackedTy = Ty.changeElementSize(32);
4058
4059 if (IsExtLoad)
4060 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4061 else if (Unpacked && IsD16 && Ty.isVector())
4062 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4063 else
4064 LoadDstReg = Dst;
4065
4066 auto MIB = B.buildInstr(Opc)
4067 .addDef(LoadDstReg) // vdata
4068 .addUse(RSrc) // rsrc
4069 .addUse(VIndex) // vindex
4070 .addUse(VOffset) // voffset
4071 .addUse(SOffset) // soffset
4072 .addImm(ImmOffset); // offset(imm)
4073
4074 if (IsTyped)
4075 MIB.addImm(Format);
4076
4077 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4078 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4079 .addMemOperand(MMO);
4080
4081 if (LoadDstReg != Dst) {
4082 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4083
4084 // Widen result for extending loads was widened.
4085 if (IsExtLoad)
4086 B.buildTrunc(Dst, LoadDstReg);
4087 else {
4088 // Repack to original 16-bit vector result
4089 // FIXME: G_TRUNC should work, but legalization currently fails
4090 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4091 SmallVector<Register, 4> Repack;
4092 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4093 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4094 B.buildMerge(Dst, Repack);
4095 }
4096 }
4097
4098 MI.eraseFromParent();
4099 return true;
4100}
4101
4102bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4103 MachineIRBuilder &B,
4104 bool IsInc) const {
4105 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4106 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4107 B.buildInstr(Opc)
4108 .addDef(MI.getOperand(0).getReg())
4109 .addUse(MI.getOperand(2).getReg())
4110 .addUse(MI.getOperand(3).getReg())
4111 .cloneMemRefs(MI);
4112 MI.eraseFromParent();
4113 return true;
4114}
4115
4116static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4117 switch (IntrID) {
4118 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4119 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4120 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4121 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4122 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4123 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4124 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4125 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4126 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4127 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4128 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4130 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4131 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4132 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4133 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4134 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4135 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4136 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4137 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4138 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4139 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4140 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4141 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4142 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4143 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4145 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4146 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4148 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4149 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4150 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4151 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4152 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4153 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4154 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4155 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4156 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4157 case Intrinsic::amdgcn_buffer_atomic_fadd:
4158 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4159 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4160 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4161 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4162 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4163 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4164 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4165 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4166 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4167 default:
4168 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4168)
;
4169 }
4170}
4171
4172bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4173 MachineIRBuilder &B,
4174 Intrinsic::ID IID) const {
4175 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4176 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4177 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4178
4179 Register Dst;
4180
4181 int OpOffset = 0;
4182 if (HasReturn) {
4183 // A few FP atomics do not support return values.
4184 Dst = MI.getOperand(0).getReg();
4185 } else {
4186 OpOffset = -1;
4187 }
4188
4189 Register VData = MI.getOperand(2 + OpOffset).getReg();
4190 Register CmpVal;
4191
4192 if (IsCmpSwap) {
4193 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4194 ++OpOffset;
4195 }
4196
4197 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4198 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4199
4200 // The struct intrinsic variants add one additional operand over raw.
4201 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4202 Register VIndex;
4203 if (HasVIndex) {
4204 VIndex = MI.getOperand(4 + OpOffset).getReg();
4205 ++OpOffset;
4206 } else {
4207 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4208 }
4209
4210 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4211 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4212 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4213
4214 MachineMemOperand *MMO = *MI.memoperands_begin();
4215
4216 unsigned ImmOffset;
4217 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4218 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4219
4220 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4221
4222 if (HasReturn)
4223 MIB.addDef(Dst);
4224
4225 MIB.addUse(VData); // vdata
4226
4227 if (IsCmpSwap)
4228 MIB.addReg(CmpVal);
4229
4230 MIB.addUse(RSrc) // rsrc
4231 .addUse(VIndex) // vindex
4232 .addUse(VOffset) // voffset
4233 .addUse(SOffset) // soffset
4234 .addImm(ImmOffset) // offset(imm)
4235 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4236 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4237 .addMemOperand(MMO);
4238
4239 MI.eraseFromParent();
4240 return true;
4241}
4242
4243/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4244/// vector with s16 typed elements.
4245static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4246 SmallVectorImpl<Register> &PackedAddrs,
4247 unsigned ArgOffset,
4248 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4249 bool IsA16, bool IsG16) {
4250 const LLT S16 = LLT::scalar(16);
4251 const LLT V2S16 = LLT::fixed_vector(2, 16);
4252 auto EndIdx = Intr->VAddrEnd;
4253
4254 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4255 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4256 if (!SrcOp.isReg())
4257 continue; // _L to _LZ may have eliminated this.
4258
4259 Register AddrReg = SrcOp.getReg();
4260
4261 if ((I < Intr->GradientStart) ||
4262 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4263 (I >= Intr->CoordStart && !IsA16)) {
4264 // Handle any gradient or coordinate operands that should not be packed
4265 if ((I < Intr->GradientStart) && IsA16 &&
4266 (B.getMRI()->getType(AddrReg) == S16)) {
4267 // Special handling of bias when A16 is on. Bias is of type half but
4268 // occupies full 32-bit.
4269 PackedAddrs.push_back(
4270 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4271 .getReg(0));
4272 } else {
4273 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4274 PackedAddrs.push_back(AddrReg);
4275 }
4276 } else {
4277 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4278 // derivatives dx/dh and dx/dv are packed with undef.
4279 if (((I + 1) >= EndIdx) ||
4280 ((Intr->NumGradients / 2) % 2 == 1 &&
4281 (I == static_cast<unsigned>(Intr->GradientStart +
4282 (Intr->NumGradients / 2) - 1) ||
4283 I == static_cast<unsigned>(Intr->GradientStart +
4284 Intr->NumGradients - 1))) ||
4285 // Check for _L to _LZ optimization
4286 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4287 PackedAddrs.push_back(
4288 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4289 .getReg(0));
4290 } else {
4291 PackedAddrs.push_back(
4292 B.buildBuildVector(
4293 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4294 .getReg(0));
4295 ++I;
4296 }
4297 }
4298 }
4299}
4300
4301/// Convert from separate vaddr components to a single vector address register,
4302/// and replace the remaining operands with $noreg.
4303static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4304 int DimIdx, int NumVAddrs) {
4305 const LLT S32 = LLT::scalar(32);
4306
4307 SmallVector<Register, 8> AddrRegs;
4308 for (int I = 0; I != NumVAddrs; ++I) {
4309 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4310 if (SrcOp.isReg()) {
4311 AddrRegs.push_back(SrcOp.getReg());
4312 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4312, __extension__
__PRETTY_FUNCTION__))
;
4313 }
4314 }
4315
4316 int NumAddrRegs = AddrRegs.size();
4317 if (NumAddrRegs != 1) {
4318 // Above 8 elements round up to next power of 2 (i.e. 16).
4319 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4320 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4321 auto Undef = B.buildUndef(S32);
4322 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4323 NumAddrRegs = RoundedNumRegs;
4324 }
4325
4326 auto VAddr =
4327 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4328 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4329 }
4330
4331 for (int I = 1; I != NumVAddrs; ++I) {
4332 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4333 if (SrcOp.isReg())
4334 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4335 }
4336}
4337
4338/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4339///
4340/// Depending on the subtarget, load/store with 16-bit element data need to be
4341/// rewritten to use the low half of 32-bit registers, or directly use a packed
4342/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4343/// registers.
4344///
4345/// We don't want to directly select image instructions just yet, but also want
4346/// to exposes all register repacking to the legalizer/combiners. We also don't
4347/// want a selected instrution entering RegBankSelect. In order to avoid
4348/// defining a multitude of intermediate image instructions, directly hack on
4349/// the intrinsic's arguments. In cases like a16 addresses, this requires
4350/// padding now unnecessary arguments with $noreg.
4351bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4352 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4353 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4354
4355 const unsigned NumDefs = MI.getNumExplicitDefs();
4356 const unsigned ArgOffset = NumDefs + 1;
4357 bool IsTFE = NumDefs == 2;
4358 // We are only processing the operands of d16 image operations on subtargets
4359 // that use the unpacked register layout, or need to repack the TFE result.
4360
4361 // TODO: Do we need to guard against already legalized intrinsics?
4362 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4363 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4364
4365 MachineRegisterInfo *MRI = B.getMRI();
4366 const LLT S32 = LLT::scalar(32);
4367 const LLT S16 = LLT::scalar(16);
4368 const LLT V2S16 = LLT::fixed_vector(2, 16);
4369
4370 unsigned DMask = 0;
4371 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4372 LLT Ty = MRI->getType(VData);
4373
4374 // Check for 16 bit addresses and pack if true.
4375 LLT GradTy =
4376 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4377 LLT AddrTy =
4378 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4379 const bool IsG16 = GradTy == S16;
4380 const bool IsA16 = AddrTy == S16;
4381 const bool IsD16 = Ty.getScalarType() == S16;
4382
4383 int DMaskLanes = 0;
4384 if (!BaseOpcode->Atomic) {
4385 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4386 if (BaseOpcode->Gather4) {
4387 DMaskLanes = 4;
4388 } else if (DMask != 0) {
4389 DMaskLanes = countPopulation(DMask);
4390 } else if (!IsTFE && !BaseOpcode->Store) {
4391 // If dmask is 0, this is a no-op load. This can be eliminated.
4392 B.buildUndef(MI.getOperand(0));
4393 MI.eraseFromParent();
4394 return true;
4395 }
4396 }
4397
4398 Observer.changingInstr(MI);
4399 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4400
4401 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4402 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4403 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4404 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4405 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4406
4407 // Track that we legalized this
4408 MI.setDesc(B.getTII().get(NewOpcode));
4409
4410 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4411 // dmask to be at least 1 otherwise the instruction will fail
4412 if (IsTFE && DMask == 0) {
4413 DMask = 0x1;
4414 DMaskLanes = 1;
4415 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4416 }
4417
4418 if (BaseOpcode->Atomic) {
4419 Register VData0 = MI.getOperand(2).getReg();
4420 LLT Ty = MRI->getType(VData0);
4421
4422 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4423 if (Ty.isVector())
4424 return false;
4425
4426 if (BaseOpcode->AtomicX2) {
4427 Register VData1 = MI.getOperand(3).getReg();
4428 // The two values are packed in one register.
4429 LLT PackedTy = LLT::fixed_vector(2, Ty);
4430 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4431 MI.getOperand(2).setReg(Concat.getReg(0));
4432 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4433 }
4434 }
4435
4436 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4437
4438 // Optimize _L to _LZ when _L is zero
4439 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4440 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4441 const ConstantFP *ConstantLod;
4442
4443 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4444 m_GFCst(ConstantLod))) {
4445 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4446 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4447 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4448 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4449 Intr->Dim);
4450
4451 // The starting indexes should remain in the same place.
4452 --CorrectedNumVAddrs;
4453
4454 MI.getOperand(NumDefs).setIntrinsicID(
4455 static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4456 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4457 Intr = NewImageDimIntr;
4458 }
4459 }
4460 }
4461
4462 // Optimize _mip away, when 'lod' is zero
4463 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4464 int64_t ConstantLod;
4465 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4466 m_ICst(ConstantLod))) {
4467 if (ConstantLod == 0) {
4468 // TODO: Change intrinsic opcode and remove operand instead or replacing
4469 // it with 0, as the _L to _LZ handling is done above.
4470 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4471 --CorrectedNumVAddrs;
4472 }
4473 }
4474 }
4475
4476 // Rewrite the addressing register layout before doing anything else.
4477 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4478 // 16 bit gradients are supported, but are tied to the A16 control
4479 // so both gradients and addresses must be 16 bit
4480 return false;
4481 }
4482
4483 if (IsA16 && !ST.hasA16()) {
4484 // A16 not supported
4485 return false;
4486 }
4487
4488 if (IsA16 || IsG16) {
4489 if (Intr->NumVAddrs > 1) {
4490 SmallVector<Register, 4> PackedRegs;
4491
4492 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4493 IsG16);
4494
4495 // See also below in the non-a16 branch
4496 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4497 PackedRegs.size() <= ST.getNSAMaxSize();
4498
4499 if (!UseNSA && PackedRegs.size() > 1) {
4500 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4501 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4502 PackedRegs[0] = Concat.getReg(0);
4503 PackedRegs.resize(1);
4504 }
4505
4506 const unsigned NumPacked = PackedRegs.size();
4507 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4508 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4509 if (!SrcOp.isReg()) {
4510 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4510, __extension__
__PRETTY_FUNCTION__))
;
4511 continue;
4512 }
4513
4514 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4514, __extension__
__PRETTY_FUNCTION__))
;
4515
4516 if (I - Intr->VAddrStart < NumPacked)
4517 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4518 else
4519 SrcOp.setReg(AMDGPU::NoRegister);
4520 }
4521 }
4522 } else {
4523 // If the register allocator cannot place the address registers contiguously
4524 // without introducing moves, then using the non-sequential address encoding
4525 // is always preferable, since it saves VALU instructions and is usually a
4526 // wash in terms of code size or even better.
4527 //
4528 // However, we currently have no way of hinting to the register allocator
4529 // that MIMG addresses should be placed contiguously when it is possible to
4530 // do so, so force non-NSA for the common 2-address case as a heuristic.
4531 //
4532 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4533 // allocation when possible.
4534 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4535 CorrectedNumVAddrs <= ST.getNSAMaxSize();
4536
4537 if (!UseNSA && Intr->NumVAddrs > 1)
4538 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4539 Intr->NumVAddrs);
4540 }
4541
4542 int Flags = 0;
4543 if (IsA16)
4544 Flags |= 1;
4545 if (IsG16)
4546 Flags |= 2;
4547 MI.addOperand(MachineOperand::CreateImm(Flags));
4548
4549 if (BaseOpcode->Store) { // No TFE for stores?
4550 // TODO: Handle dmask trim
4551 if (!Ty.isVector() || !IsD16)
4552 return true;
4553
4554 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4555 if (RepackedReg != VData) {
4556 MI.getOperand(1).setReg(RepackedReg);
4557 }
4558
4559 return true;
4560 }
4561
4562 Register DstReg = MI.getOperand(0).getReg();
4563 const LLT EltTy = Ty.getScalarType();
4564 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4565
4566 // Confirm that the return type is large enough for the dmask specified
4567 if (NumElts < DMaskLanes)
4568 return false;
4569
4570 if (NumElts > 4 || DMaskLanes > 4)
4571 return false;
4572
4573 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4574 const LLT AdjustedTy =
4575 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4576
4577 // The raw dword aligned data component of the load. The only legal cases
4578 // where this matters should be when using the packed D16 format, for
4579 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4580 LLT RoundedTy;
4581
4582 // S32 vector to to cover all data, plus TFE result element.
4583 LLT TFETy;
4584
4585 // Register type to use for each loaded component. Will be S32 or V2S16.
4586 LLT RegTy;
4587
4588 if (IsD16 && ST.hasUnpackedD16VMem()) {
4589 RoundedTy =
4590 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4591 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4592 RegTy = S32;
4593 } else {
4594 unsigned EltSize = EltTy.getSizeInBits();
4595 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4596 unsigned RoundedSize = 32 * RoundedElts;
4597 RoundedTy = LLT::scalarOrVector(
4598 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4599 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4600 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4601 }
4602
4603 // The return type does not need adjustment.
4604 // TODO: Should we change s16 case to s32 or <2 x s16>?
4605 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4606 return true;
4607
4608 Register Dst1Reg;
4609
4610 // Insert after the instruction.
4611 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4612
4613 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4614 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4615 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4616 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4617
4618 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4619
4620 MI.getOperand(0).setReg(NewResultReg);
4621
4622 // In the IR, TFE is supposed to be used with a 2 element struct return
4623 // type. The instruction really returns these two values in one contiguous
4624 // register, with one additional dword beyond the loaded data. Rewrite the
4625 // return type to use a single register result.
4626
4627 if (IsTFE) {
4628 Dst1Reg = MI.getOperand(1).getReg();
4629 if (MRI->getType(Dst1Reg) != S32)
4630 return false;
4631
4632 // TODO: Make sure the TFE operand bit is set.
4633 MI.RemoveOperand(1);
4634
4635 // Handle the easy case that requires no repack instructions.
4636 if (Ty == S32) {
4637 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4638 return true;
4639 }
4640 }
4641
4642 // Now figure out how to copy the new result register back into the old
4643 // result.
4644 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4645
4646 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4647
4648 if (ResultNumRegs == 1) {
4649 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp",
4649, __extension__ __PRETTY_FUNCTION__))
;
4650 ResultRegs[0] = NewResultReg;
4651 } else {
4652 // We have to repack into a new vector of some kind.
4653 for (int I = 0; I != NumDataRegs; ++I)
4654 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4655 B.buildUnmerge(ResultRegs, NewResultReg);
4656
4657 // Drop the final TFE element to get the data part. The TFE result is
4658 // directly written to the right place already.
4659 if (IsTFE)
4660 ResultRegs.resize(NumDataRegs);
4661 }
4662
4663 // For an s16 scalar result, we form an s32 result with a truncate regardless
4664 // of packed vs. unpacked.
4665 if (IsD16 && !Ty.isVector()) {
4666 B.buildTrunc(DstReg, ResultRegs[0]);
4667 return true;
4668 }
4669
4670 // Avoid a build/concat_vector of 1 entry.
4671 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4672 B.buildBitcast(DstReg, ResultRegs[0]);
4673 return true;
4674 }
4675
4676 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4676, __extension__ __PRETTY_FUNCTION__))
;
4677
4678 if (IsD16) {
4679 // For packed D16 results with TFE enabled, all the data components are
4680 // S32. Cast back to the expected type.
4681 //
4682 // TODO: We don't really need to use load s32 elements. We would only need one
4683 // cast for the TFE result if a multiple of v2s16 was used.
4684 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4685 for (Register &Reg : ResultRegs)
4686 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4687 } else if (ST.hasUnpackedD16VMem()) {
4688 for (Register &Reg : ResultRegs)
4689 Reg = B.buildTrunc(S16, Reg).getReg(0);
4690 }
4691 }
4692
4693 auto padWithUndef = [&](LLT Ty, int NumElts) {
4694 if (NumElts == 0)
4695 return;
4696 Register Undef = B.buildUndef(Ty).getReg(0);
4697 for (int I = 0; I != NumElts; ++I)
4698 ResultRegs.push_back(Undef);
4699 };
4700
4701 // Pad out any elements eliminated due to the dmask.
4702 LLT ResTy = MRI->getType(ResultRegs[0]);
4703 if (!ResTy.isVector()) {
4704 padWithUndef(ResTy, NumElts - ResultRegs.size());
4705 B.buildBuildVector(DstReg, ResultRegs);
4706 return true;
4707 }
4708
4709 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4709, __extension__
__PRETTY_FUNCTION__))
;
4710 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4711
4712 // Deal with the one annoying legal case.
4713 const LLT V3S16 = LLT::fixed_vector(3, 16);
4714 if (Ty == V3S16) {
4715 if (IsTFE) {
4716 if (ResultRegs.size() == 1) {
4717 NewResultReg = ResultRegs[0];
4718 } else if (ResultRegs.size() == 2) {
4719 LLT V4S16 = LLT::fixed_vector(4, 16);
4720 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4721 } else {
4722 return false;
4723 }
4724 }
4725
4726 if (MRI->getType(DstReg).getNumElements() <
4727 MRI->getType(NewResultReg).getNumElements()) {
4728 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4729 } else {
4730 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4731 }
4732 return true;
4733 }
4734
4735 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4736 B.buildConcatVectors(DstReg, ResultRegs);
4737 return true;
4738}
4739
4740bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4741 LegalizerHelper &Helper, MachineInstr &MI) const {
4742 MachineIRBuilder &B = Helper.MIRBuilder;
4743 GISelChangeObserver &Observer = Helper.Observer;
4744
4745 Register Dst = MI.getOperand(0).getReg();
4746 LLT Ty = B.getMRI()->getType(Dst);
4747 unsigned Size = Ty.getSizeInBits();
4748 MachineFunction &MF = B.getMF();
4749
4750 Observer.changingInstr(MI);
4751
4752 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4753 Ty = getBitcastRegisterType(Ty);
4754 Helper.bitcastDst(MI, Ty, 0);
4755 Dst = MI.getOperand(0).getReg();
4756 B.setInsertPt(B.getMBB(), MI);
4757 }
4758
4759 // FIXME: We don't really need this intermediate instruction. The intrinsic
4760 // should be fixed to have a memory operand. Since it's readnone, we're not
4761 // allowed to add one.
4762 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4763 MI.RemoveOperand(1); // Remove intrinsic ID
4764
4765 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4766 // TODO: Should this use datalayout alignment?
4767 const unsigned MemSize = (Size + 7) / 8;
4768 const Align MemAlign(4);
4769 MachineMemOperand *MMO = MF.getMachineMemOperand(
4770 MachinePointerInfo(),
4771 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4772 MachineMemOperand::MOInvariant,
4773 MemSize, MemAlign);
4774 MI.addMemOperand(MF, MMO);
4775
4776 // There are no 96-bit result scalar loads, but widening to 128-bit should
4777 // always be legal. We may need to restore this to a 96-bit result if it turns
4778 // out this needs to be converted to a vector load during RegBankSelect.
4779 if (!isPowerOf2_32(Size)) {
4780 if (Ty.isVector())
4781 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4782 else
4783 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4784 }
4785
4786 Observer.changedInstr(MI);
4787 return true;
4788}
4789
4790// TODO: Move to selection
4791bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4792 MachineRegisterInfo &MRI,
4793 MachineIRBuilder &B) const {
4794 if (!ST.isTrapHandlerEnabled() ||
4795 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4796 return legalizeTrapEndpgm(MI, MRI, B);
4797
4798 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4799 switch (*HsaAbiVer) {
4800 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4801 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4802 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4803 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4804 return ST.supportsGetDoorbellID() ?
4805 legalizeTrapHsa(MI, MRI, B) :
4806 legalizeTrapHsaQueuePtr(MI, MRI, B);
4807 }
4808 }
4809
4810 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4810)
;
4811}
4812
4813bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4814 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4815 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4816 MI.eraseFromParent();
4817 return true;
4818}
4819
4820bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4821 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4822 // Pass queue pointer to trap handler as input, and insert trap instruction
4823 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4824 Register LiveIn =
4825 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4826 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4827 return false;
4828
4829 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4830 B.buildCopy(SGPR01, LiveIn);
4831 B.buildInstr(AMDGPU::S_TRAP)
4832 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4833 .addReg(SGPR01, RegState::Implicit);
4834
4835 MI.eraseFromParent();
4836 return true;
4837}
4838
4839bool AMDGPULegalizerInfo::legalizeTrapHsa(
4840 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4841 B.buildInstr(AMDGPU::S_TRAP)
4842 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4843 MI.eraseFromParent();
4844 return true;
4845}
4846
4847bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4849 // Is non-HSA path or trap-handler disabled? Then, report a warning
4850 // accordingly
4851 if (!ST.isTrapHandlerEnabled() ||
4852 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4853 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4854 "debugtrap handler not supported",
4855 MI.getDebugLoc(), DS_Warning);
4856 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4857 Ctx.diagnose(NoTrap);
4858 } else {
4859 // Insert debug-trap instruction
4860 B.buildInstr(AMDGPU::S_TRAP)
4861 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4862 }
4863
4864 MI.eraseFromParent();
4865 return true;
4866}
4867
4868bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4869 MachineIRBuilder &B) const {
4870 MachineRegisterInfo &MRI = *B.getMRI();
4871 const LLT S16 = LLT::scalar(16);
4872 const LLT S32 = LLT::scalar(32);
4873
4874 Register DstReg = MI.getOperand(0).getReg();
4875 Register NodePtr = MI.getOperand(2).getReg();
4876 Register RayExtent = MI.getOperand(3).getReg();
4877 Register RayOrigin = MI.getOperand(4).getReg();
4878 Register RayDir = MI.getOperand(5).getReg();
4879 Register RayInvDir = MI.getOperand(6).getReg();
4880 Register TDescr = MI.getOperand(7).getReg();
4881
4882 if (!ST.hasGFX10_AEncoding()) {
4883 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4884 "intrinsic not supported on subtarget",
4885 MI.getDebugLoc());
4886 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4887 return false;
4888 }
4889
4890 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4891 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4892 const unsigned NumVDataDwords = 4;
4893 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4894 const bool UseNSA =
4895 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4896 const unsigned BaseOpcodes[2][2] = {
4897 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4898 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4899 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4900 int Opcode;
4901 if (UseNSA) {
4902 Opcode =
4903 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4904 NumVDataDwords, NumVAddrDwords);
4905 } else {
4906 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4907 AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4908 PowerOf2Ceil(NumVAddrDwords));
4909 }
4910 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4910, __extension__ __PRETTY_FUNCTION__))
;
4911
4912 SmallVector<Register, 12> Ops;
4913 if (Is64) {
4914 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4915 Ops.push_back(Unmerge.getReg(0));
4916 Ops.push_back(Unmerge.getReg(1));
4917 } else {
4918 Ops.push_back(NodePtr);
4919 }
4920 Ops.push_back(RayExtent);
4921
4922 auto packLanes = [&Ops, &S32, &B](Register Src) {
4923 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4924 Ops.push_back(Unmerge.getReg(0));
4925 Ops.push_back(Unmerge.getReg(1));
4926 Ops.push_back(Unmerge.getReg(2));
4927 };
4928
4929 packLanes(RayOrigin);
4930 if (IsA16) {
4931 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4932 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4933 Register R1 = MRI.createGenericVirtualRegister(S32);
4934 Register R2 = MRI.createGenericVirtualRegister(S32);
4935 Register R3 = MRI.createGenericVirtualRegister(S32);
4936 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4937 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4938 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4939 Ops.push_back(R1);
4940 Ops.push_back(R2);
4941 Ops.push_back(R3);
4942 } else {
4943 packLanes(RayDir);
4944 packLanes(RayInvDir);
4945 }
4946
4947 if (!UseNSA) {
4948 // Build a single vector containing all the operands so far prepared.
4949 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4950 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4951 Ops.clear();
4952 Ops.push_back(MergedOps);
4953 }
4954
4955 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4956 .addDef(DstReg)
4957 .addImm(Opcode);
4958
4959 for (Register R : Ops) {
4960 MIB.addUse(R);
4961 }
4962
4963 MIB.addUse(TDescr)
4964 .addImm(IsA16 ? 1 : 0)
4965 .cloneMemRefs(MI);
4966
4967 MI.eraseFromParent();
4968 return true;
4969}
4970
4971static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4972 B.buildConstant(MI.getOperand(0).getReg(), C);
4973 MI.eraseFromParent();
4974 return true;
4975}
4976
4977bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4978 MachineInstr &MI) const {
4979 MachineIRBuilder &B = Helper.MIRBuilder;
4980 MachineRegisterInfo &MRI = *B.getMRI();
4981
4982 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4983 auto IntrID = MI.getIntrinsicID();
4984 switch (IntrID) {
1
Control jumps to 'case amdgcn_dispatch_id:' at line 5114
4985 case Intrinsic::amdgcn_if:
4986 case Intrinsic::amdgcn_else: {
4987 MachineInstr *Br = nullptr;
4988 MachineBasicBlock *UncondBrTarget = nullptr;
4989 bool Negated = false;
4990 if (MachineInstr *BrCond =
4991 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4992 const SIRegisterInfo *TRI
4993 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4994
4995 Register Def = MI.getOperand(1).getReg();
4996 Register Use = MI.getOperand(3).getReg();
4997
4998 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4999
5000 if (Negated)
5001 std::swap(CondBrTarget, UncondBrTarget);
5002
5003 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5004 if (IntrID == Intrinsic::amdgcn_if) {
5005 B.buildInstr(AMDGPU::SI_IF)
5006 .addDef(Def)
5007 .addUse(Use)
5008 .addMBB(UncondBrTarget);
5009 } else {
5010 B.buildInstr(AMDGPU::SI_ELSE)
5011 .addDef(Def)
5012 .addUse(Use)
5013 .addMBB(UncondBrTarget);
5014 }
5015
5016 if (Br) {
5017 Br->getOperand(0).setMBB(CondBrTarget);
5018 } else {
5019 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5020 // since we're swapping branch targets it needs to be reinserted.
5021 // FIXME: IRTranslator should probably not do this
5022 B.buildBr(*CondBrTarget);
5023 }
5024
5025 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5026 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5027 MI.eraseFromParent();
5028 BrCond->eraseFromParent();
5029 return true;
5030 }
5031
5032 return false;
5033 }
5034 case Intrinsic::amdgcn_loop: {
5035 MachineInstr *Br = nullptr;
5036 MachineBasicBlock *UncondBrTarget = nullptr;
5037 bool Negated = false;
5038 if (MachineInstr *BrCond =
5039 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5040 const SIRegisterInfo *TRI
5041 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5042
5043 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5044 Register Reg = MI.getOperand(2).getReg();
5045
5046 if (Negated)
5047 std::swap(CondBrTarget, UncondBrTarget);
5048
5049 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5050 B.buildInstr(AMDGPU::SI_LOOP)
5051 .addUse(Reg)
5052 .addMBB(UncondBrTarget);
5053
5054 if (Br)
5055 Br->getOperand(0).setMBB(CondBrTarget);
5056 else
5057 B.buildBr(*CondBrTarget);
5058
5059 MI.eraseFromParent();
5060 BrCond->eraseFromParent();
5061 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5062 return true;
5063 }
5064
5065 return false;
5066 }
5067 case Intrinsic::amdgcn_kernarg_segment_ptr:
5068 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5069 // This only makes sense to call in a kernel, so just lower to null.
5070 B.buildConstant(MI.getOperand(0).getReg(), 0);
5071 MI.eraseFromParent();
5072 return true;
5073 }
5074
5075 return legalizePreloadedArgIntrin(
5076 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5077 case Intrinsic::amdgcn_implicitarg_ptr:
5078 return legalizeImplicitArgPtr(MI, MRI, B);
5079 case Intrinsic::amdgcn_workitem_id_x:
5080 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5081 return replaceWithConstant(B, MI, 0);
5082 return legalizePreloadedArgIntrin(MI, MRI, B,
5083 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5084 case Intrinsic::amdgcn_workitem_id_y:
5085 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5086 return replaceWithConstant(B, MI, 0);
5087
5088 return legalizePreloadedArgIntrin(MI, MRI, B,
5089 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5090 case Intrinsic::amdgcn_workitem_id_z:
5091 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5092 return replaceWithConstant(B, MI, 0);
5093
5094 return legalizePreloadedArgIntrin(MI, MRI, B,
5095 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5096 case Intrinsic::amdgcn_workgroup_id_x:
5097 return legalizePreloadedArgIntrin(MI, MRI, B,
5098 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5099 case Intrinsic::amdgcn_workgroup_id_y:
5100 return legalizePreloadedArgIntrin(MI, MRI, B,
5101 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5102 case Intrinsic::amdgcn_workgroup_id_z:
5103 return legalizePreloadedArgIntrin(MI, MRI, B,
5104 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5105 case Intrinsic::amdgcn_dispatch_ptr:
5106 return legalizePreloadedArgIntrin(MI, MRI, B,
5107 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5108 case Intrinsic::amdgcn_queue_ptr:
5109 return legalizePreloadedArgIntrin(MI, MRI, B,
5110 AMDGPUFunctionArgInfo::QUEUE_PTR);
5111 case Intrinsic::amdgcn_implicit_buffer_ptr:
5112 return legalizePreloadedArgIntrin(
5113 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5114 case Intrinsic::amdgcn_dispatch_id:
5115 return legalizePreloadedArgIntrin(MI, MRI, B,
2
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
5116 AMDGPUFunctionArgInfo::DISPATCH_ID);
5117 case Intrinsic::amdgcn_fdiv_fast:
5118 return legalizeFDIVFastIntrin(MI, MRI, B);
5119 case Intrinsic::amdgcn_is_shared:
5120 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5121 case Intrinsic::amdgcn_is_private:
5122 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5123 case Intrinsic::amdgcn_wavefrontsize: {
5124 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5125 MI.eraseFromParent();
5126 return true;
5127 }
5128 case Intrinsic::amdgcn_s_buffer_load:
5129 return legalizeSBufferLoad(Helper, MI);
5130 case Intrinsic::amdgcn_raw_buffer_store:
5131 case Intrinsic::amdgcn_struct_buffer_store:
5132 return legalizeBufferStore(MI, MRI, B, false, false);
5133 case Intrinsic::amdgcn_raw_buffer_store_format:
5134 case Intrinsic::amdgcn_struct_buffer_store_format:
5135 return legalizeBufferStore(MI, MRI, B, false, true);
5136 case Intrinsic::amdgcn_raw_tbuffer_store:
5137 case Intrinsic::amdgcn_struct_tbuffer_store:
5138 return legalizeBufferStore(MI, MRI, B, true, true);
5139 case Intrinsic::amdgcn_raw_buffer_load:
5140 case Intrinsic::amdgcn_struct_buffer_load:
5141 return legalizeBufferLoad(MI, MRI, B, false, false);
5142 case Intrinsic::amdgcn_raw_buffer_load_format:
5143 case Intrinsic::amdgcn_struct_buffer_load_format:
5144 return legalizeBufferLoad(MI, MRI, B, true, false);
5145 case Intrinsic::amdgcn_raw_tbuffer_load:
5146 case Intrinsic::amdgcn_struct_tbuffer_load:
5147 return legalizeBufferLoad(MI, MRI, B, true, true);
5148 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5149 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5150 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5151 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5152 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5153 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5154 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5155 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5156 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5157 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5158 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5159 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5160 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5161 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5162 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5163 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5164 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5165 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5166 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5167 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5168 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5169 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5170 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5171 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5172 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5173 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5174 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5175 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5176 case Intrinsic::amdgcn_buffer_atomic_fadd:
5177 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5178 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5179 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5180 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5181 return legalizeBufferAtomic(MI, B, IntrID);
5182 case Intrinsic::amdgcn_atomic_inc:
5183 return legalizeAtomicIncDec(MI, B, true);
5184 case Intrinsic::amdgcn_atomic_dec:
5185 return legalizeAtomicIncDec(MI, B, false);
5186 case Intrinsic::trap:
5187 return legalizeTrapIntrinsic(MI, MRI, B);
5188 case Intrinsic::debugtrap:
5189 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5190 case Intrinsic::amdgcn_rsq_clamp:
5191 return legalizeRsqClampIntrinsic(MI, MRI, B);
5192 case Intrinsic::amdgcn_ds_fadd:
5193 case Intrinsic::amdgcn_ds_fmin:
5194 case Intrinsic::amdgcn_ds_fmax:
5195 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5196 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5197 return legalizeBVHIntrinsic(MI, B);
5198 default: {
5199 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5200 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5201 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5202 return true;
5203 }
5204 }
5205
5206 return true;
5207}

/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)(static_cast <bool> (!IsStack) ? void (0) : __assert_fail
("!IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __extension__ __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)(static_cast <bool> (IsStack) ? void (0) : __assert_fail
("IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __extension__ __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
10
Assuming the condition is true
11
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
15.1
'ZB' is not equal to ZB_Undefined
15.1
'ZB' is not equal to ZB_Undefined
15.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
16
Assuming 'Val' is equal to 0
17
Taking true branch
117 return 32;
18
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
15
Calling 'TrailingZerosCounter::count'
19
Returning from 'TrailingZerosCounter::count'
20
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 251, __extension__
__PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 429, __extension__
__PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 440, __extension__
__PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 447, __extension__
__PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 asse