Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 3340, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30
31#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
32
33using namespace llvm;
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
38
39// Hack until load/store selection patterns support any tuple of legal types.
40static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47static constexpr unsigned MaxRegisterSize = 1024;
48
49// Round the number of elements to the next power of two elements
50static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54}
55
56// Round the number of bits to the next power of two bits
57static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61}
62
63/// \returns true if this is an odd sized vector which should widen by adding an
64/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65/// excludes s1 vectors, which should always be scalarized.
66static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78}
79
80static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85}
86
87static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93}
94
95static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::pair(TypeIdx,
100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101 };
102}
103
104static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::pair(TypeIdx, LLT::scalarOrVector(
112 ElementCount::getFixed(NewNumElts), EltTy));
113 };
114}
115
116// Increase the number of vector elements to reach the next multiple of 32-bit
117// type.
118static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 127, __extension__ __PRETTY_FUNCTION__))
;
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132}
133
134// Increase the number of vector elements to reach the next legal RegClass.
135static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 const unsigned NumElts = Ty.getNumElements();
139 const unsigned EltSize = Ty.getElementType().getSizeInBits();
140 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
141
142 assert(EltSize == 32 || EltSize == 64)(static_cast <bool> (EltSize == 32 || EltSize == 64) ? void
(0) : __assert_fail ("EltSize == 32 || EltSize == 64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 142, __extension__ __PRETTY_FUNCTION__))
;
143 assert(Ty.getSizeInBits() < MaxRegisterSize)(static_cast <bool> (Ty.getSizeInBits() < MaxRegisterSize
) ? void (0) : __assert_fail ("Ty.getSizeInBits() < MaxRegisterSize"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 143, __extension__
__PRETTY_FUNCTION__))
;
144
145 unsigned NewNumElts;
146 // Find the nearest legal RegClass that is larger than the current type.
147 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
148 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
149 break;
150 }
151
152 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
153 };
154}
155
156static LLT getBitcastRegisterType(const LLT Ty) {
157 const unsigned Size = Ty.getSizeInBits();
158
159 if (Size <= 32) {
160 // <2 x s8> -> s16
161 // <4 x s8> -> s32
162 return LLT::scalar(Size);
163 }
164
165 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
166}
167
168static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
169 return [=](const LegalityQuery &Query) {
170 const LLT Ty = Query.Types[TypeIdx];
171 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
172 };
173}
174
175static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
176 return [=](const LegalityQuery &Query) {
177 const LLT Ty = Query.Types[TypeIdx];
178 unsigned Size = Ty.getSizeInBits();
179 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180 return std::pair(
181 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
182 };
183}
184
185static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
186 return [=](const LegalityQuery &Query) {
187 const LLT QueryTy = Query.Types[TypeIdx];
188 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
189 };
190}
191
192static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
193 return [=](const LegalityQuery &Query) {
194 const LLT QueryTy = Query.Types[TypeIdx];
195 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
196 };
197}
198
199static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT QueryTy = Query.Types[TypeIdx];
202 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
203 };
204}
205
206static bool isRegisterSize(unsigned Size) {
207 return Size % 32 == 0 && Size <= MaxRegisterSize;
208}
209
210static bool isRegisterVectorElementType(LLT EltTy) {
211 const int EltSize = EltTy.getSizeInBits();
212 return EltSize == 16 || EltSize % 32 == 0;
213}
214
215static bool isRegisterVectorType(LLT Ty) {
216 const int EltSize = Ty.getElementType().getSizeInBits();
217 return EltSize == 32 || EltSize == 64 ||
218 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
219 EltSize == 128 || EltSize == 256;
220}
221
222static bool isRegisterType(LLT Ty) {
223 if (!isRegisterSize(Ty.getSizeInBits()))
224 return false;
225
226 if (Ty.isVector())
227 return isRegisterVectorType(Ty);
228
229 return true;
230}
231
232// Any combination of 32 or 64-bit elements up the maximum register size, and
233// multiples of v2s16.
234static LegalityPredicate isRegisterType(unsigned TypeIdx) {
235 return [=](const LegalityQuery &Query) {
236 return isRegisterType(Query.Types[TypeIdx]);
237 };
238}
239
240// RegisterType that doesn't have a corresponding RegClass.
241static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
242 return [=](const LegalityQuery &Query) {
243 LLT Ty = Query.Types[TypeIdx];
244 return isRegisterType(Ty) &&
245 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
246 };
247}
248
249static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
250 return [=](const LegalityQuery &Query) {
251 const LLT QueryTy = Query.Types[TypeIdx];
252 if (!QueryTy.isVector())
253 return false;
254 const LLT EltTy = QueryTy.getElementType();
255 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
256 };
257}
258
259// If we have a truncating store or an extending load with a data size larger
260// than 32-bits, we need to reduce to a 32-bit type.
261static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
262 return [=](const LegalityQuery &Query) {
263 const LLT Ty = Query.Types[TypeIdx];
264 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
265 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
266 };
267}
268
269// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
270// handle some operations by just promoting the register during
271// selection. There are also d16 loads on GFX9+ which preserve the high bits.
272static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
273 bool IsLoad, bool IsAtomic) {
274 switch (AS) {
275 case AMDGPUAS::PRIVATE_ADDRESS:
276 // FIXME: Private element size.
277 return ST.enableFlatScratch() ? 128 : 32;
278 case AMDGPUAS::LOCAL_ADDRESS:
279 return ST.useDS128() ? 128 : 64;
280 case AMDGPUAS::GLOBAL_ADDRESS:
281 case AMDGPUAS::CONSTANT_ADDRESS:
282 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
283 // Treat constant and global as identical. SMRD loads are sometimes usable for
284 // global loads (ideally constant address space should be eliminated)
285 // depending on the context. Legality cannot be context dependent, but
286 // RegBankSelect can split the load as necessary depending on the pointer
287 // register bank/uniformity and if the memory is invariant or not written in a
288 // kernel.
289 return IsLoad ? 512 : 128;
290 default:
291 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
292 // if they may alias scratch depending on the subtarget. This needs to be
293 // moved to custom handling to use addressMayBeAccessedAsPrivate
294 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
295 }
296}
297
298static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
299 const LegalityQuery &Query) {
300 const LLT Ty = Query.Types[0];
301
302 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
303 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
304
305 unsigned RegSize = Ty.getSizeInBits();
306 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
307 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
308 unsigned AS = Query.Types[1].getAddressSpace();
309
310 // All of these need to be custom lowered to cast the pointer operand.
311 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
312 return false;
313
314 // Do not handle extending vector loads.
315 if (Ty.isVector() && MemSize != RegSize)
316 return false;
317
318 // TODO: We should be able to widen loads if the alignment is high enough, but
319 // we also need to modify the memory access size.
320#if 0
321 // Accept widening loads based on alignment.
322 if (IsLoad && MemSize < Size)
323 MemSize = std::max(MemSize, Align);
324#endif
325
326 // Only 1-byte and 2-byte to 32-bit extloads are valid.
327 if (MemSize != RegSize && RegSize != 32)
328 return false;
329
330 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
331 Query.MMODescrs[0].Ordering !=
332 AtomicOrdering::NotAtomic))
333 return false;
334
335 switch (MemSize) {
336 case 8:
337 case 16:
338 case 32:
339 case 64:
340 case 128:
341 break;
342 case 96:
343 if (!ST.hasDwordx3LoadStores())
344 return false;
345 break;
346 case 256:
347 case 512:
348 // These may contextually need to be broken down.
349 break;
350 default:
351 return false;
352 }
353
354 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 354, __extension__ __PRETTY_FUNCTION__))
;
355
356 if (AlignBits < MemSize) {
357 const SITargetLowering *TLI = ST.getTargetLowering();
358 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
359 Align(AlignBits / 8)))
360 return false;
361 }
362
363 return true;
364}
365
366// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
367// workaround this. Eventually it should ignore the type for loads and only care
368// about the size. Return true in cases where we will workaround this for now by
369// bitcasting.
370static bool loadStoreBitcastWorkaround(const LLT Ty) {
371 if (EnableNewLegality)
372 return false;
373
374 const unsigned Size = Ty.getSizeInBits();
375 if (Size <= 64)
376 return false;
377 if (!Ty.isVector())
378 return true;
379
380 LLT EltTy = Ty.getElementType();
381 if (EltTy.isPointer())
382 return true;
383
384 unsigned EltSize = EltTy.getSizeInBits();
385 return EltSize != 32 && EltSize != 64;
386}
387
388static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
389 const LLT Ty = Query.Types[0];
390 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
391 !loadStoreBitcastWorkaround(Ty);
392}
393
394/// Return true if a load or store of the type should be lowered with a bitcast
395/// to a different type.
396static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
397 const LLT MemTy) {
398 const unsigned MemSizeInBits = MemTy.getSizeInBits();
399 const unsigned Size = Ty.getSizeInBits();
400 if (Size != MemSizeInBits)
401 return Size <= 32 && Ty.isVector();
402
403 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
404 return true;
405
406 // Don't try to handle bitcasting vector ext loads for now.
407 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
408 (Size <= 32 || isRegisterSize(Size)) &&
409 !isRegisterVectorElementType(Ty.getElementType());
410}
411
412/// Return true if we should legalize a load by widening an odd sized memory
413/// access up to the alignment. Note this case when the memory access itself
414/// changes, not the size of the result register.
415static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
416 uint64_t AlignInBits, unsigned AddrSpace,
417 unsigned Opcode) {
418 unsigned SizeInBits = MemoryTy.getSizeInBits();
419 // We don't want to widen cases that are naturally legal.
420 if (isPowerOf2_32(SizeInBits))
421 return false;
422
423 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
424 // end up widening these for a scalar load during RegBankSelect, since there
425 // aren't 96-bit scalar loads.
426 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
427 return false;
428
429 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
430 return false;
431
432 // A load is known dereferenceable up to the alignment, so it's legal to widen
433 // to it.
434 //
435 // TODO: Could check dereferenceable for less aligned cases.
436 unsigned RoundedSize = NextPowerOf2(SizeInBits);
437 if (AlignInBits < RoundedSize)
438 return false;
439
440 // Do not widen if it would introduce a slow unaligned load.
441 const SITargetLowering *TLI = ST.getTargetLowering();
442 unsigned Fast = 0;
443 return TLI->allowsMisalignedMemoryAccessesImpl(
444 RoundedSize, AddrSpace, Align(AlignInBits / 8),
445 MachineMemOperand::MOLoad, &Fast) &&
446 Fast;
447}
448
449static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
450 unsigned Opcode) {
451 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
452 return false;
453
454 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
455 Query.MMODescrs[0].AlignInBits,
456 Query.Types[1].getAddressSpace(), Opcode);
457}
458
459AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
460 const GCNTargetMachine &TM)
461 : ST(ST_) {
462 using namespace TargetOpcode;
463
464 auto GetAddrSpacePtr = [&TM](unsigned AS) {
465 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
466 };
467
468 const LLT S1 = LLT::scalar(1);
469 const LLT S8 = LLT::scalar(8);
470 const LLT S16 = LLT::scalar(16);
471 const LLT S32 = LLT::scalar(32);
472 const LLT S64 = LLT::scalar(64);
473 const LLT S128 = LLT::scalar(128);
474 const LLT S256 = LLT::scalar(256);
475 const LLT S512 = LLT::scalar(512);
476 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
477
478 const LLT V2S8 = LLT::fixed_vector(2, 8);
479 const LLT V2S16 = LLT::fixed_vector(2, 16);
480 const LLT V4S16 = LLT::fixed_vector(4, 16);
481
482 const LLT V2S32 = LLT::fixed_vector(2, 32);
483 const LLT V3S32 = LLT::fixed_vector(3, 32);
484 const LLT V4S32 = LLT::fixed_vector(4, 32);
485 const LLT V5S32 = LLT::fixed_vector(5, 32);
486 const LLT V6S32 = LLT::fixed_vector(6, 32);
487 const LLT V7S32 = LLT::fixed_vector(7, 32);
488 const LLT V8S32 = LLT::fixed_vector(8, 32);
489 const LLT V9S32 = LLT::fixed_vector(9, 32);
490 const LLT V10S32 = LLT::fixed_vector(10, 32);
491 const LLT V11S32 = LLT::fixed_vector(11, 32);
492 const LLT V12S32 = LLT::fixed_vector(12, 32);
493 const LLT V13S32 = LLT::fixed_vector(13, 32);
494 const LLT V14S32 = LLT::fixed_vector(14, 32);
495 const LLT V15S32 = LLT::fixed_vector(15, 32);
496 const LLT V16S32 = LLT::fixed_vector(16, 32);
497 const LLT V32S32 = LLT::fixed_vector(32, 32);
498
499 const LLT V2S64 = LLT::fixed_vector(2, 64);
500 const LLT V3S64 = LLT::fixed_vector(3, 64);
501 const LLT V4S64 = LLT::fixed_vector(4, 64);
502 const LLT V5S64 = LLT::fixed_vector(5, 64);
503 const LLT V6S64 = LLT::fixed_vector(6, 64);
504 const LLT V7S64 = LLT::fixed_vector(7, 64);
505 const LLT V8S64 = LLT::fixed_vector(8, 64);
506 const LLT V16S64 = LLT::fixed_vector(16, 64);
507
508 std::initializer_list<LLT> AllS32Vectors =
509 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
510 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
511 std::initializer_list<LLT> AllS64Vectors =
512 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
513
514 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
515 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
516 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
517 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
518 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
519 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
520 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
521
522 const LLT CodePtr = FlatPtr;
523
524 const std::initializer_list<LLT> AddrSpaces64 = {
525 GlobalPtr, ConstantPtr, FlatPtr
526 };
527
528 const std::initializer_list<LLT> AddrSpaces32 = {
529 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
530 };
531
532 const std::initializer_list<LLT> FPTypesBase = {
533 S32, S64
534 };
535
536 const std::initializer_list<LLT> FPTypes16 = {
537 S32, S64, S16
538 };
539
540 const std::initializer_list<LLT> FPTypesPK16 = {
541 S32, S64, S16, V2S16
542 };
543
544 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
545
546 // s1 for VCC branches, s32 for SCC branches.
547 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
548
549 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
550 // elements for v3s16
551 getActionDefinitionsBuilder(G_PHI)
552 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
553 .legalFor(AllS32Vectors)
554 .legalFor(AllS64Vectors)
555 .legalFor(AddrSpaces64)
556 .legalFor(AddrSpaces32)
557 .legalIf(isPointer(0))
558 .clampScalar(0, S16, S256)
559 .widenScalarToNextPow2(0, 32)
560 .clampMaxNumElements(0, S32, 16)
561 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
562 .scalarize(0);
563
564 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
565 // Full set of gfx9 features.
566 getActionDefinitionsBuilder({G_ADD, G_SUB})
567 .legalFor({S32, S16, V2S16})
568 .clampMaxNumElementsStrict(0, S16, 2)
569 .scalarize(0)
570 .minScalar(0, S16)
571 .widenScalarToNextMultipleOf(0, 32)
572 .maxScalar(0, S32);
573
574 getActionDefinitionsBuilder(G_MUL)
575 .legalFor({S32, S16, V2S16})
576 .clampMaxNumElementsStrict(0, S16, 2)
577 .scalarize(0)
578 .minScalar(0, S16)
579 .widenScalarToNextMultipleOf(0, 32)
580 .custom();
581 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 581, __extension__ __PRETTY_FUNCTION__))
;
582
583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
584 .legalFor({S32, S16, V2S16}) // Clamp modifier
585 .minScalarOrElt(0, S16)
586 .clampMaxNumElementsStrict(0, S16, 2)
587 .scalarize(0)
588 .widenScalarToNextPow2(0, 32)
589 .lower();
590 } else if (ST.has16BitInsts()) {
591 getActionDefinitionsBuilder({G_ADD, G_SUB})
592 .legalFor({S32, S16})
593 .minScalar(0, S16)
594 .widenScalarToNextMultipleOf(0, 32)
595 .maxScalar(0, S32)
596 .scalarize(0);
597
598 getActionDefinitionsBuilder(G_MUL)
599 .legalFor({S32, S16})
600 .scalarize(0)
601 .minScalar(0, S16)
602 .widenScalarToNextMultipleOf(0, 32)
603 .custom();
604 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 604, __extension__ __PRETTY_FUNCTION__))
;
605
606 // Technically the saturating operations require clamp bit support, but this
607 // was introduced at the same time as 16-bit operations.
608 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
609 .legalFor({S32, S16}) // Clamp modifier
610 .minScalar(0, S16)
611 .scalarize(0)
612 .widenScalarToNextPow2(0, 16)
613 .lower();
614
615 // We're just lowering this, but it helps get a better result to try to
616 // coerce to the desired type first.
617 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
618 .minScalar(0, S16)
619 .scalarize(0)
620 .lower();
621 } else {
622 getActionDefinitionsBuilder({G_ADD, G_SUB})
623 .legalFor({S32})
624 .widenScalarToNextMultipleOf(0, 32)
625 .clampScalar(0, S32, S32)
626 .scalarize(0);
627
628 auto &Mul = getActionDefinitionsBuilder(G_MUL)
629 .legalFor({S32})
630 .scalarize(0)
631 .minScalar(0, S32)
632 .widenScalarToNextMultipleOf(0, 32);
633
634 if (ST.hasMad64_32())
635 Mul.custom();
636 else
637 Mul.maxScalar(0, S32);
638
639 if (ST.hasIntClamp()) {
640 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
641 .legalFor({S32}) // Clamp modifier.
642 .scalarize(0)
643 .minScalarOrElt(0, S32)
644 .lower();
645 } else {
646 // Clamp bit support was added in VI, along with 16-bit operations.
647 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
648 .minScalar(0, S32)
649 .scalarize(0)
650 .lower();
651 }
652
653 // FIXME: DAG expansion gets better results. The widening uses the smaller
654 // range values and goes for the min/max lowering directly.
655 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
656 .minScalar(0, S32)
657 .scalarize(0)
658 .lower();
659 }
660
661 getActionDefinitionsBuilder(
662 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
663 .customFor({S32, S64})
664 .clampScalar(0, S32, S64)
665 .widenScalarToNextPow2(0, 32)
666 .scalarize(0);
667
668 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
669 .legalFor({S32})
670 .maxScalar(0, S32);
671
672 if (ST.hasVOP3PInsts()) {
673 Mulh
674 .clampMaxNumElements(0, S8, 2)
675 .lowerFor({V2S8});
676 }
677
678 Mulh
679 .scalarize(0)
680 .lower();
681
682 // Report legal for any types we can handle anywhere. For the cases only legal
683 // on the SALU, RegBankSelect will be able to re-legalize.
684 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
685 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
686 .clampScalar(0, S32, S64)
687 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
688 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
689 .widenScalarToNextPow2(0)
690 .scalarize(0);
691
692 getActionDefinitionsBuilder(
693 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
694 .legalFor({{S32, S1}, {S32, S32}})
695 .clampScalar(0, S32, S32)
696 .scalarize(0);
697
698 getActionDefinitionsBuilder(G_BITCAST)
699 // Don't worry about the size constraint.
700 .legalIf(all(isRegisterType(0), isRegisterType(1)))
701 .lower();
702
703
704 getActionDefinitionsBuilder(G_CONSTANT)
705 .legalFor({S1, S32, S64, S16, GlobalPtr,
706 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
707 .legalIf(isPointer(0))
708 .clampScalar(0, S32, S64)
709 .widenScalarToNextPow2(0);
710
711 getActionDefinitionsBuilder(G_FCONSTANT)
712 .legalFor({S32, S64, S16})
713 .clampScalar(0, S16, S64);
714
715 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
716 .legalIf(isRegisterType(0))
717 // s1 and s16 are special cases because they have legal operations on
718 // them, but don't really occupy registers in the normal way.
719 .legalFor({S1, S16})
720 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
721 .clampScalarOrElt(0, S32, MaxScalar)
722 .widenScalarToNextPow2(0, 32)
723 .clampMaxNumElements(0, S32, 16);
724
725 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
726
727 // If the amount is divergent, we have to do a wave reduction to get the
728 // maximum value, so this is expanded during RegBankSelect.
729 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
730 .legalFor({{PrivatePtr, S32}});
731
732 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
733 .customIf(typeIsNot(0, PrivatePtr));
734
735 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
736
737 auto &FPOpActions = getActionDefinitionsBuilder(
738 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
739 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
740 .legalFor({S32, S64});
741 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
742 .customFor({S32, S64});
743 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
744 .customFor({S32, S64});
745
746 if (ST.has16BitInsts()) {
747 if (ST.hasVOP3PInsts())
748 FPOpActions.legalFor({S16, V2S16});
749 else
750 FPOpActions.legalFor({S16});
751
752 TrigActions.customFor({S16});
753 FDIVActions.customFor({S16});
754 }
755
756 auto &MinNumMaxNum = getActionDefinitionsBuilder({
757 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
758
759 if (ST.hasVOP3PInsts()) {
760 MinNumMaxNum.customFor(FPTypesPK16)
761 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
762 .clampMaxNumElements(0, S16, 2)
763 .clampScalar(0, S16, S64)
764 .scalarize(0);
765 } else if (ST.has16BitInsts()) {
766 MinNumMaxNum.customFor(FPTypes16)
767 .clampScalar(0, S16, S64)
768 .scalarize(0);
769 } else {
770 MinNumMaxNum.customFor(FPTypesBase)
771 .clampScalar(0, S32, S64)
772 .scalarize(0);
773 }
774
775 if (ST.hasVOP3PInsts())
776 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
777
778 FPOpActions
779 .scalarize(0)
780 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
781
782 TrigActions
783 .scalarize(0)
784 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
785
786 FDIVActions
787 .scalarize(0)
788 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
789
790 getActionDefinitionsBuilder({G_FNEG, G_FABS})
791 .legalFor(FPTypesPK16)
792 .clampMaxNumElementsStrict(0, S16, 2)
793 .scalarize(0)
794 .clampScalar(0, S16, S64);
795
796 if (ST.has16BitInsts()) {
797 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
798 .legalFor({S32, S64, S16})
799 .scalarize(0)
800 .clampScalar(0, S16, S64);
801 } else {
802 getActionDefinitionsBuilder(G_FSQRT)
803 .legalFor({S32, S64})
804 .scalarize(0)
805 .clampScalar(0, S32, S64);
806
807 if (ST.hasFractBug()) {
808 getActionDefinitionsBuilder(G_FFLOOR)
809 .customFor({S64})
810 .legalFor({S32, S64})
811 .scalarize(0)
812 .clampScalar(0, S32, S64);
813 } else {
814 getActionDefinitionsBuilder(G_FFLOOR)
815 .legalFor({S32, S64})
816 .scalarize(0)
817 .clampScalar(0, S32, S64);
818 }
819 }
820
821 getActionDefinitionsBuilder(G_FPTRUNC)
822 .legalFor({{S32, S64}, {S16, S32}})
823 .scalarize(0)
824 .lower();
825
826 getActionDefinitionsBuilder(G_FPEXT)
827 .legalFor({{S64, S32}, {S32, S16}})
828 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
829 .scalarize(0);
830
831 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
832 if (ST.has16BitInsts()) {
833 FSubActions
834 // Use actual fsub instruction
835 .legalFor({S32, S16})
836 // Must use fadd + fneg
837 .lowerFor({S64, V2S16});
838 } else {
839 FSubActions
840 // Use actual fsub instruction
841 .legalFor({S32})
842 // Must use fadd + fneg
843 .lowerFor({S64, S16, V2S16});
844 }
845
846 FSubActions
847 .scalarize(0)
848 .clampScalar(0, S32, S64);
849
850 // Whether this is legal depends on the floating point mode for the function.
851 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
852 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
853 FMad.customFor({S32, S16});
854 else if (ST.hasMadMacF32Insts())
855 FMad.customFor({S32});
856 else if (ST.hasMadF16())
857 FMad.customFor({S16});
858 FMad.scalarize(0)
859 .lower();
860
861 auto &FRem = getActionDefinitionsBuilder(G_FREM);
862 if (ST.has16BitInsts()) {
863 FRem.customFor({S16, S32, S64});
864 } else {
865 FRem.minScalar(0, S32)
866 .customFor({S32, S64});
867 }
868 FRem.scalarize(0);
869
870 // TODO: Do we need to clamp maximum bitwidth?
871 getActionDefinitionsBuilder(G_TRUNC)
872 .legalIf(isScalar(0))
873 .legalFor({{V2S16, V2S32}})
874 .clampMaxNumElements(0, S16, 2)
875 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
876 // situations (like an invalid implicit use), we don't want to infinite loop
877 // in the legalizer.
878 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
879 .alwaysLegal();
880
881 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
882 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
883 {S32, S1}, {S64, S1}, {S16, S1}})
884 .scalarize(0)
885 .clampScalar(0, S32, S64)
886 .widenScalarToNextPow2(1, 32);
887
888 // TODO: Split s1->s64 during regbankselect for VALU.
889 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
890 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
891 .lowerIf(typeIs(1, S1))
892 .customFor({{S32, S64}, {S64, S64}});
893 if (ST.has16BitInsts())
894 IToFP.legalFor({{S16, S16}});
895 IToFP.clampScalar(1, S32, S64)
896 .minScalar(0, S32)
897 .scalarize(0)
898 .widenScalarToNextPow2(1);
899
900 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
901 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
902 .customFor({{S64, S32}, {S64, S64}})
903 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
904 if (ST.has16BitInsts())
905 FPToI.legalFor({{S16, S16}});
906 else
907 FPToI.minScalar(1, S32);
908
909 FPToI.minScalar(0, S32)
910 .widenScalarToNextPow2(0, 32)
911 .scalarize(0)
912 .lower();
913
914 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
915 .customFor({S16, S32})
916 .scalarize(0)
917 .lower();
918
919 // Lower roundeven into G_FRINT
920 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
921 .scalarize(0)
922 .lower();
923
924 if (ST.has16BitInsts()) {
925 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
926 .legalFor({S16, S32, S64})
927 .clampScalar(0, S16, S64)
928 .scalarize(0);
929 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
930 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
931 .legalFor({S32, S64})
932 .clampScalar(0, S32, S64)
933 .scalarize(0);
934 } else {
935 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
936 .legalFor({S32})
937 .customFor({S64})
938 .clampScalar(0, S32, S64)
939 .scalarize(0);
940 }
941
942 getActionDefinitionsBuilder(G_PTR_ADD)
943 .legalIf(all(isPointer(0), sameSize(0, 1)))
944 .scalarize(0)
945 .scalarSameSizeAs(1, 0);
946
947 getActionDefinitionsBuilder(G_PTRMASK)
948 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
949 .scalarSameSizeAs(1, 0)
950 .scalarize(0);
951
952 auto &CmpBuilder =
953 getActionDefinitionsBuilder(G_ICMP)
954 // The compare output type differs based on the register bank of the output,
955 // so make both s1 and s32 legal.
956 //
957 // Scalar compares producing output in scc will be promoted to s32, as that
958 // is the allocatable register type that will be needed for the copy from
959 // scc. This will be promoted during RegBankSelect, and we assume something
960 // before that won't try to use s32 result types.
961 //
962 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
963 // bank.
964 .legalForCartesianProduct(
965 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
966 .legalForCartesianProduct(
967 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
968 if (ST.has16BitInsts()) {
969 CmpBuilder.legalFor({{S1, S16}});
970 }
971
972 CmpBuilder
973 .widenScalarToNextPow2(1)
974 .clampScalar(1, S32, S64)
975 .scalarize(0)
976 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
977
978 getActionDefinitionsBuilder(G_FCMP)
979 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
980 .widenScalarToNextPow2(1)
981 .clampScalar(1, S32, S64)
982 .scalarize(0);
983
984 // FIXME: fpow has a selection pattern that should move to custom lowering.
985 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
986 if (ST.has16BitInsts())
987 Exp2Ops.legalFor({S32, S16});
988 else
989 Exp2Ops.legalFor({S32});
990 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
991 Exp2Ops.scalarize(0);
992
993 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
994 if (ST.has16BitInsts())
995 ExpOps.customFor({{S32}, {S16}});
996 else
997 ExpOps.customFor({S32});
998 ExpOps.clampScalar(0, MinScalarFPTy, S32)
999 .scalarize(0);
1000
1001 getActionDefinitionsBuilder(G_FPOWI)
1002 .clampScalar(0, MinScalarFPTy, S32)
1003 .lower();
1004
1005 // The 64-bit versions produce 32-bit results, but only on the SALU.
1006 getActionDefinitionsBuilder(G_CTPOP)
1007 .legalFor({{S32, S32}, {S32, S64}})
1008 .clampScalar(0, S32, S32)
1009 .widenScalarToNextPow2(1, 32)
1010 .clampScalar(1, S32, S64)
1011 .scalarize(0)
1012 .widenScalarToNextPow2(0, 32);
1013
1014 // If no 16 bit instr is available, lower into different instructions.
1015 if (ST.has16BitInsts())
1016 getActionDefinitionsBuilder(G_IS_FPCLASS)
1017 .legalForCartesianProduct({S1}, FPTypes16)
1018 .widenScalarToNextPow2(1)
1019 .scalarize(0)
1020 .lower();
1021 else
1022 getActionDefinitionsBuilder(G_IS_FPCLASS)
1023 .legalForCartesianProduct({S1}, FPTypesBase)
1024 .lowerFor({S1, S16})
1025 .widenScalarToNextPow2(1)
1026 .scalarize(0)
1027 .lower();
1028
1029 // The hardware instructions return a different result on 0 than the generic
1030 // instructions expect. The hardware produces -1, but these produce the
1031 // bitwidth.
1032 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1033 .scalarize(0)
1034 .clampScalar(0, S32, S32)
1035 .clampScalar(1, S32, S64)
1036 .widenScalarToNextPow2(0, 32)
1037 .widenScalarToNextPow2(1, 32)
1038 .custom();
1039
1040 // The 64-bit versions produce 32-bit results, but only on the SALU.
1041 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1042 .legalFor({{S32, S32}, {S32, S64}})
1043 .clampScalar(0, S32, S32)
1044 .clampScalar(1, S32, S64)
1045 .scalarize(0)
1046 .widenScalarToNextPow2(0, 32)
1047 .widenScalarToNextPow2(1, 32);
1048
1049 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1050 // RegBankSelect.
1051 getActionDefinitionsBuilder(G_BITREVERSE)
1052 .legalFor({S32, S64})
1053 .clampScalar(0, S32, S64)
1054 .scalarize(0)
1055 .widenScalarToNextPow2(0);
1056
1057 if (ST.has16BitInsts()) {
1058 getActionDefinitionsBuilder(G_BSWAP)
1059 .legalFor({S16, S32, V2S16})
1060 .clampMaxNumElementsStrict(0, S16, 2)
1061 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1062 // narrowScalar limitation.
1063 .widenScalarToNextPow2(0)
1064 .clampScalar(0, S16, S32)
1065 .scalarize(0);
1066
1067 if (ST.hasVOP3PInsts()) {
1068 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1069 .legalFor({S32, S16, V2S16})
1070 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1071 .clampMaxNumElements(0, S16, 2)
1072 .minScalar(0, S16)
1073 .widenScalarToNextPow2(0)
1074 .scalarize(0)
1075 .lower();
1076 } else {
1077 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1078 .legalFor({S32, S16})
1079 .widenScalarToNextPow2(0)
1080 .minScalar(0, S16)
1081 .scalarize(0)
1082 .lower();
1083 }
1084 } else {
1085 // TODO: Should have same legality without v_perm_b32
1086 getActionDefinitionsBuilder(G_BSWAP)
1087 .legalFor({S32})
1088 .lowerIf(scalarNarrowerThan(0, 32))
1089 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1090 // narrowScalar limitation.
1091 .widenScalarToNextPow2(0)
1092 .maxScalar(0, S32)
1093 .scalarize(0)
1094 .lower();
1095
1096 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1097 .legalFor({S32})
1098 .minScalar(0, S32)
1099 .widenScalarToNextPow2(0)
1100 .scalarize(0)
1101 .lower();
1102 }
1103
1104 getActionDefinitionsBuilder(G_INTTOPTR)
1105 // List the common cases
1106 .legalForCartesianProduct(AddrSpaces64, {S64})
1107 .legalForCartesianProduct(AddrSpaces32, {S32})
1108 .scalarize(0)
1109 // Accept any address space as long as the size matches
1110 .legalIf(sameSize(0, 1))
1111 .widenScalarIf(smallerThan(1, 0),
1112 [](const LegalityQuery &Query) {
1113 return std::pair(
1114 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1115 })
1116 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1117 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1118 });
1119
1120 getActionDefinitionsBuilder(G_PTRTOINT)
1121 // List the common cases
1122 .legalForCartesianProduct(AddrSpaces64, {S64})
1123 .legalForCartesianProduct(AddrSpaces32, {S32})
1124 .scalarize(0)
1125 // Accept any address space as long as the size matches
1126 .legalIf(sameSize(0, 1))
1127 .widenScalarIf(smallerThan(0, 1),
1128 [](const LegalityQuery &Query) {
1129 return std::pair(
1130 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1131 })
1132 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1133 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1134 });
1135
1136 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1137 .scalarize(0)
1138 .custom();
1139
1140 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1141 bool IsLoad) -> bool {
1142 const LLT DstTy = Query.Types[0];
1143
1144 // Split vector extloads.
1145 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1146
1147 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1148 return true;
1149
1150 const LLT PtrTy = Query.Types[1];
1151 unsigned AS = PtrTy.getAddressSpace();
1152 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1153 Query.MMODescrs[0].Ordering !=
1154 AtomicOrdering::NotAtomic))
1155 return true;
1156
1157 // Catch weird sized loads that don't evenly divide into the access sizes
1158 // TODO: May be able to widen depending on alignment etc.
1159 unsigned NumRegs = (MemSize + 31) / 32;
1160 if (NumRegs == 3) {
1161 if (!ST.hasDwordx3LoadStores())
1162 return true;
1163 } else {
1164 // If the alignment allows, these should have been widened.
1165 if (!isPowerOf2_32(NumRegs))
1166 return true;
1167 }
1168
1169 return false;
1170 };
1171
1172 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1173 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1174 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1175
1176 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1177 // LDS
1178 // TODO: Unsupported flat for SI.
1179
1180 for (unsigned Op : {G_LOAD, G_STORE}) {
1181 const bool IsStore = Op == G_STORE;
1182
1183 auto &Actions = getActionDefinitionsBuilder(Op);
1184 // Explicitly list some common cases.
1185 // TODO: Does this help compile time at all?
1186 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1187 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1188 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1189 {S64, GlobalPtr, S64, GlobalAlign32},
1190 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1191 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1192 {S32, GlobalPtr, S8, GlobalAlign8},
1193 {S32, GlobalPtr, S16, GlobalAlign16},
1194
1195 {S32, LocalPtr, S32, 32},
1196 {S64, LocalPtr, S64, 32},
1197 {V2S32, LocalPtr, V2S32, 32},
1198 {S32, LocalPtr, S8, 8},
1199 {S32, LocalPtr, S16, 16},
1200 {V2S16, LocalPtr, S32, 32},
1201
1202 {S32, PrivatePtr, S32, 32},
1203 {S32, PrivatePtr, S8, 8},
1204 {S32, PrivatePtr, S16, 16},
1205 {V2S16, PrivatePtr, S32, 32},
1206
1207 {S32, ConstantPtr, S32, GlobalAlign32},
1208 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1209 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1210 {S64, ConstantPtr, S64, GlobalAlign32},
1211 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1212 Actions.legalIf(
1213 [=](const LegalityQuery &Query) -> bool {
1214 return isLoadStoreLegal(ST, Query);
1215 });
1216
1217 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1218 // 64-bits.
1219 //
1220 // TODO: Should generalize bitcast action into coerce, which will also cover
1221 // inserting addrspacecasts.
1222 Actions.customIf(typeIs(1, Constant32Ptr));
1223
1224 // Turn any illegal element vectors into something easier to deal
1225 // with. These will ultimately produce 32-bit scalar shifts to extract the
1226 // parts anyway.
1227 //
1228 // For odd 16-bit element vectors, prefer to split those into pieces with
1229 // 16-bit vector parts.
1230 Actions.bitcastIf(
1231 [=](const LegalityQuery &Query) -> bool {
1232 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1233 Query.MMODescrs[0].MemoryTy);
1234 }, bitcastToRegisterType(0));
1235
1236 if (!IsStore) {
1237 // Widen suitably aligned loads by loading extra bytes. The standard
1238 // legalization actions can't properly express widening memory operands.
1239 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1240 return shouldWidenLoad(ST, Query, G_LOAD);
1241 });
1242 }
1243
1244 // FIXME: load/store narrowing should be moved to lower action
1245 Actions
1246 .narrowScalarIf(
1247 [=](const LegalityQuery &Query) -> bool {
1248 return !Query.Types[0].isVector() &&
1249 needToSplitMemOp(Query, Op == G_LOAD);
1250 },
1251 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1252 const LLT DstTy = Query.Types[0];
1253 const LLT PtrTy = Query.Types[1];
1254
1255 const unsigned DstSize = DstTy.getSizeInBits();
1256 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1257
1258 // Split extloads.
1259 if (DstSize > MemSize)
1260 return std::pair(0, LLT::scalar(MemSize));
1261
1262 unsigned MaxSize = maxSizeForAddrSpace(
1263 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1264 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1265 if (MemSize > MaxSize)
1266 return std::pair(0, LLT::scalar(MaxSize));
1267
1268 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1269 return std::pair(0, LLT::scalar(Align));
1270 })
1271 .fewerElementsIf(
1272 [=](const LegalityQuery &Query) -> bool {
1273 return Query.Types[0].isVector() &&
1274 needToSplitMemOp(Query, Op == G_LOAD);
1275 },
1276 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1277 const LLT DstTy = Query.Types[0];
1278 const LLT PtrTy = Query.Types[1];
1279
1280 LLT EltTy = DstTy.getElementType();
1281 unsigned MaxSize = maxSizeForAddrSpace(
1282 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1283 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1284
1285 // FIXME: Handle widened to power of 2 results better. This ends
1286 // up scalarizing.
1287 // FIXME: 3 element stores scalarized on SI
1288
1289 // Split if it's too large for the address space.
1290 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1291 if (MemSize > MaxSize) {
1292 unsigned NumElts = DstTy.getNumElements();
1293 unsigned EltSize = EltTy.getSizeInBits();
1294
1295 if (MaxSize % EltSize == 0) {
1296 return std::pair(
1297 0, LLT::scalarOrVector(
1298 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1299 }
1300
1301 unsigned NumPieces = MemSize / MaxSize;
1302
1303 // FIXME: Refine when odd breakdowns handled
1304 // The scalars will need to be re-legalized.
1305 if (NumPieces == 1 || NumPieces >= NumElts ||
1306 NumElts % NumPieces != 0)
1307 return std::pair(0, EltTy);
1308
1309 return std::pair(0,
1310 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1311 }
1312
1313 // FIXME: We could probably handle weird extending loads better.
1314 if (DstTy.getSizeInBits() > MemSize)
1315 return std::pair(0, EltTy);
1316
1317 unsigned EltSize = EltTy.getSizeInBits();
1318 unsigned DstSize = DstTy.getSizeInBits();
1319 if (!isPowerOf2_32(DstSize)) {
1320 // We're probably decomposing an odd sized store. Try to split
1321 // to the widest type. TODO: Account for alignment. As-is it
1322 // should be OK, since the new parts will be further legalized.
1323 unsigned FloorSize = llvm::bit_floor(DstSize);
1324 return std::pair(
1325 0, LLT::scalarOrVector(
1326 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1327 }
1328
1329 // May need relegalization for the scalars.
1330 return std::pair(0, EltTy);
1331 })
1332 .minScalar(0, S32)
1333 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1334 .widenScalarToNextPow2(0)
1335 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1336 .lower();
1337 }
1338
1339 // FIXME: Unaligned accesses not lowered.
1340 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1341 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1342 {S32, GlobalPtr, S16, 2 * 8},
1343 {S32, LocalPtr, S8, 8},
1344 {S32, LocalPtr, S16, 16},
1345 {S32, PrivatePtr, S8, 8},
1346 {S32, PrivatePtr, S16, 16},
1347 {S32, ConstantPtr, S8, 8},
1348 {S32, ConstantPtr, S16, 2 * 8}})
1349 .legalIf(
1350 [=](const LegalityQuery &Query) -> bool {
1351 return isLoadStoreLegal(ST, Query);
1352 });
1353
1354 if (ST.hasFlatAddressSpace()) {
1355 ExtLoads.legalForTypesWithMemDesc(
1356 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1357 }
1358
1359 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1360 // 64-bits.
1361 //
1362 // TODO: Should generalize bitcast action into coerce, which will also cover
1363 // inserting addrspacecasts.
1364 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1365
1366 ExtLoads.clampScalar(0, S32, S32)
1367 .widenScalarToNextPow2(0)
1368 .lower();
1369
1370 auto &Atomics = getActionDefinitionsBuilder(
1371 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1372 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1373 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1374 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1375 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1376 {S64, GlobalPtr}, {S64, LocalPtr},
1377 {S32, RegionPtr}, {S64, RegionPtr}});
1378 if (ST.hasFlatAddressSpace()) {
1379 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1380 }
1381
1382 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1383 if (ST.hasLDSFPAtomicAdd()) {
1384 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1385 if (ST.hasGFX90AInsts())
1386 Atomic.legalFor({{S64, LocalPtr}});
1387 if (ST.hasAtomicDsPkAdd16Insts())
1388 Atomic.legalFor({{V2S16, LocalPtr}});
1389 }
1390 if (ST.hasAtomicFaddInsts())
1391 Atomic.legalFor({{S32, GlobalPtr}});
1392 if (ST.hasFlatAtomicFaddF32Inst())
1393 Atomic.legalFor({{S32, FlatPtr}});
1394
1395 if (ST.hasGFX90AInsts()) {
1396 // These are legal with some caveats, and should have undergone expansion in
1397 // the IR in most situations
1398 // TODO: Move atomic expansion into legalizer
1399 Atomic.legalFor({
1400 {S32, GlobalPtr},
1401 {S64, GlobalPtr},
1402 {S64, FlatPtr}
1403 });
1404 }
1405
1406 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1407 // demarshalling
1408 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1409 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1410 {S32, FlatPtr}, {S64, FlatPtr}})
1411 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1412 {S32, RegionPtr}, {S64, RegionPtr}});
1413 // TODO: Pointer types, any 32-bit or 64-bit vector
1414
1415 // Condition should be s32 for scalar, s1 for vector.
1416 getActionDefinitionsBuilder(G_SELECT)
1417 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1418 LocalPtr, FlatPtr, PrivatePtr,
1419 LLT::fixed_vector(2, LocalPtr),
1420 LLT::fixed_vector(2, PrivatePtr)},
1421 {S1, S32})
1422 .clampScalar(0, S16, S64)
1423 .scalarize(1)
1424 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1425 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1426 .clampMaxNumElements(0, S32, 2)
1427 .clampMaxNumElements(0, LocalPtr, 2)
1428 .clampMaxNumElements(0, PrivatePtr, 2)
1429 .scalarize(0)
1430 .widenScalarToNextPow2(0)
1431 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1432
1433 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1434 // be more flexible with the shift amount type.
1435 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1436 .legalFor({{S32, S32}, {S64, S32}});
1437 if (ST.has16BitInsts()) {
1438 if (ST.hasVOP3PInsts()) {
1439 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1440 .clampMaxNumElements(0, S16, 2);
1441 } else
1442 Shifts.legalFor({{S16, S16}});
1443
1444 // TODO: Support 16-bit shift amounts for all types
1445 Shifts.widenScalarIf(
1446 [=](const LegalityQuery &Query) {
1447 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1448 // 32-bit amount.
1449 const LLT ValTy = Query.Types[0];
1450 const LLT AmountTy = Query.Types[1];
1451 return ValTy.getSizeInBits() <= 16 &&
1452 AmountTy.getSizeInBits() < 16;
1453 }, changeTo(1, S16));
1454 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1455 Shifts.clampScalar(1, S32, S32);
1456 Shifts.widenScalarToNextPow2(0, 16);
1457 Shifts.clampScalar(0, S16, S64);
1458
1459 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1460 .minScalar(0, S16)
1461 .scalarize(0)
1462 .lower();
1463 } else {
1464 // Make sure we legalize the shift amount type first, as the general
1465 // expansion for the shifted type will produce much worse code if it hasn't
1466 // been truncated already.
1467 Shifts.clampScalar(1, S32, S32);
1468 Shifts.widenScalarToNextPow2(0, 32);
1469 Shifts.clampScalar(0, S32, S64);
1470
1471 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1472 .minScalar(0, S32)
1473 .scalarize(0)
1474 .lower();
1475 }
1476 Shifts.scalarize(0);
1477
1478 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1479 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1480 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1481 unsigned IdxTypeIdx = 2;
1482
1483 getActionDefinitionsBuilder(Op)
1484 .customIf([=](const LegalityQuery &Query) {
1485 const LLT EltTy = Query.Types[EltTypeIdx];
1486 const LLT VecTy = Query.Types[VecTypeIdx];
1487 const LLT IdxTy = Query.Types[IdxTypeIdx];
1488 const unsigned EltSize = EltTy.getSizeInBits();
1489 const bool isLegalVecType =
1490 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1491 return (EltSize == 32 || EltSize == 64) &&
1492 VecTy.getSizeInBits() % 32 == 0 &&
1493 VecTy.getSizeInBits() <= MaxRegisterSize &&
1494 IdxTy.getSizeInBits() == 32 &&
1495 isLegalVecType;
1496 })
1497 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1498 bitcastToVectorElement32(VecTypeIdx))
1499 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1500 .bitcastIf(
1501 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1502 [=](const LegalityQuery &Query) {
1503 // For > 64-bit element types, try to turn this into a 64-bit
1504 // element vector since we may be able to do better indexing
1505 // if this is scalar. If not, fall back to 32.
1506 const LLT EltTy = Query.Types[EltTypeIdx];
1507 const LLT VecTy = Query.Types[VecTypeIdx];
1508 const unsigned DstEltSize = EltTy.getSizeInBits();
1509 const unsigned VecSize = VecTy.getSizeInBits();
1510
1511 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1512 return std::pair(
1513 VecTypeIdx,
1514 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1515 })
1516 .clampScalar(EltTypeIdx, S32, S64)
1517 .clampScalar(VecTypeIdx, S32, S64)
1518 .clampScalar(IdxTypeIdx, S32, S32)
1519 .clampMaxNumElements(VecTypeIdx, S32, 32)
1520 // TODO: Clamp elements for 64-bit vectors?
1521 .moreElementsIf(
1522 isIllegalRegisterType(VecTypeIdx),
1523 moreElementsToNextExistingRegClass(VecTypeIdx))
1524 // It should only be necessary with variable indexes.
1525 // As a last resort, lower to the stack
1526 .lower();
1527 }
1528
1529 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1530 .unsupportedIf([=](const LegalityQuery &Query) {
1531 const LLT &EltTy = Query.Types[1].getElementType();
1532 return Query.Types[0] != EltTy;
1533 });
1534
1535 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1536 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1537 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1538
1539 // FIXME: Doesn't handle extract of illegal sizes.
1540 getActionDefinitionsBuilder(Op)
1541 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1542 .lowerIf([=](const LegalityQuery &Query) {
1543 // Sub-vector(or single element) insert and extract.
1544 // TODO: verify immediate offset here since lower only works with
1545 // whole elements.
1546 const LLT BigTy = Query.Types[BigTyIdx];
1547 return BigTy.isVector();
1548 })
1549 // FIXME: Multiples of 16 should not be legal.
1550 .legalIf([=](const LegalityQuery &Query) {
1551 const LLT BigTy = Query.Types[BigTyIdx];
1552 const LLT LitTy = Query.Types[LitTyIdx];
1553 return (BigTy.getSizeInBits() % 32 == 0) &&
1554 (LitTy.getSizeInBits() % 16 == 0);
1555 })
1556 .widenScalarIf(
1557 [=](const LegalityQuery &Query) {
1558 const LLT BigTy = Query.Types[BigTyIdx];
1559 return (BigTy.getScalarSizeInBits() < 16);
1560 },
1561 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1562 .widenScalarIf(
1563 [=](const LegalityQuery &Query) {
1564 const LLT LitTy = Query.Types[LitTyIdx];
1565 return (LitTy.getScalarSizeInBits() < 16);
1566 },
1567 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1568 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1569 .widenScalarToNextPow2(BigTyIdx, 32);
1570
1571 }
1572
1573 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1574 .legalForCartesianProduct(AllS32Vectors, {S32})
1575 .legalForCartesianProduct(AllS64Vectors, {S64})
1576 .clampNumElements(0, V16S32, V32S32)
1577 .clampNumElements(0, V2S64, V16S64)
1578 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1579 .moreElementsIf(
1580 isIllegalRegisterType(0),
1581 moreElementsToNextExistingRegClass(0));
1582
1583 if (ST.hasScalarPackInsts()) {
1584 BuildVector
1585 // FIXME: Should probably widen s1 vectors straight to s32
1586 .minScalarOrElt(0, S16)
1587 .minScalar(1, S16);
1588
1589 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1590 .legalFor({V2S16, S32})
1591 .lower();
1592 } else {
1593 BuildVector.customFor({V2S16, S16});
1594 BuildVector.minScalarOrElt(0, S32);
1595
1596 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1597 .customFor({V2S16, S32})
1598 .lower();
1599 }
1600
1601 BuildVector.legalIf(isRegisterType(0));
1602
1603 // FIXME: Clamp maximum size
1604 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1605 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1606 .clampMaxNumElements(0, S32, 32)
1607 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1608 .clampMaxNumElements(0, S16, 64);
1609
1610 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1611
1612 // Merge/Unmerge
1613 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1614 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1615 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1616
1617 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1618 const LLT Ty = Query.Types[TypeIdx];
1619 if (Ty.isVector()) {
1620 const LLT &EltTy = Ty.getElementType();
1621 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1622 return true;
1623 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1624 return true;
1625 }
1626 return false;
1627 };
1628
1629 auto &Builder = getActionDefinitionsBuilder(Op)
1630 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1631 .lowerFor({{S16, V2S16}})
1632 .lowerIf([=](const LegalityQuery &Query) {
1633 const LLT BigTy = Query.Types[BigTyIdx];
1634 return BigTy.getSizeInBits() == 32;
1635 })
1636 // Try to widen to s16 first for small types.
1637 // TODO: Only do this on targets with legal s16 shifts
1638 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1639 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1640 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1641 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1642 elementTypeIs(1, S16)),
1643 changeTo(1, V2S16))
1644 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1645 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1646 // valid.
1647 .clampScalar(LitTyIdx, S32, S512)
1648 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1649 // Break up vectors with weird elements into scalars
1650 .fewerElementsIf(
1651 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1652 scalarize(0))
1653 .fewerElementsIf(
1654 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1655 scalarize(1))
1656 .clampScalar(BigTyIdx, S32, MaxScalar);
1657
1658 if (Op == G_MERGE_VALUES) {
1659 Builder.widenScalarIf(
1660 // TODO: Use 16-bit shifts if legal for 8-bit values?
1661 [=](const LegalityQuery &Query) {
1662 const LLT Ty = Query.Types[LitTyIdx];
1663 return Ty.getSizeInBits() < 32;
1664 },
1665 changeTo(LitTyIdx, S32));
1666 }
1667
1668 Builder.widenScalarIf(
1669 [=](const LegalityQuery &Query) {
1670 const LLT Ty = Query.Types[BigTyIdx];
1671 return Ty.getSizeInBits() % 16 != 0;
1672 },
1673 [=](const LegalityQuery &Query) {
1674 // Pick the next power of 2, or a multiple of 64 over 128.
1675 // Whichever is smaller.
1676 const LLT &Ty = Query.Types[BigTyIdx];
1677 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1678 if (NewSizeInBits >= 256) {
1679 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1680 if (RoundedTo < NewSizeInBits)
1681 NewSizeInBits = RoundedTo;
1682 }
1683 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1684 })
1685 // Any vectors left are the wrong size. Scalarize them.
1686 .scalarize(0)
1687 .scalarize(1);
1688 }
1689
1690 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1691 // RegBankSelect.
1692 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1693 .legalFor({{S32}, {S64}});
1694
1695 if (ST.hasVOP3PInsts()) {
1696 SextInReg.lowerFor({{V2S16}})
1697 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1698 // get more vector shift opportunities, since we'll get those when
1699 // expanded.
1700 .clampMaxNumElementsStrict(0, S16, 2);
1701 } else if (ST.has16BitInsts()) {
1702 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1703 } else {
1704 // Prefer to promote to s32 before lowering if we don't have 16-bit
1705 // shifts. This avoid a lot of intermediate truncate and extend operations.
1706 SextInReg.lowerFor({{S32}, {S64}});
1707 }
1708
1709 SextInReg
1710 .scalarize(0)
1711 .clampScalar(0, S32, S64)
1712 .lower();
1713
1714 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1715 .scalarize(0)
1716 .lower();
1717
1718 // TODO: Only Try to form v2s16 with legal packed instructions.
1719 getActionDefinitionsBuilder(G_FSHR)
1720 .legalFor({{S32, S32}})
1721 .lowerFor({{V2S16, V2S16}})
1722 .clampMaxNumElementsStrict(0, S16, 2)
1723 .scalarize(0)
1724 .lower();
1725
1726 if (ST.hasVOP3PInsts()) {
1727 getActionDefinitionsBuilder(G_FSHL)
1728 .lowerFor({{V2S16, V2S16}})
1729 .clampMaxNumElementsStrict(0, S16, 2)
1730 .scalarize(0)
1731 .lower();
1732 } else {
1733 getActionDefinitionsBuilder(G_FSHL)
1734 .scalarize(0)
1735 .lower();
1736 }
1737
1738 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1739 .legalFor({S64});
1740
1741 getActionDefinitionsBuilder(G_FENCE)
1742 .alwaysLegal();
1743
1744 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1745 .scalarize(0)
1746 .minScalar(0, S32)
1747 .lower();
1748
1749 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1750 .legalFor({{S32, S32}, {S64, S32}})
1751 .clampScalar(1, S32, S32)
1752 .clampScalar(0, S32, S64)
1753 .widenScalarToNextPow2(0)
1754 .scalarize(0);
1755
1756 getActionDefinitionsBuilder({
1757 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1758 G_FCOPYSIGN,
1759
1760 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1761 G_ATOMICRMW_NAND,
1762 G_ATOMICRMW_FSUB,
1763 G_READ_REGISTER,
1764 G_WRITE_REGISTER,
1765
1766 G_SADDO, G_SSUBO,
1767
1768 // TODO: Implement
1769 G_FMINIMUM, G_FMAXIMUM}).lower();
1770
1771 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1772 .lower();
1773
1774 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1775 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1776 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1777 .unsupported();
1778
1779 getLegacyLegalizerInfo().computeTables();
1780 verify(*ST.getInstrInfo());
1781}
1782
1783bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1784 MachineInstr &MI) const {
1785 MachineIRBuilder &B = Helper.MIRBuilder;
1786 MachineRegisterInfo &MRI = *B.getMRI();
1787
1788 switch (MI.getOpcode()) {
1789 case TargetOpcode::G_ADDRSPACE_CAST:
1790 return legalizeAddrSpaceCast(MI, MRI, B);
1791 case TargetOpcode::G_FRINT:
1792 return legalizeFrint(MI, MRI, B);
1793 case TargetOpcode::G_FCEIL:
1794 return legalizeFceil(MI, MRI, B);
1795 case TargetOpcode::G_FREM:
1796 return legalizeFrem(MI, MRI, B);
1797 case TargetOpcode::G_INTRINSIC_TRUNC:
1798 return legalizeIntrinsicTrunc(MI, MRI, B);
1799 case TargetOpcode::G_SITOFP:
1800 return legalizeITOFP(MI, MRI, B, true);
1801 case TargetOpcode::G_UITOFP:
1802 return legalizeITOFP(MI, MRI, B, false);
1803 case TargetOpcode::G_FPTOSI:
1804 return legalizeFPTOI(MI, MRI, B, true);
1805 case TargetOpcode::G_FPTOUI:
1806 return legalizeFPTOI(MI, MRI, B, false);
1807 case TargetOpcode::G_FMINNUM:
1808 case TargetOpcode::G_FMAXNUM:
1809 case TargetOpcode::G_FMINNUM_IEEE:
1810 case TargetOpcode::G_FMAXNUM_IEEE:
1811 return legalizeMinNumMaxNum(Helper, MI);
1812 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1813 return legalizeExtractVectorElt(MI, MRI, B);
1814 case TargetOpcode::G_INSERT_VECTOR_ELT:
1815 return legalizeInsertVectorElt(MI, MRI, B);
1816 case TargetOpcode::G_FSIN:
1817 case TargetOpcode::G_FCOS:
1818 return legalizeSinCos(MI, MRI, B);
1819 case TargetOpcode::G_GLOBAL_VALUE:
1820 return legalizeGlobalValue(MI, MRI, B);
1821 case TargetOpcode::G_LOAD:
1822 case TargetOpcode::G_SEXTLOAD:
1823 case TargetOpcode::G_ZEXTLOAD:
1824 return legalizeLoad(Helper, MI);
1825 case TargetOpcode::G_FMAD:
1826 return legalizeFMad(MI, MRI, B);
1827 case TargetOpcode::G_FDIV:
1828 return legalizeFDIV(MI, MRI, B);
1829 case TargetOpcode::G_UDIV:
1830 case TargetOpcode::G_UREM:
1831 case TargetOpcode::G_UDIVREM:
1832 return legalizeUnsignedDIV_REM(MI, MRI, B);
1833 case TargetOpcode::G_SDIV:
1834 case TargetOpcode::G_SREM:
1835 case TargetOpcode::G_SDIVREM:
1836 return legalizeSignedDIV_REM(MI, MRI, B);
1837 case TargetOpcode::G_ATOMIC_CMPXCHG:
1838 return legalizeAtomicCmpXChg(MI, MRI, B);
1839 case TargetOpcode::G_FLOG:
1840 return legalizeFlog(MI, B, numbers::ln2f);
1841 case TargetOpcode::G_FLOG10:
1842 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1843 case TargetOpcode::G_FEXP:
1844 return legalizeFExp(MI, B);
1845 case TargetOpcode::G_FPOW:
1846 return legalizeFPow(MI, B);
1847 case TargetOpcode::G_FFLOOR:
1848 return legalizeFFloor(MI, MRI, B);
1849 case TargetOpcode::G_BUILD_VECTOR:
1850 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1851 return legalizeBuildVector(MI, MRI, B);
1852 case TargetOpcode::G_MUL:
1853 return legalizeMul(Helper, MI);
1854 case TargetOpcode::G_CTLZ:
1855 case TargetOpcode::G_CTTZ:
1856 return legalizeCTLZ_CTTZ(MI, MRI, B);
1857 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1858 return legalizeFPTruncRound(MI, B);
1859 default:
1860 return false;
1861 }
1862
1863 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1863)
;
1864}
1865
1866Register AMDGPULegalizerInfo::getSegmentAperture(
1867 unsigned AS,
1868 MachineRegisterInfo &MRI,
1869 MachineIRBuilder &B) const {
1870 MachineFunction &MF = B.getMF();
1871 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1872 const LLT S32 = LLT::scalar(32);
1873 const LLT S64 = LLT::scalar(64);
1874
1875 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1875, __extension__
__PRETTY_FUNCTION__))
;
1876
1877 if (ST.hasApertureRegs()) {
1878 // Note: this register is somewhat broken. When used as a 32-bit operand,
1879 // it only returns zeroes. The real value is in the upper 32 bits.
1880 // Thus, we must emit extract the high 32 bits.
1881 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
1882 ? AMDGPU::SRC_SHARED_BASE
1883 : AMDGPU::SRC_PRIVATE_BASE;
1884 // FIXME: It would be more natural to emit a COPY here, but then copy
1885 // coalescing would kick in and it would think it's okay to use the "HI"
1886 // subregister (instead of extracting the HI 32 bits) which is an artificial
1887 // (unusable) register.
1888 // Register TableGen definitions would need an overhaul to get rid of the
1889 // artificial "HI" aperture registers and prevent this kind of issue from
1890 // happening.
1891 Register Dst = MRI.createGenericVirtualRegister(S64);
1892 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1893 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
1894 return B.buildUnmerge(S32, Dst).getReg(1);
1895 }
1896
1897 // TODO: can we be smarter about machine pointer info?
1898 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1899 Register LoadAddr = MRI.createGenericVirtualRegister(
1900 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1901 // For code object version 5, private_base and shared_base are passed through
1902 // implicit kernargs.
1903 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
1904 AMDGPU::AMDHSA_COV5) {
1905 AMDGPUTargetLowering::ImplicitParameter Param =
1906 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1907 : AMDGPUTargetLowering::PRIVATE_BASE;
1908 uint64_t Offset =
1909 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1910
1911 Register KernargPtrReg = MRI.createGenericVirtualRegister(
1912 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1913
1914 if (!loadInputValue(KernargPtrReg, B,
1915 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1916 return Register();
1917
1918 MachineMemOperand *MMO = MF.getMachineMemOperand(
1919 PtrInfo,
1920 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1921 MachineMemOperand::MOInvariant,
1922 LLT::scalar(32), commonAlignment(Align(64), Offset));
1923
1924 // Pointer address
1925 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1926 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1927 // Load address
1928 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1929 }
1930
1931 Register QueuePtr = MRI.createGenericVirtualRegister(
1932 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1933
1934 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1935 return Register();
1936
1937 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1938 // private_segment_aperture_base_hi.
1939 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1940
1941 MachineMemOperand *MMO = MF.getMachineMemOperand(
1942 PtrInfo,
1943 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1944 MachineMemOperand::MOInvariant,
1945 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1946
1947 B.buildPtrAdd(LoadAddr, QueuePtr,
1948 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1949 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1950}
1951
1952/// Return true if the value is a known valid address, such that a null check is
1953/// not necessary.
1954static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1955 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1956 MachineInstr *Def = MRI.getVRegDef(Val);
1957 switch (Def->getOpcode()) {
1958 case AMDGPU::G_FRAME_INDEX:
1959 case AMDGPU::G_GLOBAL_VALUE:
1960 case AMDGPU::G_BLOCK_ADDR:
1961 return true;
1962 case AMDGPU::G_CONSTANT: {
1963 const ConstantInt *CI = Def->getOperand(1).getCImm();
1964 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1965 }
1966 default:
1967 return false;
1968 }
1969
1970 return false;
1971}
1972
1973bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1974 MachineInstr &MI, MachineRegisterInfo &MRI,
1975 MachineIRBuilder &B) const {
1976 MachineFunction &MF = B.getMF();
1977
1978 const LLT S32 = LLT::scalar(32);
1979 Register Dst = MI.getOperand(0).getReg();
1980 Register Src = MI.getOperand(1).getReg();
1981
1982 LLT DstTy = MRI.getType(Dst);
1983 LLT SrcTy = MRI.getType(Src);
1984 unsigned DestAS = DstTy.getAddressSpace();
1985 unsigned SrcAS = SrcTy.getAddressSpace();
1986
1987 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1988 // vector element.
1989 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1989, __extension__ __PRETTY_FUNCTION__))
;
1990
1991 const AMDGPUTargetMachine &TM
1992 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1993
1994 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1995 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1996 return true;
1997 }
1998
1999 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2000 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2001 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2002 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2003 // Extract low 32-bits of the pointer.
2004 B.buildExtract(Dst, Src, 0);
2005 MI.eraseFromParent();
2006 return true;
2007 }
2008
2009 unsigned NullVal = TM.getNullPointerValue(DestAS);
2010
2011 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2012 auto FlatNull = B.buildConstant(SrcTy, 0);
2013
2014 // Extract low 32-bits of the pointer.
2015 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2016
2017 auto CmpRes =
2018 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2019 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2020
2021 MI.eraseFromParent();
2022 return true;
2023 }
2024
2025 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2026 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2027 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2028 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2029 if (!ApertureReg.isValid())
2030 return false;
2031
2032 // Coerce the type of the low half of the result so we can use merge_values.
2033 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2034
2035 // TODO: Should we allow mismatched types but matching sizes in merges to
2036 // avoid the ptrtoint?
2037 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2038
2039 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2040 B.buildCopy(Dst, BuildPtr);
2041 MI.eraseFromParent();
2042 return true;
2043 }
2044
2045 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2046 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2047
2048 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2049 SegmentNull.getReg(0));
2050
2051 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2052
2053 MI.eraseFromParent();
2054 return true;
2055 }
2056
2057 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2058 SrcTy.getSizeInBits() == 64) {
2059 // Truncate.
2060 B.buildExtract(Dst, Src, 0);
2061 MI.eraseFromParent();
2062 return true;
2063 }
2064
2065 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2066 DstTy.getSizeInBits() == 64) {
2067 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2068 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2069 auto PtrLo = B.buildPtrToInt(S32, Src);
2070 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2071 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2072 MI.eraseFromParent();
2073 return true;
2074 }
2075
2076 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2077 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2078
2079 LLVMContext &Ctx = MF.getFunction().getContext();
2080 Ctx.diagnose(InvalidAddrSpaceCast);
2081 B.buildUndef(Dst);
2082 MI.eraseFromParent();
2083 return true;
2084}
2085
2086bool AMDGPULegalizerInfo::legalizeFrint(
2087 MachineInstr &MI, MachineRegisterInfo &MRI,
2088 MachineIRBuilder &B) const {
2089 Register Src = MI.getOperand(1).getReg();
2090 LLT Ty = MRI.getType(Src);
2091 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2091, __extension__
__PRETTY_FUNCTION__))
;
2092
2093 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2094 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2095
2096 auto C1 = B.buildFConstant(Ty, C1Val);
2097 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2098
2099 // TODO: Should this propagate fast-math-flags?
2100 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2101 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2102
2103 auto C2 = B.buildFConstant(Ty, C2Val);
2104 auto Fabs = B.buildFAbs(Ty, Src);
2105
2106 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2107 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2108 MI.eraseFromParent();
2109 return true;
2110}
2111
2112bool AMDGPULegalizerInfo::legalizeFceil(
2113 MachineInstr &MI, MachineRegisterInfo &MRI,
2114 MachineIRBuilder &B) const {
2115
2116 const LLT S1 = LLT::scalar(1);
2117 const LLT S64 = LLT::scalar(64);
2118
2119 Register Src = MI.getOperand(1).getReg();
2120 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2120, __extension__ __PRETTY_FUNCTION__))
;
2121
2122 // result = trunc(src)
2123 // if (src > 0.0 && src != result)
2124 // result += 1.0
2125
2126 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2127
2128 const auto Zero = B.buildFConstant(S64, 0.0);
2129 const auto One = B.buildFConstant(S64, 1.0);
2130 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2131 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2132 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2133 auto Add = B.buildSelect(S64, And, One, Zero);
2134
2135 // TODO: Should this propagate fast-math-flags?
2136 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2137 MI.eraseFromParent();
2138 return true;
2139}
2140
2141bool AMDGPULegalizerInfo::legalizeFrem(
2142 MachineInstr &MI, MachineRegisterInfo &MRI,
2143 MachineIRBuilder &B) const {
2144 Register DstReg = MI.getOperand(0).getReg();
2145 Register Src0Reg = MI.getOperand(1).getReg();
2146 Register Src1Reg = MI.getOperand(2).getReg();
2147 auto Flags = MI.getFlags();
2148 LLT Ty = MRI.getType(DstReg);
2149
2150 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2151 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2152 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2153 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2154 MI.eraseFromParent();
2155 return true;
2156}
2157
2158static MachineInstrBuilder extractF64Exponent(Register Hi,
2159 MachineIRBuilder &B) {
2160 const unsigned FractBits = 52;
2161 const unsigned ExpBits = 11;
2162 LLT S32 = LLT::scalar(32);
2163
2164 auto Const0 = B.buildConstant(S32, FractBits - 32);
2165 auto Const1 = B.buildConstant(S32, ExpBits);
2166
2167 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2168 .addUse(Hi)
2169 .addUse(Const0.getReg(0))
2170 .addUse(Const1.getReg(0));
2171
2172 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2173}
2174
2175bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2176 MachineInstr &MI, MachineRegisterInfo &MRI,
2177 MachineIRBuilder &B) const {
2178 const LLT S1 = LLT::scalar(1);
2179 const LLT S32 = LLT::scalar(32);
2180 const LLT S64 = LLT::scalar(64);
2181
2182 Register Src = MI.getOperand(1).getReg();
2183 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2183, __extension__ __PRETTY_FUNCTION__))
;
2184
2185 // TODO: Should this use extract since the low half is unused?
2186 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2187 Register Hi = Unmerge.getReg(1);
2188
2189 // Extract the upper half, since this is where we will find the sign and
2190 // exponent.
2191 auto Exp = extractF64Exponent(Hi, B);
2192
2193 const unsigned FractBits = 52;
2194
2195 // Extract the sign bit.
2196 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2197 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2198
2199 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2200
2201 const auto Zero32 = B.buildConstant(S32, 0);
2202
2203 // Extend back to 64-bits.
2204 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2205
2206 auto Shr = B.buildAShr(S64, FractMask, Exp);
2207 auto Not = B.buildNot(S64, Shr);
2208 auto Tmp0 = B.buildAnd(S64, Src, Not);
2209 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2210
2211 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2212 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2213
2214 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2215 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2216 MI.eraseFromParent();
2217 return true;
2218}
2219
2220bool AMDGPULegalizerInfo::legalizeITOFP(
2221 MachineInstr &MI, MachineRegisterInfo &MRI,
2222 MachineIRBuilder &B, bool Signed) const {
2223
2224 Register Dst = MI.getOperand(0).getReg();
2225 Register Src = MI.getOperand(1).getReg();
2226
2227 const LLT S64 = LLT::scalar(64);
2228 const LLT S32 = LLT::scalar(32);
2229
2230 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2230, __extension__ __PRETTY_FUNCTION__))
;
2231
2232 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2233 auto ThirtyTwo = B.buildConstant(S32, 32);
2234
2235 if (MRI.getType(Dst) == S64) {
2236 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2237 : B.buildUITOFP(S64, Unmerge.getReg(1));
2238
2239 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2240 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2241 .addUse(CvtHi.getReg(0))
2242 .addUse(ThirtyTwo.getReg(0));
2243
2244 // TODO: Should this propagate fast-math-flags?
2245 B.buildFAdd(Dst, LdExp, CvtLo);
2246 MI.eraseFromParent();
2247 return true;
2248 }
2249
2250 assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0
) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2250, __extension__ __PRETTY_FUNCTION__))
;
2251
2252 auto One = B.buildConstant(S32, 1);
2253
2254 MachineInstrBuilder ShAmt;
2255 if (Signed) {
2256 auto ThirtyOne = B.buildConstant(S32, 31);
2257 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2258 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2259 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2260 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2261 /*HasSideEffects=*/false)
2262 .addUse(Unmerge.getReg(1));
2263 auto LS2 = B.buildSub(S32, LS, One);
2264 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2265 } else
2266 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2267 auto Norm = B.buildShl(S64, Src, ShAmt);
2268 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2269 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2270 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2271 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2272 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2273 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2274 /*HasSideEffects=*/false)
2275 .addUse(FVal.getReg(0))
2276 .addUse(Scale.getReg(0));
2277 MI.eraseFromParent();
2278 return true;
2279}
2280
2281// TODO: Copied from DAG implementation. Verify logic and document how this
2282// actually works.
2283bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2284 MachineRegisterInfo &MRI,
2285 MachineIRBuilder &B,
2286 bool Signed) const {
2287
2288 Register Dst = MI.getOperand(0).getReg();
2289 Register Src = MI.getOperand(1).getReg();
2290
2291 const LLT S64 = LLT::scalar(64);
2292 const LLT S32 = LLT::scalar(32);
2293
2294 const LLT SrcLT = MRI.getType(Src);
2295 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2295, __extension__
__PRETTY_FUNCTION__))
;
2296
2297 unsigned Flags = MI.getFlags();
2298
2299 // The basic idea of converting a floating point number into a pair of 32-bit
2300 // integers is illustrated as follows:
2301 //
2302 // tf := trunc(val);
2303 // hif := floor(tf * 2^-32);
2304 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2305 // hi := fptoi(hif);
2306 // lo := fptoi(lof);
2307 //
2308 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2309 MachineInstrBuilder Sign;
2310 if (Signed && SrcLT == S32) {
2311 // However, a 32-bit floating point number has only 23 bits mantissa and
2312 // it's not enough to hold all the significant bits of `lof` if val is
2313 // negative. To avoid the loss of precision, We need to take the absolute
2314 // value after truncating and flip the result back based on the original
2315 // signedness.
2316 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2317 Trunc = B.buildFAbs(S32, Trunc, Flags);
2318 }
2319 MachineInstrBuilder K0, K1;
2320 if (SrcLT == S64) {
2321 K0 = B.buildFConstant(
2322 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2323 K1 = B.buildFConstant(
2324 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2325 } else {
2326 K0 = B.buildFConstant(
2327 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2328 K1 = B.buildFConstant(
2329 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2330 }
2331
2332 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2333 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2334 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2335
2336 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2337 : B.buildFPTOUI(S32, FloorMul);
2338 auto Lo = B.buildFPTOUI(S32, Fma);
2339
2340 if (Signed && SrcLT == S32) {
2341 // Flip the result based on the signedness, which is either all 0s or 1s.
2342 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2343 // r := xor({lo, hi}, sign) - sign;
2344 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2345 Sign);
2346 } else
2347 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2348 MI.eraseFromParent();
2349
2350 return true;
2351}
2352
2353bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2354 MachineInstr &MI) const {
2355 MachineFunction &MF = Helper.MIRBuilder.getMF();
2356 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2357
2358 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2359 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2360
2361 // With ieee_mode disabled, the instructions have the correct behavior
2362 // already for G_FMINNUM/G_FMAXNUM
2363 if (!MFI->getMode().IEEE)
2364 return !IsIEEEOp;
2365
2366 if (IsIEEEOp)
2367 return true;
2368
2369 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2370}
2371
2372bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2373 MachineInstr &MI, MachineRegisterInfo &MRI,
2374 MachineIRBuilder &B) const {
2375 // TODO: Should move some of this into LegalizerHelper.
2376
2377 // TODO: Promote dynamic indexing of s16 to s32
2378
2379 // FIXME: Artifact combiner probably should have replaced the truncated
2380 // constant before this, so we shouldn't need
2381 // getIConstantVRegValWithLookThrough.
2382 std::optional<ValueAndVReg> MaybeIdxVal =
2383 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2384 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2385 return true;
2386 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2387
2388 Register Dst = MI.getOperand(0).getReg();
2389 Register Vec = MI.getOperand(1).getReg();
2390
2391 LLT VecTy = MRI.getType(Vec);
2392 LLT EltTy = VecTy.getElementType();
2393 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2393, __extension__ __PRETTY_FUNCTION__))
;
2394
2395 if (IdxVal < VecTy.getNumElements()) {
2396 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2397 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2398 } else {
2399 B.buildUndef(Dst);
2400 }
2401
2402 MI.eraseFromParent();
2403 return true;
2404}
2405
2406bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2407 MachineInstr &MI, MachineRegisterInfo &MRI,
2408 MachineIRBuilder &B) const {
2409 // TODO: Should move some of this into LegalizerHelper.
2410
2411 // TODO: Promote dynamic indexing of s16 to s32
2412
2413 // FIXME: Artifact combiner probably should have replaced the truncated
2414 // constant before this, so we shouldn't need
2415 // getIConstantVRegValWithLookThrough.
2416 std::optional<ValueAndVReg> MaybeIdxVal =
2417 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2418 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2419 return true;
2420
2421 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2422 Register Dst = MI.getOperand(0).getReg();
2423 Register Vec = MI.getOperand(1).getReg();
2424 Register Ins = MI.getOperand(2).getReg();
2425
2426 LLT VecTy = MRI.getType(Vec);
2427 LLT EltTy = VecTy.getElementType();
2428 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2428, __extension__ __PRETTY_FUNCTION__))
;
2429 (void)Ins;
2430
2431 unsigned NumElts = VecTy.getNumElements();
2432 if (IdxVal < NumElts) {
2433 SmallVector<Register, 8> SrcRegs;
2434 for (unsigned i = 0; i < NumElts; ++i)
2435 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2436 B.buildUnmerge(SrcRegs, Vec);
2437
2438 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2439 B.buildMergeLikeInstr(Dst, SrcRegs);
2440 } else {
2441 B.buildUndef(Dst);
2442 }
2443
2444 MI.eraseFromParent();
2445 return true;
2446}
2447
2448bool AMDGPULegalizerInfo::legalizeSinCos(
2449 MachineInstr &MI, MachineRegisterInfo &MRI,
2450 MachineIRBuilder &B) const {
2451
2452 Register DstReg = MI.getOperand(0).getReg();
2453 Register SrcReg = MI.getOperand(1).getReg();
2454 LLT Ty = MRI.getType(DstReg);
2455 unsigned Flags = MI.getFlags();
2456
2457 Register TrigVal;
2458 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2459 if (ST.hasTrigReducedRange()) {
2460 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2461 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2462 .addUse(MulVal.getReg(0))
2463 .setMIFlags(Flags).getReg(0);
2464 } else
2465 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2466
2467 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2468 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2469 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
2470 .addUse(TrigVal)
2471 .setMIFlags(Flags);
2472 MI.eraseFromParent();
2473 return true;
2474}
2475
2476bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2477 MachineIRBuilder &B,
2478 const GlobalValue *GV,
2479 int64_t Offset,
2480 unsigned GAFlags) const {
2481 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2481, __extension__
__PRETTY_FUNCTION__))
;
2482 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2483 // to the following code sequence:
2484 //
2485 // For constant address space:
2486 // s_getpc_b64 s[0:1]
2487 // s_add_u32 s0, s0, $symbol
2488 // s_addc_u32 s1, s1, 0
2489 //
2490 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2491 // a fixup or relocation is emitted to replace $symbol with a literal
2492 // constant, which is a pc-relative offset from the encoding of the $symbol
2493 // operand to the global variable.
2494 //
2495 // For global address space:
2496 // s_getpc_b64 s[0:1]
2497 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2498 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2499 //
2500 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2501 // fixups or relocations are emitted to replace $symbol@*@lo and
2502 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2503 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2504 // operand to the global variable.
2505 //
2506 // What we want here is an offset from the value returned by s_getpc
2507 // (which is the address of the s_add_u32 instruction) to the global
2508 // variable, but since the encoding of $symbol starts 4 bytes after the start
2509 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2510 // small. This requires us to add 4 to the global variable offset in order to
2511 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2512 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2513 // instruction.
2514
2515 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2516
2517 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2518 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2519
2520 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2521 .addDef(PCReg);
2522
2523 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2524 if (GAFlags == SIInstrInfo::MO_NONE)
2525 MIB.addImm(0);
2526 else
2527 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2528
2529 if (!B.getMRI()->getRegClassOrNull(PCReg))
2530 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2531
2532 if (PtrTy.getSizeInBits() == 32)
2533 B.buildExtract(DstReg, PCReg, 0);
2534 return true;
2535 }
2536
2537bool AMDGPULegalizerInfo::legalizeGlobalValue(
2538 MachineInstr &MI, MachineRegisterInfo &MRI,
2539 MachineIRBuilder &B) const {
2540 Register DstReg = MI.getOperand(0).getReg();
2541 LLT Ty = MRI.getType(DstReg);
2542 unsigned AS = Ty.getAddressSpace();
2543
2544 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2545 MachineFunction &MF = B.getMF();
2546 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2547
2548 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2549 if (!MFI->isModuleEntryFunction() &&
2550 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2551 const Function &Fn = MF.getFunction();
2552 DiagnosticInfoUnsupported BadLDSDecl(
2553 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2554 DS_Warning);
2555 Fn.getContext().diagnose(BadLDSDecl);
2556
2557 // We currently don't have a way to correctly allocate LDS objects that
2558 // aren't directly associated with a kernel. We do force inlining of
2559 // functions that use local objects. However, if these dead functions are
2560 // not eliminated, we don't want a compile time error. Just emit a warning
2561 // and a trap, since there should be no callable path here.
2562 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2563 B.buildUndef(DstReg);
2564 MI.eraseFromParent();
2565 return true;
2566 }
2567
2568 // TODO: We could emit code to handle the initialization somewhere.
2569 // We ignore the initializer for now and legalize it to allow selection.
2570 // The initializer will anyway get errored out during assembly emission.
2571 const SITargetLowering *TLI = ST.getTargetLowering();
2572 if (!TLI->shouldUseLDSConstAddress(GV)) {
2573 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2574 return true; // Leave in place;
2575 }
2576
2577 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2578 Type *Ty = GV->getValueType();
2579 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2580 // zero-sized type in other languages to declare the dynamic shared
2581 // memory which size is not known at the compile time. They will be
2582 // allocated by the runtime and placed directly after the static
2583 // allocated ones. They all share the same offset.
2584 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2585 // Adjust alignment for that dynamic shared memory array.
2586 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2587 LLT S32 = LLT::scalar(32);
2588 auto Sz =
2589 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2590 B.buildIntToPtr(DstReg, Sz);
2591 MI.eraseFromParent();
2592 return true;
2593 }
2594 }
2595
2596 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2597 *cast<GlobalVariable>(GV)));
2598 MI.eraseFromParent();
2599 return true;
2600 }
2601
2602 const SITargetLowering *TLI = ST.getTargetLowering();
2603
2604 if (TLI->shouldEmitFixup(GV)) {
2605 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2606 MI.eraseFromParent();
2607 return true;
2608 }
2609
2610 if (TLI->shouldEmitPCReloc(GV)) {
2611 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2612 MI.eraseFromParent();
2613 return true;
2614 }
2615
2616 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2617 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2618
2619 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2620 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2621 MachinePointerInfo::getGOT(MF),
2622 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2623 MachineMemOperand::MOInvariant,
2624 LoadTy, Align(8));
2625
2626 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2627
2628 if (Ty.getSizeInBits() == 32) {
2629 // Truncate if this is a 32-bit constant address.
2630 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2631 B.buildExtract(DstReg, Load, 0);
2632 } else
2633 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2634
2635 MI.eraseFromParent();
2636 return true;
2637}
2638
2639static LLT widenToNextPowerOf2(LLT Ty) {
2640 if (Ty.isVector())
2641 return Ty.changeElementCount(
2642 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2643 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2644}
2645
2646bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2647 MachineInstr &MI) const {
2648 MachineIRBuilder &B = Helper.MIRBuilder;
2649 MachineRegisterInfo &MRI = *B.getMRI();
2650 GISelChangeObserver &Observer = Helper.Observer;
2651
2652 Register PtrReg = MI.getOperand(1).getReg();
2653 LLT PtrTy = MRI.getType(PtrReg);
2654 unsigned AddrSpace = PtrTy.getAddressSpace();
2655
2656 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2657 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2658 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2659 Observer.changingInstr(MI);
2660 MI.getOperand(1).setReg(Cast.getReg(0));
2661 Observer.changedInstr(MI);
2662 return true;
2663 }
2664
2665 if (MI.getOpcode() != AMDGPU::G_LOAD)
2666 return false;
2667
2668 Register ValReg = MI.getOperand(0).getReg();
2669 LLT ValTy = MRI.getType(ValReg);
2670
2671 MachineMemOperand *MMO = *MI.memoperands_begin();
2672 const unsigned ValSize = ValTy.getSizeInBits();
2673 const LLT MemTy = MMO->getMemoryType();
2674 const Align MemAlign = MMO->getAlign();
2675 const unsigned MemSize = MemTy.getSizeInBits();
2676 const uint64_t AlignInBits = 8 * MemAlign.value();
2677
2678 // Widen non-power-of-2 loads to the alignment if needed
2679 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2680 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2681
2682 // This was already the correct extending load result type, so just adjust
2683 // the memory type.
2684 if (WideMemSize == ValSize) {
2685 MachineFunction &MF = B.getMF();
2686
2687 MachineMemOperand *WideMMO =
2688 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2689 Observer.changingInstr(MI);
2690 MI.setMemRefs(MF, {WideMMO});
2691 Observer.changedInstr(MI);
2692 return true;
2693 }
2694
2695 // Don't bother handling edge case that should probably never be produced.
2696 if (ValSize > WideMemSize)
2697 return false;
2698
2699 LLT WideTy = widenToNextPowerOf2(ValTy);
2700
2701 Register WideLoad;
2702 if (!WideTy.isVector()) {
2703 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2704 B.buildTrunc(ValReg, WideLoad).getReg(0);
2705 } else {
2706 // Extract the subvector.
2707
2708 if (isRegisterType(ValTy)) {
2709 // If this a case where G_EXTRACT is legal, use it.
2710 // (e.g. <3 x s32> -> <4 x s32>)
2711 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2712 B.buildExtract(ValReg, WideLoad, 0);
2713 } else {
2714 // For cases where the widened type isn't a nice register value, unmerge
2715 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2716 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2717 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2718 }
2719 }
2720
2721 MI.eraseFromParent();
2722 return true;
2723 }
2724
2725 return false;
2726}
2727
2728bool AMDGPULegalizerInfo::legalizeFMad(
2729 MachineInstr &MI, MachineRegisterInfo &MRI,
2730 MachineIRBuilder &B) const {
2731 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2732 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2732, __extension__ __PRETTY_FUNCTION__))
;
2733
2734 MachineFunction &MF = B.getMF();
2735 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2736
2737 // TODO: Always legal with future ftz flag.
2738 // FIXME: Do we need just output?
2739 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2740 return true;
2741 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2742 return true;
2743
2744 MachineIRBuilder HelperBuilder(MI);
2745 GISelObserverWrapper DummyObserver;
2746 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2747 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2748}
2749
2750bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2751 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2752 Register DstReg = MI.getOperand(0).getReg();
2753 Register PtrReg = MI.getOperand(1).getReg();
2754 Register CmpVal = MI.getOperand(2).getReg();
2755 Register NewVal = MI.getOperand(3).getReg();
2756
2757 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2758, __extension__
__PRETTY_FUNCTION__))
2758 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2758, __extension__
__PRETTY_FUNCTION__))
;
2759
2760 LLT ValTy = MRI.getType(CmpVal);
2761 LLT VecTy = LLT::fixed_vector(2, ValTy);
2762
2763 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2764
2765 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2766 .addDef(DstReg)
2767 .addUse(PtrReg)
2768 .addUse(PackedVal)
2769 .setMemRefs(MI.memoperands());
2770
2771 MI.eraseFromParent();
2772 return true;
2773}
2774
2775bool AMDGPULegalizerInfo::legalizeFlog(
2776 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2777 Register Dst = MI.getOperand(0).getReg();
2778 Register Src = MI.getOperand(1).getReg();
2779 LLT Ty = B.getMRI()->getType(Dst);
2780 unsigned Flags = MI.getFlags();
2781
2782 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2783 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2784
2785 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2786 MI.eraseFromParent();
2787 return true;
2788}
2789
2790bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2791 MachineIRBuilder &B) const {
2792 Register Dst = MI.getOperand(0).getReg();
2793 Register Src = MI.getOperand(1).getReg();
2794 unsigned Flags = MI.getFlags();
2795 LLT Ty = B.getMRI()->getType(Dst);
2796
2797 auto K = B.buildFConstant(Ty, numbers::log2e);
2798 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2799 B.buildFExp2(Dst, Mul, Flags);
2800 MI.eraseFromParent();
2801 return true;
2802}
2803
2804bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2805 MachineIRBuilder &B) const {
2806 Register Dst = MI.getOperand(0).getReg();
2807 Register Src0 = MI.getOperand(1).getReg();
2808 Register Src1 = MI.getOperand(2).getReg();
2809 unsigned Flags = MI.getFlags();
2810 LLT Ty = B.getMRI()->getType(Dst);
2811 const LLT S16 = LLT::scalar(16);
2812 const LLT S32 = LLT::scalar(32);
2813
2814 if (Ty == S32) {
2815 auto Log = B.buildFLog2(S32, Src0, Flags);
2816 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2817 .addUse(Log.getReg(0))
2818 .addUse(Src1)
2819 .setMIFlags(Flags);
2820 B.buildFExp2(Dst, Mul, Flags);
2821 } else if (Ty == S16) {
2822 // There's no f16 fmul_legacy, so we need to convert for it.
2823 auto Log = B.buildFLog2(S16, Src0, Flags);
2824 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2825 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2826 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2827 .addUse(Ext0.getReg(0))
2828 .addUse(Ext1.getReg(0))
2829 .setMIFlags(Flags);
2830
2831 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2832 } else
2833 return false;
2834
2835 MI.eraseFromParent();
2836 return true;
2837}
2838
2839// Find a source register, ignoring any possible source modifiers.
2840static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2841 Register ModSrc = OrigSrc;
2842 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2843 ModSrc = SrcFNeg->getOperand(1).getReg();
2844 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2845 ModSrc = SrcFAbs->getOperand(1).getReg();
2846 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2847 ModSrc = SrcFAbs->getOperand(1).getReg();
2848 return ModSrc;
2849}
2850
2851bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2852 MachineRegisterInfo &MRI,
2853 MachineIRBuilder &B) const {
2854
2855 const LLT S1 = LLT::scalar(1);
2856 const LLT S64 = LLT::scalar(64);
2857 Register Dst = MI.getOperand(0).getReg();
2858 Register OrigSrc = MI.getOperand(1).getReg();
2859 unsigned Flags = MI.getFlags();
2860 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2861, __extension__
__PRETTY_FUNCTION__))
2861 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2861, __extension__
__PRETTY_FUNCTION__))
;
2862
2863 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2864 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2865 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2866 // V_FRACT bug is:
2867 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2868 //
2869 // Convert floor(x) to (x - fract(x))
2870
2871 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2872 .addUse(OrigSrc)
2873 .setMIFlags(Flags);
2874
2875 // Give source modifier matching some assistance before obscuring a foldable
2876 // pattern.
2877
2878 // TODO: We can avoid the neg on the fract? The input sign to fract
2879 // shouldn't matter?
2880 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2881
2882 auto Const =
2883 B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
2884
2885 Register Min = MRI.createGenericVirtualRegister(S64);
2886
2887 // We don't need to concern ourselves with the snan handling difference, so
2888 // use the one which will directly select.
2889 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2890 if (MFI->getMode().IEEE)
2891 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2892 else
2893 B.buildFMinNum(Min, Fract, Const, Flags);
2894
2895 Register CorrectedFract = Min;
2896 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2897 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2898 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2899 }
2900
2901 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2902 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2903
2904 MI.eraseFromParent();
2905 return true;
2906}
2907
2908// Turn an illegal packed v2s16 build vector into bit operations.
2909// TODO: This should probably be a bitcast action in LegalizerHelper.
2910bool AMDGPULegalizerInfo::legalizeBuildVector(
2911 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2912 Register Dst = MI.getOperand(0).getReg();
2913 const LLT S32 = LLT::scalar(32);
2914 const LLT S16 = LLT::scalar(16);
2915 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector
(2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2915, __extension__
__PRETTY_FUNCTION__))
;
2916
2917 Register Src0 = MI.getOperand(1).getReg();
2918 Register Src1 = MI.getOperand(2).getReg();
2919
2920 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2921 assert(MRI.getType(Src0) == S32)(static_cast <bool> (MRI.getType(Src0) == S32) ? void (
0) : __assert_fail ("MRI.getType(Src0) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2921, __extension__ __PRETTY_FUNCTION__))
;
2922 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
2923 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
2924 }
2925
2926 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
2927 B.buildBitcast(Dst, Merge);
2928
2929 MI.eraseFromParent();
2930 return true;
2931}
2932
2933// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2934//
2935// Source and accumulation registers must all be 32-bits.
2936//
2937// TODO: When the multiply is uniform, we should produce a code sequence
2938// that is better suited to instruction selection on the SALU. Instead of
2939// the outer loop going over parts of the result, the outer loop should go
2940// over parts of one of the factors. This should result in instruction
2941// selection that makes full use of S_ADDC_U32 instructions.
2942void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
2943 MutableArrayRef<Register> Accum,
2944 ArrayRef<Register> Src0,
2945 ArrayRef<Register> Src1,
2946 bool UsePartialMad64_32,
2947 bool SeparateOddAlignedProducts) const {
2948 // Use (possibly empty) vectors of S1 registers to represent the set of
2949 // carries from one pair of positions to the next.
2950 using Carry = SmallVector<Register, 2>;
2951
2952 MachineIRBuilder &B = Helper.MIRBuilder;
2953 GISelKnownBits &KB = *Helper.getKnownBits();
2954
2955 const LLT S1 = LLT::scalar(1);
2956 const LLT S32 = LLT::scalar(32);
2957 const LLT S64 = LLT::scalar(64);
2958
2959 Register Zero32;
2960 Register Zero64;
2961
2962 auto getZero32 = [&]() -> Register {
2963 if (!Zero32)
2964 Zero32 = B.buildConstant(S32, 0).getReg(0);
2965 return Zero32;
2966 };
2967 auto getZero64 = [&]() -> Register {
2968 if (!Zero64)
2969 Zero64 = B.buildConstant(S64, 0).getReg(0);
2970 return Zero64;
2971 };
2972
2973 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
2974 for (unsigned i = 0; i < Src0.size(); ++i) {
2975 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
2976 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
2977 }
2978
2979 // Merge the given carries into the 32-bit LocalAccum, which is modified
2980 // in-place.
2981 //
2982 // Returns the carry-out, which is a single S1 register or null.
2983 auto mergeCarry =
2984 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2985 if (CarryIn.empty())
2986 return Register();
2987
2988 bool HaveCarryOut = true;
2989 Register CarryAccum;
2990 if (CarryIn.size() == 1) {
2991 if (!LocalAccum) {
2992 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2993 return Register();
2994 }
2995
2996 CarryAccum = getZero32();
2997 } else {
2998 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2999 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3000 CarryAccum =
3001 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3002 .getReg(0);
3003 }
3004
3005 if (!LocalAccum) {
3006 LocalAccum = getZero32();
3007 HaveCarryOut = false;
3008 }
3009 }
3010
3011 auto Add =
3012 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3013 LocalAccum = Add.getReg(0);
3014 return HaveCarryOut ? Add.getReg(1) : Register();
3015 };
3016
3017 // Build a multiply-add chain to compute
3018 //
3019 // LocalAccum + (partial products at DstIndex)
3020 // + (opportunistic subset of CarryIn)
3021 //
3022 // LocalAccum is an array of one or two 32-bit registers that are updated
3023 // in-place. The incoming registers may be null.
3024 //
3025 // In some edge cases, carry-ins can be consumed "for free". In that case,
3026 // the consumed carry bits are removed from CarryIn in-place.
3027 auto buildMadChain =
3028 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3029 -> Carry {
3030 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||(static_cast <bool> ((DstIndex + 1 < Accum.size() &&
LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() &&
LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3031, __extension__
__PRETTY_FUNCTION__))
3031 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1))(static_cast <bool> ((DstIndex + 1 < Accum.size() &&
LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() &&
LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3031, __extension__
__PRETTY_FUNCTION__))
;
3032
3033 Carry CarryOut;
3034 unsigned j0 = 0;
3035
3036 // Use plain 32-bit multiplication for the most significant part of the
3037 // result by default.
3038 if (LocalAccum.size() == 1 &&
3039 (!UsePartialMad64_32 || !CarryIn.empty())) {
3040 do {
3041 // Skip multiplication if one of the operands is 0
3042 unsigned j1 = DstIndex - j0;
3043 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3044 ++j0;
3045 continue;
3046 }
3047 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3048 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3049 LocalAccum[0] = Mul.getReg(0);
3050 } else {
3051 if (CarryIn.empty()) {
3052 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3053 } else {
3054 LocalAccum[0] =
3055 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3056 .getReg(0);
3057 CarryIn.pop_back();
3058 }
3059 }
3060 ++j0;
3061 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3062 }
3063
3064 // Build full 64-bit multiplies.
3065 if (j0 <= DstIndex) {
3066 bool HaveSmallAccum = false;
3067 Register Tmp;
3068
3069 if (LocalAccum[0]) {
3070 if (LocalAccum.size() == 1) {
3071 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3072 HaveSmallAccum = true;
3073 } else if (LocalAccum[1]) {
3074 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3075 HaveSmallAccum = false;
3076 } else {
3077 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3078 HaveSmallAccum = true;
3079 }
3080 } else {
3081 assert(LocalAccum.size() == 1 || !LocalAccum[1])(static_cast <bool> (LocalAccum.size() == 1 || !LocalAccum
[1]) ? void (0) : __assert_fail ("LocalAccum.size() == 1 || !LocalAccum[1]"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3081, __extension__
__PRETTY_FUNCTION__))
;
3082 Tmp = getZero64();
3083 HaveSmallAccum = true;
3084 }
3085
3086 do {
3087 unsigned j1 = DstIndex - j0;
3088 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3089 ++j0;
3090 continue;
3091 }
3092 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3093 {Src0[j0], Src1[j1], Tmp});
3094 Tmp = Mad.getReg(0);
3095 if (!HaveSmallAccum)
3096 CarryOut.push_back(Mad.getReg(1));
3097 HaveSmallAccum = false;
3098
3099 ++j0;
3100 } while (j0 <= DstIndex);
3101
3102 auto Unmerge = B.buildUnmerge(S32, Tmp);
3103 LocalAccum[0] = Unmerge.getReg(0);
3104 if (LocalAccum.size() > 1)
3105 LocalAccum[1] = Unmerge.getReg(1);
3106 }
3107
3108 return CarryOut;
3109 };
3110
3111 // Outer multiply loop, iterating over destination parts from least
3112 // significant to most significant parts.
3113 //
3114 // The columns of the following diagram correspond to the destination parts
3115 // affected by one iteration of the outer loop (ignoring boundary
3116 // conditions).
3117 //
3118 // Dest index relative to 2 * i: 1 0 -1
3119 // ------
3120 // Carries from previous iteration: e o
3121 // Even-aligned partial product sum: E E .
3122 // Odd-aligned partial product sum: O O
3123 //
3124 // 'o' is OddCarry, 'e' is EvenCarry.
3125 // EE and OO are computed from partial products via buildMadChain and use
3126 // accumulation where possible and appropriate.
3127 //
3128 Register SeparateOddCarry;
3129 Carry EvenCarry;
3130 Carry OddCarry;
3131
3132 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3133 Carry OddCarryIn = std::move(OddCarry);
3134 Carry EvenCarryIn = std::move(EvenCarry);
3135 OddCarry.clear();
3136 EvenCarry.clear();
3137
3138 // Partial products at offset 2 * i.
3139 if (2 * i < Accum.size()) {
3140 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3141 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3142 }
3143
3144 // Partial products at offset 2 * i - 1.
3145 if (i > 0) {
3146 if (!SeparateOddAlignedProducts) {
3147 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3148 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3149 } else {
3150 bool IsHighest = 2 * i >= Accum.size();
3151 Register SeparateOddOut[2];
3152 auto LocalAccum = MutableArrayRef(SeparateOddOut)
3153 .take_front(IsHighest ? 1 : 2);
3154 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3155
3156 MachineInstr *Lo;
3157
3158 if (i == 1) {
3159 if (!IsHighest)
3160 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3161 else
3162 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3163 } else {
3164 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3165 SeparateOddCarry);
3166 }
3167 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3168
3169 if (!IsHighest) {
3170 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3171 Lo->getOperand(1).getReg());
3172 Accum[2 * i] = Hi.getReg(0);
3173 SeparateOddCarry = Hi.getReg(1);
3174 }
3175 }
3176 }
3177
3178 // Add in the carries from the previous iteration
3179 if (i > 0) {
3180 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3181 EvenCarryIn.push_back(CarryOut);
3182
3183 if (2 * i < Accum.size()) {
3184 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3185 OddCarry.push_back(CarryOut);
3186 }
3187 }
3188 }
3189}
3190
3191// Custom narrowing of wide multiplies using wide multiply-add instructions.
3192//
3193// TODO: If the multiply is followed by an addition, we should attempt to
3194// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3195bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3196 MachineInstr &MI) const {
3197 assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail
("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3197, __extension__ __PRETTY_FUNCTION__))
;
3198 assert(MI.getOpcode() == TargetOpcode::G_MUL)(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_MUL
) ? void (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_MUL"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3198, __extension__
__PRETTY_FUNCTION__))
;
3199
3200 MachineIRBuilder &B = Helper.MIRBuilder;
3201 MachineRegisterInfo &MRI = *B.getMRI();
3202
3203 Register DstReg = MI.getOperand(0).getReg();
3204 Register Src0 = MI.getOperand(1).getReg();
3205 Register Src1 = MI.getOperand(2).getReg();
3206
3207 LLT Ty = MRI.getType(DstReg);
3208 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3208, __extension__ __PRETTY_FUNCTION__))
;
3209
3210 unsigned Size = Ty.getSizeInBits();
3211 unsigned NumParts = Size / 32;
3212 assert((Size % 32) == 0)(static_cast <bool> ((Size % 32) == 0) ? void (0) : __assert_fail
("(Size % 32) == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3212, __extension__ __PRETTY_FUNCTION__))
;
3213 assert(NumParts >= 2)(static_cast <bool> (NumParts >= 2) ? void (0) : __assert_fail
("NumParts >= 2", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3213, __extension__ __PRETTY_FUNCTION__))
;
3214
3215 // Whether to use MAD_64_32 for partial products whose high half is
3216 // discarded. This avoids some ADD instructions but risks false dependency
3217 // stalls on some subtargets in some cases.
3218 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3219
3220 // Whether to compute odd-aligned partial products separately. This is
3221 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3222 // in an even-aligned VGPR.
3223 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3224
3225 LLT S32 = LLT::scalar(32);
3226 SmallVector<Register, 2> Src0Parts, Src1Parts;
3227 for (unsigned i = 0; i < NumParts; ++i) {
3228 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3229 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3230 }
3231 B.buildUnmerge(Src0Parts, Src0);
3232 B.buildUnmerge(Src1Parts, Src1);
3233
3234 SmallVector<Register, 2> AccumRegs(NumParts);
3235 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3236 SeparateOddAlignedProducts);
3237
3238 B.buildMergeLikeInstr(DstReg, AccumRegs);
3239 MI.eraseFromParent();
3240 return true;
3241}
3242
3243// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3244// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3245// case with a single min instruction instead of a compare+select.
3246bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3247 MachineRegisterInfo &MRI,
3248 MachineIRBuilder &B) const {
3249 Register Dst = MI.getOperand(0).getReg();
3250 Register Src = MI.getOperand(1).getReg();
3251 LLT DstTy = MRI.getType(Dst);
3252 LLT SrcTy = MRI.getType(Src);
3253
3254 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3255 ? AMDGPU::G_AMDGPU_FFBH_U32
3256 : AMDGPU::G_AMDGPU_FFBL_B32;
3257 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3258 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3259
3260 MI.eraseFromParent();
3261 return true;
3262}
3263
3264// Check that this is a G_XOR x, -1
3265static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3266 if (MI.getOpcode() != TargetOpcode::G_XOR)
3267 return false;
3268 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3269 return ConstVal && *ConstVal == -1;
3270}
3271
3272// Return the use branch instruction, otherwise null if the usage is invalid.
3273static MachineInstr *
3274verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3275 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3276 Register CondDef = MI.getOperand(0).getReg();
3277 if (!MRI.hasOneNonDBGUse(CondDef))
3278 return nullptr;
3279
3280 MachineBasicBlock *Parent = MI.getParent();
3281 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3282
3283 if (isNot(MRI, *UseMI)) {
3284 Register NegatedCond = UseMI->getOperand(0).getReg();
3285 if (!MRI.hasOneNonDBGUse(NegatedCond))
3286 return nullptr;
3287
3288 // We're deleting the def of this value, so we need to remove it.
3289 eraseInstr(*UseMI, MRI);
3290
3291 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3292 Negated = true;
3293 }
3294
3295 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3296 return nullptr;
3297
3298 // Make sure the cond br is followed by a G_BR, or is the last instruction.
3299 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3300 if (Next == Parent->end()) {
3301 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3302 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3303 return nullptr;
3304 UncondBrTarget = &*NextMBB;
3305 } else {
3306 if (Next->getOpcode() != AMDGPU::G_BR)
3307 return nullptr;
3308 Br = &*Next;
3309 UncondBrTarget = Br->getOperand(0).getMBB();
3310 }
3311
3312 return UseMI;
3313}
3314
3315bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3316 const ArgDescriptor *Arg,
3317 const TargetRegisterClass *ArgRC,
3318 LLT ArgTy) const {
3319 MCRegister SrcReg = Arg->getRegister();
3320 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3320, __extension__
__PRETTY_FUNCTION__))
;
5
'?' condition is true
3321 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3321, __extension__
__PRETTY_FUNCTION__))
;
6
Assuming the condition is true
7
'?' condition is true
3322
3323 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3324 *ArgRC, B.getDebugLoc(), ArgTy);
3325 if (Arg->isMasked()) {
8
Taking true branch
3326 // TODO: Should we try to emit this once in the entry block?
3327 const LLT S32 = LLT::scalar(32);
3328 const unsigned Mask = Arg->getMask();
3329 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
9
Calling 'countr_zero<unsigned int>'
16
Returning from 'countr_zero<unsigned int>'
17
'Shift' initialized to 32
3330
3331 Register AndMaskSrc = LiveIn;
3332
3333 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3334 // 0.
3335 if (Shift
17.1
'Shift' is not equal to 0
17.1
'Shift' is not equal to 0
!= 0) {
18
Taking true branch
3336 auto ShiftAmt = B.buildConstant(S32, Shift);
3337 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3338 }
3339
3340 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
19
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
3341 } else {
3342 B.buildCopy(DstReg, LiveIn);
3343 }
3344
3345 return true;
3346}
3347
3348bool AMDGPULegalizerInfo::loadInputValue(
3349 Register DstReg, MachineIRBuilder &B,
3350 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3351 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3352 const ArgDescriptor *Arg;
3353 const TargetRegisterClass *ArgRC;
3354 LLT ArgTy;
3355 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3356
3357 if (!Arg) {
2
Assuming 'Arg' is non-null
3358 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3359 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3360 // case the pointer argument may be missing and we use null.
3361 B.buildConstant(DstReg, 0);
3362 return true;
3363 }
3364
3365 // It's undefined behavior if a function marked with the amdgpu-no-*
3366 // attributes uses the corresponding intrinsic.
3367 B.buildUndef(DstReg);
3368 return true;
3369 }
3370
3371 if (!Arg->isRegister() || !Arg->getRegister().isValid())
3
Taking false branch
3372 return false; // TODO: Handle these
3373 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4
Calling 'AMDGPULegalizerInfo::loadInputValue'
3374}
3375
3376bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3377 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3378 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3379 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3380 return false;
3381
3382 MI.eraseFromParent();
3383 return true;
3384}
3385
3386static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3387 int64_t C) {
3388 B.buildConstant(MI.getOperand(0).getReg(), C);
3389 MI.eraseFromParent();
3390 return true;
3391}
3392
3393bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3394 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3395 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3396 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3397 if (MaxID == 0)
3398 return replaceWithConstant(B, MI, 0);
3399
3400 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3401 const ArgDescriptor *Arg;
3402 const TargetRegisterClass *ArgRC;
3403 LLT ArgTy;
3404 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3405
3406 Register DstReg = MI.getOperand(0).getReg();
3407 if (!Arg) {
3408 // It's undefined behavior if a function marked with the amdgpu-no-*
3409 // attributes uses the corresponding intrinsic.
3410 B.buildUndef(DstReg);
3411 MI.eraseFromParent();
3412 return true;
3413 }
3414
3415 if (Arg->isMasked()) {
3416 // Don't bother inserting AssertZext for packed IDs since we're emitting the
3417 // masking operations anyway.
3418 //
3419 // TODO: We could assert the top bit is 0 for the source copy.
3420 if (!loadInputValue(DstReg, B, ArgType))
3421 return false;
3422 } else {
3423 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3424 if (!loadInputValue(TmpReg, B, ArgType))
3425 return false;
3426 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
3427 }
3428
3429 MI.eraseFromParent();
3430 return true;
3431}
3432
3433Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3434 int64_t Offset) const {
3435 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3436 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3437
3438 // TODO: If we passed in the base kernel offset we could have a better
3439 // alignment than 4, but we don't really need it.
3440 if (!loadInputValue(KernArgReg, B,
1
Calling 'AMDGPULegalizerInfo::loadInputValue'
3441 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3442 llvm_unreachable("failed to find kernarg segment ptr")::llvm::llvm_unreachable_internal("failed to find kernarg segment ptr"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3442)
;
3443
3444 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3445 // TODO: Should get nuw
3446 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3447}
3448
3449/// Legalize a value that's loaded from kernel arguments. This is only used by
3450/// legacy intrinsics.
3451bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3452 MachineIRBuilder &B,
3453 uint64_t Offset,
3454 Align Alignment) const {
3455 Register DstReg = MI.getOperand(0).getReg();
3456
3457 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3458, __extension__
__PRETTY_FUNCTION__))
3458 "unexpected kernarg parameter type")(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT
::scalar(32) && "unexpected kernarg parameter type") ?
void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3458, __extension__
__PRETTY_FUNCTION__))
;
3459
3460 Register Ptr = getKernargParameterPtr(B, Offset);
3461 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3462 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3463 MachineMemOperand::MODereferenceable |
3464 MachineMemOperand::MOInvariant);
3465 MI.eraseFromParent();
3466 return true;
3467}
3468
3469bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3470 MachineRegisterInfo &MRI,
3471 MachineIRBuilder &B) const {
3472 Register Dst = MI.getOperand(0).getReg();
3473 LLT DstTy = MRI.getType(Dst);
3474 LLT S16 = LLT::scalar(16);
3475 LLT S32 = LLT::scalar(32);
3476 LLT S64 = LLT::scalar(64);
3477
3478 if (DstTy == S16)
3479 return legalizeFDIV16(MI, MRI, B);
3480 if (DstTy == S32)
3481 return legalizeFDIV32(MI, MRI, B);
3482 if (DstTy == S64)
3483 return legalizeFDIV64(MI, MRI, B);
3484
3485 return false;
3486}
3487
3488void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3489 Register DstDivReg,
3490 Register DstRemReg,
3491 Register X,
3492 Register Y) const {
3493 const LLT S1 = LLT::scalar(1);
3494 const LLT S32 = LLT::scalar(32);
3495
3496 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3497 // algorithm used here.
3498
3499 // Initial estimate of inv(y).
3500 auto FloatY = B.buildUITOFP(S32, Y);
3501 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3502 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
3503 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3504 auto Z = B.buildFPTOUI(S32, ScaledY);
3505
3506 // One round of UNR.
3507 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3508 auto NegYZ = B.buildMul(S32, NegY, Z);
3509 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3510
3511 // Quotient/remainder estimate.
3512 auto Q = B.buildUMulH(S32, X, Z);
3513 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3514
3515 // First quotient/remainder refinement.
3516 auto One = B.buildConstant(S32, 1);
3517 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3518 if (DstDivReg)
3519 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3520 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3521
3522 // Second quotient/remainder refinement.
3523 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3524 if (DstDivReg)
3525 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3526
3527 if (DstRemReg)
3528 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3529}
3530
3531// Build integer reciprocal sequence around V_RCP_IFLAG_F32
3532//
3533// Return lo, hi of result
3534//
3535// %cvt.lo = G_UITOFP Val.lo
3536// %cvt.hi = G_UITOFP Val.hi
3537// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3538// %rcp = G_AMDGPU_RCP_IFLAG %mad
3539// %mul1 = G_FMUL %rcp, 0x5f7ffffc
3540// %mul2 = G_FMUL %mul1, 2**(-32)
3541// %trunc = G_INTRINSIC_TRUNC %mul2
3542// %mad2 = G_FMAD %trunc, -(2**32), %mul1
3543// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3544static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3545 Register Val) {
3546 const LLT S32 = LLT::scalar(32);
3547 auto Unmerge = B.buildUnmerge(S32, Val);
3548
3549 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3550 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3551
3552 auto Mad = B.buildFMAD(
3553 S32, CvtHi, // 2**32
3554 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
3555
3556 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3557 auto Mul1 = B.buildFMul(
3558 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
3559
3560 // 2**(-32)
3561 auto Mul2 = B.buildFMul(
3562 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
3563 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3564
3565 // -(2**32)
3566 auto Mad2 = B.buildFMAD(
3567 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
3568 Mul1);
3569
3570 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3571 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3572
3573 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3574}
3575
3576void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3577 Register DstDivReg,
3578 Register DstRemReg,
3579 Register Numer,
3580 Register Denom) const {
3581 const LLT S32 = LLT::scalar(32);
3582 const LLT S64 = LLT::scalar(64);
3583 const LLT S1 = LLT::scalar(1);
3584 Register RcpLo, RcpHi;
3585
3586 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3587
3588 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3589
3590 auto Zero64 = B.buildConstant(S64, 0);
3591 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3592
3593 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3594 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3595
3596 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3597 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3598 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3599
3600 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3601 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3602 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3603
3604 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3605 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3606 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3607 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3608 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3609
3610 auto Zero32 = B.buildConstant(S32, 0);
3611 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3612 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3613 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3614
3615 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3616 Register NumerLo = UnmergeNumer.getReg(0);
3617 Register NumerHi = UnmergeNumer.getReg(1);
3618
3619 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3620 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3621 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3622 Register Mul3_Lo = UnmergeMul3.getReg(0);
3623 Register Mul3_Hi = UnmergeMul3.getReg(1);
3624 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3625 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3626 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3627 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3628
3629 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3630 Register DenomLo = UnmergeDenom.getReg(0);
3631 Register DenomHi = UnmergeDenom.getReg(1);
3632
3633 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3634 auto C1 = B.buildSExt(S32, CmpHi);
3635
3636 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3637 auto C2 = B.buildSExt(S32, CmpLo);
3638
3639 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3640 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3641
3642 // TODO: Here and below portions of the code can be enclosed into if/endif.
3643 // Currently control flow is unconditional and we have 4 selects after
3644 // potential endif to substitute PHIs.
3645
3646 // if C3 != 0 ...
3647 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3648 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3649 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3650 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3651
3652 auto One64 = B.buildConstant(S64, 1);
3653 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3654
3655 auto C4 =
3656 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3657 auto C5 =
3658 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3659 auto C6 = B.buildSelect(
3660 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3661
3662 // if (C6 != 0)
3663 auto Add4 = B.buildAdd(S64, Add3, One64);
3664 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3665
3666 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3667 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3668 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3669
3670 // endif C6
3671 // endif C3
3672
3673 if (DstDivReg) {
3674 auto Sel1 = B.buildSelect(
3675 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3676 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3677 Sel1, MulHi3);
3678 }
3679
3680 if (DstRemReg) {
3681 auto Sel2 = B.buildSelect(
3682 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3683 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3684 Sel2, Sub1);
3685 }
3686}
3687
3688bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3689 MachineRegisterInfo &MRI,
3690 MachineIRBuilder &B) const {
3691 Register DstDivReg, DstRemReg;
3692 switch (MI.getOpcode()) {
3693 default:
3694 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3694)
;
3695 case AMDGPU::G_UDIV: {
3696 DstDivReg = MI.getOperand(0).getReg();
3697 break;
3698 }
3699 case AMDGPU::G_UREM: {
3700 DstRemReg = MI.getOperand(0).getReg();
3701 break;
3702 }
3703 case AMDGPU::G_UDIVREM: {
3704 DstDivReg = MI.getOperand(0).getReg();
3705 DstRemReg = MI.getOperand(1).getReg();
3706 break;
3707 }
3708 }
3709
3710 const LLT S64 = LLT::scalar(64);
3711 const LLT S32 = LLT::scalar(32);
3712 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3713 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3714 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3715 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3716
3717 if (Ty == S32)
3718 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3719 else if (Ty == S64)
3720 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3721 else
3722 return false;
3723
3724 MI.eraseFromParent();
3725 return true;
3726}
3727
3728bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3729 MachineRegisterInfo &MRI,
3730 MachineIRBuilder &B) const {
3731 const LLT S64 = LLT::scalar(64);
3732 const LLT S32 = LLT::scalar(32);
3733
3734 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3735 if (Ty != S32 && Ty != S64)
3736 return false;
3737
3738 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3739 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3740 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3741
3742 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3743 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3744 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3745
3746 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3747 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3748
3749 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3750 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3751
3752 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3753 switch (MI.getOpcode()) {
3754 default:
3755 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3755)
;
3756 case AMDGPU::G_SDIV: {
3757 DstDivReg = MI.getOperand(0).getReg();
3758 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3759 break;
3760 }
3761 case AMDGPU::G_SREM: {
3762 DstRemReg = MI.getOperand(0).getReg();
3763 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3764 break;
3765 }
3766 case AMDGPU::G_SDIVREM: {
3767 DstDivReg = MI.getOperand(0).getReg();
3768 DstRemReg = MI.getOperand(1).getReg();
3769 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3770 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3771 break;
3772 }
3773 }
3774
3775 if (Ty == S32)
3776 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3777 else
3778 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3779
3780 if (DstDivReg) {
3781 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3782 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3783 B.buildSub(DstDivReg, SignXor, Sign);
3784 }
3785
3786 if (DstRemReg) {
3787 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3788 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3789 B.buildSub(DstRemReg, SignXor, Sign);
3790 }
3791
3792 MI.eraseFromParent();
3793 return true;
3794}
3795
3796bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3797 MachineRegisterInfo &MRI,
3798 MachineIRBuilder &B) const {
3799 Register Res = MI.getOperand(0).getReg();
3800 Register LHS = MI.getOperand(1).getReg();
3801 Register RHS = MI.getOperand(2).getReg();
3802 uint16_t Flags = MI.getFlags();
3803 LLT ResTy = MRI.getType(Res);
3804
3805 const MachineFunction &MF = B.getMF();
3806 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3807 MI.getFlag(MachineInstr::FmAfn);
3808
3809 if (!AllowInaccurateRcp)
3810 return false;
3811
3812 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3813 // 1 / x -> RCP(x)
3814 if (CLHS->isExactlyValue(1.0)) {
3815 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3816 .addUse(RHS)
3817 .setMIFlags(Flags);
3818
3819 MI.eraseFromParent();
3820 return true;
3821 }
3822
3823 // -1 / x -> RCP( FNEG(x) )
3824 if (CLHS->isExactlyValue(-1.0)) {
3825 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3826 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3827 .addUse(FNeg.getReg(0))
3828 .setMIFlags(Flags);
3829
3830 MI.eraseFromParent();
3831 return true;
3832 }
3833 }
3834
3835 // x / y -> x * (1.0 / y)
3836 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3837 .addUse(RHS)
3838 .setMIFlags(Flags);
3839 B.buildFMul(Res, LHS, RCP, Flags);
3840
3841 MI.eraseFromParent();
3842 return true;
3843}
3844
3845bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3846 MachineRegisterInfo &MRI,
3847 MachineIRBuilder &B) const {
3848 Register Res = MI.getOperand(0).getReg();
3849 Register X = MI.getOperand(1).getReg();
3850 Register Y = MI.getOperand(2).getReg();
3851 uint16_t Flags = MI.getFlags();
3852 LLT ResTy = MRI.getType(Res);
3853
3854 const MachineFunction &MF = B.getMF();
3855 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3856 MI.getFlag(MachineInstr::FmAfn);
3857
3858 if (!AllowInaccurateRcp)
3859 return false;
3860
3861 auto NegY = B.buildFNeg(ResTy, Y);
3862 auto One = B.buildFConstant(ResTy, 1.0);
3863
3864 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3865 .addUse(Y)
3866 .setMIFlags(Flags);
3867
3868 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3869 R = B.buildFMA(ResTy, Tmp0, R, R);
3870
3871 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3872 R = B.buildFMA(ResTy, Tmp1, R, R);
3873
3874 auto Ret = B.buildFMul(ResTy, X, R);
3875 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3876
3877 B.buildFMA(Res, Tmp2, R, Ret);
3878 MI.eraseFromParent();
3879 return true;
3880}
3881
3882bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3883 MachineRegisterInfo &MRI,
3884 MachineIRBuilder &B) const {
3885 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3886 return true;
3887
3888 Register Res = MI.getOperand(0).getReg();
3889 Register LHS = MI.getOperand(1).getReg();
3890 Register RHS = MI.getOperand(2).getReg();
3891
3892 uint16_t Flags = MI.getFlags();
3893
3894 LLT S16 = LLT::scalar(16);
3895 LLT S32 = LLT::scalar(32);
3896
3897 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3898 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3899
3900 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3901 .addUse(RHSExt.getReg(0))
3902 .setMIFlags(Flags);
3903
3904 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3905 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3906
3907 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3908 .addUse(RDst.getReg(0))
3909 .addUse(RHS)
3910 .addUse(LHS)
3911 .setMIFlags(Flags);
3912
3913 MI.eraseFromParent();
3914 return true;
3915}
3916
3917// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3918// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3919static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
3920 const GCNSubtarget &ST,
3921 SIModeRegisterDefaults Mode) {
3922 // Set SP denorm mode to this value.
3923 unsigned SPDenormMode =
3924 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3925
3926 if (ST.hasDenormModeInst()) {
3927 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3928 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3929
3930 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3931 B.buildInstr(AMDGPU::S_DENORM_MODE)
3932 .addImm(NewDenormModeValue);
3933
3934 } else {
3935 // Select FP32 bit field in mode register.
3936 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3937 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3938 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3939
3940 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3941 .addImm(SPDenormMode)
3942 .addImm(SPDenormModeBitField);
3943 }
3944}
3945
3946bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3947 MachineRegisterInfo &MRI,
3948 MachineIRBuilder &B) const {
3949 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3950 return true;
3951
3952 Register Res = MI.getOperand(0).getReg();
3953 Register LHS = MI.getOperand(1).getReg();
3954 Register RHS = MI.getOperand(2).getReg();
3955 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3956 SIModeRegisterDefaults Mode = MFI->getMode();
3957
3958 uint16_t Flags = MI.getFlags();
3959
3960 LLT S32 = LLT::scalar(32);
3961 LLT S1 = LLT::scalar(1);
3962
3963 auto One = B.buildFConstant(S32, 1.0f);
3964
3965 auto DenominatorScaled =
3966 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3967 .addUse(LHS)
3968 .addUse(RHS)
3969 .addImm(0)
3970 .setMIFlags(Flags);
3971 auto NumeratorScaled =
3972 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3973 .addUse(LHS)
3974 .addUse(RHS)
3975 .addImm(1)
3976 .setMIFlags(Flags);
3977
3978 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3979 .addUse(DenominatorScaled.getReg(0))
3980 .setMIFlags(Flags);
3981 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3982
3983 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3984 // aren't modeled as reading it.
3985 if (!Mode.allFP32Denormals())
3986 toggleSPDenormMode(true, B, ST, Mode);
3987
3988 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3989 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3990 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3991 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3992 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3993 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3994
3995 if (!Mode.allFP32Denormals())
3996 toggleSPDenormMode(false, B, ST, Mode);
3997
3998 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3999 .addUse(Fma4.getReg(0))
4000 .addUse(Fma1.getReg(0))
4001 .addUse(Fma3.getReg(0))
4002 .addUse(NumeratorScaled.getReg(1))
4003 .setMIFlags(Flags);
4004
4005 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
4006 .addUse(Fmas.getReg(0))
4007 .addUse(RHS)
4008 .addUse(LHS)
4009 .setMIFlags(Flags);
4010
4011 MI.eraseFromParent();
4012 return true;
4013}
4014
4015bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4016 MachineRegisterInfo &MRI,
4017 MachineIRBuilder &B) const {
4018 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4019 return true;
4020
4021 Register Res = MI.getOperand(0).getReg();
4022 Register LHS = MI.getOperand(1).getReg();
4023 Register RHS = MI.getOperand(2).getReg();
4024
4025 uint16_t Flags = MI.getFlags();
4026
4027 LLT S64 = LLT::scalar(64);
4028 LLT S1 = LLT::scalar(1);
4029
4030 auto One = B.buildFConstant(S64, 1.0);
4031
4032 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4033 .addUse(LHS)
4034 .addUse(RHS)
4035 .addImm(0)
4036 .setMIFlags(Flags);
4037
4038 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4039
4040 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
4041 .addUse(DivScale0.getReg(0))
4042 .setMIFlags(Flags);
4043
4044 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4045 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4046 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4047
4048 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4049 .addUse(LHS)
4050 .addUse(RHS)
4051 .addImm(1)
4052 .setMIFlags(Flags);
4053
4054 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4055 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4056 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4057
4058 Register Scale;
4059 if (!ST.hasUsableDivScaleConditionOutput()) {
4060 // Workaround a hardware bug on SI where the condition output from div_scale
4061 // is not usable.
4062
4063 LLT S32 = LLT::scalar(32);
4064
4065 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4066 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4067 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4068 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4069
4070 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4071 Scale1Unmerge.getReg(1));
4072 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4073 Scale0Unmerge.getReg(1));
4074 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4075 } else {
4076 Scale = DivScale1.getReg(1);
4077 }
4078
4079 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4080 .addUse(Fma4.getReg(0))
4081 .addUse(Fma3.getReg(0))
4082 .addUse(Mul.getReg(0))
4083 .addUse(Scale)
4084 .setMIFlags(Flags);
4085
4086 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4087 .addUse(Fmas.getReg(0))
4088 .addUse(RHS)
4089 .addUse(LHS)
4090 .setMIFlags(Flags);
4091
4092 MI.eraseFromParent();
4093 return true;
4094}
4095
4096bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4097 MachineRegisterInfo &MRI,
4098 MachineIRBuilder &B) const {
4099 Register Res = MI.getOperand(0).getReg();
4100 Register LHS = MI.getOperand(2).getReg();
4101 Register RHS = MI.getOperand(3).getReg();
4102 uint16_t Flags = MI.getFlags();
4103
4104 LLT S32 = LLT::scalar(32);
4105 LLT S1 = LLT::scalar(1);
4106
4107 auto Abs = B.buildFAbs(S32, RHS, Flags);
4108 const APFloat C0Val(1.0f);
4109
4110 auto C0 = B.buildConstant(S32, 0x6f800000);
4111 auto C1 = B.buildConstant(S32, 0x2f800000);
4112 auto C2 = B.buildConstant(S32, llvm::bit_cast<uint32_t>(1.0f));
4113
4114 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4115 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4116
4117 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4118
4119 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4120 .addUse(Mul0.getReg(0))
4121 .setMIFlags(Flags);
4122
4123 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4124
4125 B.buildFMul(Res, Sel, Mul1, Flags);
4126
4127 MI.eraseFromParent();
4128 return true;
4129}
4130
4131// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4132// FIXME: Why do we handle this one but not other removed instructions?
4133//
4134// Reciprocal square root. The clamp prevents infinite results, clamping
4135// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4136// +-max_float.
4137bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4138 MachineRegisterInfo &MRI,
4139 MachineIRBuilder &B) const {
4140 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4141 return true;
4142
4143 Register Dst = MI.getOperand(0).getReg();
4144 Register Src = MI.getOperand(2).getReg();
4145 auto Flags = MI.getFlags();
4146
4147 LLT Ty = MRI.getType(Dst);
4148
4149 const fltSemantics *FltSemantics;
4150 if (Ty == LLT::scalar(32))
4151 FltSemantics = &APFloat::IEEEsingle();
4152 else if (Ty == LLT::scalar(64))
4153 FltSemantics = &APFloat::IEEEdouble();
4154 else
4155 return false;
4156
4157 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4158 .addUse(Src)
4159 .setMIFlags(Flags);
4160
4161 // We don't need to concern ourselves with the snan handling difference, since
4162 // the rsq quieted (or not) so use the one which will directly select.
4163 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4164 const bool UseIEEE = MFI->getMode().IEEE;
4165
4166 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4167 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4168 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4169
4170 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4171
4172 if (UseIEEE)
4173 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4174 else
4175 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4176 MI.eraseFromParent();
4177 return true;
4178}
4179
4180static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4181 switch (IID) {
4182 case Intrinsic::amdgcn_ds_fadd:
4183 return AMDGPU::G_ATOMICRMW_FADD;
4184 case Intrinsic::amdgcn_ds_fmin:
4185 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4186 case Intrinsic::amdgcn_ds_fmax:
4187 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4188 default:
4189 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4189)
;
4190 }
4191}
4192
4193bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4194 MachineInstr &MI,
4195 Intrinsic::ID IID) const {
4196 GISelChangeObserver &Observer = Helper.Observer;
4197 Observer.changingInstr(MI);
4198
4199 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4200
4201 // The remaining operands were used to set fields in the MemOperand on
4202 // construction.
4203 for (int I = 6; I > 3; --I)
4204 MI.removeOperand(I);
4205
4206 MI.removeOperand(1); // Remove the intrinsic ID.
4207 Observer.changedInstr(MI);
4208 return true;
4209}
4210
4211bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4212 MachineRegisterInfo &MRI,
4213 MachineIRBuilder &B) const {
4214 uint64_t Offset =
4215 ST.getTargetLowering()->getImplicitParameterOffset(
4216 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4217 LLT DstTy = MRI.getType(DstReg);
4218 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4219
4220 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4221 if (!loadInputValue(KernargPtrReg, B,
4222 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4223 return false;
4224
4225 // FIXME: This should be nuw
4226 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4227 return true;
4228}
4229
4230bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4231 MachineRegisterInfo &MRI,
4232 MachineIRBuilder &B) const {
4233 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4234 if (!MFI->isEntryFunction()) {
4235 return legalizePreloadedArgIntrin(MI, MRI, B,
4236 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4237 }
4238
4239 Register DstReg = MI.getOperand(0).getReg();
4240 if (!getImplicitArgPtr(DstReg, MRI, B))
4241 return false;
4242
4243 MI.eraseFromParent();
4244 return true;
4245}
4246
4247bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4248 MachineRegisterInfo &MRI,
4249 MachineIRBuilder &B) const {
4250 Function &F = B.getMF().getFunction();
4251 std::optional<uint32_t> KnownSize =
4252 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4253 if (KnownSize.has_value())
4254 B.buildConstant(DstReg, *KnownSize);
4255 return false;
4256}
4257
4258bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4259 MachineRegisterInfo &MRI,
4260 MachineIRBuilder &B) const {
4261
4262 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4263 if (!MFI->isEntryFunction()) {
4264 return legalizePreloadedArgIntrin(MI, MRI, B,
4265 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4266 }
4267
4268 Register DstReg = MI.getOperand(0).getReg();
4269 if (!getLDSKernelId(DstReg, MRI, B))
4270 return false;
4271
4272 MI.eraseFromParent();
4273 return true;
4274}
4275
4276bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4277 MachineRegisterInfo &MRI,
4278 MachineIRBuilder &B,
4279 unsigned AddrSpace) const {
4280 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4281 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4282 Register Hi32 = Unmerge.getReg(1);
4283
4284 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4285 MI.eraseFromParent();
4286 return true;
4287}
4288
4289// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4290// offset (the offset that is included in bounds checking and swizzling, to be
4291// split between the instruction's voffset and immoffset fields) and soffset
4292// (the offset that is excluded from bounds checking and swizzling, to go in
4293// the instruction's soffset field). This function takes the first kind of
4294// offset and figures out how to split it between voffset and immoffset.
4295std::pair<Register, unsigned>
4296AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4297 Register OrigOffset) const {
4298 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
4299 Register BaseReg;
4300 unsigned ImmOffset;
4301 const LLT S32 = LLT::scalar(32);
4302 MachineRegisterInfo &MRI = *B.getMRI();
4303
4304 std::tie(BaseReg, ImmOffset) =
4305 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4306
4307 // If BaseReg is a pointer, convert it to int.
4308 if (MRI.getType(BaseReg).isPointer())
4309 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4310
4311 // If the immediate value is too big for the immoffset field, put only bits
4312 // that would normally fit in the immoffset field. The remaining value that
4313 // is copied/added for the voffset field is a large power of 2, and it
4314 // stands more chance of being CSEd with the copy/add for another similar
4315 // load/store.
4316 // However, do not do that rounding down if that is a negative
4317 // number, as it appears to be illegal to have a negative offset in the
4318 // vgpr, even if adding the immediate offset makes it positive.
4319 unsigned Overflow = ImmOffset & ~MaxImm;
4320 ImmOffset -= Overflow;
4321 if ((int32_t)Overflow < 0) {
4322 Overflow += ImmOffset;
4323 ImmOffset = 0;
4324 }
4325
4326 if (Overflow != 0) {
4327 if (!BaseReg) {
4328 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4329 } else {
4330 auto OverflowVal = B.buildConstant(S32, Overflow);
4331 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4332 }
4333 }
4334
4335 if (!BaseReg)
4336 BaseReg = B.buildConstant(S32, 0).getReg(0);
4337
4338 return std::pair(BaseReg, ImmOffset);
4339}
4340
4341/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4342void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4343 Register VOffset, Register SOffset,
4344 unsigned ImmOffset, Register VIndex,
4345 MachineRegisterInfo &MRI) const {
4346 std::optional<ValueAndVReg> MaybeVOffsetVal =
4347 getIConstantVRegValWithLookThrough(VOffset, MRI);
4348 std::optional<ValueAndVReg> MaybeSOffsetVal =
4349 getIConstantVRegValWithLookThrough(SOffset, MRI);
4350 std::optional<ValueAndVReg> MaybeVIndexVal =
4351 getIConstantVRegValWithLookThrough(VIndex, MRI);
4352 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4353 // update the MMO with that offset. The stride is unknown so we can only do
4354 // this if VIndex is constant 0.
4355 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4356 MaybeVIndexVal->Value == 0) {
4357 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4358 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4359 MMO->setOffset(TotalOffset);
4360 } else {
4361 // We don't have a constant combined offset to use in the MMO. Give up.
4362 MMO->setValue((Value *)nullptr);
4363 }
4364}
4365
4366/// Handle register layout difference for f16 images for some subtargets.
4367Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4368 MachineRegisterInfo &MRI,
4369 Register Reg,
4370 bool ImageStore) const {
4371 const LLT S16 = LLT::scalar(16);
4372 const LLT S32 = LLT::scalar(32);
4373 LLT StoreVT = MRI.getType(Reg);
4374 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4374, __extension__
__PRETTY_FUNCTION__))
;
4375
4376 if (ST.hasUnpackedD16VMem()) {
4377 auto Unmerge = B.buildUnmerge(S16, Reg);
4378
4379 SmallVector<Register, 4> WideRegs;
4380 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4381 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4382
4383 int NumElts = StoreVT.getNumElements();
4384
4385 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4386 .getReg(0);
4387 }
4388
4389 if (ImageStore && ST.hasImageStoreD16Bug()) {
4390 if (StoreVT.getNumElements() == 2) {
4391 SmallVector<Register, 4> PackedRegs;
4392 Reg = B.buildBitcast(S32, Reg).getReg(0);
4393 PackedRegs.push_back(Reg);
4394 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4395 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4396 .getReg(0);
4397 }
4398
4399 if (StoreVT.getNumElements() == 3) {
4400 SmallVector<Register, 4> PackedRegs;
4401 auto Unmerge = B.buildUnmerge(S16, Reg);
4402 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4403 PackedRegs.push_back(Unmerge.getReg(I));
4404 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4405 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4406 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4407 }
4408
4409 if (StoreVT.getNumElements() == 4) {
4410 SmallVector<Register, 4> PackedRegs;
4411 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4412 auto Unmerge = B.buildUnmerge(S32, Reg);
4413 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4414 PackedRegs.push_back(Unmerge.getReg(I));
4415 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4416 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4417 .getReg(0);
4418 }
4419
4420 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4420)
;
4421 }
4422
4423 if (StoreVT == LLT::fixed_vector(3, S16)) {
4424 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4425 .getReg(0);
4426 }
4427 return Reg;
4428}
4429
4430Register AMDGPULegalizerInfo::fixStoreSourceType(
4431 MachineIRBuilder &B, Register VData, bool IsFormat) const {
4432 MachineRegisterInfo *MRI = B.getMRI();
4433 LLT Ty = MRI->getType(VData);
4434
4435 const LLT S16 = LLT::scalar(16);
4436
4437 // Fixup illegal register types for i8 stores.
4438 if (Ty == LLT::scalar(8) || Ty == S16) {
4439 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4440 return AnyExt;
4441 }
4442
4443 if (Ty.isVector()) {
4444 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4445 if (IsFormat)
4446 return handleD16VData(B, *MRI, VData);
4447 }
4448 }
4449
4450 return VData;
4451}
4452
4453bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4454 MachineRegisterInfo &MRI,
4455 MachineIRBuilder &B,
4456 bool IsTyped,
4457 bool IsFormat) const {
4458 Register VData = MI.getOperand(1).getReg();
4459 LLT Ty = MRI.getType(VData);
4460 LLT EltTy = Ty.getScalarType();
4461 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4462 const LLT S32 = LLT::scalar(32);
4463
4464 VData = fixStoreSourceType(B, VData, IsFormat);
4465 Register RSrc = MI.getOperand(2).getReg();
4466
4467 MachineMemOperand *MMO = *MI.memoperands_begin();
4468 const int MemSize = MMO->getSize();
4469
4470 unsigned ImmOffset;
4471
4472 // The typed intrinsics add an immediate after the registers.
4473 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4474
4475 // The struct intrinsic variants add one additional operand over raw.
4476 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4477 Register VIndex;
4478 int OpOffset = 0;
4479 if (HasVIndex) {
4480 VIndex = MI.getOperand(3).getReg();
4481 OpOffset = 1;
4482 } else {
4483 VIndex = B.buildConstant(S32, 0).getReg(0);
4484 }
4485
4486 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4487 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4488
4489 unsigned Format = 0;
4490 if (IsTyped) {
4491 Format = MI.getOperand(5 + OpOffset).getImm();
4492 ++OpOffset;
4493 }
4494
4495 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4496
4497 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4498 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4499
4500 unsigned Opc;
4501 if (IsTyped) {
4502 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4503 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4504 } else if (IsFormat) {
4505 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4506 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4507 } else {
4508 switch (MemSize) {
4509 case 1:
4510 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4511 break;
4512 case 2:
4513 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4514 break;
4515 default:
4516 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4517 break;
4518 }
4519 }
4520
4521 auto MIB = B.buildInstr(Opc)
4522 .addUse(VData) // vdata
4523 .addUse(RSrc) // rsrc
4524 .addUse(VIndex) // vindex
4525 .addUse(VOffset) // voffset
4526 .addUse(SOffset) // soffset
4527 .addImm(ImmOffset); // offset(imm)
4528
4529 if (IsTyped)
4530 MIB.addImm(Format);
4531
4532 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4533 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4534 .addMemOperand(MMO);
4535
4536 MI.eraseFromParent();
4537 return true;
4538}
4539
4540static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
4541 Register VIndex, Register VOffset, Register SOffset,
4542 unsigned ImmOffset, unsigned Format,
4543 unsigned AuxiliaryData, MachineMemOperand *MMO,
4544 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
4545 auto MIB = B.buildInstr(Opc)
4546 .addDef(LoadDstReg) // vdata
4547 .addUse(RSrc) // rsrc
4548 .addUse(VIndex) // vindex
4549 .addUse(VOffset) // voffset
4550 .addUse(SOffset) // soffset
4551 .addImm(ImmOffset); // offset(imm)
4552
4553 if (IsTyped)
4554 MIB.addImm(Format);
4555
4556 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4557 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4558 .addMemOperand(MMO);
4559}
4560
4561bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4562 MachineRegisterInfo &MRI,
4563 MachineIRBuilder &B,
4564 bool IsFormat,
4565 bool IsTyped) const {
4566 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4567 MachineMemOperand *MMO = *MI.memoperands_begin();
4568 const LLT MemTy = MMO->getMemoryType();
4569 const LLT S32 = LLT::scalar(32);
4570
4571 Register Dst = MI.getOperand(0).getReg();
4572
4573 Register StatusDst;
4574 int OpOffset = 0;
4575 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2)(static_cast <bool> (MI.getNumExplicitDefs() == 1 || MI
.getNumExplicitDefs() == 2) ? void (0) : __assert_fail ("MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4575, __extension__
__PRETTY_FUNCTION__))
;
4576 bool IsTFE = MI.getNumExplicitDefs() == 2;
4577 if (IsTFE) {
4578 StatusDst = MI.getOperand(1).getReg();
4579 ++OpOffset;
4580 }
4581
4582 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
4583
4584 // The typed intrinsics add an immediate after the registers.
4585 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4586
4587 // The struct intrinsic variants add one additional operand over raw.
4588 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
4589 Register VIndex;
4590 if (HasVIndex) {
4591 VIndex = MI.getOperand(3 + OpOffset).getReg();
4592 ++OpOffset;
4593 } else {
4594 VIndex = B.buildConstant(S32, 0).getReg(0);
4595 }
4596
4597 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4598 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4599
4600 unsigned Format = 0;
4601 if (IsTyped) {
4602 Format = MI.getOperand(5 + OpOffset).getImm();
4603 ++OpOffset;
4604 }
4605
4606 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4607 unsigned ImmOffset;
4608
4609 LLT Ty = MRI.getType(Dst);
4610 LLT EltTy = Ty.getScalarType();
4611 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4612 const bool Unpacked = ST.hasUnpackedD16VMem();
4613
4614 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4615 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4616
4617 unsigned Opc;
4618
4619 // TODO: Support TFE for typed and narrow loads.
4620 if (IsTyped) {
4621 if (IsTFE)
4622 return false;
4623 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4624 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4625 } else if (IsFormat) {
4626 if (IsD16) {
4627 if (IsTFE)
4628 return false;
4629 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4630 } else {
4631 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4632 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4633 }
4634 } else {
4635 if (IsTFE)
4636 return false;
4637 switch (MemTy.getSizeInBits()) {
4638 case 8:
4639 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4640 break;
4641 case 16:
4642 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4643 break;
4644 default:
4645 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4646 break;
4647 }
4648 }
4649
4650 if (IsTFE) {
4651 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
4652 unsigned NumLoadDWords = NumValueDWords + 1;
4653 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
4654 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
4655 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4656 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4657 if (NumValueDWords == 1) {
4658 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4659 } else {
4660 SmallVector<Register, 5> LoadElts;
4661 for (unsigned I = 0; I != NumValueDWords; ++I)
4662 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
4663 LoadElts.push_back(StatusDst);
4664 B.buildUnmerge(LoadElts, LoadDstReg);
4665 LoadElts.truncate(NumValueDWords);
4666 B.buildMergeLikeInstr(Dst, LoadElts);
4667 }
4668 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
4669 (IsD16 && !Ty.isVector())) {
4670 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4671 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4672 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4673 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4674 B.buildTrunc(Dst, LoadDstReg);
4675 } else if (Unpacked && IsD16 && Ty.isVector()) {
4676 LLT UnpackedTy = Ty.changeElementSize(32);
4677 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4678 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4679 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4680 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4681 // FIXME: G_TRUNC should work, but legalization currently fails
4682 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4683 SmallVector<Register, 4> Repack;
4684 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4685 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4686 B.buildMergeLikeInstr(Dst, Repack);
4687 } else {
4688 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
4689 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4690 }
4691
4692 MI.eraseFromParent();
4693 return true;
4694}
4695
4696bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4697 MachineIRBuilder &B,
4698 bool IsInc) const {
4699 unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP :
4700 AMDGPU::G_ATOMICRMW_UDEC_WRAP;
4701 B.buildInstr(Opc)
4702 .addDef(MI.getOperand(0).getReg())
4703 .addUse(MI.getOperand(2).getReg())
4704 .addUse(MI.getOperand(3).getReg())
4705 .cloneMemRefs(MI);
4706 MI.eraseFromParent();
4707 return true;
4708}
4709
4710static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4711 switch (IntrID) {
4712 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4713 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4714 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4715 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4716 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4717 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4718 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4719 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4720 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4721 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4722 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4723 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4724 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4725 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4726 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4727 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4728 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4729 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4730 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4731 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4732 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4733 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4734 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4735 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4736 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4737 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4738 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4739 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4740 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4741 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4742 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4743 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4744 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4745 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4746 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4747 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4748 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4749 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4750 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4751 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4752 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4753 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4754 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4755 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4756 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4757 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4758 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4759 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4760 default:
4761 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4761)
;
4762 }
4763}
4764
4765bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4766 MachineIRBuilder &B,
4767 Intrinsic::ID IID) const {
4768 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4769 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4770 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4771
4772 Register Dst;
4773
4774 int OpOffset = 0;
4775 if (HasReturn) {
4776 // A few FP atomics do not support return values.
4777 Dst = MI.getOperand(0).getReg();
4778 } else {
4779 OpOffset = -1;
4780 }
4781
4782 Register VData = MI.getOperand(2 + OpOffset).getReg();
4783 Register CmpVal;
4784
4785 if (IsCmpSwap) {
4786 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4787 ++OpOffset;
4788 }
4789
4790 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4791 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4792
4793 // The struct intrinsic variants add one additional operand over raw.
4794 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4795 Register VIndex;
4796 if (HasVIndex) {
4797 VIndex = MI.getOperand(4 + OpOffset).getReg();
4798 ++OpOffset;
4799 } else {
4800 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4801 }
4802
4803 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4804 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4805 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4806
4807 MachineMemOperand *MMO = *MI.memoperands_begin();
4808
4809 unsigned ImmOffset;
4810 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4811 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4812
4813 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4814
4815 if (HasReturn)
4816 MIB.addDef(Dst);
4817
4818 MIB.addUse(VData); // vdata
4819
4820 if (IsCmpSwap)
4821 MIB.addReg(CmpVal);
4822
4823 MIB.addUse(RSrc) // rsrc
4824 .addUse(VIndex) // vindex
4825 .addUse(VOffset) // voffset
4826 .addUse(SOffset) // soffset
4827 .addImm(ImmOffset) // offset(imm)
4828 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4829 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4830 .addMemOperand(MMO);
4831
4832 MI.eraseFromParent();
4833 return true;
4834}
4835
4836/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4837/// vector with s16 typed elements.
4838static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4839 SmallVectorImpl<Register> &PackedAddrs,
4840 unsigned ArgOffset,
4841 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4842 bool IsA16, bool IsG16) {
4843 const LLT S16 = LLT::scalar(16);
4844 const LLT V2S16 = LLT::fixed_vector(2, 16);
4845 auto EndIdx = Intr->VAddrEnd;
4846
4847 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4848 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4849 if (!SrcOp.isReg())
4850 continue; // _L to _LZ may have eliminated this.
4851
4852 Register AddrReg = SrcOp.getReg();
4853
4854 if ((I < Intr->GradientStart) ||
4855 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4856 (I >= Intr->CoordStart && !IsA16)) {
4857 if ((I < Intr->GradientStart) && IsA16 &&
4858 (B.getMRI()->getType(AddrReg) == S16)) {
4859 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument")(static_cast <bool> (I == Intr->BiasIndex &&
"Got unexpected 16-bit extra argument") ? void (0) : __assert_fail
("I == Intr->BiasIndex && \"Got unexpected 16-bit extra argument\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4859, __extension__
__PRETTY_FUNCTION__))
;
4860 // Special handling of bias when A16 is on. Bias is of type half but
4861 // occupies full 32-bit.
4862 PackedAddrs.push_back(
4863 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4864 .getReg(0));
4865 } else {
4866 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__
__PRETTY_FUNCTION__))
4867 "Bias needs to be converted to 16 bit in A16 mode")(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs ==
0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode"
) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\""
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__
__PRETTY_FUNCTION__))
;
4868 // Handle any gradient or coordinate operands that should not be packed
4869 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4870 PackedAddrs.push_back(AddrReg);
4871 }
4872 } else {
4873 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4874 // derivatives dx/dh and dx/dv are packed with undef.
4875 if (((I + 1) >= EndIdx) ||
4876 ((Intr->NumGradients / 2) % 2 == 1 &&
4877 (I == static_cast<unsigned>(Intr->GradientStart +
4878 (Intr->NumGradients / 2) - 1) ||
4879 I == static_cast<unsigned>(Intr->GradientStart +
4880 Intr->NumGradients - 1))) ||
4881 // Check for _L to _LZ optimization
4882 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4883 PackedAddrs.push_back(
4884 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4885 .getReg(0));
4886 } else {
4887 PackedAddrs.push_back(
4888 B.buildBuildVector(
4889 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4890 .getReg(0));
4891 ++I;
4892 }
4893 }
4894 }
4895}
4896
4897/// Convert from separate vaddr components to a single vector address register,
4898/// and replace the remaining operands with $noreg.
4899static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4900 int DimIdx, int NumVAddrs) {
4901 const LLT S32 = LLT::scalar(32);
4902 (void)S32;
4903 SmallVector<Register, 8> AddrRegs;
4904 for (int I = 0; I != NumVAddrs; ++I) {
4905 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4906 if (SrcOp.isReg()) {
4907 AddrRegs.push_back(SrcOp.getReg());
4908 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4908, __extension__
__PRETTY_FUNCTION__))
;
4909 }
4910 }
4911
4912 int NumAddrRegs = AddrRegs.size();
4913 if (NumAddrRegs != 1) {
4914 auto VAddr =
4915 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4916 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4917 }
4918
4919 for (int I = 1; I != NumVAddrs; ++I) {
4920 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4921 if (SrcOp.isReg())
4922 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4923 }
4924}
4925
4926/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4927///
4928/// Depending on the subtarget, load/store with 16-bit element data need to be
4929/// rewritten to use the low half of 32-bit registers, or directly use a packed
4930/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4931/// registers.
4932///
4933/// We don't want to directly select image instructions just yet, but also want
4934/// to exposes all register repacking to the legalizer/combiners. We also don't
4935/// want a selected instruction entering RegBankSelect. In order to avoid
4936/// defining a multitude of intermediate image instructions, directly hack on
4937/// the intrinsic's arguments. In cases like a16 addresses, this requires
4938/// padding now unnecessary arguments with $noreg.
4939bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4940 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4941 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4942
4943 const MachineFunction &MF = *MI.getMF();
4944 const unsigned NumDefs = MI.getNumExplicitDefs();
4945 const unsigned ArgOffset = NumDefs + 1;
4946 bool IsTFE = NumDefs == 2;
4947 // We are only processing the operands of d16 image operations on subtargets
4948 // that use the unpacked register layout, or need to repack the TFE result.
4949
4950 // TODO: Do we need to guard against already legalized intrinsics?
4951 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4952 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4953
4954 MachineRegisterInfo *MRI = B.getMRI();
4955 const LLT S32 = LLT::scalar(32);
4956 const LLT S16 = LLT::scalar(16);
4957 const LLT V2S16 = LLT::fixed_vector(2, 16);
4958
4959 unsigned DMask = 0;
4960 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4961 LLT Ty = MRI->getType(VData);
4962
4963 // Check for 16 bit addresses and pack if true.
4964 LLT GradTy =
4965 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4966 LLT AddrTy =
4967 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4968 const bool IsG16 =
4969 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
4970 const bool IsA16 = AddrTy == S16;
4971 const bool IsD16 = Ty.getScalarType() == S16;
4972
4973 int DMaskLanes = 0;
4974 if (!BaseOpcode->Atomic) {
4975 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4976 if (BaseOpcode->Gather4) {
4977 DMaskLanes = 4;
4978 } else if (DMask != 0) {
4979 DMaskLanes = llvm::popcount(DMask);
4980 } else if (!IsTFE && !BaseOpcode->Store) {
4981 // If dmask is 0, this is a no-op load. This can be eliminated.
4982 B.buildUndef(MI.getOperand(0));
4983 MI.eraseFromParent();
4984 return true;
4985 }
4986 }
4987
4988 Observer.changingInstr(MI);
4989 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4990
4991 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4992 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4993 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4994 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4995 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4996
4997 // Track that we legalized this
4998 MI.setDesc(B.getTII().get(NewOpcode));
4999
5000 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
5001 // dmask to be at least 1 otherwise the instruction will fail
5002 if (IsTFE && DMask == 0) {
5003 DMask = 0x1;
5004 DMaskLanes = 1;
5005 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
5006 }
5007
5008 if (BaseOpcode->Atomic) {
5009 Register VData0 = MI.getOperand(2).getReg();
5010 LLT Ty = MRI->getType(VData0);
5011
5012 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
5013 if (Ty.isVector())
5014 return false;
5015
5016 if (BaseOpcode->AtomicX2) {
5017 Register VData1 = MI.getOperand(3).getReg();
5018 // The two values are packed in one register.
5019 LLT PackedTy = LLT::fixed_vector(2, Ty);
5020 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
5021 MI.getOperand(2).setReg(Concat.getReg(0));
5022 MI.getOperand(3).setReg(AMDGPU::NoRegister);
5023 }
5024 }
5025
5026 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
5027
5028 // Rewrite the addressing register layout before doing anything else.
5029 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
5030 // 16 bit gradients are supported, but are tied to the A16 control
5031 // so both gradients and addresses must be 16 bit
5032 return false;
5033 }
5034
5035 if (IsA16 && !ST.hasA16()) {
5036 // A16 not supported
5037 return false;
5038 }
5039
5040 const unsigned NSAMaxSize = ST.getNSAMaxSize();
5041 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
5042
5043 if (IsA16 || IsG16) {
5044 if (Intr->NumVAddrs > 1) {
5045 SmallVector<Register, 4> PackedRegs;
5046
5047 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
5048 IsG16);
5049
5050 // See also below in the non-a16 branch
5051 const bool UseNSA = ST.hasNSAEncoding() &&
5052 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
5053 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
5054 const bool UsePartialNSA =
5055 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
5056
5057 if (UsePartialNSA) {
5058 // Pack registers that would go over NSAMaxSize into last VAddr register
5059 LLT PackedAddrTy =
5060 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
5061 auto Concat = B.buildConcatVectors(
5062 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
5063 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
5064 PackedRegs.resize(NSAMaxSize);
5065 } else if (!UseNSA && PackedRegs.size() > 1) {
5066 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
5067 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
5068 PackedRegs[0] = Concat.getReg(0);
5069 PackedRegs.resize(1);
5070 }
5071
5072 const unsigned NumPacked = PackedRegs.size();
5073 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
5074 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
5075 if (!SrcOp.isReg()) {
5076 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5076, __extension__
__PRETTY_FUNCTION__))
;
5077 continue;
5078 }
5079
5080 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5080, __extension__
__PRETTY_FUNCTION__))
;
5081
5082 if (I - Intr->VAddrStart < NumPacked)
5083 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
5084 else
5085 SrcOp.setReg(AMDGPU::NoRegister);
5086 }
5087 }
5088 } else {
5089 // If the register allocator cannot place the address registers contiguously
5090 // without introducing moves, then using the non-sequential address encoding
5091 // is always preferable, since it saves VALU instructions and is usually a
5092 // wash in terms of code size or even better.
5093 //
5094 // However, we currently have no way of hinting to the register allocator
5095 // that MIMG addresses should be placed contiguously when it is possible to
5096 // do so, so force non-NSA for the common 2-address case as a heuristic.
5097 //
5098 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5099 // allocation when possible.
5100 //
5101 // Partial NSA is allowed on GFX11 where the final register is a contiguous
5102 // set of the remaining addresses.
5103 const bool UseNSA = ST.hasNSAEncoding() &&
5104 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5105 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
5106 const bool UsePartialNSA =
5107 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
5108
5109 if (UsePartialNSA) {
5110 convertImageAddrToPacked(B, MI,
5111 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
5112 Intr->NumVAddrs - NSAMaxSize + 1);
5113 } else if (!UseNSA && Intr->NumVAddrs > 1) {
5114 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5115 Intr->NumVAddrs);
5116 }
5117 }
5118
5119 int Flags = 0;
5120 if (IsA16)
5121 Flags |= 1;
5122 if (IsG16)
5123 Flags |= 2;
5124 MI.addOperand(MachineOperand::CreateImm(Flags));
5125
5126 if (BaseOpcode->Store) { // No TFE for stores?
5127 // TODO: Handle dmask trim
5128 if (!Ty.isVector() || !IsD16)
5129 return true;
5130
5131 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5132 if (RepackedReg != VData) {
5133 MI.getOperand(1).setReg(RepackedReg);
5134 }
5135
5136 return true;
5137 }
5138
5139 Register DstReg = MI.getOperand(0).getReg();
5140 const LLT EltTy = Ty.getScalarType();
5141 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5142
5143 // Confirm that the return type is large enough for the dmask specified
5144 if (NumElts < DMaskLanes)
5145 return false;
5146
5147 if (NumElts > 4 || DMaskLanes > 4)
5148 return false;
5149
5150 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5151 const LLT AdjustedTy =
5152 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5153
5154 // The raw dword aligned data component of the load. The only legal cases
5155 // where this matters should be when using the packed D16 format, for
5156 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5157 LLT RoundedTy;
5158
5159 // S32 vector to cover all data, plus TFE result element.
5160 LLT TFETy;
5161
5162 // Register type to use for each loaded component. Will be S32 or V2S16.
5163 LLT RegTy;
5164
5165 if (IsD16 && ST.hasUnpackedD16VMem()) {
5166 RoundedTy =
5167 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5168 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5169 RegTy = S32;
5170 } else {
5171 unsigned EltSize = EltTy.getSizeInBits();
5172 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5173 unsigned RoundedSize = 32 * RoundedElts;
5174 RoundedTy = LLT::scalarOrVector(
5175 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5176 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5177 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5178 }
5179
5180 // The return type does not need adjustment.
5181 // TODO: Should we change s16 case to s32 or <2 x s16>?
5182 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5183 return true;
5184
5185 Register Dst1Reg;
5186
5187 // Insert after the instruction.
5188 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5189
5190 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5191 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5192 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5193 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5194
5195 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5196
5197 MI.getOperand(0).setReg(NewResultReg);
5198
5199 // In the IR, TFE is supposed to be used with a 2 element struct return
5200 // type. The instruction really returns these two values in one contiguous
5201 // register, with one additional dword beyond the loaded data. Rewrite the
5202 // return type to use a single register result.
5203
5204 if (IsTFE) {
5205 Dst1Reg = MI.getOperand(1).getReg();
5206 if (MRI->getType(Dst1Reg) != S32)
5207 return false;
5208
5209 // TODO: Make sure the TFE operand bit is set.
5210 MI.removeOperand(1);
5211
5212 // Handle the easy case that requires no repack instructions.
5213 if (Ty == S32) {
5214 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5215 return true;
5216 }
5217 }
5218
5219 // Now figure out how to copy the new result register back into the old
5220 // result.
5221 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5222
5223 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5224
5225 if (ResultNumRegs == 1) {
5226 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp",
5226, __extension__ __PRETTY_FUNCTION__))
;
5227 ResultRegs[0] = NewResultReg;
5228 } else {
5229 // We have to repack into a new vector of some kind.
5230 for (int I = 0; I != NumDataRegs; ++I)
5231 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5232 B.buildUnmerge(ResultRegs, NewResultReg);
5233
5234 // Drop the final TFE element to get the data part. The TFE result is
5235 // directly written to the right place already.
5236 if (IsTFE)
5237 ResultRegs.resize(NumDataRegs);
5238 }
5239
5240 // For an s16 scalar result, we form an s32 result with a truncate regardless
5241 // of packed vs. unpacked.
5242 if (IsD16 && !Ty.isVector()) {
5243 B.buildTrunc(DstReg, ResultRegs[0]);
5244 return true;
5245 }
5246
5247 // Avoid a build/concat_vector of 1 entry.
5248 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5249 B.buildBitcast(DstReg, ResultRegs[0]);
5250 return true;
5251 }
5252
5253 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5253, __extension__ __PRETTY_FUNCTION__))
;
5254
5255 if (IsD16) {
5256 // For packed D16 results with TFE enabled, all the data components are
5257 // S32. Cast back to the expected type.
5258 //
5259 // TODO: We don't really need to use load s32 elements. We would only need one
5260 // cast for the TFE result if a multiple of v2s16 was used.
5261 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5262 for (Register &Reg : ResultRegs)
5263 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5264 } else if (ST.hasUnpackedD16VMem()) {
5265 for (Register &Reg : ResultRegs)
5266 Reg = B.buildTrunc(S16, Reg).getReg(0);
5267 }
5268 }
5269
5270 auto padWithUndef = [&](LLT Ty, int NumElts) {
5271 if (NumElts == 0)
5272 return;
5273 Register Undef = B.buildUndef(Ty).getReg(0);
5274 for (int I = 0; I != NumElts; ++I)
5275 ResultRegs.push_back(Undef);
5276 };
5277
5278 // Pad out any elements eliminated due to the dmask.
5279 LLT ResTy = MRI->getType(ResultRegs[0]);
5280 if (!ResTy.isVector()) {
5281 padWithUndef(ResTy, NumElts - ResultRegs.size());
5282 B.buildBuildVector(DstReg, ResultRegs);
5283 return true;
5284 }
5285
5286 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5286, __extension__
__PRETTY_FUNCTION__))
;
5287 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5288
5289 // Deal with the one annoying legal case.
5290 const LLT V3S16 = LLT::fixed_vector(3, 16);
5291 if (Ty == V3S16) {
5292 if (IsTFE) {
5293 if (ResultRegs.size() == 1) {
5294 NewResultReg = ResultRegs[0];
5295 } else if (ResultRegs.size() == 2) {
5296 LLT V4S16 = LLT::fixed_vector(4, 16);
5297 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5298 } else {
5299 return false;
5300 }
5301 }
5302
5303 if (MRI->getType(DstReg).getNumElements() <
5304 MRI->getType(NewResultReg).getNumElements()) {
5305 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5306 } else {
5307 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5308 }
5309 return true;
5310 }
5311
5312 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5313 B.buildConcatVectors(DstReg, ResultRegs);
5314 return true;
5315}
5316
5317bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5318 LegalizerHelper &Helper, MachineInstr &MI) const {
5319 MachineIRBuilder &B = Helper.MIRBuilder;
5320 GISelChangeObserver &Observer = Helper.Observer;
5321
5322 Register Dst = MI.getOperand(0).getReg();
5323 LLT Ty = B.getMRI()->getType(Dst);
5324 unsigned Size = Ty.getSizeInBits();
5325 MachineFunction &MF = B.getMF();
5326
5327 Observer.changingInstr(MI);
5328
5329 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5330 Ty = getBitcastRegisterType(Ty);
5331 Helper.bitcastDst(MI, Ty, 0);
5332 Dst = MI.getOperand(0).getReg();
5333 B.setInsertPt(B.getMBB(), MI);
5334 }
5335
5336 // FIXME: We don't really need this intermediate instruction. The intrinsic
5337 // should be fixed to have a memory operand. Since it's readnone, we're not
5338 // allowed to add one.
5339 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5340 MI.removeOperand(1); // Remove intrinsic ID
5341
5342 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5343 // TODO: Should this use datalayout alignment?
5344 const unsigned MemSize = (Size + 7) / 8;
5345 const Align MemAlign(4);
5346 MachineMemOperand *MMO = MF.getMachineMemOperand(
5347 MachinePointerInfo(),
5348 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5349 MachineMemOperand::MOInvariant,
5350 MemSize, MemAlign);
5351 MI.addMemOperand(MF, MMO);
5352
5353 // There are no 96-bit result scalar loads, but widening to 128-bit should
5354 // always be legal. We may need to restore this to a 96-bit result if it turns
5355 // out this needs to be converted to a vector load during RegBankSelect.
5356 if (!isPowerOf2_32(Size)) {
5357 if (Ty.isVector())
5358 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5359 else
5360 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5361 }
5362
5363 Observer.changedInstr(MI);
5364 return true;
5365}
5366
5367// TODO: Move to selection
5368bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5369 MachineRegisterInfo &MRI,
5370 MachineIRBuilder &B) const {
5371 if (!ST.isTrapHandlerEnabled() ||
5372 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5373 return legalizeTrapEndpgm(MI, MRI, B);
5374
5375 const Module *M = B.getMF().getFunction().getParent();
5376 unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
5377 if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
5378 return legalizeTrapHsaQueuePtr(MI, MRI, B);
5379
5380 return ST.supportsGetDoorbellID() ?
5381 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
5382}
5383
5384bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5385 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5386 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5387 MI.eraseFromParent();
5388 return true;
5389}
5390
5391bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5392 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5393 MachineFunction &MF = B.getMF();
5394 const LLT S64 = LLT::scalar(64);
5395
5396 Register SGPR01(AMDGPU::SGPR0_SGPR1);
5397 // For code object version 5, queue_ptr is passed through implicit kernarg.
5398 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
5399 AMDGPU::AMDHSA_COV5) {
5400 AMDGPUTargetLowering::ImplicitParameter Param =
5401 AMDGPUTargetLowering::QUEUE_PTR;
5402 uint64_t Offset =
5403 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5404
5405 Register KernargPtrReg = MRI.createGenericVirtualRegister(
5406 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5407
5408 if (!loadInputValue(KernargPtrReg, B,
5409 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5410 return false;
5411
5412 // TODO: can we be smarter about machine pointer info?
5413 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5414 MachineMemOperand *MMO = MF.getMachineMemOperand(
5415 PtrInfo,
5416 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5417 MachineMemOperand::MOInvariant,
5418 LLT::scalar(64), commonAlignment(Align(64), Offset));
5419
5420 // Pointer address
5421 Register LoadAddr = MRI.createGenericVirtualRegister(
5422 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5423 B.buildPtrAdd(LoadAddr, KernargPtrReg,
5424 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5425 // Load address
5426 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5427 B.buildCopy(SGPR01, Temp);
5428 B.buildInstr(AMDGPU::S_TRAP)
5429 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5430 .addReg(SGPR01, RegState::Implicit);
5431 MI.eraseFromParent();
5432 return true;
5433 }
5434
5435 // Pass queue pointer to trap handler as input, and insert trap instruction
5436 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5437 Register LiveIn =
5438 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5439 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5440 return false;
5441
5442 B.buildCopy(SGPR01, LiveIn);
5443 B.buildInstr(AMDGPU::S_TRAP)
5444 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5445 .addReg(SGPR01, RegState::Implicit);
5446
5447 MI.eraseFromParent();
5448 return true;
5449}
5450
5451bool AMDGPULegalizerInfo::legalizeTrapHsa(
5452 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5453 B.buildInstr(AMDGPU::S_TRAP)
5454 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5455 MI.eraseFromParent();
5456 return true;
5457}
5458
5459bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5460 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5461 // Is non-HSA path or trap-handler disabled? Then, report a warning
5462 // accordingly
5463 if (!ST.isTrapHandlerEnabled() ||
5464 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5465 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5466 "debugtrap handler not supported",
5467 MI.getDebugLoc(), DS_Warning);
5468 LLVMContext &Ctx = B.getMF().getFunction().getContext();
5469 Ctx.diagnose(NoTrap);
5470 } else {
5471 // Insert debug-trap instruction
5472 B.buildInstr(AMDGPU::S_TRAP)
5473 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5474 }
5475
5476 MI.eraseFromParent();
5477 return true;
5478}
5479
5480bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5481 MachineIRBuilder &B) const {
5482 MachineRegisterInfo &MRI = *B.getMRI();
5483 const LLT S16 = LLT::scalar(16);
5484 const LLT S32 = LLT::scalar(32);
5485 const LLT V2S16 = LLT::fixed_vector(2, 16);
5486 const LLT V3S32 = LLT::fixed_vector(3, 32);
5487
5488 Register DstReg = MI.getOperand(0).getReg();
5489 Register NodePtr = MI.getOperand(2).getReg();
5490 Register RayExtent = MI.getOperand(3).getReg();
5491 Register RayOrigin = MI.getOperand(4).getReg();
5492 Register RayDir = MI.getOperand(5).getReg();
5493 Register RayInvDir = MI.getOperand(6).getReg();
5494 Register TDescr = MI.getOperand(7).getReg();
5495
5496 if (!ST.hasGFX10_AEncoding()) {
5497 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5498 "intrinsic not supported on subtarget",
5499 MI.getDebugLoc());
5500 B.getMF().getFunction().getContext().diagnose(BadIntrin);
5501 return false;
5502 }
5503
5504 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5505 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5506 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5507 const unsigned NumVDataDwords = 4;
5508 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5509 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5510 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5511 const unsigned BaseOpcodes[2][2] = {
5512 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5513 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5514 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5515 int Opcode;
5516 if (UseNSA) {
5517 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5518 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5519 : AMDGPU::MIMGEncGfx10NSA,
5520 NumVDataDwords, NumVAddrDwords);
5521 } else {
5522 Opcode = AMDGPU::getMIMGOpcode(
5523 BaseOpcodes[Is64][IsA16],
5524 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5525 NumVDataDwords, NumVAddrDwords);
5526 }
5527 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 5527, __extension__ __PRETTY_FUNCTION__))
;
5528
5529 SmallVector<Register, 12> Ops;
5530 if (UseNSA && IsGFX11Plus) {
5531 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5532 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5533 auto Merged = B.buildMergeLikeInstr(
5534 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5535 Ops.push_back(Merged.getReg(0));
5536 };
5537
5538 Ops.push_back(NodePtr);
5539 Ops.push_back(RayExtent);
5540 packLanes(RayOrigin);
5541
5542 if (IsA16) {
5543 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5544 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5545 auto MergedDir = B.buildMergeLikeInstr(
5546 V3S32,
5547 {B.buildBitcast(
5548 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
5549 UnmergeRayDir.getReg(0)}))
5550 .getReg(0),
5551 B.buildBitcast(
5552 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
5553 UnmergeRayDir.getReg(1)}))
5554 .getReg(0),
5555 B.buildBitcast(
5556 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
5557 UnmergeRayDir.getReg(2)}))
5558 .getReg(0)});
5559 Ops.push_back(MergedDir.getReg(0));
5560 } else {
5561 packLanes(RayDir);
5562 packLanes(RayInvDir);
5563 }
5564 } else {
5565 if (Is64) {
5566 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5567 Ops.push_back(Unmerge.getReg(0));
5568 Ops.push_back(Unmerge.getReg(1));
5569 } else {
5570 Ops.push_back(NodePtr);
5571 }
5572 Ops.push_back(RayExtent);
5573
5574 auto packLanes = [&Ops, &S32, &B](Register Src) {
5575 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5576 Ops.push_back(Unmerge.getReg(0));
5577 Ops.push_back(Unmerge.getReg(1));
5578 Ops.push_back(Unmerge.getReg(2));
5579 };
5580
5581 packLanes(RayOrigin);
5582 if (IsA16) {
5583 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5584 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5585 Register R1 = MRI.createGenericVirtualRegister(S32);
5586 Register R2 = MRI.createGenericVirtualRegister(S32);
5587 Register R3 = MRI.createGenericVirtualRegister(S32);
5588 B.buildMergeLikeInstr(R1,
5589 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5590 B.buildMergeLikeInstr(
5591 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5592 B.buildMergeLikeInstr(
5593 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5594 Ops.push_back(R1);
5595 Ops.push_back(R2);
5596 Ops.push_back(R3);
5597 } else {
5598 packLanes(RayDir);
5599 packLanes(RayInvDir);
5600 }
5601 }
5602
5603 if (!UseNSA) {
5604 // Build a single vector containing all the operands so far prepared.
5605 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5606 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
5607 Ops.clear();
5608 Ops.push_back(MergedOps);
5609 }
5610
5611 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5612 .addDef(DstReg)
5613 .addImm(Opcode);
5614
5615 for (Register R : Ops) {
5616 MIB.addUse(R);
5617 }
5618
5619 MIB.addUse(TDescr)
5620 .addImm(IsA16 ? 1 : 0)
5621 .cloneMemRefs(MI);
5622
5623 MI.eraseFromParent();
5624 return true;
5625}
5626
5627bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5628 MachineIRBuilder &B) const {
5629 unsigned Opc;
5630 int RoundMode = MI.getOperand(2).getImm();
5631
5632 if (RoundMode == (int)RoundingMode::TowardPositive)
5633 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5634 else if (RoundMode == (int)RoundingMode::TowardNegative)
5635 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5636 else
5637 return false;
5638
5639 B.buildInstr(Opc)
5640 .addDef(MI.getOperand(0).getReg())
5641 .addUse(MI.getOperand(1).getReg());
5642
5643 MI.eraseFromParent();
5644
5645 return true;
5646}
5647
5648bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5649 MachineInstr &MI) const {
5650 MachineIRBuilder &B = Helper.MIRBuilder;
5651 MachineRegisterInfo &MRI = *B.getMRI();
5652
5653 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5654 auto IntrID = MI.getIntrinsicID();
5655 switch (IntrID) {
5656 case Intrinsic::amdgcn_if:
5657 case Intrinsic::amdgcn_else: {
5658 MachineInstr *Br = nullptr;
5659 MachineBasicBlock *UncondBrTarget = nullptr;
5660 bool Negated = false;
5661 if (MachineInstr *BrCond =
5662 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5663 const SIRegisterInfo *TRI
5664 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5665
5666 Register Def = MI.getOperand(1).getReg();
5667 Register Use = MI.getOperand(3).getReg();
5668
5669 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5670
5671 if (Negated)
5672 std::swap(CondBrTarget, UncondBrTarget);
5673
5674 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5675 if (IntrID == Intrinsic::amdgcn_if) {
5676 B.buildInstr(AMDGPU::SI_IF)
5677 .addDef(Def)
5678 .addUse(Use)
5679 .addMBB(UncondBrTarget);
5680 } else {
5681 B.buildInstr(AMDGPU::SI_ELSE)
5682 .addDef(Def)
5683 .addUse(Use)
5684 .addMBB(UncondBrTarget);
5685 }
5686
5687 if (Br) {
5688 Br->getOperand(0).setMBB(CondBrTarget);
5689 } else {
5690 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5691 // since we're swapping branch targets it needs to be reinserted.
5692 // FIXME: IRTranslator should probably not do this
5693 B.buildBr(*CondBrTarget);
5694 }
5695
5696 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5697 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5698 MI.eraseFromParent();
5699 BrCond->eraseFromParent();
5700 return true;
5701 }
5702
5703 return false;
5704 }
5705 case Intrinsic::amdgcn_loop: {
5706 MachineInstr *Br = nullptr;
5707 MachineBasicBlock *UncondBrTarget = nullptr;
5708 bool Negated = false;
5709 if (MachineInstr *BrCond =
5710 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5711 const SIRegisterInfo *TRI
5712 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5713
5714 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5715 Register Reg = MI.getOperand(2).getReg();
5716
5717 if (Negated)
5718 std::swap(CondBrTarget, UncondBrTarget);
5719
5720 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5721 B.buildInstr(AMDGPU::SI_LOOP)
5722 .addUse(Reg)
5723 .addMBB(UncondBrTarget);
5724
5725 if (Br)
5726 Br->getOperand(0).setMBB(CondBrTarget);
5727 else
5728 B.buildBr(*CondBrTarget);
5729
5730 MI.eraseFromParent();
5731 BrCond->eraseFromParent();
5732 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5733 return true;
5734 }
5735
5736 return false;
5737 }
5738 case Intrinsic::amdgcn_kernarg_segment_ptr:
5739 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5740 // This only makes sense to call in a kernel, so just lower to null.
5741 B.buildConstant(MI.getOperand(0).getReg(), 0);
5742 MI.eraseFromParent();
5743 return true;
5744 }
5745
5746 return legalizePreloadedArgIntrin(
5747 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5748 case Intrinsic::amdgcn_implicitarg_ptr:
5749 return legalizeImplicitArgPtr(MI, MRI, B);
5750 case Intrinsic::amdgcn_workitem_id_x:
5751 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5752 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5753 case Intrinsic::amdgcn_workitem_id_y:
5754 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5755 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5756 case Intrinsic::amdgcn_workitem_id_z:
5757 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5758 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5759 case Intrinsic::amdgcn_workgroup_id_x:
5760 return legalizePreloadedArgIntrin(MI, MRI, B,
5761 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5762 case Intrinsic::amdgcn_workgroup_id_y:
5763 return legalizePreloadedArgIntrin(MI, MRI, B,
5764 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5765 case Intrinsic::amdgcn_workgroup_id_z:
5766 return legalizePreloadedArgIntrin(MI, MRI, B,
5767 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5768 case Intrinsic::amdgcn_lds_kernel_id:
5769 return legalizePreloadedArgIntrin(MI, MRI, B,
5770 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5771 case Intrinsic::amdgcn_dispatch_ptr:
5772 return legalizePreloadedArgIntrin(MI, MRI, B,
5773 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5774 case Intrinsic::amdgcn_queue_ptr:
5775 return legalizePreloadedArgIntrin(MI, MRI, B,
5776 AMDGPUFunctionArgInfo::QUEUE_PTR);
5777 case Intrinsic::amdgcn_implicit_buffer_ptr:
5778 return legalizePreloadedArgIntrin(
5779 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5780 case Intrinsic::amdgcn_dispatch_id:
5781 return legalizePreloadedArgIntrin(MI, MRI, B,
5782 AMDGPUFunctionArgInfo::DISPATCH_ID);
5783 case Intrinsic::r600_read_ngroups_x:
5784 // TODO: Emit error for hsa
5785 return legalizeKernargMemParameter(MI, B,
5786 SI::KernelInputOffsets::NGROUPS_X);
5787 case Intrinsic::r600_read_ngroups_y:
5788 return legalizeKernargMemParameter(MI, B,
5789 SI::KernelInputOffsets::NGROUPS_Y);
5790 case Intrinsic::r600_read_ngroups_z:
5791 return legalizeKernargMemParameter(MI, B,
5792 SI::KernelInputOffsets::NGROUPS_Z);
5793 case Intrinsic::r600_read_local_size_x:
5794 // TODO: Could insert G_ASSERT_ZEXT from s16
5795 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5796 case Intrinsic::r600_read_local_size_y:
5797 // TODO: Could insert G_ASSERT_ZEXT from s16
5798 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
5799 // TODO: Could insert G_ASSERT_ZEXT from s16
5800 case Intrinsic::r600_read_local_size_z:
5801 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5802 case Intrinsic::r600_read_global_size_x:
5803 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5804 case Intrinsic::r600_read_global_size_y:
5805 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5806 case Intrinsic::r600_read_global_size_z:
5807 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5808 case Intrinsic::amdgcn_fdiv_fast:
5809 return legalizeFDIVFastIntrin(MI, MRI, B);
5810 case Intrinsic::amdgcn_is_shared:
5811 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5812 case Intrinsic::amdgcn_is_private:
5813 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5814 case Intrinsic::amdgcn_wavefrontsize: {
5815 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5816 MI.eraseFromParent();
5817 return true;
5818 }
5819 case Intrinsic::amdgcn_s_buffer_load:
5820 return legalizeSBufferLoad(Helper, MI);
5821 case Intrinsic::amdgcn_raw_buffer_store:
5822 case Intrinsic::amdgcn_struct_buffer_store:
5823 return legalizeBufferStore(MI, MRI, B, false, false);
5824 case Intrinsic::amdgcn_raw_buffer_store_format:
5825 case Intrinsic::amdgcn_struct_buffer_store_format:
5826 return legalizeBufferStore(MI, MRI, B, false, true);
5827 case Intrinsic::amdgcn_raw_tbuffer_store:
5828 case Intrinsic::amdgcn_struct_tbuffer_store:
5829 return legalizeBufferStore(MI, MRI, B, true, true);
5830 case Intrinsic::amdgcn_raw_buffer_load:
5831 case Intrinsic::amdgcn_struct_buffer_load:
5832 return legalizeBufferLoad(MI, MRI, B, false, false);
5833 case Intrinsic::amdgcn_raw_buffer_load_format:
5834 case Intrinsic::amdgcn_struct_buffer_load_format:
5835 return legalizeBufferLoad(MI, MRI, B, true, false);
5836 case Intrinsic::amdgcn_raw_tbuffer_load:
5837 case Intrinsic::amdgcn_struct_tbuffer_load:
5838 return legalizeBufferLoad(MI, MRI, B, true, true);
5839 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5840 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5841 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5842 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5843 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5844 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5845 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5846 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5847 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5848 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5849 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5850 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5851 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5852 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5853 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5854 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5855 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5856 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5857 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5858 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5859 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5860 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5861 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5862 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5863 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5864 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5865 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5866 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5867 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5868 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5869 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5870 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5871 return legalizeBufferAtomic(MI, B, IntrID);
5872 case Intrinsic::amdgcn_atomic_inc:
5873 return legalizeAtomicIncDec(MI, B, true);
5874 case Intrinsic::amdgcn_atomic_dec:
5875 return legalizeAtomicIncDec(MI, B, false);
5876 case Intrinsic::trap:
5877 return legalizeTrapIntrinsic(MI, MRI, B);
5878 case Intrinsic::debugtrap:
5879 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5880 case Intrinsic::amdgcn_rsq_clamp:
5881 return legalizeRsqClampIntrinsic(MI, MRI, B);
5882 case Intrinsic::amdgcn_ds_fadd:
5883 case Intrinsic::amdgcn_ds_fmin:
5884 case Intrinsic::amdgcn_ds_fmax:
5885 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5886 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5887 return legalizeBVHIntrinsic(MI, B);
5888 default: {
5889 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5890 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5891 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5892 return true;
5893 }
5894 }
5895
5896 return true;
5897}

/build/source/llvm/include/llvm/ADT/bit.h

1//===-- llvm/ADT/bit.h - C++20 <bit> ----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the C++20 <bit> header.
11///
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_ADT_BIT_H
15#define LLVM_ADT_BIT_H
16
17#include "llvm/Support/Compiler.h"
18#include <cstdint>
19#include <limits>
20#include <type_traits>
21
22#if !__has_builtin(__builtin_bit_cast)1
23#include <cstring>
24#endif
25
26#if defined(_MSC_VER) && !defined(_DEBUG1)
27#include <cstdlib> // for _byteswap_{ushort,ulong,uint64}
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and bit.h is popular via MathExtras.h.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44// This implementation of bit_cast is different from the C++20 one in two ways:
45// - It isn't constexpr because that requires compiler support.
46// - It requires trivially-constructible To, to avoid UB in the implementation.
47template <
48 typename To, typename From,
49 typename = std::enable_if_t<sizeof(To) == sizeof(From)>,
50 typename = std::enable_if_t<std::is_trivially_constructible<To>::value>,
51 typename = std::enable_if_t<std::is_trivially_copyable<To>::value>,
52 typename = std::enable_if_t<std::is_trivially_copyable<From>::value>>
53[[nodiscard]] inline To bit_cast(const From &from) noexcept {
54#if __has_builtin(__builtin_bit_cast)1
55 return __builtin_bit_cast(To, from);
56#else
57 To to;
58 std::memcpy(&to, &from, sizeof(To));
59 return to;
60#endif
61}
62
63/// Reverses the bytes in the given integer value V.
64template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
65[[nodiscard]] constexpr T byteswap(T V) noexcept {
66 if constexpr (sizeof(T) == 1) {
67 return V;
68 } else if constexpr (sizeof(T) == 2) {
69 uint16_t UV = V;
70#if defined(_MSC_VER) && !defined(_DEBUG1)
71 // The DLL version of the runtime lacks these functions (bug!?), but in a
72 // release build they're replaced with BSWAP instructions anyway.
73 return _byteswap_ushort(UV);
74#else
75 uint16_t Hi = UV << 8;
76 uint16_t Lo = UV >> 8;
77 return Hi | Lo;
78#endif
79 } else if constexpr (sizeof(T) == 4) {
80 uint32_t UV = V;
81#if __has_builtin(__builtin_bswap32)1
82 return __builtin_bswap32(UV);
83#elif defined(_MSC_VER) && !defined(_DEBUG1)
84 return _byteswap_ulong(UV);
85#else
86 uint32_t Byte0 = UV & 0x000000FF;
87 uint32_t Byte1 = UV & 0x0000FF00;
88 uint32_t Byte2 = UV & 0x00FF0000;
89 uint32_t Byte3 = UV & 0xFF000000;
90 return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
91#endif
92 } else if constexpr (sizeof(T) == 8) {
93 uint64_t UV = V;
94#if __has_builtin(__builtin_bswap64)1
95 return __builtin_bswap64(UV);
96#elif defined(_MSC_VER) && !defined(_DEBUG1)
97 return _byteswap_uint64(UV);
98#else
99 uint64_t Hi = llvm::byteswap<uint32_t>(UV);
100 uint32_t Lo = llvm::byteswap<uint32_t>(UV >> 32);
101 return (Hi << 32) | Lo;
102#endif
103 } else {
104 static_assert(!sizeof(T *), "Don't know how to handle the given type.");
105 return 0;
106 }
107}
108
109template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
110[[nodiscard]] constexpr inline bool has_single_bit(T Value) noexcept {
111 return (Value != 0) && ((Value & (Value - 1)) == 0);
112}
113
114namespace detail {
115template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
116 static unsigned count(T Val) {
117 if (!Val)
118 return std::numeric_limits<T>::digits;
119 if (Val & 0x1)
120 return 0;
121
122 // Bisection method.
123 unsigned ZeroBits = 0;
124 T Shift = std::numeric_limits<T>::digits >> 1;
125 T Mask = std::numeric_limits<T>::max() >> Shift;
126 while (Shift) {
127 if ((Val & Mask) == 0) {
128 Val >>= Shift;
129 ZeroBits |= Shift;
130 }
131 Shift >>= 1;
132 Mask >>= Shift;
133 }
134 return ZeroBits;
135 }
136};
137
138#if defined(__GNUC__4) || defined(_MSC_VER)
139template <typename T> struct TrailingZerosCounter<T, 4> {
140 static unsigned count(T Val) {
141 if (Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
142 return 32;
13
Returning the value 32
143
144#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
145 return __builtin_ctz(Val);
146#elif defined(_MSC_VER)
147 unsigned long Index;
148 _BitScanForward(&Index, Val);
149 return Index;
150#endif
151 }
152};
153
154#if !defined(_MSC_VER) || defined(_M_X64)
155template <typename T> struct TrailingZerosCounter<T, 8> {
156 static unsigned count(T Val) {
157 if (Val == 0)
158 return 64;
159
160#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
161 return __builtin_ctzll(Val);
162#elif defined(_MSC_VER)
163 unsigned long Index;
164 _BitScanForward64(&Index, Val);
165 return Index;
166#endif
167 }
168};
169#endif
170#endif
171} // namespace detail
172
173/// Count number of 0's from the least significant bit to the most
174/// stopping at the first 1.
175///
176/// Only unsigned integral types are allowed.
177///
178/// Returns std::numeric_limits<T>::digits on an input of 0.
179template <typename T> [[nodiscard]] int countr_zero(T Val) {
180 static_assert(std::is_unsigned_v<T>,
181 "Only unsigned integral types are allowed.");
182 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
183}
184
185namespace detail {
186template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
187 static unsigned count(T Val) {
188 if (!Val)
189 return std::numeric_limits<T>::digits;
190
191 // Bisection method.
192 unsigned ZeroBits = 0;
193 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
194 T Tmp = Val >> Shift;
195 if (Tmp)
196 Val = Tmp;
197 else
198 ZeroBits |= Shift;
199 }
200 return ZeroBits;
201 }
202};
203
204#if defined(__GNUC__4) || defined(_MSC_VER)
205template <typename T> struct LeadingZerosCounter<T, 4> {
206 static unsigned count(T Val) {
207 if (Val == 0)
208 return 32;
209
210#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
211 return __builtin_clz(Val);
212#elif defined(_MSC_VER)
213 unsigned long Index;
214 _BitScanReverse(&Index, Val);
215 return Index ^ 31;
216#endif
217 }
218};
219
220#if !defined(_MSC_VER) || defined(_M_X64)
221template <typename T> struct LeadingZerosCounter<T, 8> {
222 static unsigned count(T Val) {
223 if (Val == 0)
224 return 64;
225
226#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
227 return __builtin_clzll(Val);
228#elif defined(_MSC_VER)
229 unsigned long Index;
230 _BitScanReverse64(&Index, Val);
231 return Index ^ 63;
232#endif
233 }
234};
235#endif
236#endif
237} // namespace detail
238
239/// Count number of 0's from the most significant bit to the least
240/// stopping at the first 1.
241///
242/// Only unsigned integral types are allowed.
243///
244/// Returns std::numeric_limits<T>::digits on an input of 0.
245template <typename T> [[nodiscard]] int countl_zero(T Val) {
246 static_assert(std::is_unsigned_v<T>,
247 "Only unsigned integral types are allowed.");
248 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val);
249}
250
251/// Count the number of ones from the most significant bit to the first
252/// zero bit.
253///
254/// Ex. countl_one(0xFF0FFF00) == 8.
255/// Only unsigned integral types are allowed.
256///
257/// Returns std::numeric_limits<T>::digits on an input of all ones.
258template <typename T> [[nodiscard]] int countl_one(T Value) {
259 static_assert(std::is_unsigned_v<T>,
260 "Only unsigned integral types are allowed.");
261 return llvm::countl_zero<T>(~Value);
262}
263
264/// Count the number of ones from the least significant bit to the first
265/// zero bit.
266///
267/// Ex. countr_one(0x00FF00FF) == 8.
268/// Only unsigned integral types are allowed.
269///
270/// Returns std::numeric_limits<T>::digits on an input of all ones.
271template <typename T> [[nodiscard]] int countr_one(T Value) {
272 static_assert(std::is_unsigned_v<T>,
273 "Only unsigned integral types are allowed.");
274 return llvm::countr_zero<T>(~Value);
275}
276
277/// Returns the number of bits needed to represent Value if Value is nonzero.
278/// Returns 0 otherwise.
279///
280/// Ex. bit_width(5) == 3.
281template <typename T> [[nodiscard]] int bit_width(T Value) {
282 static_assert(std::is_unsigned_v<T>,
283 "Only unsigned integral types are allowed.");
284 return std::numeric_limits<T>::digits - llvm::countl_zero(Value);
285}
286
287/// Returns the largest integral power of two no greater than Value if Value is
288/// nonzero. Returns 0 otherwise.
289///
290/// Ex. bit_floor(5) == 4.
291template <typename T> [[nodiscard]] T bit_floor(T Value) {
292 static_assert(std::is_unsigned_v<T>,
293 "Only unsigned integral types are allowed.");
294 if (!Value)
295 return 0;
296 return T(1) << (llvm::bit_width(Value) - 1);
297}
298
299/// Returns the smallest integral power of two no smaller than Value if Value is
300/// nonzero. Returns 1 otherwise.
301///
302/// Ex. bit_ceil(5) == 8.
303///
304/// The return value is undefined if the input is larger than the largest power
305/// of two representable in T.
306template <typename T> [[nodiscard]] T bit_ceil(T Value) {
307 static_assert(std::is_unsigned_v<T>,
308 "Only unsigned integral types are allowed.");
309 if (Value < 2)
310 return 1;
311 return T(1) << llvm::bit_width<T>(Value - 1u);
312}
313
314namespace detail {
315template <typename T, std::size_t SizeOfT> struct PopulationCounter {
316 static int count(T Value) {
317 // Generic version, forward to 32 bits.
318 static_assert(SizeOfT <= 4, "Not implemented!");
319#if defined(__GNUC__4)
320 return (int)__builtin_popcount(Value);
321#else
322 uint32_t v = Value;
323 v = v - ((v >> 1) & 0x55555555);
324 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
325 return int(((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24);
326#endif
327 }
328};
329
330template <typename T> struct PopulationCounter<T, 8> {
331 static int count(T Value) {
332#if defined(__GNUC__4)
333 return (int)__builtin_popcountll(Value);
334#else
335 uint64_t v = Value;
336 v = v - ((v >> 1) & 0x5555555555555555ULL);
337 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
338 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
339 return int((uint64_t)(v * 0x0101010101010101ULL) >> 56);
340#endif
341 }
342};
343} // namespace detail
344
345/// Count the number of set bits in a value.
346/// Ex. popcount(0xF000F000) = 8
347/// Returns 0 if the word is zero.
348template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
349[[nodiscard]] inline int popcount(T Value) noexcept {
350 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
351}
352
353// Forward-declare rotr so that rotl can use it.
354template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
355[[nodiscard]] constexpr T rotr(T V, int R);
356
357template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
358[[nodiscard]] constexpr T rotl(T V, int R) {
359 unsigned N = std::numeric_limits<T>::digits;
360
361 R = R % N;
362 if (!R)
363 return V;
364
365 if (R < 0)
366 return llvm::rotr(V, -R);
367
368 return (V << R) | (V >> (N - R));
369}
370
371template <typename T, typename> [[nodiscard]] constexpr T rotr(T V, int R) {
372 unsigned N = std::numeric_limits<T>::digits;
373
374 R = R % N;
375 if (!R)
376 return V;
377
378 if (R < 0)
379 return llvm::rotl(V, -R);
380
381 return (V >> R) | (V << (N - R));
382}
383
384} // namespace llvm
385
386#endif