Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 4419, column 49
The result of the '/' expression is undefined

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-06-13-111025-38230-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeNumElements(Pow2NElts);
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returs true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99 };
100}
101
102static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 unsigned Size = Ty.getSizeInBits();
107 unsigned Pieces = (Size + 63) / 64;
108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110 };
111}
112
113// Increase the number of vector elements to reach the next multiple of 32-bit
114// type.
115static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116 return [=](const LegalityQuery &Query) {
117 const LLT Ty = Query.Types[TypeIdx];
118
119 const LLT EltTy = Ty.getElementType();
120 const int Size = Ty.getSizeInBits();
121 const int EltSize = EltTy.getSizeInBits();
122 const int NextMul32 = (Size + 31) / 32;
123
124 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 124, __extension__ __PRETTY_FUNCTION__))
;
125
126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128 };
129}
130
131static LLT getBitcastRegisterType(const LLT Ty) {
132 const unsigned Size = Ty.getSizeInBits();
133
134 LLT CoercedTy;
135 if (Size <= 32) {
136 // <2 x s8> -> s16
137 // <4 x s8> -> s32
138 return LLT::scalar(Size);
139 }
140
141 return LLT::scalarOrVector(Size / 32, 32);
142}
143
144static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 const LLT Ty = Query.Types[TypeIdx];
147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148 };
149}
150
151static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152 return [=](const LegalityQuery &Query) {
153 const LLT Ty = Query.Types[TypeIdx];
154 unsigned Size = Ty.getSizeInBits();
155 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 155, __extension__ __PRETTY_FUNCTION__))
;
156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157 };
158}
159
160static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161 return [=](const LegalityQuery &Query) {
162 const LLT QueryTy = Query.Types[TypeIdx];
163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164 };
165}
166
167static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168 return [=](const LegalityQuery &Query) {
169 const LLT QueryTy = Query.Types[TypeIdx];
170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171 };
172}
173
174static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175 return [=](const LegalityQuery &Query) {
176 const LLT QueryTy = Query.Types[TypeIdx];
177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178 };
179}
180
181static bool isRegisterSize(unsigned Size) {
182 return Size % 32 == 0 && Size <= MaxRegisterSize;
183}
184
185static bool isRegisterVectorElementType(LLT EltTy) {
186 const int EltSize = EltTy.getSizeInBits();
187 return EltSize == 16 || EltSize % 32 == 0;
188}
189
190static bool isRegisterVectorType(LLT Ty) {
191 const int EltSize = Ty.getElementType().getSizeInBits();
192 return EltSize == 32 || EltSize == 64 ||
193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194 EltSize == 128 || EltSize == 256;
195}
196
197static bool isRegisterType(LLT Ty) {
198 if (!isRegisterSize(Ty.getSizeInBits()))
199 return false;
200
201 if (Ty.isVector())
202 return isRegisterVectorType(Ty);
203
204 return true;
205}
206
207// Any combination of 32 or 64-bit elements up the maximum register size, and
208// multiples of v2s16.
209static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210 return [=](const LegalityQuery &Query) {
211 return isRegisterType(Query.Types[TypeIdx]);
212 };
213}
214
215static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216 return [=](const LegalityQuery &Query) {
217 const LLT QueryTy = Query.Types[TypeIdx];
218 if (!QueryTy.isVector())
219 return false;
220 const LLT EltTy = QueryTy.getElementType();
221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222 };
223}
224
225// If we have a truncating store or an extending load with a data size larger
226// than 32-bits, we need to reduce to a 32-bit type.
227static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
228 return [=](const LegalityQuery &Query) {
229 const LLT Ty = Query.Types[TypeIdx];
230 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232 };
233}
234
235// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236// handle some operations by just promoting the register during
237// selection. There are also d16 loads on GFX9+ which preserve the high bits.
238static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239 bool IsLoad) {
240 switch (AS) {
241 case AMDGPUAS::PRIVATE_ADDRESS:
242 // FIXME: Private element size.
243 return ST.enableFlatScratch() ? 128 : 32;
244 case AMDGPUAS::LOCAL_ADDRESS:
245 return ST.useDS128() ? 128 : 64;
246 case AMDGPUAS::GLOBAL_ADDRESS:
247 case AMDGPUAS::CONSTANT_ADDRESS:
248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249 // Treat constant and global as identical. SMRD loads are sometimes usable for
250 // global loads (ideally constant address space should be eliminated)
251 // depending on the context. Legality cannot be context dependent, but
252 // RegBankSelect can split the load as necessary depending on the pointer
253 // register bank/uniformity and if the memory is invariant or not written in a
254 // kernel.
255 return IsLoad ? 512 : 128;
256 default:
257 // Flat addresses may contextually need to be split to 32-bit parts if they
258 // may alias scratch depending on the subtarget.
259 return 128;
260 }
261}
262
263static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264 const LegalityQuery &Query) {
265 const LLT Ty = Query.Types[0];
266
267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269
270 unsigned RegSize = Ty.getSizeInBits();
271 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273 unsigned AS = Query.Types[1].getAddressSpace();
274
275 // All of these need to be custom lowered to cast the pointer operand.
276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277 return false;
278
279 // Do not handle extending vector loads.
280 if (Ty.isVector() && MemSize != RegSize)
281 return false;
282
283 // TODO: We should be able to widen loads if the alignment is high enough, but
284 // we also need to modify the memory access size.
285#if 0
286 // Accept widening loads based on alignment.
287 if (IsLoad && MemSize < Size)
288 MemSize = std::max(MemSize, Align);
289#endif
290
291 // Only 1-byte and 2-byte to 32-bit extloads are valid.
292 if (MemSize != RegSize && RegSize != 32)
293 return false;
294
295 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296 return false;
297
298 switch (MemSize) {
299 case 8:
300 case 16:
301 case 32:
302 case 64:
303 case 128:
304 break;
305 case 96:
306 if (!ST.hasDwordx3LoadStores())
307 return false;
308 break;
309 case 256:
310 case 512:
311 // These may contextually need to be broken down.
312 break;
313 default:
314 return false;
315 }
316
317 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 317, __extension__ __PRETTY_FUNCTION__))
;
318
319 if (AlignBits < MemSize) {
320 const SITargetLowering *TLI = ST.getTargetLowering();
321 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322 Align(AlignBits / 8)))
323 return false;
324 }
325
326 return true;
327}
328
329// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330// workaround this. Eventually it should ignore the type for loads and only care
331// about the size. Return true in cases where we will workaround this for now by
332// bitcasting.
333static bool loadStoreBitcastWorkaround(const LLT Ty) {
334 if (EnableNewLegality)
335 return false;
336
337 const unsigned Size = Ty.getSizeInBits();
338 if (Size <= 64)
339 return false;
340 if (!Ty.isVector())
341 return true;
342
343 LLT EltTy = Ty.getElementType();
344 if (EltTy.isPointer())
345 return true;
346
347 unsigned EltSize = EltTy.getSizeInBits();
348 return EltSize != 32 && EltSize != 64;
349}
350
351static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352 const LLT Ty = Query.Types[0];
353 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
354 !loadStoreBitcastWorkaround(Ty);
355}
356
357/// Return true if a load or store of the type should be lowered with a bitcast
358/// to a different type.
359static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360 const unsigned MemSizeInBits) {
361 const unsigned Size = Ty.getSizeInBits();
362 if (Size != MemSizeInBits)
363 return Size <= 32 && Ty.isVector();
364
365 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
366 return true;
367 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
368 !isRegisterVectorElementType(Ty.getElementType());
369}
370
371/// Return true if we should legalize a load by widening an odd sized memory
372/// access up to the alignment. Note this case when the memory access itself
373/// changes, not the size of the result register.
374static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375 unsigned AlignInBits, unsigned AddrSpace,
376 unsigned Opcode) {
377 // We don't want to widen cases that are naturally legal.
378 if (isPowerOf2_32(SizeInBits))
379 return false;
380
381 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382 // end up widening these for a scalar load during RegBankSelect, since there
383 // aren't 96-bit scalar loads.
384 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385 return false;
386
387 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388 return false;
389
390 // A load is known dereferenceable up to the alignment, so it's legal to widen
391 // to it.
392 //
393 // TODO: Could check dereferenceable for less aligned cases.
394 unsigned RoundedSize = NextPowerOf2(SizeInBits);
395 if (AlignInBits < RoundedSize)
396 return false;
397
398 // Do not widen if it would introduce a slow unaligned load.
399 const SITargetLowering *TLI = ST.getTargetLowering();
400 bool Fast = false;
401 return TLI->allowsMisalignedMemoryAccessesImpl(
402 RoundedSize, AddrSpace, Align(AlignInBits / 8),
403 MachineMemOperand::MOLoad, &Fast) &&
404 Fast;
405}
406
407static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408 unsigned Opcode) {
409 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410 return false;
411
412 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413 Query.MMODescrs[0].AlignInBits,
414 Query.Types[1].getAddressSpace(), Opcode);
415}
416
417AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
418 const GCNTargetMachine &TM)
419 : ST(ST_) {
420 using namespace TargetOpcode;
421
422 auto GetAddrSpacePtr = [&TM](unsigned AS) {
423 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424 };
425
426 const LLT S1 = LLT::scalar(1);
427 const LLT S8 = LLT::scalar(8);
428 const LLT S16 = LLT::scalar(16);
429 const LLT S32 = LLT::scalar(32);
430 const LLT S64 = LLT::scalar(64);
431 const LLT S128 = LLT::scalar(128);
432 const LLT S256 = LLT::scalar(256);
433 const LLT S512 = LLT::scalar(512);
434 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435
436 const LLT V2S8 = LLT::vector(2, 8);
437 const LLT V2S16 = LLT::vector(2, 16);
438 const LLT V4S16 = LLT::vector(4, 16);
439
440 const LLT V2S32 = LLT::vector(2, 32);
441 const LLT V3S32 = LLT::vector(3, 32);
442 const LLT V4S32 = LLT::vector(4, 32);
443 const LLT V5S32 = LLT::vector(5, 32);
444 const LLT V6S32 = LLT::vector(6, 32);
445 const LLT V7S32 = LLT::vector(7, 32);
446 const LLT V8S32 = LLT::vector(8, 32);
447 const LLT V9S32 = LLT::vector(9, 32);
448 const LLT V10S32 = LLT::vector(10, 32);
449 const LLT V11S32 = LLT::vector(11, 32);
450 const LLT V12S32 = LLT::vector(12, 32);
451 const LLT V13S32 = LLT::vector(13, 32);
452 const LLT V14S32 = LLT::vector(14, 32);
453 const LLT V15S32 = LLT::vector(15, 32);
454 const LLT V16S32 = LLT::vector(16, 32);
455 const LLT V32S32 = LLT::vector(32, 32);
456
457 const LLT V2S64 = LLT::vector(2, 64);
458 const LLT V3S64 = LLT::vector(3, 64);
459 const LLT V4S64 = LLT::vector(4, 64);
460 const LLT V5S64 = LLT::vector(5, 64);
461 const LLT V6S64 = LLT::vector(6, 64);
462 const LLT V7S64 = LLT::vector(7, 64);
463 const LLT V8S64 = LLT::vector(8, 64);
464 const LLT V16S64 = LLT::vector(16, 64);
465
466 std::initializer_list<LLT> AllS32Vectors =
467 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469 std::initializer_list<LLT> AllS64Vectors =
470 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471
472 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479
480 const LLT CodePtr = FlatPtr;
481
482 const std::initializer_list<LLT> AddrSpaces64 = {
483 GlobalPtr, ConstantPtr, FlatPtr
484 };
485
486 const std::initializer_list<LLT> AddrSpaces32 = {
487 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488 };
489
490 const std::initializer_list<LLT> FPTypesBase = {
491 S32, S64
492 };
493
494 const std::initializer_list<LLT> FPTypes16 = {
495 S32, S64, S16
496 };
497
498 const std::initializer_list<LLT> FPTypesPK16 = {
499 S32, S64, S16, V2S16
500 };
501
502 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503
504 auto &LegacyInfo = getLegacyLegalizerInfo();
505 LegacyInfo.setAction({G_BRCOND, S1},
506 LegacyLegalizeActions::Legal); // VCC branches
507 LegacyInfo.setAction({G_BRCOND, S32},
508 LegacyLegalizeActions::Legal); // SCC branches
509
510 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
511 // elements for v3s16
512 getActionDefinitionsBuilder(G_PHI)
513 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
514 .legalFor(AllS32Vectors)
515 .legalFor(AllS64Vectors)
516 .legalFor(AddrSpaces64)
517 .legalFor(AddrSpaces32)
518 .legalIf(isPointer(0))
519 .clampScalar(0, S16, S256)
520 .widenScalarToNextPow2(0, 32)
521 .clampMaxNumElements(0, S32, 16)
522 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
523 .scalarize(0);
524
525 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
526 // Full set of gfx9 features.
527 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
528 .legalFor({S32, S16, V2S16})
529 .clampScalar(0, S16, S32)
530 .clampMaxNumElements(0, S16, 2)
531 .scalarize(0)
532 .widenScalarToNextPow2(0, 32);
533
534 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
535 .legalFor({S32, S16, V2S16}) // Clamp modifier
536 .minScalarOrElt(0, S16)
537 .clampMaxNumElements(0, S16, 2)
538 .scalarize(0)
539 .widenScalarToNextPow2(0, 32)
540 .lower();
541 } else if (ST.has16BitInsts()) {
542 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
543 .legalFor({S32, S16})
544 .clampScalar(0, S16, S32)
545 .scalarize(0)
546 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
547
548 // Technically the saturating operations require clamp bit support, but this
549 // was introduced at the same time as 16-bit operations.
550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
551 .legalFor({S32, S16}) // Clamp modifier
552 .minScalar(0, S16)
553 .scalarize(0)
554 .widenScalarToNextPow2(0, 16)
555 .lower();
556
557 // We're just lowering this, but it helps get a better result to try to
558 // coerce to the desired type first.
559 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
560 .minScalar(0, S16)
561 .scalarize(0)
562 .lower();
563 } else {
564 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
565 .legalFor({S32})
566 .clampScalar(0, S32, S32)
567 .scalarize(0);
568
569 if (ST.hasIntClamp()) {
570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571 .legalFor({S32}) // Clamp modifier.
572 .scalarize(0)
573 .minScalarOrElt(0, S32)
574 .lower();
575 } else {
576 // Clamp bit support was added in VI, along with 16-bit operations.
577 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578 .minScalar(0, S32)
579 .scalarize(0)
580 .lower();
581 }
582
583 // FIXME: DAG expansion gets better results. The widening uses the smaller
584 // range values and goes for the min/max lowering directly.
585 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
586 .minScalar(0, S32)
587 .scalarize(0)
588 .lower();
589 }
590
591 getActionDefinitionsBuilder(
592 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
593 .customFor({S32, S64})
594 .clampScalar(0, S32, S64)
595 .widenScalarToNextPow2(0, 32)
596 .scalarize(0);
597
598 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
599 .legalFor({S32})
600 .maxScalarOrElt(0, S32);
601
602 if (ST.hasVOP3PInsts()) {
603 Mulh
604 .clampMaxNumElements(0, S8, 2)
605 .lowerFor({V2S8});
606 }
607
608 Mulh
609 .scalarize(0)
610 .lower();
611
612 // Report legal for any types we can handle anywhere. For the cases only legal
613 // on the SALU, RegBankSelect will be able to re-legalize.
614 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
615 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
616 .clampScalar(0, S32, S64)
617 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
618 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
619 .widenScalarToNextPow2(0)
620 .scalarize(0);
621
622 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
623 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
624 .legalFor({{S32, S1}, {S32, S32}})
625 .minScalar(0, S32)
626 // TODO: .scalarize(0)
627 .lower();
628
629 getActionDefinitionsBuilder(G_BITCAST)
630 // Don't worry about the size constraint.
631 .legalIf(all(isRegisterType(0), isRegisterType(1)))
632 .lower();
633
634
635 getActionDefinitionsBuilder(G_CONSTANT)
636 .legalFor({S1, S32, S64, S16, GlobalPtr,
637 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
638 .legalIf(isPointer(0))
639 .clampScalar(0, S32, S64)
640 .widenScalarToNextPow2(0);
641
642 getActionDefinitionsBuilder(G_FCONSTANT)
643 .legalFor({S32, S64, S16})
644 .clampScalar(0, S16, S64);
645
646 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
647 .legalIf(isRegisterType(0))
648 // s1 and s16 are special cases because they have legal operations on
649 // them, but don't really occupy registers in the normal way.
650 .legalFor({S1, S16})
651 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
652 .clampScalarOrElt(0, S32, MaxScalar)
653 .widenScalarToNextPow2(0, 32)
654 .clampMaxNumElements(0, S32, 16);
655
656 LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr},
657 LegacyLegalizeActions::Legal);
658
659 // If the amount is divergent, we have to do a wave reduction to get the
660 // maximum value, so this is expanded during RegBankSelect.
661 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662 .legalFor({{PrivatePtr, S32}});
663
664 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665 .customIf(typeIsNot(0, PrivatePtr));
666
667 LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal);
668
669 auto &FPOpActions = getActionDefinitionsBuilder(
670 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671 .legalFor({S32, S64});
672 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673 .customFor({S32, S64});
674 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675 .customFor({S32, S64});
676
677 if (ST.has16BitInsts()) {
678 if (ST.hasVOP3PInsts())
679 FPOpActions.legalFor({S16, V2S16});
680 else
681 FPOpActions.legalFor({S16});
682
683 TrigActions.customFor({S16});
684 FDIVActions.customFor({S16});
685 }
686
687 auto &MinNumMaxNum = getActionDefinitionsBuilder({
688 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689
690 if (ST.hasVOP3PInsts()) {
691 MinNumMaxNum.customFor(FPTypesPK16)
692 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
693 .clampMaxNumElements(0, S16, 2)
694 .clampScalar(0, S16, S64)
695 .scalarize(0);
696 } else if (ST.has16BitInsts()) {
697 MinNumMaxNum.customFor(FPTypes16)
698 .clampScalar(0, S16, S64)
699 .scalarize(0);
700 } else {
701 MinNumMaxNum.customFor(FPTypesBase)
702 .clampScalar(0, S32, S64)
703 .scalarize(0);
704 }
705
706 if (ST.hasVOP3PInsts())
707 FPOpActions.clampMaxNumElements(0, S16, 2);
708
709 FPOpActions
710 .scalarize(0)
711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712
713 TrigActions
714 .scalarize(0)
715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716
717 FDIVActions
718 .scalarize(0)
719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720
721 getActionDefinitionsBuilder({G_FNEG, G_FABS})
722 .legalFor(FPTypesPK16)
723 .clampMaxNumElements(0, S16, 2)
724 .scalarize(0)
725 .clampScalar(0, S16, S64);
726
727 if (ST.has16BitInsts()) {
728 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729 .legalFor({S32, S64, S16})
730 .scalarize(0)
731 .clampScalar(0, S16, S64);
732 } else {
733 getActionDefinitionsBuilder(G_FSQRT)
734 .legalFor({S32, S64})
735 .scalarize(0)
736 .clampScalar(0, S32, S64);
737
738 if (ST.hasFractBug()) {
739 getActionDefinitionsBuilder(G_FFLOOR)
740 .customFor({S64})
741 .legalFor({S32, S64})
742 .scalarize(0)
743 .clampScalar(0, S32, S64);
744 } else {
745 getActionDefinitionsBuilder(G_FFLOOR)
746 .legalFor({S32, S64})
747 .scalarize(0)
748 .clampScalar(0, S32, S64);
749 }
750 }
751
752 getActionDefinitionsBuilder(G_FPTRUNC)
753 .legalFor({{S32, S64}, {S16, S32}})
754 .scalarize(0)
755 .lower();
756
757 getActionDefinitionsBuilder(G_FPEXT)
758 .legalFor({{S64, S32}, {S32, S16}})
759 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760 .scalarize(0);
761
762 getActionDefinitionsBuilder(G_FSUB)
763 // Use actual fsub instruction
764 .legalFor({S32})
765 // Must use fadd + fneg
766 .lowerFor({S64, S16, V2S16})
767 .scalarize(0)
768 .clampScalar(0, S32, S64);
769
770 // Whether this is legal depends on the floating point mode for the function.
771 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773 FMad.customFor({S32, S16});
774 else if (ST.hasMadMacF32Insts())
775 FMad.customFor({S32});
776 else if (ST.hasMadF16())
777 FMad.customFor({S16});
778 FMad.scalarize(0)
779 .lower();
780
781 auto &FRem = getActionDefinitionsBuilder(G_FREM);
782 if (ST.has16BitInsts()) {
783 FRem.customFor({S16, S32, S64});
784 } else {
785 FRem.minScalar(0, S32)
786 .customFor({S32, S64});
787 }
788 FRem.scalarize(0);
789
790 // TODO: Do we need to clamp maximum bitwidth?
791 getActionDefinitionsBuilder(G_TRUNC)
792 .legalIf(isScalar(0))
793 .legalFor({{V2S16, V2S32}})
794 .clampMaxNumElements(0, S16, 2)
795 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796 // situations (like an invalid implicit use), we don't want to infinite loop
797 // in the legalizer.
798 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
799 .alwaysLegal();
800
801 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803 {S32, S1}, {S64, S1}, {S16, S1}})
804 .scalarize(0)
805 .clampScalar(0, S32, S64)
806 .widenScalarToNextPow2(1, 32);
807
808 // TODO: Split s1->s64 during regbankselect for VALU.
809 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811 .lowerFor({{S32, S64}})
812 .lowerIf(typeIs(1, S1))
813 .customFor({{S64, S64}});
814 if (ST.has16BitInsts())
815 IToFP.legalFor({{S16, S16}});
816 IToFP.clampScalar(1, S32, S64)
817 .minScalar(0, S32)
818 .scalarize(0)
819 .widenScalarToNextPow2(1);
820
821 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823 .customFor({{S64, S64}})
824 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825 if (ST.has16BitInsts())
826 FPToI.legalFor({{S16, S16}});
827 else
828 FPToI.minScalar(1, S32);
829
830 FPToI.minScalar(0, S32)
831 .widenScalarToNextPow2(0, 32)
832 .scalarize(0)
833 .lower();
834
835 // Lower roundeven into G_FRINT
836 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837 .scalarize(0)
838 .lower();
839
840 if (ST.has16BitInsts()) {
841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842 .legalFor({S16, S32, S64})
843 .clampScalar(0, S16, S64)
844 .scalarize(0);
845 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847 .legalFor({S32, S64})
848 .clampScalar(0, S32, S64)
849 .scalarize(0);
850 } else {
851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852 .legalFor({S32})
853 .customFor({S64})
854 .clampScalar(0, S32, S64)
855 .scalarize(0);
856 }
857
858 getActionDefinitionsBuilder(G_PTR_ADD)
859 .legalIf(all(isPointer(0), sameSize(0, 1)))
860 .scalarize(0)
861 .scalarSameSizeAs(1, 0);
862
863 getActionDefinitionsBuilder(G_PTRMASK)
864 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865 .scalarSameSizeAs(1, 0)
866 .scalarize(0);
867
868 auto &CmpBuilder =
869 getActionDefinitionsBuilder(G_ICMP)
870 // The compare output type differs based on the register bank of the output,
871 // so make both s1 and s32 legal.
872 //
873 // Scalar compares producing output in scc will be promoted to s32, as that
874 // is the allocatable register type that will be needed for the copy from
875 // scc. This will be promoted during RegBankSelect, and we assume something
876 // before that won't try to use s32 result types.
877 //
878 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879 // bank.
880 .legalForCartesianProduct(
881 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882 .legalForCartesianProduct(
883 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884 if (ST.has16BitInsts()) {
885 CmpBuilder.legalFor({{S1, S16}});
886 }
887
888 CmpBuilder
889 .widenScalarToNextPow2(1)
890 .clampScalar(1, S32, S64)
891 .scalarize(0)
892 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893
894 getActionDefinitionsBuilder(G_FCMP)
895 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896 .widenScalarToNextPow2(1)
897 .clampScalar(1, S32, S64)
898 .scalarize(0);
899
900 // FIXME: fpow has a selection pattern that should move to custom lowering.
901 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902 if (ST.has16BitInsts())
903 Exp2Ops.legalFor({S32, S16});
904 else
905 Exp2Ops.legalFor({S32});
906 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907 Exp2Ops.scalarize(0);
908
909 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910 if (ST.has16BitInsts())
911 ExpOps.customFor({{S32}, {S16}});
912 else
913 ExpOps.customFor({S32});
914 ExpOps.clampScalar(0, MinScalarFPTy, S32)
915 .scalarize(0);
916
917 getActionDefinitionsBuilder(G_FPOWI)
918 .clampScalar(0, MinScalarFPTy, S32)
919 .lower();
920
921 // The 64-bit versions produce 32-bit results, but only on the SALU.
922 getActionDefinitionsBuilder(G_CTPOP)
923 .legalFor({{S32, S32}, {S32, S64}})
924 .clampScalar(0, S32, S32)
925 .clampScalar(1, S32, S64)
926 .scalarize(0)
927 .widenScalarToNextPow2(0, 32)
928 .widenScalarToNextPow2(1, 32);
929
930 // The hardware instructions return a different result on 0 than the generic
931 // instructions expect. The hardware produces -1, but these produce the
932 // bitwidth.
933 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934 .scalarize(0)
935 .clampScalar(0, S32, S32)
936 .clampScalar(1, S32, S64)
937 .widenScalarToNextPow2(0, 32)
938 .widenScalarToNextPow2(1, 32)
939 .lower();
940
941 // The 64-bit versions produce 32-bit results, but only on the SALU.
942 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943 .legalFor({{S32, S32}, {S32, S64}})
944 .clampScalar(0, S32, S32)
945 .clampScalar(1, S32, S64)
946 .scalarize(0)
947 .widenScalarToNextPow2(0, 32)
948 .widenScalarToNextPow2(1, 32);
949
950 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951 // RegBankSelect.
952 getActionDefinitionsBuilder(G_BITREVERSE)
953 .legalFor({S32, S64})
954 .clampScalar(0, S32, S64)
955 .scalarize(0)
956 .widenScalarToNextPow2(0);
957
958 if (ST.has16BitInsts()) {
959 getActionDefinitionsBuilder(G_BSWAP)
960 .legalFor({S16, S32, V2S16})
961 .clampMaxNumElements(0, S16, 2)
962 // FIXME: Fixing non-power-of-2 before clamp is workaround for
963 // narrowScalar limitation.
964 .widenScalarToNextPow2(0)
965 .clampScalar(0, S16, S32)
966 .scalarize(0);
967
968 if (ST.hasVOP3PInsts()) {
969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970 .legalFor({S32, S16, V2S16})
971 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972 .clampMaxNumElements(0, S16, 2)
973 .minScalar(0, S16)
974 .widenScalarToNextPow2(0)
975 .scalarize(0)
976 .lower();
977 } else {
978 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979 .legalFor({S32, S16})
980 .widenScalarToNextPow2(0)
981 .minScalar(0, S16)
982 .scalarize(0)
983 .lower();
984 }
985 } else {
986 // TODO: Should have same legality without v_perm_b32
987 getActionDefinitionsBuilder(G_BSWAP)
988 .legalFor({S32})
989 .lowerIf(scalarNarrowerThan(0, 32))
990 // FIXME: Fixing non-power-of-2 before clamp is workaround for
991 // narrowScalar limitation.
992 .widenScalarToNextPow2(0)
993 .maxScalar(0, S32)
994 .scalarize(0)
995 .lower();
996
997 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998 .legalFor({S32})
999 .minScalar(0, S32)
1000 .widenScalarToNextPow2(0)
1001 .scalarize(0)
1002 .lower();
1003 }
1004
1005 getActionDefinitionsBuilder(G_INTTOPTR)
1006 // List the common cases
1007 .legalForCartesianProduct(AddrSpaces64, {S64})
1008 .legalForCartesianProduct(AddrSpaces32, {S32})
1009 .scalarize(0)
1010 // Accept any address space as long as the size matches
1011 .legalIf(sameSize(0, 1))
1012 .widenScalarIf(smallerThan(1, 0),
1013 [](const LegalityQuery &Query) {
1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015 })
1016 .narrowScalarIf(largerThan(1, 0),
1017 [](const LegalityQuery &Query) {
1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019 });
1020
1021 getActionDefinitionsBuilder(G_PTRTOINT)
1022 // List the common cases
1023 .legalForCartesianProduct(AddrSpaces64, {S64})
1024 .legalForCartesianProduct(AddrSpaces32, {S32})
1025 .scalarize(0)
1026 // Accept any address space as long as the size matches
1027 .legalIf(sameSize(0, 1))
1028 .widenScalarIf(smallerThan(0, 1),
1029 [](const LegalityQuery &Query) {
1030 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031 })
1032 .narrowScalarIf(
1033 largerThan(0, 1),
1034 [](const LegalityQuery &Query) {
1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036 });
1037
1038 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039 .scalarize(0)
1040 .custom();
1041
1042 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043 bool IsLoad) -> bool {
1044 const LLT DstTy = Query.Types[0];
1045
1046 // Split vector extloads.
1047 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049
1050 if (MemSize < DstTy.getSizeInBits())
1051 MemSize = std::max(MemSize, AlignBits);
1052
1053 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054 return true;
1055
1056 const LLT PtrTy = Query.Types[1];
1057 unsigned AS = PtrTy.getAddressSpace();
1058 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059 return true;
1060
1061 // Catch weird sized loads that don't evenly divide into the access sizes
1062 // TODO: May be able to widen depending on alignment etc.
1063 unsigned NumRegs = (MemSize + 31) / 32;
1064 if (NumRegs == 3) {
1065 if (!ST.hasDwordx3LoadStores())
1066 return true;
1067 } else {
1068 // If the alignment allows, these should have been widened.
1069 if (!isPowerOf2_32(NumRegs))
1070 return true;
1071 }
1072
1073 if (AlignBits < MemSize) {
1074 const SITargetLowering *TLI = ST.getTargetLowering();
1075 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076 Align(AlignBits / 8));
1077 }
1078
1079 return false;
1080 };
1081
1082 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085
1086 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087 // LDS
1088 // TODO: Unsupported flat for SI.
1089
1090 for (unsigned Op : {G_LOAD, G_STORE}) {
1091 const bool IsStore = Op == G_STORE;
1092
1093 auto &Actions = getActionDefinitionsBuilder(Op);
1094 // Explicitly list some common cases.
1095 // TODO: Does this help compile time at all?
1096 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097 {V2S32, GlobalPtr, 64, GlobalAlign32},
1098 {V4S32, GlobalPtr, 128, GlobalAlign32},
1099 {S64, GlobalPtr, 64, GlobalAlign32},
1100 {V2S64, GlobalPtr, 128, GlobalAlign32},
1101 {V2S16, GlobalPtr, 32, GlobalAlign32},
1102 {S32, GlobalPtr, 8, GlobalAlign8},
1103 {S32, GlobalPtr, 16, GlobalAlign16},
1104
1105 {S32, LocalPtr, 32, 32},
1106 {S64, LocalPtr, 64, 32},
1107 {V2S32, LocalPtr, 64, 32},
1108 {S32, LocalPtr, 8, 8},
1109 {S32, LocalPtr, 16, 16},
1110 {V2S16, LocalPtr, 32, 32},
1111
1112 {S32, PrivatePtr, 32, 32},
1113 {S32, PrivatePtr, 8, 8},
1114 {S32, PrivatePtr, 16, 16},
1115 {V2S16, PrivatePtr, 32, 32},
1116
1117 {S32, ConstantPtr, 32, GlobalAlign32},
1118 {V2S32, ConstantPtr, 64, GlobalAlign32},
1119 {V4S32, ConstantPtr, 128, GlobalAlign32},
1120 {S64, ConstantPtr, 64, GlobalAlign32},
1121 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122 Actions.legalIf(
1123 [=](const LegalityQuery &Query) -> bool {
1124 return isLoadStoreLegal(ST, Query);
1125 });
1126
1127 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128 // 64-bits.
1129 //
1130 // TODO: Should generalize bitcast action into coerce, which will also cover
1131 // inserting addrspacecasts.
1132 Actions.customIf(typeIs(1, Constant32Ptr));
1133
1134 // Turn any illegal element vectors into something easier to deal
1135 // with. These will ultimately produce 32-bit scalar shifts to extract the
1136 // parts anyway.
1137 //
1138 // For odd 16-bit element vectors, prefer to split those into pieces with
1139 // 16-bit vector parts.
1140 Actions.bitcastIf(
1141 [=](const LegalityQuery &Query) -> bool {
1142 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143 Query.MMODescrs[0].SizeInBits);
1144 }, bitcastToRegisterType(0));
1145
1146 if (!IsStore) {
1147 // Widen suitably aligned loads by loading extra bytes. The standard
1148 // legalization actions can't properly express widening memory operands.
1149 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150 return shouldWidenLoad(ST, Query, G_LOAD);
1151 });
1152 }
1153
1154 // FIXME: load/store narrowing should be moved to lower action
1155 Actions
1156 .narrowScalarIf(
1157 [=](const LegalityQuery &Query) -> bool {
1158 return !Query.Types[0].isVector() &&
1159 needToSplitMemOp(Query, Op == G_LOAD);
1160 },
1161 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162 const LLT DstTy = Query.Types[0];
1163 const LLT PtrTy = Query.Types[1];
1164
1165 const unsigned DstSize = DstTy.getSizeInBits();
1166 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167
1168 // Split extloads.
1169 if (DstSize > MemSize)
1170 return std::make_pair(0, LLT::scalar(MemSize));
1171
1172 if (!isPowerOf2_32(DstSize)) {
1173 // We're probably decomposing an odd sized store. Try to split
1174 // to the widest type. TODO: Account for alignment. As-is it
1175 // should be OK, since the new parts will be further legalized.
1176 unsigned FloorSize = PowerOf2Floor(DstSize);
1177 return std::make_pair(0, LLT::scalar(FloorSize));
1178 }
1179
1180 if (DstSize > 32 && (DstSize % 32 != 0)) {
1181 // FIXME: Need a way to specify non-extload of larger size if
1182 // suitably aligned.
1183 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184 }
1185
1186 unsigned MaxSize = maxSizeForAddrSpace(ST,
1187 PtrTy.getAddressSpace(),
1188 Op == G_LOAD);
1189 if (MemSize > MaxSize)
1190 return std::make_pair(0, LLT::scalar(MaxSize));
1191
1192 unsigned Align = Query.MMODescrs[0].AlignInBits;
1193 return std::make_pair(0, LLT::scalar(Align));
1194 })
1195 .fewerElementsIf(
1196 [=](const LegalityQuery &Query) -> bool {
1197 return Query.Types[0].isVector() &&
1198 needToSplitMemOp(Query, Op == G_LOAD);
1199 },
1200 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201 const LLT DstTy = Query.Types[0];
1202 const LLT PtrTy = Query.Types[1];
1203
1204 LLT EltTy = DstTy.getElementType();
1205 unsigned MaxSize = maxSizeForAddrSpace(ST,
1206 PtrTy.getAddressSpace(),
1207 Op == G_LOAD);
1208
1209 // FIXME: Handle widened to power of 2 results better. This ends
1210 // up scalarizing.
1211 // FIXME: 3 element stores scalarized on SI
1212
1213 // Split if it's too large for the address space.
1214 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215 unsigned NumElts = DstTy.getNumElements();
1216 unsigned EltSize = EltTy.getSizeInBits();
1217
1218 if (MaxSize % EltSize == 0) {
1219 return std::make_pair(
1220 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1221 }
1222
1223 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1224
1225 // FIXME: Refine when odd breakdowns handled
1226 // The scalars will need to be re-legalized.
1227 if (NumPieces == 1 || NumPieces >= NumElts ||
1228 NumElts % NumPieces != 0)
1229 return std::make_pair(0, EltTy);
1230
1231 return std::make_pair(0,
1232 LLT::vector(NumElts / NumPieces, EltTy));
1233 }
1234
1235 // FIXME: We could probably handle weird extending loads better.
1236 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1237 if (DstTy.getSizeInBits() > MemSize)
1238 return std::make_pair(0, EltTy);
1239
1240 unsigned EltSize = EltTy.getSizeInBits();
1241 unsigned DstSize = DstTy.getSizeInBits();
1242 if (!isPowerOf2_32(DstSize)) {
1243 // We're probably decomposing an odd sized store. Try to split
1244 // to the widest type. TODO: Account for alignment. As-is it
1245 // should be OK, since the new parts will be further legalized.
1246 unsigned FloorSize = PowerOf2Floor(DstSize);
1247 return std::make_pair(
1248 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1249 }
1250
1251 // Need to split because of alignment.
1252 unsigned Align = Query.MMODescrs[0].AlignInBits;
1253 if (EltSize > Align &&
1254 (EltSize / Align < DstTy.getNumElements())) {
1255 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1256 }
1257
1258 // May need relegalization for the scalars.
1259 return std::make_pair(0, EltTy);
1260 })
1261 .lowerIfMemSizeNotPow2()
1262 .minScalar(0, S32)
1263 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1264 .widenScalarToNextPow2(0)
1265 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1266 .lower();
1267 }
1268
1269 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1270 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1271 {S32, GlobalPtr, 16, 2 * 8},
1272 {S32, LocalPtr, 8, 8},
1273 {S32, LocalPtr, 16, 16},
1274 {S32, PrivatePtr, 8, 8},
1275 {S32, PrivatePtr, 16, 16},
1276 {S32, ConstantPtr, 8, 8},
1277 {S32, ConstantPtr, 16, 2 * 8}})
1278 .legalIf(
1279 [=](const LegalityQuery &Query) -> bool {
1280 return isLoadStoreLegal(ST, Query);
1281 });
1282
1283 if (ST.hasFlatAddressSpace()) {
1284 ExtLoads.legalForTypesWithMemDesc(
1285 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1286 }
1287
1288 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1289 // 64-bits.
1290 //
1291 // TODO: Should generalize bitcast action into coerce, which will also cover
1292 // inserting addrspacecasts.
1293 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1294
1295 ExtLoads.clampScalar(0, S32, S32)
1296 .widenScalarToNextPow2(0)
1297 .unsupportedIfMemSizeNotPow2()
1298 .lower();
1299
1300 auto &Atomics = getActionDefinitionsBuilder(
1301 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1302 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1303 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1304 G_ATOMICRMW_UMIN})
1305 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1306 {S64, GlobalPtr}, {S64, LocalPtr},
1307 {S32, RegionPtr}, {S64, RegionPtr}});
1308 if (ST.hasFlatAddressSpace()) {
1309 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1310 }
1311
1312 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1313 if (ST.hasLDSFPAtomics()) {
1314 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1315 if (ST.hasGFX90AInsts())
1316 Atomic.legalFor({{S64, LocalPtr}});
1317 }
1318 if (ST.hasAtomicFaddInsts())
1319 Atomic.legalFor({{S32, GlobalPtr}});
1320
1321 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1322 // demarshalling
1323 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1324 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1325 {S32, FlatPtr}, {S64, FlatPtr}})
1326 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1327 {S32, RegionPtr}, {S64, RegionPtr}});
1328 // TODO: Pointer types, any 32-bit or 64-bit vector
1329
1330 // Condition should be s32 for scalar, s1 for vector.
1331 getActionDefinitionsBuilder(G_SELECT)
1332 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1333 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1334 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1335 .clampScalar(0, S16, S64)
1336 .scalarize(1)
1337 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1338 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1339 .clampMaxNumElements(0, S32, 2)
1340 .clampMaxNumElements(0, LocalPtr, 2)
1341 .clampMaxNumElements(0, PrivatePtr, 2)
1342 .scalarize(0)
1343 .widenScalarToNextPow2(0)
1344 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1345
1346 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1347 // be more flexible with the shift amount type.
1348 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1349 .legalFor({{S32, S32}, {S64, S32}});
1350 if (ST.has16BitInsts()) {
1351 if (ST.hasVOP3PInsts()) {
1352 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1353 .clampMaxNumElements(0, S16, 2);
1354 } else
1355 Shifts.legalFor({{S16, S16}});
1356
1357 // TODO: Support 16-bit shift amounts for all types
1358 Shifts.widenScalarIf(
1359 [=](const LegalityQuery &Query) {
1360 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1361 // 32-bit amount.
1362 const LLT ValTy = Query.Types[0];
1363 const LLT AmountTy = Query.Types[1];
1364 return ValTy.getSizeInBits() <= 16 &&
1365 AmountTy.getSizeInBits() < 16;
1366 }, changeTo(1, S16));
1367 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1368 Shifts.clampScalar(1, S32, S32);
1369 Shifts.clampScalar(0, S16, S64);
1370 Shifts.widenScalarToNextPow2(0, 16);
1371
1372 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1373 .minScalar(0, S16)
1374 .scalarize(0)
1375 .lower();
1376 } else {
1377 // Make sure we legalize the shift amount type first, as the general
1378 // expansion for the shifted type will produce much worse code if it hasn't
1379 // been truncated already.
1380 Shifts.clampScalar(1, S32, S32);
1381 Shifts.clampScalar(0, S32, S64);
1382 Shifts.widenScalarToNextPow2(0, 32);
1383
1384 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1385 .minScalar(0, S32)
1386 .scalarize(0)
1387 .lower();
1388 }
1389 Shifts.scalarize(0);
1390
1391 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1392 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1393 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1394 unsigned IdxTypeIdx = 2;
1395
1396 getActionDefinitionsBuilder(Op)
1397 .customIf([=](const LegalityQuery &Query) {
1398 const LLT EltTy = Query.Types[EltTypeIdx];
1399 const LLT VecTy = Query.Types[VecTypeIdx];
1400 const LLT IdxTy = Query.Types[IdxTypeIdx];
1401 const unsigned EltSize = EltTy.getSizeInBits();
1402 return (EltSize == 32 || EltSize == 64) &&
1403 VecTy.getSizeInBits() % 32 == 0 &&
1404 VecTy.getSizeInBits() <= MaxRegisterSize &&
1405 IdxTy.getSizeInBits() == 32;
1406 })
1407 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1408 bitcastToVectorElement32(VecTypeIdx))
1409 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1410 .bitcastIf(
1411 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1412 [=](const LegalityQuery &Query) {
1413 // For > 64-bit element types, try to turn this into a 64-bit
1414 // element vector since we may be able to do better indexing
1415 // if this is scalar. If not, fall back to 32.
1416 const LLT EltTy = Query.Types[EltTypeIdx];
1417 const LLT VecTy = Query.Types[VecTypeIdx];
1418 const unsigned DstEltSize = EltTy.getSizeInBits();
1419 const unsigned VecSize = VecTy.getSizeInBits();
1420
1421 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1422 return std::make_pair(
1423 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1424 })
1425 .clampScalar(EltTypeIdx, S32, S64)
1426 .clampScalar(VecTypeIdx, S32, S64)
1427 .clampScalar(IdxTypeIdx, S32, S32)
1428 .clampMaxNumElements(VecTypeIdx, S32, 32)
1429 // TODO: Clamp elements for 64-bit vectors?
1430 // It should only be necessary with variable indexes.
1431 // As a last resort, lower to the stack
1432 .lower();
1433 }
1434
1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436 .unsupportedIf([=](const LegalityQuery &Query) {
1437 const LLT &EltTy = Query.Types[1].getElementType();
1438 return Query.Types[0] != EltTy;
1439 });
1440
1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444
1445 // FIXME: Doesn't handle extract of illegal sizes.
1446 getActionDefinitionsBuilder(Op)
1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448 // FIXME: Multiples of 16 should not be legal.
1449 .legalIf([=](const LegalityQuery &Query) {
1450 const LLT BigTy = Query.Types[BigTyIdx];
1451 const LLT LitTy = Query.Types[LitTyIdx];
1452 return (BigTy.getSizeInBits() % 32 == 0) &&
1453 (LitTy.getSizeInBits() % 16 == 0);
1454 })
1455 .widenScalarIf(
1456 [=](const LegalityQuery &Query) {
1457 const LLT BigTy = Query.Types[BigTyIdx];
1458 return (BigTy.getScalarSizeInBits() < 16);
1459 },
1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1461 .widenScalarIf(
1462 [=](const LegalityQuery &Query) {
1463 const LLT LitTy = Query.Types[LitTyIdx];
1464 return (LitTy.getScalarSizeInBits() < 16);
1465 },
1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468 .widenScalarToNextPow2(BigTyIdx, 32);
1469
1470 }
1471
1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473 .legalForCartesianProduct(AllS32Vectors, {S32})
1474 .legalForCartesianProduct(AllS64Vectors, {S64})
1475 .clampNumElements(0, V16S32, V32S32)
1476 .clampNumElements(0, V2S64, V16S64)
1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478
1479 if (ST.hasScalarPackInsts()) {
1480 BuildVector
1481 // FIXME: Should probably widen s1 vectors straight to s32
1482 .minScalarOrElt(0, S16)
1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484 .minScalar(1, S32);
1485
1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487 .legalFor({V2S16, S32})
1488 .lower();
1489 BuildVector.minScalarOrElt(0, S32);
1490 } else {
1491 BuildVector.customFor({V2S16, S16});
1492 BuildVector.minScalarOrElt(0, S32);
1493
1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495 .customFor({V2S16, S32})
1496 .lower();
1497 }
1498
1499 BuildVector.legalIf(isRegisterType(0));
1500
1501 // FIXME: Clamp maximum size
1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504 .clampMaxNumElements(0, S32, 32)
1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506 .clampMaxNumElements(0, S16, 64);
1507
1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509 // pre-legalize.
1510 if (ST.hasVOP3PInsts()) {
1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512 .customFor({V2S16, V2S16})
1513 .lower();
1514 } else
1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516
1517 // Merge/Unmerge
1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521
1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523 const LLT Ty = Query.Types[TypeIdx];
1524 if (Ty.isVector()) {
1525 const LLT &EltTy = Ty.getElementType();
1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527 return true;
1528 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529 return true;
1530 }
1531 return false;
1532 };
1533
1534 auto &Builder = getActionDefinitionsBuilder(Op)
1535 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536 .lowerFor({{S16, V2S16}})
1537 .lowerIf([=](const LegalityQuery &Query) {
1538 const LLT BigTy = Query.Types[BigTyIdx];
1539 return BigTy.getSizeInBits() == 32;
1540 })
1541 // Try to widen to s16 first for small types.
1542 // TODO: Only do this on targets with legal s16 shifts
1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547 elementTypeIs(1, S16)),
1548 changeTo(1, V2S16))
1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551 // valid.
1552 .clampScalar(LitTyIdx, S32, S512)
1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554 // Break up vectors with weird elements into scalars
1555 .fewerElementsIf(
1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557 scalarize(0))
1558 .fewerElementsIf(
1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560 scalarize(1))
1561 .clampScalar(BigTyIdx, S32, MaxScalar);
1562
1563 if (Op == G_MERGE_VALUES) {
1564 Builder.widenScalarIf(
1565 // TODO: Use 16-bit shifts if legal for 8-bit values?
1566 [=](const LegalityQuery &Query) {
1567 const LLT Ty = Query.Types[LitTyIdx];
1568 return Ty.getSizeInBits() < 32;
1569 },
1570 changeTo(LitTyIdx, S32));
1571 }
1572
1573 Builder.widenScalarIf(
1574 [=](const LegalityQuery &Query) {
1575 const LLT Ty = Query.Types[BigTyIdx];
1576 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577 Ty.getSizeInBits() % 16 != 0;
1578 },
1579 [=](const LegalityQuery &Query) {
1580 // Pick the next power of 2, or a multiple of 64 over 128.
1581 // Whichever is smaller.
1582 const LLT &Ty = Query.Types[BigTyIdx];
1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584 if (NewSizeInBits >= 256) {
1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586 if (RoundedTo < NewSizeInBits)
1587 NewSizeInBits = RoundedTo;
1588 }
1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590 })
1591 // Any vectors left are the wrong size. Scalarize them.
1592 .scalarize(0)
1593 .scalarize(1);
1594 }
1595
1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597 // RegBankSelect.
1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599 .legalFor({{S32}, {S64}});
1600
1601 if (ST.hasVOP3PInsts()) {
1602 SextInReg.lowerFor({{V2S16}})
1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604 // get more vector shift opportunities, since we'll get those when
1605 // expanded.
1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1607 } else if (ST.has16BitInsts()) {
1608 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609 } else {
1610 // Prefer to promote to s32 before lowering if we don't have 16-bit
1611 // shifts. This avoid a lot of intermediate truncate and extend operations.
1612 SextInReg.lowerFor({{S32}, {S64}});
1613 }
1614
1615 SextInReg
1616 .scalarize(0)
1617 .clampScalar(0, S32, S64)
1618 .lower();
1619
1620 // TODO: Only Try to form v2s16 with legal packed instructions.
1621 getActionDefinitionsBuilder(G_FSHR)
1622 .legalFor({{S32, S32}})
1623 .lowerFor({{V2S16, V2S16}})
1624 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1625 .scalarize(0)
1626 .lower();
1627
1628 if (ST.hasVOP3PInsts()) {
1629 getActionDefinitionsBuilder(G_FSHL)
1630 .lowerFor({{V2S16, V2S16}})
1631 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1632 .scalarize(0)
1633 .lower();
1634 } else {
1635 getActionDefinitionsBuilder(G_FSHL)
1636 .scalarize(0)
1637 .lower();
1638 }
1639
1640 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1641 .legalFor({S64});
1642
1643 getActionDefinitionsBuilder(G_FENCE)
1644 .alwaysLegal();
1645
1646 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1647 .scalarize(0)
1648 .minScalar(0, S32)
1649 .lower();
1650
1651 getActionDefinitionsBuilder({
1652 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1653 G_FCOPYSIGN,
1654
1655 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1656 G_ATOMICRMW_NAND,
1657 G_ATOMICRMW_FSUB,
1658 G_READ_REGISTER,
1659 G_WRITE_REGISTER,
1660
1661 G_SADDO, G_SSUBO,
1662
1663 // TODO: Implement
1664 G_FMINIMUM, G_FMAXIMUM}).lower();
1665
1666 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1667 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1668 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1669 .unsupported();
1670
1671 getLegacyLegalizerInfo().computeTables();
1672 verify(*ST.getInstrInfo());
1673}
1674
1675bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1676 MachineInstr &MI) const {
1677 MachineIRBuilder &B = Helper.MIRBuilder;
1678 MachineRegisterInfo &MRI = *B.getMRI();
1679
1680 switch (MI.getOpcode()) {
1681 case TargetOpcode::G_ADDRSPACE_CAST:
1682 return legalizeAddrSpaceCast(MI, MRI, B);
1683 case TargetOpcode::G_FRINT:
1684 return legalizeFrint(MI, MRI, B);
1685 case TargetOpcode::G_FCEIL:
1686 return legalizeFceil(MI, MRI, B);
1687 case TargetOpcode::G_FREM:
1688 return legalizeFrem(MI, MRI, B);
1689 case TargetOpcode::G_INTRINSIC_TRUNC:
1690 return legalizeIntrinsicTrunc(MI, MRI, B);
1691 case TargetOpcode::G_SITOFP:
1692 return legalizeITOFP(MI, MRI, B, true);
1693 case TargetOpcode::G_UITOFP:
1694 return legalizeITOFP(MI, MRI, B, false);
1695 case TargetOpcode::G_FPTOSI:
1696 return legalizeFPTOI(MI, MRI, B, true);
1697 case TargetOpcode::G_FPTOUI:
1698 return legalizeFPTOI(MI, MRI, B, false);
1699 case TargetOpcode::G_FMINNUM:
1700 case TargetOpcode::G_FMAXNUM:
1701 case TargetOpcode::G_FMINNUM_IEEE:
1702 case TargetOpcode::G_FMAXNUM_IEEE:
1703 return legalizeMinNumMaxNum(Helper, MI);
1704 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1705 return legalizeExtractVectorElt(MI, MRI, B);
1706 case TargetOpcode::G_INSERT_VECTOR_ELT:
1707 return legalizeInsertVectorElt(MI, MRI, B);
1708 case TargetOpcode::G_SHUFFLE_VECTOR:
1709 return legalizeShuffleVector(MI, MRI, B);
1710 case TargetOpcode::G_FSIN:
1711 case TargetOpcode::G_FCOS:
1712 return legalizeSinCos(MI, MRI, B);
1713 case TargetOpcode::G_GLOBAL_VALUE:
1714 return legalizeGlobalValue(MI, MRI, B);
1715 case TargetOpcode::G_LOAD:
1716 case TargetOpcode::G_SEXTLOAD:
1717 case TargetOpcode::G_ZEXTLOAD:
1718 return legalizeLoad(Helper, MI);
1719 case TargetOpcode::G_FMAD:
1720 return legalizeFMad(MI, MRI, B);
1721 case TargetOpcode::G_FDIV:
1722 return legalizeFDIV(MI, MRI, B);
1723 case TargetOpcode::G_UDIV:
1724 case TargetOpcode::G_UREM:
1725 case TargetOpcode::G_UDIVREM:
1726 return legalizeUnsignedDIV_REM(MI, MRI, B);
1727 case TargetOpcode::G_SDIV:
1728 case TargetOpcode::G_SREM:
1729 case TargetOpcode::G_SDIVREM:
1730 return legalizeSignedDIV_REM(MI, MRI, B);
1731 case TargetOpcode::G_ATOMIC_CMPXCHG:
1732 return legalizeAtomicCmpXChg(MI, MRI, B);
1733 case TargetOpcode::G_FLOG:
1734 return legalizeFlog(MI, B, numbers::ln2f);
1735 case TargetOpcode::G_FLOG10:
1736 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1737 case TargetOpcode::G_FEXP:
1738 return legalizeFExp(MI, B);
1739 case TargetOpcode::G_FPOW:
1740 return legalizeFPow(MI, B);
1741 case TargetOpcode::G_FFLOOR:
1742 return legalizeFFloor(MI, MRI, B);
1743 case TargetOpcode::G_BUILD_VECTOR:
1744 return legalizeBuildVector(MI, MRI, B);
1745 default:
1746 return false;
1747 }
1748
1749 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1749)
;
1750}
1751
1752Register AMDGPULegalizerInfo::getSegmentAperture(
1753 unsigned AS,
1754 MachineRegisterInfo &MRI,
1755 MachineIRBuilder &B) const {
1756 MachineFunction &MF = B.getMF();
1757 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1758 const LLT S32 = LLT::scalar(32);
1759
1760 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1760, __extension__ __PRETTY_FUNCTION__))
;
1761
1762 if (ST.hasApertureRegs()) {
1763 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1764 // getreg.
1765 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1766 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1767 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1768 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1769 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1770 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1771 unsigned Encoding =
1772 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1773 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1774 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1775
1776 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1777
1778 B.buildInstr(AMDGPU::S_GETREG_B32)
1779 .addDef(GetReg)
1780 .addImm(Encoding);
1781 MRI.setType(GetReg, S32);
1782
1783 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1784 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1785 }
1786
1787 Register QueuePtr = MRI.createGenericVirtualRegister(
1788 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1789
1790 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1791 return Register();
1792
1793 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1794 // private_segment_aperture_base_hi.
1795 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1796
1797 // TODO: can we be smarter about machine pointer info?
1798 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1799 MachineMemOperand *MMO = MF.getMachineMemOperand(
1800 PtrInfo,
1801 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1802 MachineMemOperand::MOInvariant,
1803 4, commonAlignment(Align(64), StructOffset));
1804
1805 Register LoadAddr;
1806
1807 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1808 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1809}
1810
1811bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1812 MachineInstr &MI, MachineRegisterInfo &MRI,
1813 MachineIRBuilder &B) const {
1814 MachineFunction &MF = B.getMF();
1815
1816 const LLT S32 = LLT::scalar(32);
1817 Register Dst = MI.getOperand(0).getReg();
1818 Register Src = MI.getOperand(1).getReg();
1819
1820 LLT DstTy = MRI.getType(Dst);
1821 LLT SrcTy = MRI.getType(Src);
1822 unsigned DestAS = DstTy.getAddressSpace();
1823 unsigned SrcAS = SrcTy.getAddressSpace();
1824
1825 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1826 // vector element.
1827 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1827, __extension__ __PRETTY_FUNCTION__))
;
1828
1829 const AMDGPUTargetMachine &TM
1830 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1831
1832 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1833 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1834 return true;
1835 }
1836
1837 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1838 // Truncate.
1839 B.buildExtract(Dst, Src, 0);
1840 MI.eraseFromParent();
1841 return true;
1842 }
1843
1844 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1845 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1846 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1847
1848 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1849 // another. Merge operands are required to be the same type, but creating an
1850 // extra ptrtoint would be kind of pointless.
1851 auto HighAddr = B.buildConstant(
1852 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1853 B.buildMerge(Dst, {Src, HighAddr});
1854 MI.eraseFromParent();
1855 return true;
1856 }
1857
1858 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1859 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1860, __extension__ __PRETTY_FUNCTION__))
1860 DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1860, __extension__ __PRETTY_FUNCTION__))
;
1861 unsigned NullVal = TM.getNullPointerValue(DestAS);
1862
1863 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1864 auto FlatNull = B.buildConstant(SrcTy, 0);
1865
1866 // Extract low 32-bits of the pointer.
1867 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1868
1869 auto CmpRes =
1870 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1871 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1872
1873 MI.eraseFromParent();
1874 return true;
1875 }
1876
1877 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1878 return false;
1879
1880 if (!ST.hasFlatAddressSpace())
1881 return false;
1882
1883 auto SegmentNull =
1884 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1885 auto FlatNull =
1886 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1887
1888 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1889 if (!ApertureReg.isValid())
1890 return false;
1891
1892 auto CmpRes =
1893 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1894
1895 // Coerce the type of the low half of the result so we can use merge_values.
1896 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1897
1898 // TODO: Should we allow mismatched types but matching sizes in merges to
1899 // avoid the ptrtoint?
1900 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1901 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1902
1903 MI.eraseFromParent();
1904 return true;
1905}
1906
1907bool AMDGPULegalizerInfo::legalizeFrint(
1908 MachineInstr &MI, MachineRegisterInfo &MRI,
1909 MachineIRBuilder &B) const {
1910 Register Src = MI.getOperand(1).getReg();
1911 LLT Ty = MRI.getType(Src);
1912 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1912, __extension__ __PRETTY_FUNCTION__))
;
1913
1914 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1915 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1916
1917 auto C1 = B.buildFConstant(Ty, C1Val);
1918 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1919
1920 // TODO: Should this propagate fast-math-flags?
1921 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1922 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1923
1924 auto C2 = B.buildFConstant(Ty, C2Val);
1925 auto Fabs = B.buildFAbs(Ty, Src);
1926
1927 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1928 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1929 MI.eraseFromParent();
1930 return true;
1931}
1932
1933bool AMDGPULegalizerInfo::legalizeFceil(
1934 MachineInstr &MI, MachineRegisterInfo &MRI,
1935 MachineIRBuilder &B) const {
1936
1937 const LLT S1 = LLT::scalar(1);
1938 const LLT S64 = LLT::scalar(64);
1939
1940 Register Src = MI.getOperand(1).getReg();
1941 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1941, __extension__ __PRETTY_FUNCTION__))
;
1942
1943 // result = trunc(src)
1944 // if (src > 0.0 && src != result)
1945 // result += 1.0
1946
1947 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1948
1949 const auto Zero = B.buildFConstant(S64, 0.0);
1950 const auto One = B.buildFConstant(S64, 1.0);
1951 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1952 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1953 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1954 auto Add = B.buildSelect(S64, And, One, Zero);
1955
1956 // TODO: Should this propagate fast-math-flags?
1957 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1958 return true;
1959}
1960
1961bool AMDGPULegalizerInfo::legalizeFrem(
1962 MachineInstr &MI, MachineRegisterInfo &MRI,
1963 MachineIRBuilder &B) const {
1964 Register DstReg = MI.getOperand(0).getReg();
1965 Register Src0Reg = MI.getOperand(1).getReg();
1966 Register Src1Reg = MI.getOperand(2).getReg();
1967 auto Flags = MI.getFlags();
1968 LLT Ty = MRI.getType(DstReg);
1969
1970 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1971 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1972 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1973 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1974 MI.eraseFromParent();
1975 return true;
1976}
1977
1978static MachineInstrBuilder extractF64Exponent(Register Hi,
1979 MachineIRBuilder &B) {
1980 const unsigned FractBits = 52;
1981 const unsigned ExpBits = 11;
1982 LLT S32 = LLT::scalar(32);
1983
1984 auto Const0 = B.buildConstant(S32, FractBits - 32);
1985 auto Const1 = B.buildConstant(S32, ExpBits);
1986
1987 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1988 .addUse(Hi)
1989 .addUse(Const0.getReg(0))
1990 .addUse(Const1.getReg(0));
1991
1992 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1993}
1994
1995bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1996 MachineInstr &MI, MachineRegisterInfo &MRI,
1997 MachineIRBuilder &B) const {
1998 const LLT S1 = LLT::scalar(1);
1999 const LLT S32 = LLT::scalar(32);
2000 const LLT S64 = LLT::scalar(64);
2001
2002 Register Src = MI.getOperand(1).getReg();
2003 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2003, __extension__ __PRETTY_FUNCTION__))
;
2004
2005 // TODO: Should this use extract since the low half is unused?
2006 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2007 Register Hi = Unmerge.getReg(1);
2008
2009 // Extract the upper half, since this is where we will find the sign and
2010 // exponent.
2011 auto Exp = extractF64Exponent(Hi, B);
2012
2013 const unsigned FractBits = 52;
2014
2015 // Extract the sign bit.
2016 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2017 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2018
2019 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2020
2021 const auto Zero32 = B.buildConstant(S32, 0);
2022
2023 // Extend back to 64-bits.
2024 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2025
2026 auto Shr = B.buildAShr(S64, FractMask, Exp);
2027 auto Not = B.buildNot(S64, Shr);
2028 auto Tmp0 = B.buildAnd(S64, Src, Not);
2029 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2030
2031 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2032 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2033
2034 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2035 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2036 MI.eraseFromParent();
2037 return true;
2038}
2039
2040bool AMDGPULegalizerInfo::legalizeITOFP(
2041 MachineInstr &MI, MachineRegisterInfo &MRI,
2042 MachineIRBuilder &B, bool Signed) const {
2043
2044 Register Dst = MI.getOperand(0).getReg();
2045 Register Src = MI.getOperand(1).getReg();
2046
2047 const LLT S64 = LLT::scalar(64);
2048 const LLT S32 = LLT::scalar(32);
2049
2050 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2050, __extension__ __PRETTY_FUNCTION__))
;
2051
2052 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2053
2054 auto CvtHi = Signed ?
2055 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2056 B.buildUITOFP(S64, Unmerge.getReg(1));
2057
2058 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2059
2060 auto ThirtyTwo = B.buildConstant(S32, 32);
2061 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2062 .addUse(CvtHi.getReg(0))
2063 .addUse(ThirtyTwo.getReg(0));
2064
2065 // TODO: Should this propagate fast-math-flags?
2066 B.buildFAdd(Dst, LdExp, CvtLo);
2067 MI.eraseFromParent();
2068 return true;
2069}
2070
2071// TODO: Copied from DAG implementation. Verify logic and document how this
2072// actually works.
2073bool AMDGPULegalizerInfo::legalizeFPTOI(
2074 MachineInstr &MI, MachineRegisterInfo &MRI,
2075 MachineIRBuilder &B, bool Signed) const {
2076
2077 Register Dst = MI.getOperand(0).getReg();
2078 Register Src = MI.getOperand(1).getReg();
2079
2080 const LLT S64 = LLT::scalar(64);
2081 const LLT S32 = LLT::scalar(32);
2082
2083 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2083, __extension__ __PRETTY_FUNCTION__))
;
2084
2085 unsigned Flags = MI.getFlags();
2086
2087 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2088 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL));
2089 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL));
2090
2091 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2092 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2093 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2094
2095 auto Hi = Signed ?
2096 B.buildFPTOSI(S32, FloorMul) :
2097 B.buildFPTOUI(S32, FloorMul);
2098 auto Lo = B.buildFPTOUI(S32, Fma);
2099
2100 B.buildMerge(Dst, { Lo, Hi });
2101 MI.eraseFromParent();
2102
2103 return true;
2104}
2105
2106bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2107 MachineInstr &MI) const {
2108 MachineFunction &MF = Helper.MIRBuilder.getMF();
2109 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2110
2111 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2112 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2113
2114 // With ieee_mode disabled, the instructions have the correct behavior
2115 // already for G_FMINNUM/G_FMAXNUM
2116 if (!MFI->getMode().IEEE)
2117 return !IsIEEEOp;
2118
2119 if (IsIEEEOp)
2120 return true;
2121
2122 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2123}
2124
2125bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2126 MachineInstr &MI, MachineRegisterInfo &MRI,
2127 MachineIRBuilder &B) const {
2128 // TODO: Should move some of this into LegalizerHelper.
2129
2130 // TODO: Promote dynamic indexing of s16 to s32
2131
2132 // FIXME: Artifact combiner probably should have replaced the truncated
2133 // constant before this, so we shouldn't need
2134 // getConstantVRegValWithLookThrough.
2135 Optional<ValueAndVReg> MaybeIdxVal =
2136 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2137 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2138 return true;
2139 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2140
2141 Register Dst = MI.getOperand(0).getReg();
2142 Register Vec = MI.getOperand(1).getReg();
2143
2144 LLT VecTy = MRI.getType(Vec);
2145 LLT EltTy = VecTy.getElementType();
2146 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2146, __extension__ __PRETTY_FUNCTION__))
;
2147
2148 if (IdxVal < VecTy.getNumElements())
2149 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2150 else
2151 B.buildUndef(Dst);
2152
2153 MI.eraseFromParent();
2154 return true;
2155}
2156
2157bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2158 MachineInstr &MI, MachineRegisterInfo &MRI,
2159 MachineIRBuilder &B) const {
2160 // TODO: Should move some of this into LegalizerHelper.
2161
2162 // TODO: Promote dynamic indexing of s16 to s32
2163
2164 // FIXME: Artifact combiner probably should have replaced the truncated
2165 // constant before this, so we shouldn't need
2166 // getConstantVRegValWithLookThrough.
2167 Optional<ValueAndVReg> MaybeIdxVal =
2168 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2169 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2170 return true;
2171
2172 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2173 Register Dst = MI.getOperand(0).getReg();
2174 Register Vec = MI.getOperand(1).getReg();
2175 Register Ins = MI.getOperand(2).getReg();
2176
2177 LLT VecTy = MRI.getType(Vec);
2178 LLT EltTy = VecTy.getElementType();
2179 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2179, __extension__ __PRETTY_FUNCTION__))
;
2180
2181 if (IdxVal < VecTy.getNumElements())
2182 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2183 else
2184 B.buildUndef(Dst);
2185
2186 MI.eraseFromParent();
2187 return true;
2188}
2189
2190bool AMDGPULegalizerInfo::legalizeShuffleVector(
2191 MachineInstr &MI, MachineRegisterInfo &MRI,
2192 MachineIRBuilder &B) const {
2193 const LLT V2S16 = LLT::vector(2, 16);
2194
2195 Register Dst = MI.getOperand(0).getReg();
2196 Register Src0 = MI.getOperand(1).getReg();
2197 LLT DstTy = MRI.getType(Dst);
2198 LLT SrcTy = MRI.getType(Src0);
2199
2200 if (SrcTy == V2S16 && DstTy == V2S16 &&
2201 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2202 return true;
2203
2204 MachineIRBuilder HelperBuilder(MI);
2205 GISelObserverWrapper DummyObserver;
2206 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2207 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2208}
2209
2210bool AMDGPULegalizerInfo::legalizeSinCos(
2211 MachineInstr &MI, MachineRegisterInfo &MRI,
2212 MachineIRBuilder &B) const {
2213
2214 Register DstReg = MI.getOperand(0).getReg();
2215 Register SrcReg = MI.getOperand(1).getReg();
2216 LLT Ty = MRI.getType(DstReg);
2217 unsigned Flags = MI.getFlags();
2218
2219 Register TrigVal;
2220 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2221 if (ST.hasTrigReducedRange()) {
2222 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2223 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2224 .addUse(MulVal.getReg(0))
2225 .setMIFlags(Flags).getReg(0);
2226 } else
2227 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2228
2229 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2230 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2231 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2232 .addUse(TrigVal)
2233 .setMIFlags(Flags);
2234 MI.eraseFromParent();
2235 return true;
2236}
2237
2238bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2239 MachineIRBuilder &B,
2240 const GlobalValue *GV,
2241 int64_t Offset,
2242 unsigned GAFlags) const {
2243 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2243, __extension__ __PRETTY_FUNCTION__))
;
2244 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2245 // to the following code sequence:
2246 //
2247 // For constant address space:
2248 // s_getpc_b64 s[0:1]
2249 // s_add_u32 s0, s0, $symbol
2250 // s_addc_u32 s1, s1, 0
2251 //
2252 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2253 // a fixup or relocation is emitted to replace $symbol with a literal
2254 // constant, which is a pc-relative offset from the encoding of the $symbol
2255 // operand to the global variable.
2256 //
2257 // For global address space:
2258 // s_getpc_b64 s[0:1]
2259 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2260 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2261 //
2262 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2263 // fixups or relocations are emitted to replace $symbol@*@lo and
2264 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2265 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2266 // operand to the global variable.
2267 //
2268 // What we want here is an offset from the value returned by s_getpc
2269 // (which is the address of the s_add_u32 instruction) to the global
2270 // variable, but since the encoding of $symbol starts 4 bytes after the start
2271 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2272 // small. This requires us to add 4 to the global variable offset in order to
2273 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2274 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2275 // instruction.
2276
2277 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2278
2279 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2280 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2281
2282 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2283 .addDef(PCReg);
2284
2285 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2286 if (GAFlags == SIInstrInfo::MO_NONE)
2287 MIB.addImm(0);
2288 else
2289 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2290
2291 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2292
2293 if (PtrTy.getSizeInBits() == 32)
2294 B.buildExtract(DstReg, PCReg, 0);
2295 return true;
2296 }
2297
2298bool AMDGPULegalizerInfo::legalizeGlobalValue(
2299 MachineInstr &MI, MachineRegisterInfo &MRI,
2300 MachineIRBuilder &B) const {
2301 Register DstReg = MI.getOperand(0).getReg();
2302 LLT Ty = MRI.getType(DstReg);
2303 unsigned AS = Ty.getAddressSpace();
2304
2305 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2306 MachineFunction &MF = B.getMF();
2307 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2308
2309 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2310 if (!MFI->isModuleEntryFunction() &&
2311 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2312 const Function &Fn = MF.getFunction();
2313 DiagnosticInfoUnsupported BadLDSDecl(
2314 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2315 DS_Warning);
2316 Fn.getContext().diagnose(BadLDSDecl);
2317
2318 // We currently don't have a way to correctly allocate LDS objects that
2319 // aren't directly associated with a kernel. We do force inlining of
2320 // functions that use local objects. However, if these dead functions are
2321 // not eliminated, we don't want a compile time error. Just emit a warning
2322 // and a trap, since there should be no callable path here.
2323 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2324 B.buildUndef(DstReg);
2325 MI.eraseFromParent();
2326 return true;
2327 }
2328
2329 // TODO: We could emit code to handle the initialization somewhere.
2330 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2331 const SITargetLowering *TLI = ST.getTargetLowering();
2332 if (!TLI->shouldUseLDSConstAddress(GV)) {
2333 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2334 return true; // Leave in place;
2335 }
2336
2337 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2338 Type *Ty = GV->getValueType();
2339 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2340 // zero-sized type in other languages to declare the dynamic shared
2341 // memory which size is not known at the compile time. They will be
2342 // allocated by the runtime and placed directly after the static
2343 // allocated ones. They all share the same offset.
2344 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2345 // Adjust alignment for that dynamic shared memory array.
2346 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2347 LLT S32 = LLT::scalar(32);
2348 auto Sz =
2349 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2350 B.buildIntToPtr(DstReg, Sz);
2351 MI.eraseFromParent();
2352 return true;
2353 }
2354 }
2355
2356 B.buildConstant(
2357 DstReg,
2358 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2359 MI.eraseFromParent();
2360 return true;
2361 }
2362
2363 const Function &Fn = MF.getFunction();
2364 DiagnosticInfoUnsupported BadInit(
2365 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2366 Fn.getContext().diagnose(BadInit);
2367 return true;
2368 }
2369
2370 const SITargetLowering *TLI = ST.getTargetLowering();
2371
2372 if (TLI->shouldEmitFixup(GV)) {
2373 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2374 MI.eraseFromParent();
2375 return true;
2376 }
2377
2378 if (TLI->shouldEmitPCReloc(GV)) {
2379 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2380 MI.eraseFromParent();
2381 return true;
2382 }
2383
2384 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2385 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2386
2387 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2388 MachinePointerInfo::getGOT(MF),
2389 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2390 MachineMemOperand::MOInvariant,
2391 8 /*Size*/, Align(8));
2392
2393 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2394
2395 if (Ty.getSizeInBits() == 32) {
2396 // Truncate if this is a 32-bit constant adrdess.
2397 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2398 B.buildExtract(DstReg, Load, 0);
2399 } else
2400 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2401
2402 MI.eraseFromParent();
2403 return true;
2404}
2405
2406static LLT widenToNextPowerOf2(LLT Ty) {
2407 if (Ty.isVector())
2408 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2409 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2410}
2411
2412bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2413 MachineInstr &MI) const {
2414 MachineIRBuilder &B = Helper.MIRBuilder;
2415 MachineRegisterInfo &MRI = *B.getMRI();
2416 GISelChangeObserver &Observer = Helper.Observer;
2417
2418 Register PtrReg = MI.getOperand(1).getReg();
2419 LLT PtrTy = MRI.getType(PtrReg);
2420 unsigned AddrSpace = PtrTy.getAddressSpace();
2421
2422 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2423 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2424 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2425 Observer.changingInstr(MI);
2426 MI.getOperand(1).setReg(Cast.getReg(0));
2427 Observer.changedInstr(MI);
2428 return true;
2429 }
2430
2431 if (MI.getOpcode() != AMDGPU::G_LOAD)
2432 return false;
2433
2434 Register ValReg = MI.getOperand(0).getReg();
2435 LLT ValTy = MRI.getType(ValReg);
2436
2437 MachineMemOperand *MMO = *MI.memoperands_begin();
2438 const unsigned ValSize = ValTy.getSizeInBits();
2439 const unsigned MemSize = 8 * MMO->getSize();
2440 const Align MemAlign = MMO->getAlign();
2441 const unsigned AlignInBits = 8 * MemAlign.value();
2442
2443 // Widen non-power-of-2 loads to the alignment if needed
2444 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2445 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2446
2447 // This was already the correct extending load result type, so just adjust
2448 // the memory type.
2449 if (WideMemSize == ValSize) {
2450 MachineFunction &MF = B.getMF();
2451
2452 MachineMemOperand *WideMMO =
2453 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2454 Observer.changingInstr(MI);
2455 MI.setMemRefs(MF, {WideMMO});
2456 Observer.changedInstr(MI);
2457 return true;
2458 }
2459
2460 // Don't bother handling edge case that should probably never be produced.
2461 if (ValSize > WideMemSize)
2462 return false;
2463
2464 LLT WideTy = widenToNextPowerOf2(ValTy);
2465
2466 Register WideLoad;
2467 if (!WideTy.isVector()) {
2468 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2469 B.buildTrunc(ValReg, WideLoad).getReg(0);
2470 } else {
2471 // Extract the subvector.
2472
2473 if (isRegisterType(ValTy)) {
2474 // If this a case where G_EXTRACT is legal, use it.
2475 // (e.g. <3 x s32> -> <4 x s32>)
2476 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2477 B.buildExtract(ValReg, WideLoad, 0);
2478 } else {
2479 // For cases where the widened type isn't a nice register value, unmerge
2480 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2481 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2482 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2483 B.setInsertPt(B.getMBB(), MI.getIterator());
2484 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2485 }
2486 }
2487
2488 MI.eraseFromParent();
2489 return true;
2490 }
2491
2492 return false;
2493}
2494
2495bool AMDGPULegalizerInfo::legalizeFMad(
2496 MachineInstr &MI, MachineRegisterInfo &MRI,
2497 MachineIRBuilder &B) const {
2498 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2499 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2499, __extension__ __PRETTY_FUNCTION__))
;
2500
2501 MachineFunction &MF = B.getMF();
2502 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2503
2504 // TODO: Always legal with future ftz flag.
2505 // FIXME: Do we need just output?
2506 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2507 return true;
2508 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2509 return true;
2510
2511 MachineIRBuilder HelperBuilder(MI);
2512 GISelObserverWrapper DummyObserver;
2513 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2514 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2515}
2516
2517bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2518 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2519 Register DstReg = MI.getOperand(0).getReg();
2520 Register PtrReg = MI.getOperand(1).getReg();
2521 Register CmpVal = MI.getOperand(2).getReg();
2522 Register NewVal = MI.getOperand(3).getReg();
2523
2524 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2525, __extension__ __PRETTY_FUNCTION__))
2525 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2525, __extension__ __PRETTY_FUNCTION__))
;
2526
2527 LLT ValTy = MRI.getType(CmpVal);
2528 LLT VecTy = LLT::vector(2, ValTy);
2529
2530 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2531
2532 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2533 .addDef(DstReg)
2534 .addUse(PtrReg)
2535 .addUse(PackedVal)
2536 .setMemRefs(MI.memoperands());
2537
2538 MI.eraseFromParent();
2539 return true;
2540}
2541
2542bool AMDGPULegalizerInfo::legalizeFlog(
2543 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2544 Register Dst = MI.getOperand(0).getReg();
2545 Register Src = MI.getOperand(1).getReg();
2546 LLT Ty = B.getMRI()->getType(Dst);
2547 unsigned Flags = MI.getFlags();
2548
2549 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2550 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2551
2552 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2553 MI.eraseFromParent();
2554 return true;
2555}
2556
2557bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2558 MachineIRBuilder &B) const {
2559 Register Dst = MI.getOperand(0).getReg();
2560 Register Src = MI.getOperand(1).getReg();
2561 unsigned Flags = MI.getFlags();
2562 LLT Ty = B.getMRI()->getType(Dst);
2563
2564 auto K = B.buildFConstant(Ty, numbers::log2e);
2565 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2566 B.buildFExp2(Dst, Mul, Flags);
2567 MI.eraseFromParent();
2568 return true;
2569}
2570
2571bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2572 MachineIRBuilder &B) const {
2573 Register Dst = MI.getOperand(0).getReg();
2574 Register Src0 = MI.getOperand(1).getReg();
2575 Register Src1 = MI.getOperand(2).getReg();
2576 unsigned Flags = MI.getFlags();
2577 LLT Ty = B.getMRI()->getType(Dst);
2578 const LLT S16 = LLT::scalar(16);
2579 const LLT S32 = LLT::scalar(32);
2580
2581 if (Ty == S32) {
2582 auto Log = B.buildFLog2(S32, Src0, Flags);
2583 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2584 .addUse(Log.getReg(0))
2585 .addUse(Src1)
2586 .setMIFlags(Flags);
2587 B.buildFExp2(Dst, Mul, Flags);
2588 } else if (Ty == S16) {
2589 // There's no f16 fmul_legacy, so we need to convert for it.
2590 auto Log = B.buildFLog2(S16, Src0, Flags);
2591 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2592 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2593 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2594 .addUse(Ext0.getReg(0))
2595 .addUse(Ext1.getReg(0))
2596 .setMIFlags(Flags);
2597
2598 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2599 } else
2600 return false;
2601
2602 MI.eraseFromParent();
2603 return true;
2604}
2605
2606// Find a source register, ignoring any possible source modifiers.
2607static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2608 Register ModSrc = OrigSrc;
2609 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2610 ModSrc = SrcFNeg->getOperand(1).getReg();
2611 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2612 ModSrc = SrcFAbs->getOperand(1).getReg();
2613 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2614 ModSrc = SrcFAbs->getOperand(1).getReg();
2615 return ModSrc;
2616}
2617
2618bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2619 MachineRegisterInfo &MRI,
2620 MachineIRBuilder &B) const {
2621
2622 const LLT S1 = LLT::scalar(1);
2623 const LLT S64 = LLT::scalar(64);
2624 Register Dst = MI.getOperand(0).getReg();
2625 Register OrigSrc = MI.getOperand(1).getReg();
2626 unsigned Flags = MI.getFlags();
2627 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2628, __extension__ __PRETTY_FUNCTION__))
2628 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2628, __extension__ __PRETTY_FUNCTION__))
;
2629
2630 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2631 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2632 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2633 // V_FRACT bug is:
2634 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2635 //
2636 // Convert floor(x) to (x - fract(x))
2637
2638 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2639 .addUse(OrigSrc)
2640 .setMIFlags(Flags);
2641
2642 // Give source modifier matching some assistance before obscuring a foldable
2643 // pattern.
2644
2645 // TODO: We can avoid the neg on the fract? The input sign to fract
2646 // shouldn't matter?
2647 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2648
2649 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2650
2651 Register Min = MRI.createGenericVirtualRegister(S64);
2652
2653 // We don't need to concern ourselves with the snan handling difference, so
2654 // use the one which will directly select.
2655 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2656 if (MFI->getMode().IEEE)
2657 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2658 else
2659 B.buildFMinNum(Min, Fract, Const, Flags);
2660
2661 Register CorrectedFract = Min;
2662 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2663 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2664 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2665 }
2666
2667 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2668 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2669
2670 MI.eraseFromParent();
2671 return true;
2672}
2673
2674// Turn an illegal packed v2s16 build vector into bit operations.
2675// TODO: This should probably be a bitcast action in LegalizerHelper.
2676bool AMDGPULegalizerInfo::legalizeBuildVector(
2677 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2678 Register Dst = MI.getOperand(0).getReg();
2679 const LLT S32 = LLT::scalar(32);
2680 assert(MRI.getType(Dst) == LLT::vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::vector(2,
16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2680, __extension__ __PRETTY_FUNCTION__))
;
2681
2682 Register Src0 = MI.getOperand(1).getReg();
2683 Register Src1 = MI.getOperand(2).getReg();
2684 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16
)) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2684, __extension__ __PRETTY_FUNCTION__))
;
2685
2686 auto Merge = B.buildMerge(S32, {Src0, Src1});
2687 B.buildBitcast(Dst, Merge);
2688
2689 MI.eraseFromParent();
2690 return true;
2691}
2692
2693// Check that this is a G_XOR x, -1
2694static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2695 if (MI.getOpcode() != TargetOpcode::G_XOR)
2696 return false;
2697 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2698 return ConstVal && *ConstVal == -1;
2699}
2700
2701// Return the use branch instruction, otherwise null if the usage is invalid.
2702static MachineInstr *
2703verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2704 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2705 Register CondDef = MI.getOperand(0).getReg();
2706 if (!MRI.hasOneNonDBGUse(CondDef))
2707 return nullptr;
2708
2709 MachineBasicBlock *Parent = MI.getParent();
2710 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2711
2712 if (isNot(MRI, *UseMI)) {
2713 Register NegatedCond = UseMI->getOperand(0).getReg();
2714 if (!MRI.hasOneNonDBGUse(NegatedCond))
2715 return nullptr;
2716
2717 // We're deleting the def of this value, so we need to remove it.
2718 UseMI->eraseFromParent();
2719
2720 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2721 Negated = true;
2722 }
2723
2724 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2725 return nullptr;
2726
2727 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2728 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2729 if (Next == Parent->end()) {
2730 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2731 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2732 return nullptr;
2733 UncondBrTarget = &*NextMBB;
2734 } else {
2735 if (Next->getOpcode() != AMDGPU::G_BR)
2736 return nullptr;
2737 Br = &*Next;
2738 UncondBrTarget = Br->getOperand(0).getMBB();
2739 }
2740
2741 return UseMI;
2742}
2743
2744bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2745 const ArgDescriptor *Arg,
2746 const TargetRegisterClass *ArgRC,
2747 LLT ArgTy) const {
2748 MCRegister SrcReg = Arg->getRegister();
2749 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2749, __extension__ __PRETTY_FUNCTION__))
;
2750 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2750, __extension__ __PRETTY_FUNCTION__))
;
2751
2752 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2753 ArgTy);
2754 if (Arg->isMasked()) {
2755 // TODO: Should we try to emit this once in the entry block?
2756 const LLT S32 = LLT::scalar(32);
2757 const unsigned Mask = Arg->getMask();
2758 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2759
2760 Register AndMaskSrc = LiveIn;
2761
2762 if (Shift != 0) {
2763 auto ShiftAmt = B.buildConstant(S32, Shift);
2764 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2765 }
2766
2767 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2768 } else {
2769 B.buildCopy(DstReg, LiveIn);
2770 }
2771
2772 return true;
2773}
2774
2775bool AMDGPULegalizerInfo::loadInputValue(
2776 Register DstReg, MachineIRBuilder &B,
2777 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2778 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2779 const ArgDescriptor *Arg;
2780 const TargetRegisterClass *ArgRC;
2781 LLT ArgTy;
2782 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2783
2784 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2785 return false; // TODO: Handle these
2786 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2787}
2788
2789bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2790 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2791 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2792 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2793 return false;
2794
2795 MI.eraseFromParent();
2796 return true;
2797}
2798
2799bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2800 MachineRegisterInfo &MRI,
2801 MachineIRBuilder &B) const {
2802 Register Dst = MI.getOperand(0).getReg();
2803 LLT DstTy = MRI.getType(Dst);
2804 LLT S16 = LLT::scalar(16);
2805 LLT S32 = LLT::scalar(32);
2806 LLT S64 = LLT::scalar(64);
2807
2808 if (DstTy == S16)
2809 return legalizeFDIV16(MI, MRI, B);
2810 if (DstTy == S32)
2811 return legalizeFDIV32(MI, MRI, B);
2812 if (DstTy == S64)
2813 return legalizeFDIV64(MI, MRI, B);
2814
2815 return false;
2816}
2817
2818void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2819 Register DstDivReg,
2820 Register DstRemReg,
2821 Register X,
2822 Register Y) const {
2823 const LLT S1 = LLT::scalar(1);
2824 const LLT S32 = LLT::scalar(32);
2825
2826 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2827 // algorithm used here.
2828
2829 // Initial estimate of inv(y).
2830 auto FloatY = B.buildUITOFP(S32, Y);
2831 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2832 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2833 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2834 auto Z = B.buildFPTOUI(S32, ScaledY);
2835
2836 // One round of UNR.
2837 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2838 auto NegYZ = B.buildMul(S32, NegY, Z);
2839 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2840
2841 // Quotient/remainder estimate.
2842 auto Q = B.buildUMulH(S32, X, Z);
2843 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2844
2845 // First quotient/remainder refinement.
2846 auto One = B.buildConstant(S32, 1);
2847 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2848 if (DstDivReg)
2849 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2850 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2851
2852 // Second quotient/remainder refinement.
2853 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2854 if (DstDivReg)
2855 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2856
2857 if (DstRemReg)
2858 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2859}
2860
2861// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2862//
2863// Return lo, hi of result
2864//
2865// %cvt.lo = G_UITOFP Val.lo
2866// %cvt.hi = G_UITOFP Val.hi
2867// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2868// %rcp = G_AMDGPU_RCP_IFLAG %mad
2869// %mul1 = G_FMUL %rcp, 0x5f7ffffc
2870// %mul2 = G_FMUL %mul1, 2**(-32)
2871// %trunc = G_INTRINSIC_TRUNC %mul2
2872// %mad2 = G_FMAD %trunc, -(2**32), %mul1
2873// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2874static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2875 Register Val) {
2876 const LLT S32 = LLT::scalar(32);
2877 auto Unmerge = B.buildUnmerge(S32, Val);
2878
2879 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2880 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2881
2882 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2883 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2884
2885 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2886 auto Mul1 =
2887 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2888
2889 // 2**(-32)
2890 auto Mul2 =
2891 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2892 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2893
2894 // -(2**32)
2895 auto Mad2 = B.buildFMAD(S32, Trunc,
2896 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2897
2898 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2899 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2900
2901 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2902}
2903
2904void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2905 Register DstDivReg,
2906 Register DstRemReg,
2907 Register Numer,
2908 Register Denom) const {
2909 const LLT S32 = LLT::scalar(32);
2910 const LLT S64 = LLT::scalar(64);
2911 const LLT S1 = LLT::scalar(1);
2912 Register RcpLo, RcpHi;
2913
2914 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2915
2916 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2917
2918 auto Zero64 = B.buildConstant(S64, 0);
2919 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2920
2921 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2922 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2923
2924 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2925 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2926 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2927
2928 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2929 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2930 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2931 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2932
2933 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2934 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2935 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2936 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2937 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2938
2939 auto Zero32 = B.buildConstant(S32, 0);
2940 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2941 auto Add2_HiC =
2942 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2943 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2944 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2945
2946 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2947 Register NumerLo = UnmergeNumer.getReg(0);
2948 Register NumerHi = UnmergeNumer.getReg(1);
2949
2950 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2951 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2952 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2953 Register Mul3_Lo = UnmergeMul3.getReg(0);
2954 Register Mul3_Hi = UnmergeMul3.getReg(1);
2955 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2956 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2957 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2958 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2959
2960 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2961 Register DenomLo = UnmergeDenom.getReg(0);
2962 Register DenomHi = UnmergeDenom.getReg(1);
2963
2964 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2965 auto C1 = B.buildSExt(S32, CmpHi);
2966
2967 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2968 auto C2 = B.buildSExt(S32, CmpLo);
2969
2970 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2971 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2972
2973 // TODO: Here and below portions of the code can be enclosed into if/endif.
2974 // Currently control flow is unconditional and we have 4 selects after
2975 // potential endif to substitute PHIs.
2976
2977 // if C3 != 0 ...
2978 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2979 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2980 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2981 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2982
2983 auto One64 = B.buildConstant(S64, 1);
2984 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2985
2986 auto C4 =
2987 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2988 auto C5 =
2989 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2990 auto C6 = B.buildSelect(
2991 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2992
2993 // if (C6 != 0)
2994 auto Add4 = B.buildAdd(S64, Add3, One64);
2995 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2996
2997 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2998 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2999 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3000
3001 // endif C6
3002 // endif C3
3003
3004 if (DstDivReg) {
3005 auto Sel1 = B.buildSelect(
3006 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3007 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3008 Sel1, MulHi3);
3009 }
3010
3011 if (DstRemReg) {
3012 auto Sel2 = B.buildSelect(
3013 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3014 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3015 Sel2, Sub1);
3016 }
3017}
3018
3019bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3020 MachineRegisterInfo &MRI,
3021 MachineIRBuilder &B) const {
3022 Register DstDivReg, DstRemReg;
3023 switch (MI.getOpcode()) {
3024 default:
3025 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3025)
;
3026 case AMDGPU::G_UDIV: {
3027 DstDivReg = MI.getOperand(0).getReg();
3028 break;
3029 }
3030 case AMDGPU::G_UREM: {
3031 DstRemReg = MI.getOperand(0).getReg();
3032 break;
3033 }
3034 case AMDGPU::G_UDIVREM: {
3035 DstDivReg = MI.getOperand(0).getReg();
3036 DstRemReg = MI.getOperand(1).getReg();
3037 break;
3038 }
3039 }
3040
3041 const LLT S64 = LLT::scalar(64);
3042 const LLT S32 = LLT::scalar(32);
3043 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3044 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3045 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3046 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3047
3048 if (Ty == S32)
3049 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3050 else if (Ty == S64)
3051 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3052 else
3053 return false;
3054
3055 MI.eraseFromParent();
3056 return true;
3057}
3058
3059bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3060 MachineRegisterInfo &MRI,
3061 MachineIRBuilder &B) const {
3062 const LLT S64 = LLT::scalar(64);
3063 const LLT S32 = LLT::scalar(32);
3064
3065 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3066 if (Ty != S32 && Ty != S64)
3067 return false;
3068
3069 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3070 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3071 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3072
3073 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3074 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3075 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3076
3077 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3078 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3079
3080 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3081 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3082
3083 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3084 switch (MI.getOpcode()) {
3085 default:
3086 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3086)
;
3087 case AMDGPU::G_SDIV: {
3088 DstDivReg = MI.getOperand(0).getReg();
3089 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3090 break;
3091 }
3092 case AMDGPU::G_SREM: {
3093 DstRemReg = MI.getOperand(0).getReg();
3094 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3095 break;
3096 }
3097 case AMDGPU::G_SDIVREM: {
3098 DstDivReg = MI.getOperand(0).getReg();
3099 DstRemReg = MI.getOperand(1).getReg();
3100 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3101 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3102 break;
3103 }
3104 }
3105
3106 if (Ty == S32)
3107 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3108 else
3109 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3110
3111 if (DstDivReg) {
3112 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3113 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3114 B.buildSub(DstDivReg, SignXor, Sign);
3115 }
3116
3117 if (DstRemReg) {
3118 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3119 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3120 B.buildSub(DstRemReg, SignXor, Sign);
3121 }
3122
3123 MI.eraseFromParent();
3124 return true;
3125}
3126
3127bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3128 MachineRegisterInfo &MRI,
3129 MachineIRBuilder &B) const {
3130 Register Res = MI.getOperand(0).getReg();
3131 Register LHS = MI.getOperand(1).getReg();
3132 Register RHS = MI.getOperand(2).getReg();
3133 uint16_t Flags = MI.getFlags();
3134 LLT ResTy = MRI.getType(Res);
3135
3136 const MachineFunction &MF = B.getMF();
3137 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3138 MI.getFlag(MachineInstr::FmAfn);
3139
3140 if (!AllowInaccurateRcp)
3141 return false;
3142
3143 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3144 // 1 / x -> RCP(x)
3145 if (CLHS->isExactlyValue(1.0)) {
3146 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3147 .addUse(RHS)
3148 .setMIFlags(Flags);
3149
3150 MI.eraseFromParent();
3151 return true;
3152 }
3153
3154 // -1 / x -> RCP( FNEG(x) )
3155 if (CLHS->isExactlyValue(-1.0)) {
3156 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3157 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3158 .addUse(FNeg.getReg(0))
3159 .setMIFlags(Flags);
3160
3161 MI.eraseFromParent();
3162 return true;
3163 }
3164 }
3165
3166 // x / y -> x * (1.0 / y)
3167 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3168 .addUse(RHS)
3169 .setMIFlags(Flags);
3170 B.buildFMul(Res, LHS, RCP, Flags);
3171
3172 MI.eraseFromParent();
3173 return true;
3174}
3175
3176bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3177 MachineRegisterInfo &MRI,
3178 MachineIRBuilder &B) const {
3179 Register Res = MI.getOperand(0).getReg();
3180 Register X = MI.getOperand(1).getReg();
3181 Register Y = MI.getOperand(2).getReg();
3182 uint16_t Flags = MI.getFlags();
3183 LLT ResTy = MRI.getType(Res);
3184
3185 const MachineFunction &MF = B.getMF();
3186 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3187 MI.getFlag(MachineInstr::FmAfn);
3188
3189 if (!AllowInaccurateRcp)
3190 return false;
3191
3192 auto NegY = B.buildFNeg(ResTy, Y);
3193 auto One = B.buildFConstant(ResTy, 1.0);
3194
3195 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3196 .addUse(Y)
3197 .setMIFlags(Flags);
3198
3199 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3200 R = B.buildFMA(ResTy, Tmp0, R, R);
3201
3202 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3203 R = B.buildFMA(ResTy, Tmp1, R, R);
3204
3205 auto Ret = B.buildFMul(ResTy, X, R);
3206 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3207
3208 B.buildFMA(Res, Tmp2, R, Ret);
3209 MI.eraseFromParent();
3210 return true;
3211}
3212
3213bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3214 MachineRegisterInfo &MRI,
3215 MachineIRBuilder &B) const {
3216 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3217 return true;
3218
3219 Register Res = MI.getOperand(0).getReg();
3220 Register LHS = MI.getOperand(1).getReg();
3221 Register RHS = MI.getOperand(2).getReg();
3222
3223 uint16_t Flags = MI.getFlags();
3224
3225 LLT S16 = LLT::scalar(16);
3226 LLT S32 = LLT::scalar(32);
3227
3228 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3229 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3230
3231 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3232 .addUse(RHSExt.getReg(0))
3233 .setMIFlags(Flags);
3234
3235 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3236 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3237
3238 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3239 .addUse(RDst.getReg(0))
3240 .addUse(RHS)
3241 .addUse(LHS)
3242 .setMIFlags(Flags);
3243
3244 MI.eraseFromParent();
3245 return true;
3246}
3247
3248// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3249// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3250static void toggleSPDenormMode(bool Enable,
3251 MachineIRBuilder &B,
3252 const GCNSubtarget &ST,
3253 AMDGPU::SIModeRegisterDefaults Mode) {
3254 // Set SP denorm mode to this value.
3255 unsigned SPDenormMode =
3256 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3257
3258 if (ST.hasDenormModeInst()) {
3259 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3260 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3261
3262 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3263 B.buildInstr(AMDGPU::S_DENORM_MODE)
3264 .addImm(NewDenormModeValue);
3265
3266 } else {
3267 // Select FP32 bit field in mode register.
3268 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3269 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3270 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3271
3272 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3273 .addImm(SPDenormMode)
3274 .addImm(SPDenormModeBitField);
3275 }
3276}
3277
3278bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3279 MachineRegisterInfo &MRI,
3280 MachineIRBuilder &B) const {
3281 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3282 return true;
3283
3284 Register Res = MI.getOperand(0).getReg();
3285 Register LHS = MI.getOperand(1).getReg();
3286 Register RHS = MI.getOperand(2).getReg();
3287 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3288 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3289
3290 uint16_t Flags = MI.getFlags();
3291
3292 LLT S32 = LLT::scalar(32);
3293 LLT S1 = LLT::scalar(1);
3294
3295 auto One = B.buildFConstant(S32, 1.0f);
3296
3297 auto DenominatorScaled =
3298 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3299 .addUse(LHS)
3300 .addUse(RHS)
3301 .addImm(0)
3302 .setMIFlags(Flags);
3303 auto NumeratorScaled =
3304 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3305 .addUse(LHS)
3306 .addUse(RHS)
3307 .addImm(1)
3308 .setMIFlags(Flags);
3309
3310 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3311 .addUse(DenominatorScaled.getReg(0))
3312 .setMIFlags(Flags);
3313 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3314
3315 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3316 // aren't modeled as reading it.
3317 if (!Mode.allFP32Denormals())
3318 toggleSPDenormMode(true, B, ST, Mode);
3319
3320 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3321 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3322 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3323 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3324 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3325 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3326
3327 if (!Mode.allFP32Denormals())
3328 toggleSPDenormMode(false, B, ST, Mode);
3329
3330 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3331 .addUse(Fma4.getReg(0))
3332 .addUse(Fma1.getReg(0))
3333 .addUse(Fma3.getReg(0))
3334 .addUse(NumeratorScaled.getReg(1))
3335 .setMIFlags(Flags);
3336
3337 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3338 .addUse(Fmas.getReg(0))
3339 .addUse(RHS)
3340 .addUse(LHS)
3341 .setMIFlags(Flags);
3342
3343 MI.eraseFromParent();
3344 return true;
3345}
3346
3347bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3348 MachineRegisterInfo &MRI,
3349 MachineIRBuilder &B) const {
3350 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3351 return true;
3352
3353 Register Res = MI.getOperand(0).getReg();
3354 Register LHS = MI.getOperand(1).getReg();
3355 Register RHS = MI.getOperand(2).getReg();
3356
3357 uint16_t Flags = MI.getFlags();
3358
3359 LLT S64 = LLT::scalar(64);
3360 LLT S1 = LLT::scalar(1);
3361
3362 auto One = B.buildFConstant(S64, 1.0);
3363
3364 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3365 .addUse(LHS)
3366 .addUse(RHS)
3367 .addImm(0)
3368 .setMIFlags(Flags);
3369
3370 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3371
3372 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3373 .addUse(DivScale0.getReg(0))
3374 .setMIFlags(Flags);
3375
3376 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3377 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3378 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3379
3380 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3381 .addUse(LHS)
3382 .addUse(RHS)
3383 .addImm(1)
3384 .setMIFlags(Flags);
3385
3386 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3387 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3388 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3389
3390 Register Scale;
3391 if (!ST.hasUsableDivScaleConditionOutput()) {
3392 // Workaround a hardware bug on SI where the condition output from div_scale
3393 // is not usable.
3394
3395 LLT S32 = LLT::scalar(32);
3396
3397 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3398 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3399 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3400 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3401
3402 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3403 Scale1Unmerge.getReg(1));
3404 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3405 Scale0Unmerge.getReg(1));
3406 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3407 } else {
3408 Scale = DivScale1.getReg(1);
3409 }
3410
3411 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3412 .addUse(Fma4.getReg(0))
3413 .addUse(Fma3.getReg(0))
3414 .addUse(Mul.getReg(0))
3415 .addUse(Scale)
3416 .setMIFlags(Flags);
3417
3418 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3419 .addUse(Fmas.getReg(0))
3420 .addUse(RHS)
3421 .addUse(LHS)
3422 .setMIFlags(Flags);
3423
3424 MI.eraseFromParent();
3425 return true;
3426}
3427
3428bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3429 MachineRegisterInfo &MRI,
3430 MachineIRBuilder &B) const {
3431 Register Res = MI.getOperand(0).getReg();
3432 Register LHS = MI.getOperand(2).getReg();
3433 Register RHS = MI.getOperand(3).getReg();
3434 uint16_t Flags = MI.getFlags();
3435
3436 LLT S32 = LLT::scalar(32);
3437 LLT S1 = LLT::scalar(1);
3438
3439 auto Abs = B.buildFAbs(S32, RHS, Flags);
3440 const APFloat C0Val(1.0f);
3441
3442 auto C0 = B.buildConstant(S32, 0x6f800000);
3443 auto C1 = B.buildConstant(S32, 0x2f800000);
3444 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3445
3446 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3447 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3448
3449 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3450
3451 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3452 .addUse(Mul0.getReg(0))
3453 .setMIFlags(Flags);
3454
3455 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3456
3457 B.buildFMul(Res, Sel, Mul1, Flags);
3458
3459 MI.eraseFromParent();
3460 return true;
3461}
3462
3463// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3464// FIXME: Why do we handle this one but not other removed instructions?
3465//
3466// Reciprocal square root. The clamp prevents infinite results, clamping
3467// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3468// +-max_float.
3469bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3470 MachineRegisterInfo &MRI,
3471 MachineIRBuilder &B) const {
3472 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3473 return true;
3474
3475 Register Dst = MI.getOperand(0).getReg();
3476 Register Src = MI.getOperand(2).getReg();
3477 auto Flags = MI.getFlags();
3478
3479 LLT Ty = MRI.getType(Dst);
3480
3481 const fltSemantics *FltSemantics;
3482 if (Ty == LLT::scalar(32))
3483 FltSemantics = &APFloat::IEEEsingle();
3484 else if (Ty == LLT::scalar(64))
3485 FltSemantics = &APFloat::IEEEdouble();
3486 else
3487 return false;
3488
3489 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3490 .addUse(Src)
3491 .setMIFlags(Flags);
3492
3493 // We don't need to concern ourselves with the snan handling difference, since
3494 // the rsq quieted (or not) so use the one which will directly select.
3495 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3496 const bool UseIEEE = MFI->getMode().IEEE;
3497
3498 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3499 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3500 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3501
3502 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3503
3504 if (UseIEEE)
3505 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3506 else
3507 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3508 MI.eraseFromParent();
3509 return true;
3510}
3511
3512static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3513 switch (IID) {
3514 case Intrinsic::amdgcn_ds_fadd:
3515 return AMDGPU::G_ATOMICRMW_FADD;
3516 case Intrinsic::amdgcn_ds_fmin:
3517 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3518 case Intrinsic::amdgcn_ds_fmax:
3519 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3520 default:
3521 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3521)
;
3522 }
3523}
3524
3525bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3526 MachineInstr &MI,
3527 Intrinsic::ID IID) const {
3528 GISelChangeObserver &Observer = Helper.Observer;
3529 Observer.changingInstr(MI);
3530
3531 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3532
3533 // The remaining operands were used to set fields in the MemOperand on
3534 // construction.
3535 for (int I = 6; I > 3; --I)
3536 MI.RemoveOperand(I);
3537
3538 MI.RemoveOperand(1); // Remove the intrinsic ID.
3539 Observer.changedInstr(MI);
3540 return true;
3541}
3542
3543bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3544 MachineRegisterInfo &MRI,
3545 MachineIRBuilder &B) const {
3546 uint64_t Offset =
3547 ST.getTargetLowering()->getImplicitParameterOffset(
3548 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3549 LLT DstTy = MRI.getType(DstReg);
3550 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3551
3552 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3553 if (!loadInputValue(KernargPtrReg, B,
3554 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3555 return false;
3556
3557 // FIXME: This should be nuw
3558 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3559 return true;
3560}
3561
3562bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3563 MachineRegisterInfo &MRI,
3564 MachineIRBuilder &B) const {
3565 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3566 if (!MFI->isEntryFunction()) {
3567 return legalizePreloadedArgIntrin(MI, MRI, B,
3568 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3569 }
3570
3571 Register DstReg = MI.getOperand(0).getReg();
3572 if (!getImplicitArgPtr(DstReg, MRI, B))
3573 return false;
3574
3575 MI.eraseFromParent();
3576 return true;
3577}
3578
3579bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3580 MachineRegisterInfo &MRI,
3581 MachineIRBuilder &B,
3582 unsigned AddrSpace) const {
3583 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3584 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3585 Register Hi32 = Unmerge.getReg(1);
3586
3587 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3588 MI.eraseFromParent();
3589 return true;
3590}
3591
3592// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3593// offset (the offset that is included in bounds checking and swizzling, to be
3594// split between the instruction's voffset and immoffset fields) and soffset
3595// (the offset that is excluded from bounds checking and swizzling, to go in
3596// the instruction's soffset field). This function takes the first kind of
3597// offset and figures out how to split it between voffset and immoffset.
3598std::tuple<Register, unsigned, unsigned>
3599AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3600 Register OrigOffset) const {
3601 const unsigned MaxImm = 4095;
3602 Register BaseReg;
3603 unsigned TotalConstOffset;
3604 const LLT S32 = LLT::scalar(32);
3605 MachineRegisterInfo &MRI = *B.getMRI();
3606
3607 std::tie(BaseReg, TotalConstOffset) =
3608 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3609
3610 unsigned ImmOffset = TotalConstOffset;
3611
3612 // If BaseReg is a pointer, convert it to int.
3613 if (MRI.getType(BaseReg).isPointer())
3614 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3615
3616 // If the immediate value is too big for the immoffset field, put the value
3617 // and -4096 into the immoffset field so that the value that is copied/added
3618 // for the voffset field is a multiple of 4096, and it stands more chance
3619 // of being CSEd with the copy/add for another similar load/store.
3620 // However, do not do that rounding down to a multiple of 4096 if that is a
3621 // negative number, as it appears to be illegal to have a negative offset
3622 // in the vgpr, even if adding the immediate offset makes it positive.
3623 unsigned Overflow = ImmOffset & ~MaxImm;
3624 ImmOffset -= Overflow;
3625 if ((int32_t)Overflow < 0) {
3626 Overflow += ImmOffset;
3627 ImmOffset = 0;
3628 }
3629
3630 if (Overflow != 0) {
3631 if (!BaseReg) {
3632 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3633 } else {
3634 auto OverflowVal = B.buildConstant(S32, Overflow);
3635 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3636 }
3637 }
3638
3639 if (!BaseReg)
3640 BaseReg = B.buildConstant(S32, 0).getReg(0);
3641
3642 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3643}
3644
3645/// Handle register layout difference for f16 images for some subtargets.
3646Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3647 MachineRegisterInfo &MRI,
3648 Register Reg,
3649 bool ImageStore) const {
3650 const LLT S16 = LLT::scalar(16);
3651 const LLT S32 = LLT::scalar(32);
3652 LLT StoreVT = MRI.getType(Reg);
3653 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3653, __extension__ __PRETTY_FUNCTION__))
;
3654
3655 if (ST.hasUnpackedD16VMem()) {
3656 auto Unmerge = B.buildUnmerge(S16, Reg);
3657
3658 SmallVector<Register, 4> WideRegs;
3659 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3660 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3661
3662 int NumElts = StoreVT.getNumElements();
3663
3664 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3665 }
3666
3667 if (ImageStore && ST.hasImageStoreD16Bug()) {
3668 if (StoreVT.getNumElements() == 2) {
3669 SmallVector<Register, 4> PackedRegs;
3670 Reg = B.buildBitcast(S32, Reg).getReg(0);
3671 PackedRegs.push_back(Reg);
3672 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3673 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3674 }
3675
3676 if (StoreVT.getNumElements() == 3) {
3677 SmallVector<Register, 4> PackedRegs;
3678 auto Unmerge = B.buildUnmerge(S16, Reg);
3679 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3680 PackedRegs.push_back(Unmerge.getReg(I));
3681 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3682 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3683 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3684 }
3685
3686 if (StoreVT.getNumElements() == 4) {
3687 SmallVector<Register, 4> PackedRegs;
3688 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3689 auto Unmerge = B.buildUnmerge(S32, Reg);
3690 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3691 PackedRegs.push_back(Unmerge.getReg(I));
3692 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3693 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3694 }
3695
3696 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3696)
;
3697 }
3698
3699 return Reg;
3700}
3701
3702Register AMDGPULegalizerInfo::fixStoreSourceType(
3703 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3704 MachineRegisterInfo *MRI = B.getMRI();
3705 LLT Ty = MRI->getType(VData);
3706
3707 const LLT S16 = LLT::scalar(16);
3708
3709 // Fixup illegal register types for i8 stores.
3710 if (Ty == LLT::scalar(8) || Ty == S16) {
3711 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3712 return AnyExt;
3713 }
3714
3715 if (Ty.isVector()) {
3716 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3717 if (IsFormat)
3718 return handleD16VData(B, *MRI, VData);
3719 }
3720 }
3721
3722 return VData;
3723}
3724
3725bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3726 MachineRegisterInfo &MRI,
3727 MachineIRBuilder &B,
3728 bool IsTyped,
3729 bool IsFormat) const {
3730 Register VData = MI.getOperand(1).getReg();
3731 LLT Ty = MRI.getType(VData);
3732 LLT EltTy = Ty.getScalarType();
3733 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3734 const LLT S32 = LLT::scalar(32);
3735
3736 VData = fixStoreSourceType(B, VData, IsFormat);
3737 Register RSrc = MI.getOperand(2).getReg();
3738
3739 MachineMemOperand *MMO = *MI.memoperands_begin();
3740 const int MemSize = MMO->getSize();
3741
3742 unsigned ImmOffset;
3743 unsigned TotalOffset;
3744
3745 // The typed intrinsics add an immediate after the registers.
3746 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3747
3748 // The struct intrinsic variants add one additional operand over raw.
3749 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3750 Register VIndex;
3751 int OpOffset = 0;
3752 if (HasVIndex) {
3753 VIndex = MI.getOperand(3).getReg();
3754 OpOffset = 1;
3755 }
3756
3757 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3758 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3759
3760 unsigned Format = 0;
3761 if (IsTyped) {
3762 Format = MI.getOperand(5 + OpOffset).getImm();
3763 ++OpOffset;
3764 }
3765
3766 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3767
3768 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3769 if (TotalOffset != 0)
3770 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3771
3772 unsigned Opc;
3773 if (IsTyped) {
3774 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3775 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3776 } else if (IsFormat) {
3777 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3778 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3779 } else {
3780 switch (MemSize) {
3781 case 1:
3782 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3783 break;
3784 case 2:
3785 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3786 break;
3787 default:
3788 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3789 break;
3790 }
3791 }
3792
3793 if (!VIndex)
3794 VIndex = B.buildConstant(S32, 0).getReg(0);
3795
3796 auto MIB = B.buildInstr(Opc)
3797 .addUse(VData) // vdata
3798 .addUse(RSrc) // rsrc
3799 .addUse(VIndex) // vindex
3800 .addUse(VOffset) // voffset
3801 .addUse(SOffset) // soffset
3802 .addImm(ImmOffset); // offset(imm)
3803
3804 if (IsTyped)
3805 MIB.addImm(Format);
3806
3807 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3808 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3809 .addMemOperand(MMO);
3810
3811 MI.eraseFromParent();
3812 return true;
3813}
3814
3815bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3816 MachineRegisterInfo &MRI,
3817 MachineIRBuilder &B,
3818 bool IsFormat,
3819 bool IsTyped) const {
3820 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3821 MachineMemOperand *MMO = *MI.memoperands_begin();
3822 const int MemSize = MMO->getSize();
3823 const LLT S32 = LLT::scalar(32);
3824
3825 Register Dst = MI.getOperand(0).getReg();
3826 Register RSrc = MI.getOperand(2).getReg();
3827
3828 // The typed intrinsics add an immediate after the registers.
3829 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3830
3831 // The struct intrinsic variants add one additional operand over raw.
3832 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3833 Register VIndex;
3834 int OpOffset = 0;
3835 if (HasVIndex) {
3836 VIndex = MI.getOperand(3).getReg();
3837 OpOffset = 1;
3838 }
3839
3840 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3841 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3842
3843 unsigned Format = 0;
3844 if (IsTyped) {
3845 Format = MI.getOperand(5 + OpOffset).getImm();
3846 ++OpOffset;
3847 }
3848
3849 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3850 unsigned ImmOffset;
3851 unsigned TotalOffset;
3852
3853 LLT Ty = MRI.getType(Dst);
3854 LLT EltTy = Ty.getScalarType();
3855 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3856 const bool Unpacked = ST.hasUnpackedD16VMem();
3857
3858 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3859 if (TotalOffset != 0)
3860 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3861
3862 unsigned Opc;
3863
3864 if (IsTyped) {
3865 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3866 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3867 } else if (IsFormat) {
3868 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3869 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3870 } else {
3871 switch (MemSize) {
3872 case 1:
3873 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3874 break;
3875 case 2:
3876 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3877 break;
3878 default:
3879 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3880 break;
3881 }
3882 }
3883
3884 Register LoadDstReg;
3885
3886 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3887 LLT UnpackedTy = Ty.changeElementSize(32);
3888
3889 if (IsExtLoad)
3890 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3891 else if (Unpacked && IsD16 && Ty.isVector())
3892 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3893 else
3894 LoadDstReg = Dst;
3895
3896 if (!VIndex)
3897 VIndex = B.buildConstant(S32, 0).getReg(0);
3898
3899 auto MIB = B.buildInstr(Opc)
3900 .addDef(LoadDstReg) // vdata
3901 .addUse(RSrc) // rsrc
3902 .addUse(VIndex) // vindex
3903 .addUse(VOffset) // voffset
3904 .addUse(SOffset) // soffset
3905 .addImm(ImmOffset); // offset(imm)
3906
3907 if (IsTyped)
3908 MIB.addImm(Format);
3909
3910 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3911 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3912 .addMemOperand(MMO);
3913
3914 if (LoadDstReg != Dst) {
3915 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3916
3917 // Widen result for extending loads was widened.
3918 if (IsExtLoad)
3919 B.buildTrunc(Dst, LoadDstReg);
3920 else {
3921 // Repack to original 16-bit vector result
3922 // FIXME: G_TRUNC should work, but legalization currently fails
3923 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3924 SmallVector<Register, 4> Repack;
3925 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3926 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3927 B.buildMerge(Dst, Repack);
3928 }
3929 }
3930
3931 MI.eraseFromParent();
3932 return true;
3933}
3934
3935bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3936 MachineIRBuilder &B,
3937 bool IsInc) const {
3938 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3939 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3940 B.buildInstr(Opc)
3941 .addDef(MI.getOperand(0).getReg())
3942 .addUse(MI.getOperand(2).getReg())
3943 .addUse(MI.getOperand(3).getReg())
3944 .cloneMemRefs(MI);
3945 MI.eraseFromParent();
3946 return true;
3947}
3948
3949static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3950 switch (IntrID) {
3951 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3952 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3953 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3954 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3955 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3956 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3957 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3958 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3959 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3960 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3961 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3962 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3963 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3964 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3966 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3967 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3968 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3969 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3970 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3971 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3972 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3973 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3974 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3975 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3976 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3977 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3978 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3979 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3980 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3981 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3982 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3984 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3985 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3986 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3987 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3988 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3990 case Intrinsic::amdgcn_buffer_atomic_fadd:
3991 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3992 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3993 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3994 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3995 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3996 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3997 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3998 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3999 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4000 default:
4001 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4001)
;
4002 }
4003}
4004
4005bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4006 MachineIRBuilder &B,
4007 Intrinsic::ID IID) const {
4008 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4009 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4010 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4011
4012 Register Dst;
4013
4014 int OpOffset = 0;
4015 if (HasReturn) {
4016 // A few FP atomics do not support return values.
4017 Dst = MI.getOperand(0).getReg();
4018 } else {
4019 OpOffset = -1;
4020 }
4021
4022 Register VData = MI.getOperand(2 + OpOffset).getReg();
4023 Register CmpVal;
4024
4025 if (IsCmpSwap) {
4026 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4027 ++OpOffset;
4028 }
4029
4030 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4031 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4032
4033 // The struct intrinsic variants add one additional operand over raw.
4034 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4035 Register VIndex;
4036 if (HasVIndex) {
4037 VIndex = MI.getOperand(4 + OpOffset).getReg();
4038 ++OpOffset;
4039 }
4040
4041 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4042 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4043 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4044
4045 MachineMemOperand *MMO = *MI.memoperands_begin();
4046
4047 unsigned ImmOffset;
4048 unsigned TotalOffset;
4049 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4050 if (TotalOffset != 0)
4051 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4052
4053 if (!VIndex)
4054 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4055
4056 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4057
4058 if (HasReturn)
4059 MIB.addDef(Dst);
4060
4061 MIB.addUse(VData); // vdata
4062
4063 if (IsCmpSwap)
4064 MIB.addReg(CmpVal);
4065
4066 MIB.addUse(RSrc) // rsrc
4067 .addUse(VIndex) // vindex
4068 .addUse(VOffset) // voffset
4069 .addUse(SOffset) // soffset
4070 .addImm(ImmOffset) // offset(imm)
4071 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4072 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4073 .addMemOperand(MMO);
4074
4075 MI.eraseFromParent();
4076 return true;
4077}
4078
4079/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4080/// vector with s16 typed elements.
4081static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4082 SmallVectorImpl<Register> &PackedAddrs,
4083 unsigned ArgOffset,
4084 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4085 bool IsA16, bool IsG16) {
4086 const LLT S16 = LLT::scalar(16);
4087 const LLT V2S16 = LLT::vector(2, 16);
4088 auto EndIdx = Intr->VAddrEnd;
4089
4090 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4091 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4092 if (!SrcOp.isReg())
4093 continue; // _L to _LZ may have eliminated this.
4094
4095 Register AddrReg = SrcOp.getReg();
4096
4097 if (I < Intr->GradientStart) {
4098 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4099 PackedAddrs.push_back(AddrReg);
4100 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4101 (I >= Intr->CoordStart && !IsA16)) {
4102 // Handle any gradient or coordinate operands that should not be packed
4103 PackedAddrs.push_back(AddrReg);
4104 } else {
4105 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4106 // derivatives dx/dh and dx/dv are packed with undef.
4107 if (((I + 1) >= EndIdx) ||
4108 ((Intr->NumGradients / 2) % 2 == 1 &&
4109 (I == static_cast<unsigned>(Intr->GradientStart +
4110 (Intr->NumGradients / 2) - 1) ||
4111 I == static_cast<unsigned>(Intr->GradientStart +
4112 Intr->NumGradients - 1))) ||
4113 // Check for _L to _LZ optimization
4114 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4115 PackedAddrs.push_back(
4116 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4117 .getReg(0));
4118 } else {
4119 PackedAddrs.push_back(
4120 B.buildBuildVector(
4121 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4122 .getReg(0));
4123 ++I;
4124 }
4125 }
4126 }
4127}
4128
4129/// Convert from separate vaddr components to a single vector address register,
4130/// and replace the remaining operands with $noreg.
4131static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4132 int DimIdx, int NumVAddrs) {
4133 const LLT S32 = LLT::scalar(32);
4134
4135 SmallVector<Register, 8> AddrRegs;
4136 for (int I = 0; I != NumVAddrs; ++I) {
4137 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4138 if (SrcOp.isReg()) {
4139 AddrRegs.push_back(SrcOp.getReg());
4140 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4140, __extension__ __PRETTY_FUNCTION__))
;
4141 }
4142 }
4143
4144 int NumAddrRegs = AddrRegs.size();
4145 if (NumAddrRegs != 1) {
4146 // Round up to 8 elements for v5-v7
4147 // FIXME: Missing intermediate sized register classes and instructions.
4148 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4149 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4150 auto Undef = B.buildUndef(S32);
4151 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4152 NumAddrRegs = RoundedNumRegs;
4153 }
4154
4155 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4156 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4157 }
4158
4159 for (int I = 1; I != NumVAddrs; ++I) {
4160 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4161 if (SrcOp.isReg())
4162 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4163 }
4164}
4165
4166/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4167///
4168/// Depending on the subtarget, load/store with 16-bit element data need to be
4169/// rewritten to use the low half of 32-bit registers, or directly use a packed
4170/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4171/// registers.
4172///
4173/// We don't want to directly select image instructions just yet, but also want
4174/// to exposes all register repacking to the legalizer/combiners. We also don't
4175/// want a selected instrution entering RegBankSelect. In order to avoid
4176/// defining a multitude of intermediate image instructions, directly hack on
4177/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4178/// now unnecessary arguments with $noreg.
4179bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4180 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4181 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4182
4183 const unsigned NumDefs = MI.getNumExplicitDefs();
4184 const unsigned ArgOffset = NumDefs + 1;
4185 bool IsTFE = NumDefs == 2;
1
Assuming 'NumDefs' is not equal to 2
4186 // We are only processing the operands of d16 image operations on subtargets
4187 // that use the unpacked register layout, or need to repack the TFE result.
4188
4189 // TODO: Do we need to guard against already legalized intrinsics?
4190 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4191 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4192
4193 MachineRegisterInfo *MRI = B.getMRI();
4194 const LLT S32 = LLT::scalar(32);
4195 const LLT S16 = LLT::scalar(16);
4196 const LLT V2S16 = LLT::vector(2, 16);
4197
4198 unsigned DMask = 0;
4199
4200 // Check for 16 bit addresses and pack if true.
4201 LLT GradTy =
4202 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4203 LLT AddrTy =
4204 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4205 const bool IsG16 = GradTy == S16;
2
Calling 'LLT::operator=='
5
Returning from 'LLT::operator=='
4206 const bool IsA16 = AddrTy == S16;
6
Calling 'LLT::operator=='
8
Returning from 'LLT::operator=='
4207
4208 int DMaskLanes = 0;
4209 if (!BaseOpcode->Atomic) {
9
Assuming field 'Atomic' is true
10
Taking false branch
4210 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4211 if (BaseOpcode->Gather4) {
4212 DMaskLanes = 4;
4213 } else if (DMask != 0) {
4214 DMaskLanes = countPopulation(DMask);
4215 } else if (!IsTFE && !BaseOpcode->Store) {
4216 // If dmask is 0, this is a no-op load. This can be eliminated.
4217 B.buildUndef(MI.getOperand(0));
4218 MI.eraseFromParent();
4219 return true;
4220 }
4221 }
4222
4223 Observer.changingInstr(MI);
4224 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4225
4226 unsigned NewOpcode = NumDefs == 0 ?
11
Assuming 'NumDefs' is not equal to 0
12
'?' condition is false
4227 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4228
4229 // Track that we legalized this
4230 MI.setDesc(B.getTII().get(NewOpcode));
4231
4232 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4233 // dmask to be at least 1 otherwise the instruction will fail
4234 if (IsTFE
12.1
'IsTFE' is false
12.1
'IsTFE' is false
&& DMask == 0) {
4235 DMask = 0x1;
4236 DMaskLanes = 1;
4237 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4238 }
4239
4240 if (BaseOpcode->Atomic
12.2
Field 'Atomic' is true
12.2
Field 'Atomic' is true
) {
13
Taking true branch
4241 Register VData0 = MI.getOperand(2).getReg();
4242 LLT Ty = MRI->getType(VData0);
4243
4244 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4245 if (Ty.isVector())
14
Calling 'LLT::isVector'
16
Returning from 'LLT::isVector'
17
Taking false branch
4246 return false;
4247
4248 if (BaseOpcode->AtomicX2) {
18
Assuming field 'AtomicX2' is false
19
Taking false branch
4249 Register VData1 = MI.getOperand(3).getReg();
4250 // The two values are packed in one register.
4251 LLT PackedTy = LLT::vector(2, Ty);
4252 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4253 MI.getOperand(2).setReg(Concat.getReg(0));
4254 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4255 }
4256 }
4257
4258 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4259
4260 // Optimize _L to _LZ when _L is zero
4261 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
20
Assuming 'LZMappingInfo' is null
21
Taking false branch
4262 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4263 const ConstantFP *ConstantLod;
4264
4265 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4266 m_GFCst(ConstantLod))) {
4267 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4268 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4269 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4270 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4271 Intr->Dim);
4272
4273 // The starting indexes should remain in the same place.
4274 --CorrectedNumVAddrs;
4275
4276 MI.getOperand(MI.getNumExplicitDefs())
4277 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4278 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4279 Intr = NewImageDimIntr;
4280 }
4281 }
4282 }
4283
4284 // Optimize _mip away, when 'lod' is zero
4285 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
22
Assuming the condition is false
23
Taking false branch
4286 int64_t ConstantLod;
4287 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4288 m_ICst(ConstantLod))) {
4289 if (ConstantLod == 0) {
4290 // TODO: Change intrinsic opcode and remove operand instead or replacing
4291 // it with 0, as the _L to _LZ handling is done above.
4292 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4293 --CorrectedNumVAddrs;
4294 }
4295 }
4296 }
4297
4298 // Rewrite the addressing register layout before doing anything else.
4299 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
24
Assuming field 'Gradients' is false
4300 // 16 bit gradients are supported, but are tied to the A16 control
4301 // so both gradients and addresses must be 16 bit
4302 return false;
4303 }
4304
4305 if (IsA16
24.1
'IsA16' is false
24.1
'IsA16' is false
&& !ST.hasA16()) {
4306 // A16 not supported
4307 return false;
4308 }
4309
4310 if (IsA16
24.2
'IsA16' is false
24.2
'IsA16' is false
|| IsG16
24.3
'IsG16' is false
24.3
'IsG16' is false
) {
25
Taking false branch
4311 if (Intr->NumVAddrs > 1) {
4312 SmallVector<Register, 4> PackedRegs;
4313
4314 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4315 IsG16);
4316
4317 // See also below in the non-a16 branch
4318 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4319
4320 if (!UseNSA && PackedRegs.size() > 1) {
4321 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4322 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4323 PackedRegs[0] = Concat.getReg(0);
4324 PackedRegs.resize(1);
4325 }
4326
4327 const unsigned NumPacked = PackedRegs.size();
4328 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4329 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4330 if (!SrcOp.isReg()) {
4331 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4331, __extension__ __PRETTY_FUNCTION__))
;
4332 continue;
4333 }
4334
4335 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4335, __extension__ __PRETTY_FUNCTION__))
;
4336
4337 if (I - Intr->VAddrStart < NumPacked)
4338 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4339 else
4340 SrcOp.setReg(AMDGPU::NoRegister);
4341 }
4342 }
4343 } else {
4344 // If the register allocator cannot place the address registers contiguously
4345 // without introducing moves, then using the non-sequential address encoding
4346 // is always preferable, since it saves VALU instructions and is usually a
4347 // wash in terms of code size or even better.
4348 //
4349 // However, we currently have no way of hinting to the register allocator
4350 // that MIMG addresses should be placed contiguously when it is possible to
4351 // do so, so force non-NSA for the common 2-address case as a heuristic.
4352 //
4353 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4354 // allocation when possible.
4355 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
26
Assuming 'CorrectedNumVAddrs' is < 3
4356
4357 if (!UseNSA
26.1
'UseNSA' is false
26.1
'UseNSA' is false
&& Intr->NumVAddrs > 1)
27
Assuming field 'NumVAddrs' is <= 1
28
Taking false branch
4358 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4359 Intr->NumVAddrs);
4360 }
4361
4362 int Flags = 0;
4363 if (IsA16
28.1
'IsA16' is false
28.1
'IsA16' is false
)
29
Taking false branch
4364 Flags |= 1;
4365 if (IsG16
29.1
'IsG16' is false
29.1
'IsG16' is false
)
30
Taking false branch
4366 Flags |= 2;
4367 MI.addOperand(MachineOperand::CreateImm(Flags));
4368
4369 if (BaseOpcode->Store) { // No TFE for stores?
31
Assuming field 'Store' is false
32
Taking false branch
4370 // TODO: Handle dmask trim
4371 Register VData = MI.getOperand(1).getReg();
4372 LLT Ty = MRI->getType(VData);
4373 if (!Ty.isVector() || Ty.getElementType() != S16)
4374 return true;
4375
4376 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4377 if (RepackedReg != VData) {
4378 MI.getOperand(1).setReg(RepackedReg);
4379 }
4380
4381 return true;
4382 }
4383
4384 Register DstReg = MI.getOperand(0).getReg();
4385 LLT Ty = MRI->getType(DstReg);
4386 const LLT EltTy = Ty.getScalarType();
4387 const bool IsD16 = Ty.getScalarType() == S16;
33
Calling 'LLT::operator=='
36
Returning from 'LLT::operator=='
4388 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
37
'?' condition is false
4389
4390 // Confirm that the return type is large enough for the dmask specified
4391 if (NumElts
37.1
'NumElts' is >= 'DMaskLanes'
37.1
'NumElts' is >= 'DMaskLanes'
< DMaskLanes)
38
Taking false branch
4392 return false;
4393
4394 if (NumElts
38.1
'NumElts' is <= 4
38.1
'NumElts' is <= 4
> 4 || DMaskLanes
38.2
'DMaskLanes' is <= 4
38.2
'DMaskLanes' is <= 4
> 4)
39
Taking false branch
4395 return false;
4396
4397 const unsigned AdjustedNumElts = DMaskLanes
39.1
'DMaskLanes' is equal to 0
39.1
'DMaskLanes' is equal to 0
== 0 ? 1 : DMaskLanes;
40
'?' condition is true
4398 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4399
4400 // The raw dword aligned data component of the load. The only legal cases
4401 // where this matters should be when using the packed D16 format, for
4402 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4403 LLT RoundedTy;
4404
4405 // S32 vector to to cover all data, plus TFE result element.
4406 LLT TFETy;
4407
4408 // Register type to use for each loaded component. Will be S32 or V2S16.
4409 LLT RegTy;
4410
4411 if (IsD16
40.1
'IsD16' is false
40.1
'IsD16' is false
&& ST.hasUnpackedD16VMem()) {
4412 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4413 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4414 RegTy = S32;
4415 } else {
4416 unsigned EltSize = EltTy.getSizeInBits();
4417 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4418 unsigned RoundedSize = 32 * RoundedElts;
4419 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
41
The result of the '/' expression is undefined
4420 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4421 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4422 }
4423
4424 // The return type does not need adjustment.
4425 // TODO: Should we change s16 case to s32 or <2 x s16>?
4426 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4427 return true;
4428
4429 Register Dst1Reg;
4430
4431 // Insert after the instruction.
4432 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4433
4434 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4435 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4436 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4437 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4438
4439 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4440
4441 MI.getOperand(0).setReg(NewResultReg);
4442
4443 // In the IR, TFE is supposed to be used with a 2 element struct return
4444 // type. The intruction really returns these two values in one contiguous
4445 // register, with one additional dword beyond the loaded data. Rewrite the
4446 // return type to use a single register result.
4447
4448 if (IsTFE) {
4449 Dst1Reg = MI.getOperand(1).getReg();
4450 if (MRI->getType(Dst1Reg) != S32)
4451 return false;
4452
4453 // TODO: Make sure the TFE operand bit is set.
4454 MI.RemoveOperand(1);
4455
4456 // Handle the easy case that requires no repack instructions.
4457 if (Ty == S32) {
4458 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4459 return true;
4460 }
4461 }
4462
4463 // Now figure out how to copy the new result register back into the old
4464 // result.
4465 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4466
4467 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4468
4469 if (ResultNumRegs == 1) {
4470 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4470, __extension__ __PRETTY_FUNCTION__))
;
4471 ResultRegs[0] = NewResultReg;
4472 } else {
4473 // We have to repack into a new vector of some kind.
4474 for (int I = 0; I != NumDataRegs; ++I)
4475 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4476 B.buildUnmerge(ResultRegs, NewResultReg);
4477
4478 // Drop the final TFE element to get the data part. The TFE result is
4479 // directly written to the right place already.
4480 if (IsTFE)
4481 ResultRegs.resize(NumDataRegs);
4482 }
4483
4484 // For an s16 scalar result, we form an s32 result with a truncate regardless
4485 // of packed vs. unpacked.
4486 if (IsD16 && !Ty.isVector()) {
4487 B.buildTrunc(DstReg, ResultRegs[0]);
4488 return true;
4489 }
4490
4491 // Avoid a build/concat_vector of 1 entry.
4492 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4493 B.buildBitcast(DstReg, ResultRegs[0]);
4494 return true;
4495 }
4496
4497 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4497, __extension__ __PRETTY_FUNCTION__))
;
4498
4499 if (IsD16) {
4500 // For packed D16 results with TFE enabled, all the data components are
4501 // S32. Cast back to the expected type.
4502 //
4503 // TODO: We don't really need to use load s32 elements. We would only need one
4504 // cast for the TFE result if a multiple of v2s16 was used.
4505 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4506 for (Register &Reg : ResultRegs)
4507 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4508 } else if (ST.hasUnpackedD16VMem()) {
4509 for (Register &Reg : ResultRegs)
4510 Reg = B.buildTrunc(S16, Reg).getReg(0);
4511 }
4512 }
4513
4514 auto padWithUndef = [&](LLT Ty, int NumElts) {
4515 if (NumElts == 0)
4516 return;
4517 Register Undef = B.buildUndef(Ty).getReg(0);
4518 for (int I = 0; I != NumElts; ++I)
4519 ResultRegs.push_back(Undef);
4520 };
4521
4522 // Pad out any elements eliminated due to the dmask.
4523 LLT ResTy = MRI->getType(ResultRegs[0]);
4524 if (!ResTy.isVector()) {
4525 padWithUndef(ResTy, NumElts - ResultRegs.size());
4526 B.buildBuildVector(DstReg, ResultRegs);
4527 return true;
4528 }
4529
4530 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4530, __extension__ __PRETTY_FUNCTION__))
;
4531 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4532
4533 // Deal with the one annoying legal case.
4534 const LLT V3S16 = LLT::vector(3, 16);
4535 if (Ty == V3S16) {
4536 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4537 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4538 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4539 return true;
4540 }
4541
4542 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4543 B.buildConcatVectors(DstReg, ResultRegs);
4544 return true;
4545}
4546
4547bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4548 LegalizerHelper &Helper, MachineInstr &MI) const {
4549 MachineIRBuilder &B = Helper.MIRBuilder;
4550 GISelChangeObserver &Observer = Helper.Observer;
4551
4552 Register Dst = MI.getOperand(0).getReg();
4553 LLT Ty = B.getMRI()->getType(Dst);
4554 unsigned Size = Ty.getSizeInBits();
4555 MachineFunction &MF = B.getMF();
4556
4557 Observer.changingInstr(MI);
4558
4559 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4560 Ty = getBitcastRegisterType(Ty);
4561 Helper.bitcastDst(MI, Ty, 0);
4562 Dst = MI.getOperand(0).getReg();
4563 B.setInsertPt(B.getMBB(), MI);
4564 }
4565
4566 // FIXME: We don't really need this intermediate instruction. The intrinsic
4567 // should be fixed to have a memory operand. Since it's readnone, we're not
4568 // allowed to add one.
4569 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4570 MI.RemoveOperand(1); // Remove intrinsic ID
4571
4572 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4573 // TODO: Should this use datalayout alignment?
4574 const unsigned MemSize = (Size + 7) / 8;
4575 const Align MemAlign(4);
4576 MachineMemOperand *MMO = MF.getMachineMemOperand(
4577 MachinePointerInfo(),
4578 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4579 MachineMemOperand::MOInvariant,
4580 MemSize, MemAlign);
4581 MI.addMemOperand(MF, MMO);
4582
4583 // There are no 96-bit result scalar loads, but widening to 128-bit should
4584 // always be legal. We may need to restore this to a 96-bit result if it turns
4585 // out this needs to be converted to a vector load during RegBankSelect.
4586 if (!isPowerOf2_32(Size)) {
4587 if (Ty.isVector())
4588 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4589 else
4590 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4591 }
4592
4593 Observer.changedInstr(MI);
4594 return true;
4595}
4596
4597// TODO: Move to selection
4598bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4599 MachineRegisterInfo &MRI,
4600 MachineIRBuilder &B) const {
4601 if (!ST.isTrapHandlerEnabled() ||
4602 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4603 return legalizeTrapEndpgm(MI, MRI, B);
4604
4605 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4606 switch (*HsaAbiVer) {
4607 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4608 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4609 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4610 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4611 return ST.supportsGetDoorbellID() ?
4612 legalizeTrapHsa(MI, MRI, B) :
4613 legalizeTrapHsaQueuePtr(MI, MRI, B);
4614 }
4615 }
4616
4617 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4617)
;
4618}
4619
4620bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4621 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4622 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4623 MI.eraseFromParent();
4624 return true;
4625}
4626
4627bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4628 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4629 // Pass queue pointer to trap handler as input, and insert trap instruction
4630 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4631 Register LiveIn =
4632 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4633 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4634 return false;
4635
4636 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4637 B.buildCopy(SGPR01, LiveIn);
4638 B.buildInstr(AMDGPU::S_TRAP)
4639 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4640 .addReg(SGPR01, RegState::Implicit);
4641
4642 MI.eraseFromParent();
4643 return true;
4644}
4645
4646bool AMDGPULegalizerInfo::legalizeTrapHsa(
4647 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4648 B.buildInstr(AMDGPU::S_TRAP)
4649 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4650 MI.eraseFromParent();
4651 return true;
4652}
4653
4654bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4655 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4656 // Is non-HSA path or trap-handler disabled? then, report a warning
4657 // accordingly
4658 if (!ST.isTrapHandlerEnabled() ||
4659 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4660 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4661 "debugtrap handler not supported",
4662 MI.getDebugLoc(), DS_Warning);
4663 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4664 Ctx.diagnose(NoTrap);
4665 } else {
4666 // Insert debug-trap instruction
4667 B.buildInstr(AMDGPU::S_TRAP)
4668 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4669 }
4670
4671 MI.eraseFromParent();
4672 return true;
4673}
4674
4675bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4676 MachineIRBuilder &B) const {
4677 MachineRegisterInfo &MRI = *B.getMRI();
4678 const LLT S16 = LLT::scalar(16);
4679 const LLT S32 = LLT::scalar(32);
4680
4681 Register DstReg = MI.getOperand(0).getReg();
4682 Register NodePtr = MI.getOperand(2).getReg();
4683 Register RayExtent = MI.getOperand(3).getReg();
4684 Register RayOrigin = MI.getOperand(4).getReg();
4685 Register RayDir = MI.getOperand(5).getReg();
4686 Register RayInvDir = MI.getOperand(6).getReg();
4687 Register TDescr = MI.getOperand(7).getReg();
4688
4689 if (!ST.hasGFX10_AEncoding()) {
4690 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4691 "intrinsic not supported on subtarget",
4692 MI.getDebugLoc());
4693 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4694 return false;
4695 }
4696
4697 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4698 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4699 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4700 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4701 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4702 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4703
4704 SmallVector<Register, 12> Ops;
4705 if (Is64) {
4706 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4707 Ops.push_back(Unmerge.getReg(0));
4708 Ops.push_back(Unmerge.getReg(1));
4709 } else {
4710 Ops.push_back(NodePtr);
4711 }
4712 Ops.push_back(RayExtent);
4713
4714 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4715 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4716 Ops.push_back(Unmerge.getReg(0));
4717 Ops.push_back(Unmerge.getReg(1));
4718 Ops.push_back(Unmerge.getReg(2));
4719 };
4720
4721 packLanes(RayOrigin);
4722 if (IsA16) {
4723 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4724 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4725 Register R1 = MRI.createGenericVirtualRegister(S32);
4726 Register R2 = MRI.createGenericVirtualRegister(S32);
4727 Register R3 = MRI.createGenericVirtualRegister(S32);
4728 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4729 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4730 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4731 Ops.push_back(R1);
4732 Ops.push_back(R2);
4733 Ops.push_back(R3);
4734 } else {
4735 packLanes(RayDir);
4736 packLanes(RayInvDir);
4737 }
4738
4739 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4740 .addDef(DstReg)
4741 .addImm(Opcode);
4742
4743 for (Register R : Ops) {
4744 MIB.addUse(R);
4745 }
4746
4747 MIB.addUse(TDescr)
4748 .addImm(IsA16 ? 1 : 0)
4749 .cloneMemRefs(MI);
4750
4751 MI.eraseFromParent();
4752 return true;
4753}
4754
4755bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4756 MachineInstr &MI) const {
4757 MachineIRBuilder &B = Helper.MIRBuilder;
4758 MachineRegisterInfo &MRI = *B.getMRI();
4759
4760 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4761 auto IntrID = MI.getIntrinsicID();
4762 switch (IntrID) {
4763 case Intrinsic::amdgcn_if:
4764 case Intrinsic::amdgcn_else: {
4765 MachineInstr *Br = nullptr;
4766 MachineBasicBlock *UncondBrTarget = nullptr;
4767 bool Negated = false;
4768 if (MachineInstr *BrCond =
4769 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4770 const SIRegisterInfo *TRI
4771 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4772
4773 Register Def = MI.getOperand(1).getReg();
4774 Register Use = MI.getOperand(3).getReg();
4775
4776 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4777
4778 if (Negated)
4779 std::swap(CondBrTarget, UncondBrTarget);
4780
4781 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4782 if (IntrID == Intrinsic::amdgcn_if) {
4783 B.buildInstr(AMDGPU::SI_IF)
4784 .addDef(Def)
4785 .addUse(Use)
4786 .addMBB(UncondBrTarget);
4787 } else {
4788 B.buildInstr(AMDGPU::SI_ELSE)
4789 .addDef(Def)
4790 .addUse(Use)
4791 .addMBB(UncondBrTarget);
4792 }
4793
4794 if (Br) {
4795 Br->getOperand(0).setMBB(CondBrTarget);
4796 } else {
4797 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4798 // since we're swapping branch targets it needs to be reinserted.
4799 // FIXME: IRTranslator should probably not do this
4800 B.buildBr(*CondBrTarget);
4801 }
4802
4803 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4804 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4805 MI.eraseFromParent();
4806 BrCond->eraseFromParent();
4807 return true;
4808 }
4809
4810 return false;
4811 }
4812 case Intrinsic::amdgcn_loop: {
4813 MachineInstr *Br = nullptr;
4814 MachineBasicBlock *UncondBrTarget = nullptr;
4815 bool Negated = false;
4816 if (MachineInstr *BrCond =
4817 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4818 const SIRegisterInfo *TRI
4819 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4820
4821 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4822 Register Reg = MI.getOperand(2).getReg();
4823
4824 if (Negated)
4825 std::swap(CondBrTarget, UncondBrTarget);
4826
4827 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4828 B.buildInstr(AMDGPU::SI_LOOP)
4829 .addUse(Reg)
4830 .addMBB(UncondBrTarget);
4831
4832 if (Br)
4833 Br->getOperand(0).setMBB(CondBrTarget);
4834 else
4835 B.buildBr(*CondBrTarget);
4836
4837 MI.eraseFromParent();
4838 BrCond->eraseFromParent();
4839 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4840 return true;
4841 }
4842
4843 return false;
4844 }
4845 case Intrinsic::amdgcn_kernarg_segment_ptr:
4846 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4847 // This only makes sense to call in a kernel, so just lower to null.
4848 B.buildConstant(MI.getOperand(0).getReg(), 0);
4849 MI.eraseFromParent();
4850 return true;
4851 }
4852
4853 return legalizePreloadedArgIntrin(
4854 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4855 case Intrinsic::amdgcn_implicitarg_ptr:
4856 return legalizeImplicitArgPtr(MI, MRI, B);
4857 case Intrinsic::amdgcn_workitem_id_x:
4858 return legalizePreloadedArgIntrin(MI, MRI, B,
4859 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4860 case Intrinsic::amdgcn_workitem_id_y:
4861 return legalizePreloadedArgIntrin(MI, MRI, B,
4862 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4863 case Intrinsic::amdgcn_workitem_id_z:
4864 return legalizePreloadedArgIntrin(MI, MRI, B,
4865 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4866 case Intrinsic::amdgcn_workgroup_id_x:
4867 return legalizePreloadedArgIntrin(MI, MRI, B,
4868 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4869 case Intrinsic::amdgcn_workgroup_id_y:
4870 return legalizePreloadedArgIntrin(MI, MRI, B,
4871 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4872 case Intrinsic::amdgcn_workgroup_id_z:
4873 return legalizePreloadedArgIntrin(MI, MRI, B,
4874 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4875 case Intrinsic::amdgcn_dispatch_ptr:
4876 return legalizePreloadedArgIntrin(MI, MRI, B,
4877 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4878 case Intrinsic::amdgcn_queue_ptr:
4879 return legalizePreloadedArgIntrin(MI, MRI, B,
4880 AMDGPUFunctionArgInfo::QUEUE_PTR);
4881 case Intrinsic::amdgcn_implicit_buffer_ptr:
4882 return legalizePreloadedArgIntrin(
4883 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4884 case Intrinsic::amdgcn_dispatch_id:
4885 return legalizePreloadedArgIntrin(MI, MRI, B,
4886 AMDGPUFunctionArgInfo::DISPATCH_ID);
4887 case Intrinsic::amdgcn_fdiv_fast:
4888 return legalizeFDIVFastIntrin(MI, MRI, B);
4889 case Intrinsic::amdgcn_is_shared:
4890 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4891 case Intrinsic::amdgcn_is_private:
4892 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4893 case Intrinsic::amdgcn_wavefrontsize: {
4894 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4895 MI.eraseFromParent();
4896 return true;
4897 }
4898 case Intrinsic::amdgcn_s_buffer_load:
4899 return legalizeSBufferLoad(Helper, MI);
4900 case Intrinsic::amdgcn_raw_buffer_store:
4901 case Intrinsic::amdgcn_struct_buffer_store:
4902 return legalizeBufferStore(MI, MRI, B, false, false);
4903 case Intrinsic::amdgcn_raw_buffer_store_format:
4904 case Intrinsic::amdgcn_struct_buffer_store_format:
4905 return legalizeBufferStore(MI, MRI, B, false, true);
4906 case Intrinsic::amdgcn_raw_tbuffer_store:
4907 case Intrinsic::amdgcn_struct_tbuffer_store:
4908 return legalizeBufferStore(MI, MRI, B, true, true);
4909 case Intrinsic::amdgcn_raw_buffer_load:
4910 case Intrinsic::amdgcn_struct_buffer_load:
4911 return legalizeBufferLoad(MI, MRI, B, false, false);
4912 case Intrinsic::amdgcn_raw_buffer_load_format:
4913 case Intrinsic::amdgcn_struct_buffer_load_format:
4914 return legalizeBufferLoad(MI, MRI, B, true, false);
4915 case Intrinsic::amdgcn_raw_tbuffer_load:
4916 case Intrinsic::amdgcn_struct_tbuffer_load:
4917 return legalizeBufferLoad(MI, MRI, B, true, true);
4918 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4919 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4920 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4921 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4922 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4923 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4924 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4925 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4926 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4927 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4928 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4929 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4930 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4931 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4932 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4933 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4934 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4935 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4936 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4937 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4938 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4939 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4940 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4941 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4942 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4943 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4944 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4945 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4946 case Intrinsic::amdgcn_buffer_atomic_fadd:
4947 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4948 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4949 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4950 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4951 return legalizeBufferAtomic(MI, B, IntrID);
4952 case Intrinsic::amdgcn_atomic_inc:
4953 return legalizeAtomicIncDec(MI, B, true);
4954 case Intrinsic::amdgcn_atomic_dec:
4955 return legalizeAtomicIncDec(MI, B, false);
4956 case Intrinsic::trap:
4957 return legalizeTrapIntrinsic(MI, MRI, B);
4958 case Intrinsic::debugtrap:
4959 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4960 case Intrinsic::amdgcn_rsq_clamp:
4961 return legalizeRsqClampIntrinsic(MI, MRI, B);
4962 case Intrinsic::amdgcn_ds_fadd:
4963 case Intrinsic::amdgcn_ds_fmin:
4964 case Intrinsic::amdgcn_ds_fmax:
4965 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4966 case Intrinsic::amdgcn_image_bvh_intersect_ray:
4967 return legalizeBVHIntrinsic(MI, B);
4968 default: {
4969 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4970 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4971 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4972 return true;
4973 }
4974 }
4975
4976 return true;
4977}

/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h

1//== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// Implement a low-level type suitable for MachineInstr level instruction
10/// selection.
11///
12/// For a type attached to a MachineInstr, we only care about 2 details: total
13/// size and the number of vector lanes (if any). Accordingly, there are 4
14/// possible valid type-kinds:
15///
16/// * `sN` for scalars and aggregates
17/// * `<N x sM>` for vectors, which must have at least 2 elements.
18/// * `pN` for pointers
19///
20/// Other information required for correct selection is expected to be carried
21/// by the opcode, or non-type flags. For example the distinction between G_ADD
22/// and G_FADD for int/float or fast-math flags.
23///
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
27#define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
28
29#include "llvm/ADT/DenseMapInfo.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/MachineValueType.h"
32#include <cassert>
33
34namespace llvm {
35
36class DataLayout;
37class Type;
38class raw_ostream;
39
40class LLT {
41public:
42 /// Get a low-level scalar or aggregate "bag of bits".
43 static LLT scalar(unsigned SizeInBits) {
44 assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 44, __extension__ __PRETTY_FUNCTION__))
;
45 return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0,
46 SizeInBits, /*AddressSpace=*/0};
47 }
48
49 /// Get a low-level pointer in the given address space.
50 static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
51 assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 51, __extension__ __PRETTY_FUNCTION__))
;
52 return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0,
53 SizeInBits, AddressSpace};
54 }
55
56 /// Get a low-level vector of some number of elements and element width.
57 /// \p NumElements must be at least 2.
58 static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) {
59 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 59, __extension__ __PRETTY_FUNCTION__))
;
60 assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 &&
"invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 60, __extension__ __PRETTY_FUNCTION__))
;
61 return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements,
62 ScalarSizeInBits, /*AddressSpace=*/0};
63 }
64
65 /// Get a low-level vector of some number of elements and element type.
66 static LLT vector(uint16_t NumElements, LLT ScalarTy) {
67 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 67, __extension__ __PRETTY_FUNCTION__))
;
68 assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type"
) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 68, __extension__ __PRETTY_FUNCTION__))
;
69 return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements,
70 ScalarTy.getSizeInBits(),
71 ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0};
72 }
73
74 static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) {
75 return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy);
76 }
77
78 static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) {
79 return scalarOrVector(NumElements, LLT::scalar(ScalarSize));
80 }
81
82 explicit LLT(bool isPointer, bool isVector, uint16_t NumElements,
83 unsigned SizeInBits, unsigned AddressSpace) {
84 init(isPointer, isVector, NumElements, SizeInBits, AddressSpace);
85 }
86 explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {}
87
88 explicit LLT(MVT VT);
89
90 bool isValid() const { return RawData != 0; }
91
92 bool isScalar() const { return isValid() && !IsPointer && !IsVector; }
93
94 bool isPointer() const { return isValid() && IsPointer && !IsVector; }
95
96 bool isVector() const { return isValid() && IsVector; }
15
Returning zero, which participates in a condition later
97
98 /// Returns the number of elements in a vector LLT. Must only be called on
99 /// vector types.
100 uint16_t getNumElements() const {
101 assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate"
) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 101, __extension__ __PRETTY_FUNCTION__))
;
102 if (!IsPointer)
103 return getFieldValue(VectorElementsFieldInfo);
104 else
105 return getFieldValue(PointerVectorElementsFieldInfo);
106 }
107
108 /// Returns the total size of the type. Must only be called on sized types.
109 unsigned getSizeInBits() const {
110 if (isPointer() || isScalar())
111 return getScalarSizeInBits();
112 return getScalarSizeInBits() * getNumElements();
113 }
114
115 /// Returns the total size of the type in bytes, i.e. number of whole bytes
116 /// needed to represent the size in bits. Must only be called on sized types.
117 unsigned getSizeInBytes() const {
118 return (getSizeInBits() + 7) / 8;
119 }
120
121 LLT getScalarType() const {
122 return isVector() ? getElementType() : *this;
123 }
124
125 /// If this type is a vector, return a vector with the same number of elements
126 /// but the new element type. Otherwise, return the new element type.
127 LLT changeElementType(LLT NewEltTy) const {
128 return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy;
129 }
130
131 /// If this type is a vector, return a vector with the same number of elements
132 /// but the new element size. Otherwise, return the new element type. Invalid
133 /// for pointer types. For pointer types, use changeElementType.
134 LLT changeElementSize(unsigned NewEltSize) const {
135 assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
136 "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
;
137 return isVector() ? LLT::vector(getNumElements(), NewEltSize)
138 : LLT::scalar(NewEltSize);
139 }
140
141 /// Return a vector or scalar with the same element type and the new number of
142 /// elements.
143 LLT changeNumElements(unsigned NewNumElts) const {
144 return LLT::scalarOrVector(NewNumElts, getScalarType());
145 }
146
147 /// Return a type that is \p Factor times smaller. Reduces the number of
148 /// elements if this is a vector, or the bitwidth for scalar/pointers. Does
149 /// not attempt to handle cases that aren't evenly divisible.
150 LLT divide(int Factor) const {
151 assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail
("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
;
152 if (isVector()) {
153 assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void
(0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 153, __extension__ __PRETTY_FUNCTION__))
;
154 return scalarOrVector(getNumElements() / Factor, getElementType());
155 }
156
157 assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void
(0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 157, __extension__ __PRETTY_FUNCTION__))
;
158 return scalar(getSizeInBits() / Factor);
159 }
160
161 bool isByteSized() const { return (getSizeInBits() & 7) == 0; }
162
163 unsigned getScalarSizeInBits() const {
164 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 164, __extension__ __PRETTY_FUNCTION__))
;
165 if (!IsVector) {
166 if (!IsPointer)
167 return getFieldValue(ScalarSizeFieldInfo);
168 else
169 return getFieldValue(PointerSizeFieldInfo);
170 } else {
171 if (!IsPointer)
172 return getFieldValue(VectorSizeFieldInfo);
173 else
174 return getFieldValue(PointerVectorSizeFieldInfo);
175 }
176 }
177
178 unsigned getAddressSpace() const {
179 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180 assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type"
) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 180, __extension__ __PRETTY_FUNCTION__))
;
181 if (!IsVector)
182 return getFieldValue(PointerAddressSpaceFieldInfo);
183 else
184 return getFieldValue(PointerVectorAddressSpaceFieldInfo);
185 }
186
187 /// Returns the vector's element type. Only valid for vector types.
188 LLT getElementType() const {
189 assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate"
) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 189, __extension__ __PRETTY_FUNCTION__))
;
190 if (IsPointer)
191 return pointer(getAddressSpace(), getScalarSizeInBits());
192 else
193 return scalar(getScalarSizeInBits());
194 }
195
196 void print(raw_ostream &OS) const;
197
198#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
199 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {
200 print(dbgs());
201 dbgs() << '\n';
202 }
203#endif
204
205 bool operator==(const LLT &RHS) const {
206 return IsPointer
6.1
'IsPointer' is not equal to 'RHS.IsPointer'
6.1
'IsPointer' is not equal to 'RHS.IsPointer'
== RHS.IsPointer
&& IsVector == RHS.IsVector &&
3
Assuming 'IsPointer' is not equal to 'RHS.IsPointer'
4
Returning zero, which participates in a condition later
7
Returning zero, which participates in a condition later
34
Assuming 'IsPointer' is not equal to 'RHS.IsPointer'
35
Returning zero, which participates in a condition later
207 RHS.RawData == RawData;
208 }
209
210 bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
211
212 friend struct DenseMapInfo<LLT>;
213 friend class GISelInstProfileBuilder;
214
215private:
216 /// LLT is packed into 64 bits as follows:
217 /// isPointer : 1
218 /// isVector : 1
219 /// with 62 bits remaining for Kind-specific data, packed in bitfields
220 /// as described below. As there isn't a simple portable way to pack bits
221 /// into bitfields, here the different fields in the packed structure is
222 /// described in static const *Field variables. Each of these variables
223 /// is a 2-element array, with the first element describing the bitfield size
224 /// and the second element describing the bitfield offset.
225 typedef int BitFieldInfo[2];
226 ///
227 /// This is how the bitfields are packed per Kind:
228 /// * Invalid:
229 /// gets encoded as RawData == 0, as that is an invalid encoding, since for
230 /// valid encodings, SizeInBits/SizeOfElement must be larger than 0.
231 /// * Non-pointer scalar (isPointer == 0 && isVector == 0):
232 /// SizeInBits: 32;
233 static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0};
234 /// * Pointer (isPointer == 1 && isVector == 0):
235 /// SizeInBits: 16;
236 /// AddressSpace: 24;
237 static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0};
238 static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{
239 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]};
240 /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1):
241 /// NumElements: 16;
242 /// SizeOfElement: 32;
243 static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0};
244 static const constexpr BitFieldInfo VectorSizeFieldInfo{
245 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]};
246 /// * Vector-of-pointer (isPointer == 1 && isVector == 1):
247 /// NumElements: 16;
248 /// SizeOfElement: 16;
249 /// AddressSpace: 24;
250 static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0};
251 static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{
252 16,
253 PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]};
254 static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{
255 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]};
256
257 uint64_t IsPointer : 1;
258 uint64_t IsVector : 1;
259 uint64_t RawData : 62;
260
261 static uint64_t getMask(const BitFieldInfo FieldInfo) {
262 const int FieldSizeInBits = FieldInfo[0];
263 return (((uint64_t)1) << FieldSizeInBits) - 1;
264 }
265 static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) {
266 assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field"
) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 266, __extension__ __PRETTY_FUNCTION__))
;
267 return (Val & Mask) << Shift;
268 }
269 static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) {
270 return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]);
271 }
272 uint64_t getFieldValue(const BitFieldInfo FieldInfo) const {
273 return getMask(FieldInfo) & (RawData >> FieldInfo[1]);
274 }
275
276 void init(bool IsPointer, bool IsVector, uint16_t NumElements,
277 unsigned SizeInBits, unsigned AddressSpace) {
278 this->IsPointer = IsPointer;
279 this->IsVector = IsVector;
280 if (!IsVector) {
281 if (!IsPointer)
282 RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo);
283 else
284 RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) |
285 maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo);
286 } else {
287 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 287, __extension__ __PRETTY_FUNCTION__))
;
288 if (!IsPointer)
289 RawData = maskAndShift(NumElements, VectorElementsFieldInfo) |
290 maskAndShift(SizeInBits, VectorSizeFieldInfo);
291 else
292 RawData =
293 maskAndShift(NumElements, PointerVectorElementsFieldInfo) |
294 maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) |
295 maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo);
296 }
297 }
298
299 uint64_t getUniqueRAWLLTData() const {
300 return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 |
301 ((uint64_t)IsVector);
302 }
303};
304
305inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
306 Ty.print(OS);
307 return OS;
308}
309
310template<> struct DenseMapInfo<LLT> {
311 static inline LLT getEmptyKey() {
312 LLT Invalid;
313 Invalid.IsPointer = true;
314 return Invalid;
315 }
316 static inline LLT getTombstoneKey() {
317 LLT Invalid;
318 Invalid.IsVector = true;
319 return Invalid;
320 }
321 static inline unsigned getHashValue(const LLT &Ty) {
322 uint64_t Val = Ty.getUniqueRAWLLTData();
323 return DenseMapInfo<uint64_t>::getHashValue(Val);
324 }
325 static bool isEqual(const LLT &LHS, const LLT &RHS) {
326 return LHS == RHS;
327 }
328};
329
330}
331
332#endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H