Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 4453, column 49
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-06-21-164211-33944-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeNumElements(Pow2NElts);
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returs true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99 };
100}
101
102static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 unsigned Size = Ty.getSizeInBits();
107 unsigned Pieces = (Size + 63) / 64;
108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110 };
111}
112
113// Increase the number of vector elements to reach the next multiple of 32-bit
114// type.
115static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116 return [=](const LegalityQuery &Query) {
117 const LLT Ty = Query.Types[TypeIdx];
118
119 const LLT EltTy = Ty.getElementType();
120 const int Size = Ty.getSizeInBits();
121 const int EltSize = EltTy.getSizeInBits();
122 const int NextMul32 = (Size + 31) / 32;
123
124 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 124, __extension__ __PRETTY_FUNCTION__))
;
125
126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128 };
129}
130
131static LLT getBitcastRegisterType(const LLT Ty) {
132 const unsigned Size = Ty.getSizeInBits();
133
134 LLT CoercedTy;
135 if (Size <= 32) {
136 // <2 x s8> -> s16
137 // <4 x s8> -> s32
138 return LLT::scalar(Size);
139 }
140
141 return LLT::scalarOrVector(Size / 32, 32);
142}
143
144static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 const LLT Ty = Query.Types[TypeIdx];
147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148 };
149}
150
151static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152 return [=](const LegalityQuery &Query) {
153 const LLT Ty = Query.Types[TypeIdx];
154 unsigned Size = Ty.getSizeInBits();
155 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 155, __extension__ __PRETTY_FUNCTION__))
;
156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157 };
158}
159
160static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161 return [=](const LegalityQuery &Query) {
162 const LLT QueryTy = Query.Types[TypeIdx];
163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164 };
165}
166
167static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168 return [=](const LegalityQuery &Query) {
169 const LLT QueryTy = Query.Types[TypeIdx];
170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171 };
172}
173
174static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175 return [=](const LegalityQuery &Query) {
176 const LLT QueryTy = Query.Types[TypeIdx];
177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178 };
179}
180
181static bool isRegisterSize(unsigned Size) {
182 return Size % 32 == 0 && Size <= MaxRegisterSize;
183}
184
185static bool isRegisterVectorElementType(LLT EltTy) {
186 const int EltSize = EltTy.getSizeInBits();
187 return EltSize == 16 || EltSize % 32 == 0;
188}
189
190static bool isRegisterVectorType(LLT Ty) {
191 const int EltSize = Ty.getElementType().getSizeInBits();
192 return EltSize == 32 || EltSize == 64 ||
193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194 EltSize == 128 || EltSize == 256;
195}
196
197static bool isRegisterType(LLT Ty) {
198 if (!isRegisterSize(Ty.getSizeInBits()))
199 return false;
200
201 if (Ty.isVector())
202 return isRegisterVectorType(Ty);
203
204 return true;
205}
206
207// Any combination of 32 or 64-bit elements up the maximum register size, and
208// multiples of v2s16.
209static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210 return [=](const LegalityQuery &Query) {
211 return isRegisterType(Query.Types[TypeIdx]);
212 };
213}
214
215static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216 return [=](const LegalityQuery &Query) {
217 const LLT QueryTy = Query.Types[TypeIdx];
218 if (!QueryTy.isVector())
219 return false;
220 const LLT EltTy = QueryTy.getElementType();
221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222 };
223}
224
225// If we have a truncating store or an extending load with a data size larger
226// than 32-bits, we need to reduce to a 32-bit type.
227static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
228 return [=](const LegalityQuery &Query) {
229 const LLT Ty = Query.Types[TypeIdx];
230 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232 };
233}
234
235// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236// handle some operations by just promoting the register during
237// selection. There are also d16 loads on GFX9+ which preserve the high bits.
238static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239 bool IsLoad) {
240 switch (AS) {
241 case AMDGPUAS::PRIVATE_ADDRESS:
242 // FIXME: Private element size.
243 return ST.enableFlatScratch() ? 128 : 32;
244 case AMDGPUAS::LOCAL_ADDRESS:
245 return ST.useDS128() ? 128 : 64;
246 case AMDGPUAS::GLOBAL_ADDRESS:
247 case AMDGPUAS::CONSTANT_ADDRESS:
248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249 // Treat constant and global as identical. SMRD loads are sometimes usable for
250 // global loads (ideally constant address space should be eliminated)
251 // depending on the context. Legality cannot be context dependent, but
252 // RegBankSelect can split the load as necessary depending on the pointer
253 // register bank/uniformity and if the memory is invariant or not written in a
254 // kernel.
255 return IsLoad ? 512 : 128;
256 default:
257 // Flat addresses may contextually need to be split to 32-bit parts if they
258 // may alias scratch depending on the subtarget.
259 return 128;
260 }
261}
262
263static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264 const LegalityQuery &Query) {
265 const LLT Ty = Query.Types[0];
266
267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269
270 unsigned RegSize = Ty.getSizeInBits();
271 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273 unsigned AS = Query.Types[1].getAddressSpace();
274
275 // All of these need to be custom lowered to cast the pointer operand.
276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277 return false;
278
279 // Do not handle extending vector loads.
280 if (Ty.isVector() && MemSize != RegSize)
281 return false;
282
283 // TODO: We should be able to widen loads if the alignment is high enough, but
284 // we also need to modify the memory access size.
285#if 0
286 // Accept widening loads based on alignment.
287 if (IsLoad && MemSize < Size)
288 MemSize = std::max(MemSize, Align);
289#endif
290
291 // Only 1-byte and 2-byte to 32-bit extloads are valid.
292 if (MemSize != RegSize && RegSize != 32)
293 return false;
294
295 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296 return false;
297
298 switch (MemSize) {
299 case 8:
300 case 16:
301 case 32:
302 case 64:
303 case 128:
304 break;
305 case 96:
306 if (!ST.hasDwordx3LoadStores())
307 return false;
308 break;
309 case 256:
310 case 512:
311 // These may contextually need to be broken down.
312 break;
313 default:
314 return false;
315 }
316
317 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 317, __extension__ __PRETTY_FUNCTION__))
;
318
319 if (AlignBits < MemSize) {
320 const SITargetLowering *TLI = ST.getTargetLowering();
321 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322 Align(AlignBits / 8)))
323 return false;
324 }
325
326 return true;
327}
328
329// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330// workaround this. Eventually it should ignore the type for loads and only care
331// about the size. Return true in cases where we will workaround this for now by
332// bitcasting.
333static bool loadStoreBitcastWorkaround(const LLT Ty) {
334 if (EnableNewLegality)
335 return false;
336
337 const unsigned Size = Ty.getSizeInBits();
338 if (Size <= 64)
339 return false;
340 if (!Ty.isVector())
341 return true;
342
343 LLT EltTy = Ty.getElementType();
344 if (EltTy.isPointer())
345 return true;
346
347 unsigned EltSize = EltTy.getSizeInBits();
348 return EltSize != 32 && EltSize != 64;
349}
350
351static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352 const LLT Ty = Query.Types[0];
353 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
354 !loadStoreBitcastWorkaround(Ty);
355}
356
357/// Return true if a load or store of the type should be lowered with a bitcast
358/// to a different type.
359static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360 const unsigned MemSizeInBits) {
361 const unsigned Size = Ty.getSizeInBits();
362 if (Size != MemSizeInBits)
363 return Size <= 32 && Ty.isVector();
364
365 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
366 return true;
367 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
368 !isRegisterVectorElementType(Ty.getElementType());
369}
370
371/// Return true if we should legalize a load by widening an odd sized memory
372/// access up to the alignment. Note this case when the memory access itself
373/// changes, not the size of the result register.
374static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375 unsigned AlignInBits, unsigned AddrSpace,
376 unsigned Opcode) {
377 // We don't want to widen cases that are naturally legal.
378 if (isPowerOf2_32(SizeInBits))
379 return false;
380
381 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382 // end up widening these for a scalar load during RegBankSelect, since there
383 // aren't 96-bit scalar loads.
384 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385 return false;
386
387 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388 return false;
389
390 // A load is known dereferenceable up to the alignment, so it's legal to widen
391 // to it.
392 //
393 // TODO: Could check dereferenceable for less aligned cases.
394 unsigned RoundedSize = NextPowerOf2(SizeInBits);
395 if (AlignInBits < RoundedSize)
396 return false;
397
398 // Do not widen if it would introduce a slow unaligned load.
399 const SITargetLowering *TLI = ST.getTargetLowering();
400 bool Fast = false;
401 return TLI->allowsMisalignedMemoryAccessesImpl(
402 RoundedSize, AddrSpace, Align(AlignInBits / 8),
403 MachineMemOperand::MOLoad, &Fast) &&
404 Fast;
405}
406
407static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408 unsigned Opcode) {
409 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410 return false;
411
412 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413 Query.MMODescrs[0].AlignInBits,
414 Query.Types[1].getAddressSpace(), Opcode);
415}
416
417AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
418 const GCNTargetMachine &TM)
419 : ST(ST_) {
420 using namespace TargetOpcode;
421
422 auto GetAddrSpacePtr = [&TM](unsigned AS) {
423 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424 };
425
426 const LLT S1 = LLT::scalar(1);
427 const LLT S8 = LLT::scalar(8);
428 const LLT S16 = LLT::scalar(16);
429 const LLT S32 = LLT::scalar(32);
430 const LLT S64 = LLT::scalar(64);
431 const LLT S128 = LLT::scalar(128);
432 const LLT S256 = LLT::scalar(256);
433 const LLT S512 = LLT::scalar(512);
434 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435
436 const LLT V2S8 = LLT::vector(2, 8);
437 const LLT V2S16 = LLT::vector(2, 16);
438 const LLT V4S16 = LLT::vector(4, 16);
439
440 const LLT V2S32 = LLT::vector(2, 32);
441 const LLT V3S32 = LLT::vector(3, 32);
442 const LLT V4S32 = LLT::vector(4, 32);
443 const LLT V5S32 = LLT::vector(5, 32);
444 const LLT V6S32 = LLT::vector(6, 32);
445 const LLT V7S32 = LLT::vector(7, 32);
446 const LLT V8S32 = LLT::vector(8, 32);
447 const LLT V9S32 = LLT::vector(9, 32);
448 const LLT V10S32 = LLT::vector(10, 32);
449 const LLT V11S32 = LLT::vector(11, 32);
450 const LLT V12S32 = LLT::vector(12, 32);
451 const LLT V13S32 = LLT::vector(13, 32);
452 const LLT V14S32 = LLT::vector(14, 32);
453 const LLT V15S32 = LLT::vector(15, 32);
454 const LLT V16S32 = LLT::vector(16, 32);
455 const LLT V32S32 = LLT::vector(32, 32);
456
457 const LLT V2S64 = LLT::vector(2, 64);
458 const LLT V3S64 = LLT::vector(3, 64);
459 const LLT V4S64 = LLT::vector(4, 64);
460 const LLT V5S64 = LLT::vector(5, 64);
461 const LLT V6S64 = LLT::vector(6, 64);
462 const LLT V7S64 = LLT::vector(7, 64);
463 const LLT V8S64 = LLT::vector(8, 64);
464 const LLT V16S64 = LLT::vector(16, 64);
465
466 std::initializer_list<LLT> AllS32Vectors =
467 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469 std::initializer_list<LLT> AllS64Vectors =
470 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471
472 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479
480 const LLT CodePtr = FlatPtr;
481
482 const std::initializer_list<LLT> AddrSpaces64 = {
483 GlobalPtr, ConstantPtr, FlatPtr
484 };
485
486 const std::initializer_list<LLT> AddrSpaces32 = {
487 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488 };
489
490 const std::initializer_list<LLT> FPTypesBase = {
491 S32, S64
492 };
493
494 const std::initializer_list<LLT> FPTypes16 = {
495 S32, S64, S16
496 };
497
498 const std::initializer_list<LLT> FPTypesPK16 = {
499 S32, S64, S16, V2S16
500 };
501
502 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503
504 auto &LegacyInfo = getLegacyLegalizerInfo();
505 LegacyInfo.setAction({G_BRCOND, S1},
506 LegacyLegalizeActions::Legal); // VCC branches
507 LegacyInfo.setAction({G_BRCOND, S32},
508 LegacyLegalizeActions::Legal); // SCC branches
509
510 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
511 // elements for v3s16
512 getActionDefinitionsBuilder(G_PHI)
513 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
514 .legalFor(AllS32Vectors)
515 .legalFor(AllS64Vectors)
516 .legalFor(AddrSpaces64)
517 .legalFor(AddrSpaces32)
518 .legalIf(isPointer(0))
519 .clampScalar(0, S16, S256)
520 .widenScalarToNextPow2(0, 32)
521 .clampMaxNumElements(0, S32, 16)
522 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
523 .scalarize(0);
524
525 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
526 // Full set of gfx9 features.
527 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
528 .legalFor({S32, S16, V2S16})
529 .clampScalar(0, S16, S32)
530 .clampMaxNumElements(0, S16, 2)
531 .scalarize(0)
532 .widenScalarToNextPow2(0, 32);
533
534 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
535 .legalFor({S32, S16, V2S16}) // Clamp modifier
536 .minScalarOrElt(0, S16)
537 .clampMaxNumElements(0, S16, 2)
538 .scalarize(0)
539 .widenScalarToNextPow2(0, 32)
540 .lower();
541 } else if (ST.has16BitInsts()) {
542 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
543 .legalFor({S32, S16})
544 .clampScalar(0, S16, S32)
545 .scalarize(0)
546 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
547
548 // Technically the saturating operations require clamp bit support, but this
549 // was introduced at the same time as 16-bit operations.
550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
551 .legalFor({S32, S16}) // Clamp modifier
552 .minScalar(0, S16)
553 .scalarize(0)
554 .widenScalarToNextPow2(0, 16)
555 .lower();
556
557 // We're just lowering this, but it helps get a better result to try to
558 // coerce to the desired type first.
559 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
560 .minScalar(0, S16)
561 .scalarize(0)
562 .lower();
563 } else {
564 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
565 .legalFor({S32})
566 .clampScalar(0, S32, S32)
567 .scalarize(0);
568
569 if (ST.hasIntClamp()) {
570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571 .legalFor({S32}) // Clamp modifier.
572 .scalarize(0)
573 .minScalarOrElt(0, S32)
574 .lower();
575 } else {
576 // Clamp bit support was added in VI, along with 16-bit operations.
577 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578 .minScalar(0, S32)
579 .scalarize(0)
580 .lower();
581 }
582
583 // FIXME: DAG expansion gets better results. The widening uses the smaller
584 // range values and goes for the min/max lowering directly.
585 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
586 .minScalar(0, S32)
587 .scalarize(0)
588 .lower();
589 }
590
591 getActionDefinitionsBuilder(
592 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
593 .customFor({S32, S64})
594 .clampScalar(0, S32, S64)
595 .widenScalarToNextPow2(0, 32)
596 .scalarize(0);
597
598 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
599 .legalFor({S32})
600 .maxScalarOrElt(0, S32);
601
602 if (ST.hasVOP3PInsts()) {
603 Mulh
604 .clampMaxNumElements(0, S8, 2)
605 .lowerFor({V2S8});
606 }
607
608 Mulh
609 .scalarize(0)
610 .lower();
611
612 // Report legal for any types we can handle anywhere. For the cases only legal
613 // on the SALU, RegBankSelect will be able to re-legalize.
614 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
615 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
616 .clampScalar(0, S32, S64)
617 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
618 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
619 .widenScalarToNextPow2(0)
620 .scalarize(0);
621
622 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
623 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
624 .legalFor({{S32, S1}, {S32, S32}})
625 .minScalar(0, S32)
626 // TODO: .scalarize(0)
627 .lower();
628
629 getActionDefinitionsBuilder(G_BITCAST)
630 // Don't worry about the size constraint.
631 .legalIf(all(isRegisterType(0), isRegisterType(1)))
632 .lower();
633
634
635 getActionDefinitionsBuilder(G_CONSTANT)
636 .legalFor({S1, S32, S64, S16, GlobalPtr,
637 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
638 .legalIf(isPointer(0))
639 .clampScalar(0, S32, S64)
640 .widenScalarToNextPow2(0);
641
642 getActionDefinitionsBuilder(G_FCONSTANT)
643 .legalFor({S32, S64, S16})
644 .clampScalar(0, S16, S64);
645
646 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
647 .legalIf(isRegisterType(0))
648 // s1 and s16 are special cases because they have legal operations on
649 // them, but don't really occupy registers in the normal way.
650 .legalFor({S1, S16})
651 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
652 .clampScalarOrElt(0, S32, MaxScalar)
653 .widenScalarToNextPow2(0, 32)
654 .clampMaxNumElements(0, S32, 16);
655
656 LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr},
657 LegacyLegalizeActions::Legal);
658
659 // If the amount is divergent, we have to do a wave reduction to get the
660 // maximum value, so this is expanded during RegBankSelect.
661 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662 .legalFor({{PrivatePtr, S32}});
663
664 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665 .customIf(typeIsNot(0, PrivatePtr));
666
667 LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal);
668
669 auto &FPOpActions = getActionDefinitionsBuilder(
670 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671 .legalFor({S32, S64});
672 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673 .customFor({S32, S64});
674 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675 .customFor({S32, S64});
676
677 if (ST.has16BitInsts()) {
678 if (ST.hasVOP3PInsts())
679 FPOpActions.legalFor({S16, V2S16});
680 else
681 FPOpActions.legalFor({S16});
682
683 TrigActions.customFor({S16});
684 FDIVActions.customFor({S16});
685 }
686
687 auto &MinNumMaxNum = getActionDefinitionsBuilder({
688 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689
690 if (ST.hasVOP3PInsts()) {
691 MinNumMaxNum.customFor(FPTypesPK16)
692 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
693 .clampMaxNumElements(0, S16, 2)
694 .clampScalar(0, S16, S64)
695 .scalarize(0);
696 } else if (ST.has16BitInsts()) {
697 MinNumMaxNum.customFor(FPTypes16)
698 .clampScalar(0, S16, S64)
699 .scalarize(0);
700 } else {
701 MinNumMaxNum.customFor(FPTypesBase)
702 .clampScalar(0, S32, S64)
703 .scalarize(0);
704 }
705
706 if (ST.hasVOP3PInsts())
707 FPOpActions.clampMaxNumElements(0, S16, 2);
708
709 FPOpActions
710 .scalarize(0)
711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712
713 TrigActions
714 .scalarize(0)
715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716
717 FDIVActions
718 .scalarize(0)
719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720
721 getActionDefinitionsBuilder({G_FNEG, G_FABS})
722 .legalFor(FPTypesPK16)
723 .clampMaxNumElements(0, S16, 2)
724 .scalarize(0)
725 .clampScalar(0, S16, S64);
726
727 if (ST.has16BitInsts()) {
728 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729 .legalFor({S32, S64, S16})
730 .scalarize(0)
731 .clampScalar(0, S16, S64);
732 } else {
733 getActionDefinitionsBuilder(G_FSQRT)
734 .legalFor({S32, S64})
735 .scalarize(0)
736 .clampScalar(0, S32, S64);
737
738 if (ST.hasFractBug()) {
739 getActionDefinitionsBuilder(G_FFLOOR)
740 .customFor({S64})
741 .legalFor({S32, S64})
742 .scalarize(0)
743 .clampScalar(0, S32, S64);
744 } else {
745 getActionDefinitionsBuilder(G_FFLOOR)
746 .legalFor({S32, S64})
747 .scalarize(0)
748 .clampScalar(0, S32, S64);
749 }
750 }
751
752 getActionDefinitionsBuilder(G_FPTRUNC)
753 .legalFor({{S32, S64}, {S16, S32}})
754 .scalarize(0)
755 .lower();
756
757 getActionDefinitionsBuilder(G_FPEXT)
758 .legalFor({{S64, S32}, {S32, S16}})
759 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760 .scalarize(0);
761
762 getActionDefinitionsBuilder(G_FSUB)
763 // Use actual fsub instruction
764 .legalFor({S32})
765 // Must use fadd + fneg
766 .lowerFor({S64, S16, V2S16})
767 .scalarize(0)
768 .clampScalar(0, S32, S64);
769
770 // Whether this is legal depends on the floating point mode for the function.
771 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773 FMad.customFor({S32, S16});
774 else if (ST.hasMadMacF32Insts())
775 FMad.customFor({S32});
776 else if (ST.hasMadF16())
777 FMad.customFor({S16});
778 FMad.scalarize(0)
779 .lower();
780
781 auto &FRem = getActionDefinitionsBuilder(G_FREM);
782 if (ST.has16BitInsts()) {
783 FRem.customFor({S16, S32, S64});
784 } else {
785 FRem.minScalar(0, S32)
786 .customFor({S32, S64});
787 }
788 FRem.scalarize(0);
789
790 // TODO: Do we need to clamp maximum bitwidth?
791 getActionDefinitionsBuilder(G_TRUNC)
792 .legalIf(isScalar(0))
793 .legalFor({{V2S16, V2S32}})
794 .clampMaxNumElements(0, S16, 2)
795 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796 // situations (like an invalid implicit use), we don't want to infinite loop
797 // in the legalizer.
798 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
799 .alwaysLegal();
800
801 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803 {S32, S1}, {S64, S1}, {S16, S1}})
804 .scalarize(0)
805 .clampScalar(0, S32, S64)
806 .widenScalarToNextPow2(1, 32);
807
808 // TODO: Split s1->s64 during regbankselect for VALU.
809 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811 .lowerFor({{S32, S64}})
812 .lowerIf(typeIs(1, S1))
813 .customFor({{S64, S64}});
814 if (ST.has16BitInsts())
815 IToFP.legalFor({{S16, S16}});
816 IToFP.clampScalar(1, S32, S64)
817 .minScalar(0, S32)
818 .scalarize(0)
819 .widenScalarToNextPow2(1);
820
821 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823 .customFor({{S64, S32}, {S64, S64}})
824 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825 if (ST.has16BitInsts())
826 FPToI.legalFor({{S16, S16}});
827 else
828 FPToI.minScalar(1, S32);
829
830 FPToI.minScalar(0, S32)
831 .widenScalarToNextPow2(0, 32)
832 .scalarize(0)
833 .lower();
834
835 // Lower roundeven into G_FRINT
836 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837 .scalarize(0)
838 .lower();
839
840 if (ST.has16BitInsts()) {
841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842 .legalFor({S16, S32, S64})
843 .clampScalar(0, S16, S64)
844 .scalarize(0);
845 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847 .legalFor({S32, S64})
848 .clampScalar(0, S32, S64)
849 .scalarize(0);
850 } else {
851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852 .legalFor({S32})
853 .customFor({S64})
854 .clampScalar(0, S32, S64)
855 .scalarize(0);
856 }
857
858 getActionDefinitionsBuilder(G_PTR_ADD)
859 .legalIf(all(isPointer(0), sameSize(0, 1)))
860 .scalarize(0)
861 .scalarSameSizeAs(1, 0);
862
863 getActionDefinitionsBuilder(G_PTRMASK)
864 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865 .scalarSameSizeAs(1, 0)
866 .scalarize(0);
867
868 auto &CmpBuilder =
869 getActionDefinitionsBuilder(G_ICMP)
870 // The compare output type differs based on the register bank of the output,
871 // so make both s1 and s32 legal.
872 //
873 // Scalar compares producing output in scc will be promoted to s32, as that
874 // is the allocatable register type that will be needed for the copy from
875 // scc. This will be promoted during RegBankSelect, and we assume something
876 // before that won't try to use s32 result types.
877 //
878 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879 // bank.
880 .legalForCartesianProduct(
881 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882 .legalForCartesianProduct(
883 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884 if (ST.has16BitInsts()) {
885 CmpBuilder.legalFor({{S1, S16}});
886 }
887
888 CmpBuilder
889 .widenScalarToNextPow2(1)
890 .clampScalar(1, S32, S64)
891 .scalarize(0)
892 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893
894 getActionDefinitionsBuilder(G_FCMP)
895 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896 .widenScalarToNextPow2(1)
897 .clampScalar(1, S32, S64)
898 .scalarize(0);
899
900 // FIXME: fpow has a selection pattern that should move to custom lowering.
901 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902 if (ST.has16BitInsts())
903 Exp2Ops.legalFor({S32, S16});
904 else
905 Exp2Ops.legalFor({S32});
906 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907 Exp2Ops.scalarize(0);
908
909 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910 if (ST.has16BitInsts())
911 ExpOps.customFor({{S32}, {S16}});
912 else
913 ExpOps.customFor({S32});
914 ExpOps.clampScalar(0, MinScalarFPTy, S32)
915 .scalarize(0);
916
917 getActionDefinitionsBuilder(G_FPOWI)
918 .clampScalar(0, MinScalarFPTy, S32)
919 .lower();
920
921 // The 64-bit versions produce 32-bit results, but only on the SALU.
922 getActionDefinitionsBuilder(G_CTPOP)
923 .legalFor({{S32, S32}, {S32, S64}})
924 .clampScalar(0, S32, S32)
925 .clampScalar(1, S32, S64)
926 .scalarize(0)
927 .widenScalarToNextPow2(0, 32)
928 .widenScalarToNextPow2(1, 32);
929
930 // The hardware instructions return a different result on 0 than the generic
931 // instructions expect. The hardware produces -1, but these produce the
932 // bitwidth.
933 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934 .scalarize(0)
935 .clampScalar(0, S32, S32)
936 .clampScalar(1, S32, S64)
937 .widenScalarToNextPow2(0, 32)
938 .widenScalarToNextPow2(1, 32)
939 .lower();
940
941 // The 64-bit versions produce 32-bit results, but only on the SALU.
942 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943 .legalFor({{S32, S32}, {S32, S64}})
944 .clampScalar(0, S32, S32)
945 .clampScalar(1, S32, S64)
946 .scalarize(0)
947 .widenScalarToNextPow2(0, 32)
948 .widenScalarToNextPow2(1, 32);
949
950 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951 // RegBankSelect.
952 getActionDefinitionsBuilder(G_BITREVERSE)
953 .legalFor({S32, S64})
954 .clampScalar(0, S32, S64)
955 .scalarize(0)
956 .widenScalarToNextPow2(0);
957
958 if (ST.has16BitInsts()) {
959 getActionDefinitionsBuilder(G_BSWAP)
960 .legalFor({S16, S32, V2S16})
961 .clampMaxNumElements(0, S16, 2)
962 // FIXME: Fixing non-power-of-2 before clamp is workaround for
963 // narrowScalar limitation.
964 .widenScalarToNextPow2(0)
965 .clampScalar(0, S16, S32)
966 .scalarize(0);
967
968 if (ST.hasVOP3PInsts()) {
969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970 .legalFor({S32, S16, V2S16})
971 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972 .clampMaxNumElements(0, S16, 2)
973 .minScalar(0, S16)
974 .widenScalarToNextPow2(0)
975 .scalarize(0)
976 .lower();
977 } else {
978 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979 .legalFor({S32, S16})
980 .widenScalarToNextPow2(0)
981 .minScalar(0, S16)
982 .scalarize(0)
983 .lower();
984 }
985 } else {
986 // TODO: Should have same legality without v_perm_b32
987 getActionDefinitionsBuilder(G_BSWAP)
988 .legalFor({S32})
989 .lowerIf(scalarNarrowerThan(0, 32))
990 // FIXME: Fixing non-power-of-2 before clamp is workaround for
991 // narrowScalar limitation.
992 .widenScalarToNextPow2(0)
993 .maxScalar(0, S32)
994 .scalarize(0)
995 .lower();
996
997 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998 .legalFor({S32})
999 .minScalar(0, S32)
1000 .widenScalarToNextPow2(0)
1001 .scalarize(0)
1002 .lower();
1003 }
1004
1005 getActionDefinitionsBuilder(G_INTTOPTR)
1006 // List the common cases
1007 .legalForCartesianProduct(AddrSpaces64, {S64})
1008 .legalForCartesianProduct(AddrSpaces32, {S32})
1009 .scalarize(0)
1010 // Accept any address space as long as the size matches
1011 .legalIf(sameSize(0, 1))
1012 .widenScalarIf(smallerThan(1, 0),
1013 [](const LegalityQuery &Query) {
1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015 })
1016 .narrowScalarIf(largerThan(1, 0),
1017 [](const LegalityQuery &Query) {
1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019 });
1020
1021 getActionDefinitionsBuilder(G_PTRTOINT)
1022 // List the common cases
1023 .legalForCartesianProduct(AddrSpaces64, {S64})
1024 .legalForCartesianProduct(AddrSpaces32, {S32})
1025 .scalarize(0)
1026 // Accept any address space as long as the size matches
1027 .legalIf(sameSize(0, 1))
1028 .widenScalarIf(smallerThan(0, 1),
1029 [](const LegalityQuery &Query) {
1030 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031 })
1032 .narrowScalarIf(
1033 largerThan(0, 1),
1034 [](const LegalityQuery &Query) {
1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036 });
1037
1038 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039 .scalarize(0)
1040 .custom();
1041
1042 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043 bool IsLoad) -> bool {
1044 const LLT DstTy = Query.Types[0];
1045
1046 // Split vector extloads.
1047 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049
1050 if (MemSize < DstTy.getSizeInBits())
1051 MemSize = std::max(MemSize, AlignBits);
1052
1053 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054 return true;
1055
1056 const LLT PtrTy = Query.Types[1];
1057 unsigned AS = PtrTy.getAddressSpace();
1058 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059 return true;
1060
1061 // Catch weird sized loads that don't evenly divide into the access sizes
1062 // TODO: May be able to widen depending on alignment etc.
1063 unsigned NumRegs = (MemSize + 31) / 32;
1064 if (NumRegs == 3) {
1065 if (!ST.hasDwordx3LoadStores())
1066 return true;
1067 } else {
1068 // If the alignment allows, these should have been widened.
1069 if (!isPowerOf2_32(NumRegs))
1070 return true;
1071 }
1072
1073 if (AlignBits < MemSize) {
1074 const SITargetLowering *TLI = ST.getTargetLowering();
1075 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076 Align(AlignBits / 8));
1077 }
1078
1079 return false;
1080 };
1081
1082 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085
1086 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087 // LDS
1088 // TODO: Unsupported flat for SI.
1089
1090 for (unsigned Op : {G_LOAD, G_STORE}) {
1091 const bool IsStore = Op == G_STORE;
1092
1093 auto &Actions = getActionDefinitionsBuilder(Op);
1094 // Explicitly list some common cases.
1095 // TODO: Does this help compile time at all?
1096 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097 {V2S32, GlobalPtr, 64, GlobalAlign32},
1098 {V4S32, GlobalPtr, 128, GlobalAlign32},
1099 {S64, GlobalPtr, 64, GlobalAlign32},
1100 {V2S64, GlobalPtr, 128, GlobalAlign32},
1101 {V2S16, GlobalPtr, 32, GlobalAlign32},
1102 {S32, GlobalPtr, 8, GlobalAlign8},
1103 {S32, GlobalPtr, 16, GlobalAlign16},
1104
1105 {S32, LocalPtr, 32, 32},
1106 {S64, LocalPtr, 64, 32},
1107 {V2S32, LocalPtr, 64, 32},
1108 {S32, LocalPtr, 8, 8},
1109 {S32, LocalPtr, 16, 16},
1110 {V2S16, LocalPtr, 32, 32},
1111
1112 {S32, PrivatePtr, 32, 32},
1113 {S32, PrivatePtr, 8, 8},
1114 {S32, PrivatePtr, 16, 16},
1115 {V2S16, PrivatePtr, 32, 32},
1116
1117 {S32, ConstantPtr, 32, GlobalAlign32},
1118 {V2S32, ConstantPtr, 64, GlobalAlign32},
1119 {V4S32, ConstantPtr, 128, GlobalAlign32},
1120 {S64, ConstantPtr, 64, GlobalAlign32},
1121 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122 Actions.legalIf(
1123 [=](const LegalityQuery &Query) -> bool {
1124 return isLoadStoreLegal(ST, Query);
1125 });
1126
1127 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128 // 64-bits.
1129 //
1130 // TODO: Should generalize bitcast action into coerce, which will also cover
1131 // inserting addrspacecasts.
1132 Actions.customIf(typeIs(1, Constant32Ptr));
1133
1134 // Turn any illegal element vectors into something easier to deal
1135 // with. These will ultimately produce 32-bit scalar shifts to extract the
1136 // parts anyway.
1137 //
1138 // For odd 16-bit element vectors, prefer to split those into pieces with
1139 // 16-bit vector parts.
1140 Actions.bitcastIf(
1141 [=](const LegalityQuery &Query) -> bool {
1142 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143 Query.MMODescrs[0].SizeInBits);
1144 }, bitcastToRegisterType(0));
1145
1146 if (!IsStore) {
1147 // Widen suitably aligned loads by loading extra bytes. The standard
1148 // legalization actions can't properly express widening memory operands.
1149 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150 return shouldWidenLoad(ST, Query, G_LOAD);
1151 });
1152 }
1153
1154 // FIXME: load/store narrowing should be moved to lower action
1155 Actions
1156 .narrowScalarIf(
1157 [=](const LegalityQuery &Query) -> bool {
1158 return !Query.Types[0].isVector() &&
1159 needToSplitMemOp(Query, Op == G_LOAD);
1160 },
1161 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162 const LLT DstTy = Query.Types[0];
1163 const LLT PtrTy = Query.Types[1];
1164
1165 const unsigned DstSize = DstTy.getSizeInBits();
1166 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167
1168 // Split extloads.
1169 if (DstSize > MemSize)
1170 return std::make_pair(0, LLT::scalar(MemSize));
1171
1172 if (!isPowerOf2_32(DstSize)) {
1173 // We're probably decomposing an odd sized store. Try to split
1174 // to the widest type. TODO: Account for alignment. As-is it
1175 // should be OK, since the new parts will be further legalized.
1176 unsigned FloorSize = PowerOf2Floor(DstSize);
1177 return std::make_pair(0, LLT::scalar(FloorSize));
1178 }
1179
1180 if (DstSize > 32 && (DstSize % 32 != 0)) {
1181 // FIXME: Need a way to specify non-extload of larger size if
1182 // suitably aligned.
1183 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184 }
1185
1186 unsigned MaxSize = maxSizeForAddrSpace(ST,
1187 PtrTy.getAddressSpace(),
1188 Op == G_LOAD);
1189 if (MemSize > MaxSize)
1190 return std::make_pair(0, LLT::scalar(MaxSize));
1191
1192 unsigned Align = Query.MMODescrs[0].AlignInBits;
1193 return std::make_pair(0, LLT::scalar(Align));
1194 })
1195 .fewerElementsIf(
1196 [=](const LegalityQuery &Query) -> bool {
1197 return Query.Types[0].isVector() &&
1198 needToSplitMemOp(Query, Op == G_LOAD);
1199 },
1200 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201 const LLT DstTy = Query.Types[0];
1202 const LLT PtrTy = Query.Types[1];
1203
1204 LLT EltTy = DstTy.getElementType();
1205 unsigned MaxSize = maxSizeForAddrSpace(ST,
1206 PtrTy.getAddressSpace(),
1207 Op == G_LOAD);
1208
1209 // FIXME: Handle widened to power of 2 results better. This ends
1210 // up scalarizing.
1211 // FIXME: 3 element stores scalarized on SI
1212
1213 // Split if it's too large for the address space.
1214 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215 unsigned NumElts = DstTy.getNumElements();
1216 unsigned EltSize = EltTy.getSizeInBits();
1217
1218 if (MaxSize % EltSize == 0) {
1219 return std::make_pair(
1220 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1221 }
1222
1223 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1224
1225 // FIXME: Refine when odd breakdowns handled
1226 // The scalars will need to be re-legalized.
1227 if (NumPieces == 1 || NumPieces >= NumElts ||
1228 NumElts % NumPieces != 0)
1229 return std::make_pair(0, EltTy);
1230
1231 return std::make_pair(0,
1232 LLT::vector(NumElts / NumPieces, EltTy));
1233 }
1234
1235 // FIXME: We could probably handle weird extending loads better.
1236 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1237 if (DstTy.getSizeInBits() > MemSize)
1238 return std::make_pair(0, EltTy);
1239
1240 unsigned EltSize = EltTy.getSizeInBits();
1241 unsigned DstSize = DstTy.getSizeInBits();
1242 if (!isPowerOf2_32(DstSize)) {
1243 // We're probably decomposing an odd sized store. Try to split
1244 // to the widest type. TODO: Account for alignment. As-is it
1245 // should be OK, since the new parts will be further legalized.
1246 unsigned FloorSize = PowerOf2Floor(DstSize);
1247 return std::make_pair(
1248 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1249 }
1250
1251 // Need to split because of alignment.
1252 unsigned Align = Query.MMODescrs[0].AlignInBits;
1253 if (EltSize > Align &&
1254 (EltSize / Align < DstTy.getNumElements())) {
1255 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1256 }
1257
1258 // May need relegalization for the scalars.
1259 return std::make_pair(0, EltTy);
1260 })
1261 .lowerIfMemSizeNotPow2()
1262 .minScalar(0, S32)
1263 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1264 .widenScalarToNextPow2(0)
1265 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1266 .lower();
1267 }
1268
1269 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1270 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1271 {S32, GlobalPtr, 16, 2 * 8},
1272 {S32, LocalPtr, 8, 8},
1273 {S32, LocalPtr, 16, 16},
1274 {S32, PrivatePtr, 8, 8},
1275 {S32, PrivatePtr, 16, 16},
1276 {S32, ConstantPtr, 8, 8},
1277 {S32, ConstantPtr, 16, 2 * 8}})
1278 .legalIf(
1279 [=](const LegalityQuery &Query) -> bool {
1280 return isLoadStoreLegal(ST, Query);
1281 });
1282
1283 if (ST.hasFlatAddressSpace()) {
1284 ExtLoads.legalForTypesWithMemDesc(
1285 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1286 }
1287
1288 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1289 // 64-bits.
1290 //
1291 // TODO: Should generalize bitcast action into coerce, which will also cover
1292 // inserting addrspacecasts.
1293 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1294
1295 ExtLoads.clampScalar(0, S32, S32)
1296 .widenScalarToNextPow2(0)
1297 .unsupportedIfMemSizeNotPow2()
1298 .lower();
1299
1300 auto &Atomics = getActionDefinitionsBuilder(
1301 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1302 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1303 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1304 G_ATOMICRMW_UMIN})
1305 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1306 {S64, GlobalPtr}, {S64, LocalPtr},
1307 {S32, RegionPtr}, {S64, RegionPtr}});
1308 if (ST.hasFlatAddressSpace()) {
1309 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1310 }
1311
1312 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1313 if (ST.hasLDSFPAtomics()) {
1314 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1315 if (ST.hasGFX90AInsts())
1316 Atomic.legalFor({{S64, LocalPtr}});
1317 }
1318 if (ST.hasAtomicFaddInsts())
1319 Atomic.legalFor({{S32, GlobalPtr}});
1320
1321 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1322 // demarshalling
1323 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1324 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1325 {S32, FlatPtr}, {S64, FlatPtr}})
1326 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1327 {S32, RegionPtr}, {S64, RegionPtr}});
1328 // TODO: Pointer types, any 32-bit or 64-bit vector
1329
1330 // Condition should be s32 for scalar, s1 for vector.
1331 getActionDefinitionsBuilder(G_SELECT)
1332 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1333 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1334 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1335 .clampScalar(0, S16, S64)
1336 .scalarize(1)
1337 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1338 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1339 .clampMaxNumElements(0, S32, 2)
1340 .clampMaxNumElements(0, LocalPtr, 2)
1341 .clampMaxNumElements(0, PrivatePtr, 2)
1342 .scalarize(0)
1343 .widenScalarToNextPow2(0)
1344 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1345
1346 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1347 // be more flexible with the shift amount type.
1348 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1349 .legalFor({{S32, S32}, {S64, S32}});
1350 if (ST.has16BitInsts()) {
1351 if (ST.hasVOP3PInsts()) {
1352 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1353 .clampMaxNumElements(0, S16, 2);
1354 } else
1355 Shifts.legalFor({{S16, S16}});
1356
1357 // TODO: Support 16-bit shift amounts for all types
1358 Shifts.widenScalarIf(
1359 [=](const LegalityQuery &Query) {
1360 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1361 // 32-bit amount.
1362 const LLT ValTy = Query.Types[0];
1363 const LLT AmountTy = Query.Types[1];
1364 return ValTy.getSizeInBits() <= 16 &&
1365 AmountTy.getSizeInBits() < 16;
1366 }, changeTo(1, S16));
1367 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1368 Shifts.clampScalar(1, S32, S32);
1369 Shifts.clampScalar(0, S16, S64);
1370 Shifts.widenScalarToNextPow2(0, 16);
1371
1372 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1373 .minScalar(0, S16)
1374 .scalarize(0)
1375 .lower();
1376 } else {
1377 // Make sure we legalize the shift amount type first, as the general
1378 // expansion for the shifted type will produce much worse code if it hasn't
1379 // been truncated already.
1380 Shifts.clampScalar(1, S32, S32);
1381 Shifts.clampScalar(0, S32, S64);
1382 Shifts.widenScalarToNextPow2(0, 32);
1383
1384 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1385 .minScalar(0, S32)
1386 .scalarize(0)
1387 .lower();
1388 }
1389 Shifts.scalarize(0);
1390
1391 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1392 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1393 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1394 unsigned IdxTypeIdx = 2;
1395
1396 getActionDefinitionsBuilder(Op)
1397 .customIf([=](const LegalityQuery &Query) {
1398 const LLT EltTy = Query.Types[EltTypeIdx];
1399 const LLT VecTy = Query.Types[VecTypeIdx];
1400 const LLT IdxTy = Query.Types[IdxTypeIdx];
1401 const unsigned EltSize = EltTy.getSizeInBits();
1402 return (EltSize == 32 || EltSize == 64) &&
1403 VecTy.getSizeInBits() % 32 == 0 &&
1404 VecTy.getSizeInBits() <= MaxRegisterSize &&
1405 IdxTy.getSizeInBits() == 32;
1406 })
1407 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1408 bitcastToVectorElement32(VecTypeIdx))
1409 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1410 .bitcastIf(
1411 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1412 [=](const LegalityQuery &Query) {
1413 // For > 64-bit element types, try to turn this into a 64-bit
1414 // element vector since we may be able to do better indexing
1415 // if this is scalar. If not, fall back to 32.
1416 const LLT EltTy = Query.Types[EltTypeIdx];
1417 const LLT VecTy = Query.Types[VecTypeIdx];
1418 const unsigned DstEltSize = EltTy.getSizeInBits();
1419 const unsigned VecSize = VecTy.getSizeInBits();
1420
1421 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1422 return std::make_pair(
1423 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1424 })
1425 .clampScalar(EltTypeIdx, S32, S64)
1426 .clampScalar(VecTypeIdx, S32, S64)
1427 .clampScalar(IdxTypeIdx, S32, S32)
1428 .clampMaxNumElements(VecTypeIdx, S32, 32)
1429 // TODO: Clamp elements for 64-bit vectors?
1430 // It should only be necessary with variable indexes.
1431 // As a last resort, lower to the stack
1432 .lower();
1433 }
1434
1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436 .unsupportedIf([=](const LegalityQuery &Query) {
1437 const LLT &EltTy = Query.Types[1].getElementType();
1438 return Query.Types[0] != EltTy;
1439 });
1440
1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444
1445 // FIXME: Doesn't handle extract of illegal sizes.
1446 getActionDefinitionsBuilder(Op)
1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448 // FIXME: Multiples of 16 should not be legal.
1449 .legalIf([=](const LegalityQuery &Query) {
1450 const LLT BigTy = Query.Types[BigTyIdx];
1451 const LLT LitTy = Query.Types[LitTyIdx];
1452 return (BigTy.getSizeInBits() % 32 == 0) &&
1453 (LitTy.getSizeInBits() % 16 == 0);
1454 })
1455 .widenScalarIf(
1456 [=](const LegalityQuery &Query) {
1457 const LLT BigTy = Query.Types[BigTyIdx];
1458 return (BigTy.getScalarSizeInBits() < 16);
1459 },
1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1461 .widenScalarIf(
1462 [=](const LegalityQuery &Query) {
1463 const LLT LitTy = Query.Types[LitTyIdx];
1464 return (LitTy.getScalarSizeInBits() < 16);
1465 },
1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468 .widenScalarToNextPow2(BigTyIdx, 32);
1469
1470 }
1471
1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473 .legalForCartesianProduct(AllS32Vectors, {S32})
1474 .legalForCartesianProduct(AllS64Vectors, {S64})
1475 .clampNumElements(0, V16S32, V32S32)
1476 .clampNumElements(0, V2S64, V16S64)
1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478
1479 if (ST.hasScalarPackInsts()) {
1480 BuildVector
1481 // FIXME: Should probably widen s1 vectors straight to s32
1482 .minScalarOrElt(0, S16)
1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484 .minScalar(1, S32);
1485
1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487 .legalFor({V2S16, S32})
1488 .lower();
1489 BuildVector.minScalarOrElt(0, S32);
1490 } else {
1491 BuildVector.customFor({V2S16, S16});
1492 BuildVector.minScalarOrElt(0, S32);
1493
1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495 .customFor({V2S16, S32})
1496 .lower();
1497 }
1498
1499 BuildVector.legalIf(isRegisterType(0));
1500
1501 // FIXME: Clamp maximum size
1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504 .clampMaxNumElements(0, S32, 32)
1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506 .clampMaxNumElements(0, S16, 64);
1507
1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509 // pre-legalize.
1510 if (ST.hasVOP3PInsts()) {
1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512 .customFor({V2S16, V2S16})
1513 .lower();
1514 } else
1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516
1517 // Merge/Unmerge
1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521
1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523 const LLT Ty = Query.Types[TypeIdx];
1524 if (Ty.isVector()) {
1525 const LLT &EltTy = Ty.getElementType();
1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527 return true;
1528 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529 return true;
1530 }
1531 return false;
1532 };
1533
1534 auto &Builder = getActionDefinitionsBuilder(Op)
1535 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536 .lowerFor({{S16, V2S16}})
1537 .lowerIf([=](const LegalityQuery &Query) {
1538 const LLT BigTy = Query.Types[BigTyIdx];
1539 return BigTy.getSizeInBits() == 32;
1540 })
1541 // Try to widen to s16 first for small types.
1542 // TODO: Only do this on targets with legal s16 shifts
1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547 elementTypeIs(1, S16)),
1548 changeTo(1, V2S16))
1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551 // valid.
1552 .clampScalar(LitTyIdx, S32, S512)
1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554 // Break up vectors with weird elements into scalars
1555 .fewerElementsIf(
1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557 scalarize(0))
1558 .fewerElementsIf(
1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560 scalarize(1))
1561 .clampScalar(BigTyIdx, S32, MaxScalar);
1562
1563 if (Op == G_MERGE_VALUES) {
1564 Builder.widenScalarIf(
1565 // TODO: Use 16-bit shifts if legal for 8-bit values?
1566 [=](const LegalityQuery &Query) {
1567 const LLT Ty = Query.Types[LitTyIdx];
1568 return Ty.getSizeInBits() < 32;
1569 },
1570 changeTo(LitTyIdx, S32));
1571 }
1572
1573 Builder.widenScalarIf(
1574 [=](const LegalityQuery &Query) {
1575 const LLT Ty = Query.Types[BigTyIdx];
1576 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577 Ty.getSizeInBits() % 16 != 0;
1578 },
1579 [=](const LegalityQuery &Query) {
1580 // Pick the next power of 2, or a multiple of 64 over 128.
1581 // Whichever is smaller.
1582 const LLT &Ty = Query.Types[BigTyIdx];
1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584 if (NewSizeInBits >= 256) {
1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586 if (RoundedTo < NewSizeInBits)
1587 NewSizeInBits = RoundedTo;
1588 }
1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590 })
1591 // Any vectors left are the wrong size. Scalarize them.
1592 .scalarize(0)
1593 .scalarize(1);
1594 }
1595
1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597 // RegBankSelect.
1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599 .legalFor({{S32}, {S64}});
1600
1601 if (ST.hasVOP3PInsts()) {
1602 SextInReg.lowerFor({{V2S16}})
1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604 // get more vector shift opportunities, since we'll get those when
1605 // expanded.
1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1607 } else if (ST.has16BitInsts()) {
1608 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609 } else {
1610 // Prefer to promote to s32 before lowering if we don't have 16-bit
1611 // shifts. This avoid a lot of intermediate truncate and extend operations.
1612 SextInReg.lowerFor({{S32}, {S64}});
1613 }
1614
1615 SextInReg
1616 .scalarize(0)
1617 .clampScalar(0, S32, S64)
1618 .lower();
1619
1620 // TODO: Only Try to form v2s16 with legal packed instructions.
1621 getActionDefinitionsBuilder(G_FSHR)
1622 .legalFor({{S32, S32}})
1623 .lowerFor({{V2S16, V2S16}})
1624 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1625 .scalarize(0)
1626 .lower();
1627
1628 if (ST.hasVOP3PInsts()) {
1629 getActionDefinitionsBuilder(G_FSHL)
1630 .lowerFor({{V2S16, V2S16}})
1631 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1632 .scalarize(0)
1633 .lower();
1634 } else {
1635 getActionDefinitionsBuilder(G_FSHL)
1636 .scalarize(0)
1637 .lower();
1638 }
1639
1640 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1641 .legalFor({S64});
1642
1643 getActionDefinitionsBuilder(G_FENCE)
1644 .alwaysLegal();
1645
1646 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1647 .scalarize(0)
1648 .minScalar(0, S32)
1649 .lower();
1650
1651 getActionDefinitionsBuilder({
1652 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1653 G_FCOPYSIGN,
1654
1655 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1656 G_ATOMICRMW_NAND,
1657 G_ATOMICRMW_FSUB,
1658 G_READ_REGISTER,
1659 G_WRITE_REGISTER,
1660
1661 G_SADDO, G_SSUBO,
1662
1663 // TODO: Implement
1664 G_FMINIMUM, G_FMAXIMUM}).lower();
1665
1666 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1667 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1668 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1669 .unsupported();
1670
1671 getLegacyLegalizerInfo().computeTables();
1672 verify(*ST.getInstrInfo());
1673}
1674
1675bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1676 MachineInstr &MI) const {
1677 MachineIRBuilder &B = Helper.MIRBuilder;
1678 MachineRegisterInfo &MRI = *B.getMRI();
1679
1680 switch (MI.getOpcode()) {
1681 case TargetOpcode::G_ADDRSPACE_CAST:
1682 return legalizeAddrSpaceCast(MI, MRI, B);
1683 case TargetOpcode::G_FRINT:
1684 return legalizeFrint(MI, MRI, B);
1685 case TargetOpcode::G_FCEIL:
1686 return legalizeFceil(MI, MRI, B);
1687 case TargetOpcode::G_FREM:
1688 return legalizeFrem(MI, MRI, B);
1689 case TargetOpcode::G_INTRINSIC_TRUNC:
1690 return legalizeIntrinsicTrunc(MI, MRI, B);
1691 case TargetOpcode::G_SITOFP:
1692 return legalizeITOFP(MI, MRI, B, true);
1693 case TargetOpcode::G_UITOFP:
1694 return legalizeITOFP(MI, MRI, B, false);
1695 case TargetOpcode::G_FPTOSI:
1696 return legalizeFPTOI(MI, MRI, B, true);
1697 case TargetOpcode::G_FPTOUI:
1698 return legalizeFPTOI(MI, MRI, B, false);
1699 case TargetOpcode::G_FMINNUM:
1700 case TargetOpcode::G_FMAXNUM:
1701 case TargetOpcode::G_FMINNUM_IEEE:
1702 case TargetOpcode::G_FMAXNUM_IEEE:
1703 return legalizeMinNumMaxNum(Helper, MI);
1704 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1705 return legalizeExtractVectorElt(MI, MRI, B);
1706 case TargetOpcode::G_INSERT_VECTOR_ELT:
1707 return legalizeInsertVectorElt(MI, MRI, B);
1708 case TargetOpcode::G_SHUFFLE_VECTOR:
1709 return legalizeShuffleVector(MI, MRI, B);
1710 case TargetOpcode::G_FSIN:
1711 case TargetOpcode::G_FCOS:
1712 return legalizeSinCos(MI, MRI, B);
1713 case TargetOpcode::G_GLOBAL_VALUE:
1714 return legalizeGlobalValue(MI, MRI, B);
1715 case TargetOpcode::G_LOAD:
1716 case TargetOpcode::G_SEXTLOAD:
1717 case TargetOpcode::G_ZEXTLOAD:
1718 return legalizeLoad(Helper, MI);
1719 case TargetOpcode::G_FMAD:
1720 return legalizeFMad(MI, MRI, B);
1721 case TargetOpcode::G_FDIV:
1722 return legalizeFDIV(MI, MRI, B);
1723 case TargetOpcode::G_UDIV:
1724 case TargetOpcode::G_UREM:
1725 case TargetOpcode::G_UDIVREM:
1726 return legalizeUnsignedDIV_REM(MI, MRI, B);
1727 case TargetOpcode::G_SDIV:
1728 case TargetOpcode::G_SREM:
1729 case TargetOpcode::G_SDIVREM:
1730 return legalizeSignedDIV_REM(MI, MRI, B);
1731 case TargetOpcode::G_ATOMIC_CMPXCHG:
1732 return legalizeAtomicCmpXChg(MI, MRI, B);
1733 case TargetOpcode::G_FLOG:
1734 return legalizeFlog(MI, B, numbers::ln2f);
1735 case TargetOpcode::G_FLOG10:
1736 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1737 case TargetOpcode::G_FEXP:
1738 return legalizeFExp(MI, B);
1739 case TargetOpcode::G_FPOW:
1740 return legalizeFPow(MI, B);
1741 case TargetOpcode::G_FFLOOR:
1742 return legalizeFFloor(MI, MRI, B);
1743 case TargetOpcode::G_BUILD_VECTOR:
1744 return legalizeBuildVector(MI, MRI, B);
1745 default:
1746 return false;
1747 }
1748
1749 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1749)
;
1750}
1751
1752Register AMDGPULegalizerInfo::getSegmentAperture(
1753 unsigned AS,
1754 MachineRegisterInfo &MRI,
1755 MachineIRBuilder &B) const {
1756 MachineFunction &MF = B.getMF();
1757 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1758 const LLT S32 = LLT::scalar(32);
1759
1760 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1760, __extension__ __PRETTY_FUNCTION__))
;
1761
1762 if (ST.hasApertureRegs()) {
1763 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1764 // getreg.
1765 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1766 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1767 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1768 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1769 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1770 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1771 unsigned Encoding =
1772 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1773 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1774 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1775
1776 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1777
1778 B.buildInstr(AMDGPU::S_GETREG_B32)
1779 .addDef(GetReg)
1780 .addImm(Encoding);
1781 MRI.setType(GetReg, S32);
1782
1783 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1784 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1785 }
1786
1787 Register QueuePtr = MRI.createGenericVirtualRegister(
1788 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1789
1790 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1791 return Register();
1792
1793 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1794 // private_segment_aperture_base_hi.
1795 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1796
1797 // TODO: can we be smarter about machine pointer info?
1798 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1799 MachineMemOperand *MMO = MF.getMachineMemOperand(
1800 PtrInfo,
1801 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1802 MachineMemOperand::MOInvariant,
1803 4, commonAlignment(Align(64), StructOffset));
1804
1805 Register LoadAddr;
1806
1807 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1808 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1809}
1810
1811bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1812 MachineInstr &MI, MachineRegisterInfo &MRI,
1813 MachineIRBuilder &B) const {
1814 MachineFunction &MF = B.getMF();
1815
1816 const LLT S32 = LLT::scalar(32);
1817 Register Dst = MI.getOperand(0).getReg();
1818 Register Src = MI.getOperand(1).getReg();
1819
1820 LLT DstTy = MRI.getType(Dst);
1821 LLT SrcTy = MRI.getType(Src);
1822 unsigned DestAS = DstTy.getAddressSpace();
1823 unsigned SrcAS = SrcTy.getAddressSpace();
1824
1825 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1826 // vector element.
1827 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1827, __extension__ __PRETTY_FUNCTION__))
;
1828
1829 const AMDGPUTargetMachine &TM
1830 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1831
1832 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1833 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1834 return true;
1835 }
1836
1837 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1838 // Truncate.
1839 B.buildExtract(Dst, Src, 0);
1840 MI.eraseFromParent();
1841 return true;
1842 }
1843
1844 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1845 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1846 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1847
1848 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1849 // another. Merge operands are required to be the same type, but creating an
1850 // extra ptrtoint would be kind of pointless.
1851 auto HighAddr = B.buildConstant(
1852 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1853 B.buildMerge(Dst, {Src, HighAddr});
1854 MI.eraseFromParent();
1855 return true;
1856 }
1857
1858 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1859 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1860, __extension__ __PRETTY_FUNCTION__))
1860 DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1860, __extension__ __PRETTY_FUNCTION__))
;
1861 unsigned NullVal = TM.getNullPointerValue(DestAS);
1862
1863 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1864 auto FlatNull = B.buildConstant(SrcTy, 0);
1865
1866 // Extract low 32-bits of the pointer.
1867 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1868
1869 auto CmpRes =
1870 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1871 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1872
1873 MI.eraseFromParent();
1874 return true;
1875 }
1876
1877 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1878 return false;
1879
1880 if (!ST.hasFlatAddressSpace())
1881 return false;
1882
1883 auto SegmentNull =
1884 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1885 auto FlatNull =
1886 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1887
1888 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1889 if (!ApertureReg.isValid())
1890 return false;
1891
1892 auto CmpRes =
1893 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1894
1895 // Coerce the type of the low half of the result so we can use merge_values.
1896 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1897
1898 // TODO: Should we allow mismatched types but matching sizes in merges to
1899 // avoid the ptrtoint?
1900 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1901 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1902
1903 MI.eraseFromParent();
1904 return true;
1905}
1906
1907bool AMDGPULegalizerInfo::legalizeFrint(
1908 MachineInstr &MI, MachineRegisterInfo &MRI,
1909 MachineIRBuilder &B) const {
1910 Register Src = MI.getOperand(1).getReg();
1911 LLT Ty = MRI.getType(Src);
1912 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1912, __extension__ __PRETTY_FUNCTION__))
;
1913
1914 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1915 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1916
1917 auto C1 = B.buildFConstant(Ty, C1Val);
1918 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1919
1920 // TODO: Should this propagate fast-math-flags?
1921 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1922 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1923
1924 auto C2 = B.buildFConstant(Ty, C2Val);
1925 auto Fabs = B.buildFAbs(Ty, Src);
1926
1927 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1928 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1929 MI.eraseFromParent();
1930 return true;
1931}
1932
1933bool AMDGPULegalizerInfo::legalizeFceil(
1934 MachineInstr &MI, MachineRegisterInfo &MRI,
1935 MachineIRBuilder &B) const {
1936
1937 const LLT S1 = LLT::scalar(1);
1938 const LLT S64 = LLT::scalar(64);
1939
1940 Register Src = MI.getOperand(1).getReg();
1941 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1941, __extension__ __PRETTY_FUNCTION__))
;
1942
1943 // result = trunc(src)
1944 // if (src > 0.0 && src != result)
1945 // result += 1.0
1946
1947 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1948
1949 const auto Zero = B.buildFConstant(S64, 0.0);
1950 const auto One = B.buildFConstant(S64, 1.0);
1951 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1952 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1953 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1954 auto Add = B.buildSelect(S64, And, One, Zero);
1955
1956 // TODO: Should this propagate fast-math-flags?
1957 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1958 return true;
1959}
1960
1961bool AMDGPULegalizerInfo::legalizeFrem(
1962 MachineInstr &MI, MachineRegisterInfo &MRI,
1963 MachineIRBuilder &B) const {
1964 Register DstReg = MI.getOperand(0).getReg();
1965 Register Src0Reg = MI.getOperand(1).getReg();
1966 Register Src1Reg = MI.getOperand(2).getReg();
1967 auto Flags = MI.getFlags();
1968 LLT Ty = MRI.getType(DstReg);
1969
1970 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1971 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1972 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1973 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1974 MI.eraseFromParent();
1975 return true;
1976}
1977
1978static MachineInstrBuilder extractF64Exponent(Register Hi,
1979 MachineIRBuilder &B) {
1980 const unsigned FractBits = 52;
1981 const unsigned ExpBits = 11;
1982 LLT S32 = LLT::scalar(32);
1983
1984 auto Const0 = B.buildConstant(S32, FractBits - 32);
1985 auto Const1 = B.buildConstant(S32, ExpBits);
1986
1987 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1988 .addUse(Hi)
1989 .addUse(Const0.getReg(0))
1990 .addUse(Const1.getReg(0));
1991
1992 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1993}
1994
1995bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1996 MachineInstr &MI, MachineRegisterInfo &MRI,
1997 MachineIRBuilder &B) const {
1998 const LLT S1 = LLT::scalar(1);
1999 const LLT S32 = LLT::scalar(32);
2000 const LLT S64 = LLT::scalar(64);
2001
2002 Register Src = MI.getOperand(1).getReg();
2003 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2003, __extension__ __PRETTY_FUNCTION__))
;
2004
2005 // TODO: Should this use extract since the low half is unused?
2006 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2007 Register Hi = Unmerge.getReg(1);
2008
2009 // Extract the upper half, since this is where we will find the sign and
2010 // exponent.
2011 auto Exp = extractF64Exponent(Hi, B);
2012
2013 const unsigned FractBits = 52;
2014
2015 // Extract the sign bit.
2016 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2017 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2018
2019 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2020
2021 const auto Zero32 = B.buildConstant(S32, 0);
2022
2023 // Extend back to 64-bits.
2024 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2025
2026 auto Shr = B.buildAShr(S64, FractMask, Exp);
2027 auto Not = B.buildNot(S64, Shr);
2028 auto Tmp0 = B.buildAnd(S64, Src, Not);
2029 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2030
2031 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2032 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2033
2034 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2035 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2036 MI.eraseFromParent();
2037 return true;
2038}
2039
2040bool AMDGPULegalizerInfo::legalizeITOFP(
2041 MachineInstr &MI, MachineRegisterInfo &MRI,
2042 MachineIRBuilder &B, bool Signed) const {
2043
2044 Register Dst = MI.getOperand(0).getReg();
2045 Register Src = MI.getOperand(1).getReg();
2046
2047 const LLT S64 = LLT::scalar(64);
2048 const LLT S32 = LLT::scalar(32);
2049
2050 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2050, __extension__ __PRETTY_FUNCTION__))
;
2051
2052 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2053
2054 auto CvtHi = Signed ?
2055 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2056 B.buildUITOFP(S64, Unmerge.getReg(1));
2057
2058 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2059
2060 auto ThirtyTwo = B.buildConstant(S32, 32);
2061 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2062 .addUse(CvtHi.getReg(0))
2063 .addUse(ThirtyTwo.getReg(0));
2064
2065 // TODO: Should this propagate fast-math-flags?
2066 B.buildFAdd(Dst, LdExp, CvtLo);
2067 MI.eraseFromParent();
2068 return true;
2069}
2070
2071// TODO: Copied from DAG implementation. Verify logic and document how this
2072// actually works.
2073bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2074 MachineRegisterInfo &MRI,
2075 MachineIRBuilder &B,
2076 bool Signed) const {
2077
2078 Register Dst = MI.getOperand(0).getReg();
2079 Register Src = MI.getOperand(1).getReg();
2080
2081 const LLT S64 = LLT::scalar(64);
2082 const LLT S32 = LLT::scalar(32);
2083
2084 const LLT SrcLT = MRI.getType(Src);
2085 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2085, __extension__ __PRETTY_FUNCTION__))
;
2086
2087 unsigned Flags = MI.getFlags();
2088
2089 // The basic idea of converting a floating point number into a pair of 32-bit
2090 // integers is illustrated as follows:
2091 //
2092 // tf := trunc(val);
2093 // hif := floor(tf * 2^-32);
2094 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2095 // hi := fptoi(hif);
2096 // lo := fptoi(lof);
2097 //
2098 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2099 MachineInstrBuilder Sign;
2100 if (Signed && SrcLT == S32) {
2101 // However, a 32-bit floating point number has only 23 bits mantissa and
2102 // it's not enough to hold all the significant bits of `lof` if val is
2103 // negative. To avoid the loss of precision, We need to take the absolute
2104 // value after truncating and flip the result back based on the original
2105 // signedness.
2106 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2107 Trunc = B.buildFAbs(S32, Trunc, Flags);
2108 }
2109 MachineInstrBuilder K0, K1;
2110 if (SrcLT == S64) {
2111 K0 = B.buildFConstant(S64,
2112 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2113 K1 = B.buildFConstant(S64,
2114 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2115 } else {
2116 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2117 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2118 }
2119
2120 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2121 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2122 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2123
2124 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2125 : B.buildFPTOUI(S32, FloorMul);
2126 auto Lo = B.buildFPTOUI(S32, Fma);
2127
2128 if (Signed && SrcLT == S32) {
2129 // Flip the result based on the signedness, which is either all 0s or 1s.
2130 Sign = B.buildMerge(S64, {Sign, Sign});
2131 // r := xor({lo, hi}, sign) - sign;
2132 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2133 } else
2134 B.buildMerge(Dst, {Lo, Hi});
2135 MI.eraseFromParent();
2136
2137 return true;
2138}
2139
2140bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2141 MachineInstr &MI) const {
2142 MachineFunction &MF = Helper.MIRBuilder.getMF();
2143 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2144
2145 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2146 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2147
2148 // With ieee_mode disabled, the instructions have the correct behavior
2149 // already for G_FMINNUM/G_FMAXNUM
2150 if (!MFI->getMode().IEEE)
2151 return !IsIEEEOp;
2152
2153 if (IsIEEEOp)
2154 return true;
2155
2156 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2157}
2158
2159bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2160 MachineInstr &MI, MachineRegisterInfo &MRI,
2161 MachineIRBuilder &B) const {
2162 // TODO: Should move some of this into LegalizerHelper.
2163
2164 // TODO: Promote dynamic indexing of s16 to s32
2165
2166 // FIXME: Artifact combiner probably should have replaced the truncated
2167 // constant before this, so we shouldn't need
2168 // getConstantVRegValWithLookThrough.
2169 Optional<ValueAndVReg> MaybeIdxVal =
2170 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2171 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2172 return true;
2173 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2174
2175 Register Dst = MI.getOperand(0).getReg();
2176 Register Vec = MI.getOperand(1).getReg();
2177
2178 LLT VecTy = MRI.getType(Vec);
2179 LLT EltTy = VecTy.getElementType();
2180 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2180, __extension__ __PRETTY_FUNCTION__))
;
2181
2182 if (IdxVal < VecTy.getNumElements())
2183 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2184 else
2185 B.buildUndef(Dst);
2186
2187 MI.eraseFromParent();
2188 return true;
2189}
2190
2191bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2192 MachineInstr &MI, MachineRegisterInfo &MRI,
2193 MachineIRBuilder &B) const {
2194 // TODO: Should move some of this into LegalizerHelper.
2195
2196 // TODO: Promote dynamic indexing of s16 to s32
2197
2198 // FIXME: Artifact combiner probably should have replaced the truncated
2199 // constant before this, so we shouldn't need
2200 // getConstantVRegValWithLookThrough.
2201 Optional<ValueAndVReg> MaybeIdxVal =
2202 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2203 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2204 return true;
2205
2206 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2207 Register Dst = MI.getOperand(0).getReg();
2208 Register Vec = MI.getOperand(1).getReg();
2209 Register Ins = MI.getOperand(2).getReg();
2210
2211 LLT VecTy = MRI.getType(Vec);
2212 LLT EltTy = VecTy.getElementType();
2213 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2213, __extension__ __PRETTY_FUNCTION__))
;
2214
2215 if (IdxVal < VecTy.getNumElements())
2216 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2217 else
2218 B.buildUndef(Dst);
2219
2220 MI.eraseFromParent();
2221 return true;
2222}
2223
2224bool AMDGPULegalizerInfo::legalizeShuffleVector(
2225 MachineInstr &MI, MachineRegisterInfo &MRI,
2226 MachineIRBuilder &B) const {
2227 const LLT V2S16 = LLT::vector(2, 16);
2228
2229 Register Dst = MI.getOperand(0).getReg();
2230 Register Src0 = MI.getOperand(1).getReg();
2231 LLT DstTy = MRI.getType(Dst);
2232 LLT SrcTy = MRI.getType(Src0);
2233
2234 if (SrcTy == V2S16 && DstTy == V2S16 &&
2235 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2236 return true;
2237
2238 MachineIRBuilder HelperBuilder(MI);
2239 GISelObserverWrapper DummyObserver;
2240 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2241 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2242}
2243
2244bool AMDGPULegalizerInfo::legalizeSinCos(
2245 MachineInstr &MI, MachineRegisterInfo &MRI,
2246 MachineIRBuilder &B) const {
2247
2248 Register DstReg = MI.getOperand(0).getReg();
2249 Register SrcReg = MI.getOperand(1).getReg();
2250 LLT Ty = MRI.getType(DstReg);
2251 unsigned Flags = MI.getFlags();
2252
2253 Register TrigVal;
2254 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2255 if (ST.hasTrigReducedRange()) {
2256 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2257 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2258 .addUse(MulVal.getReg(0))
2259 .setMIFlags(Flags).getReg(0);
2260 } else
2261 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2262
2263 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2264 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2265 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2266 .addUse(TrigVal)
2267 .setMIFlags(Flags);
2268 MI.eraseFromParent();
2269 return true;
2270}
2271
2272bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2273 MachineIRBuilder &B,
2274 const GlobalValue *GV,
2275 int64_t Offset,
2276 unsigned GAFlags) const {
2277 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2277, __extension__ __PRETTY_FUNCTION__))
;
2278 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2279 // to the following code sequence:
2280 //
2281 // For constant address space:
2282 // s_getpc_b64 s[0:1]
2283 // s_add_u32 s0, s0, $symbol
2284 // s_addc_u32 s1, s1, 0
2285 //
2286 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2287 // a fixup or relocation is emitted to replace $symbol with a literal
2288 // constant, which is a pc-relative offset from the encoding of the $symbol
2289 // operand to the global variable.
2290 //
2291 // For global address space:
2292 // s_getpc_b64 s[0:1]
2293 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2294 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2295 //
2296 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2297 // fixups or relocations are emitted to replace $symbol@*@lo and
2298 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2299 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2300 // operand to the global variable.
2301 //
2302 // What we want here is an offset from the value returned by s_getpc
2303 // (which is the address of the s_add_u32 instruction) to the global
2304 // variable, but since the encoding of $symbol starts 4 bytes after the start
2305 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2306 // small. This requires us to add 4 to the global variable offset in order to
2307 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2308 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2309 // instruction.
2310
2311 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2312
2313 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2314 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2315
2316 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2317 .addDef(PCReg);
2318
2319 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2320 if (GAFlags == SIInstrInfo::MO_NONE)
2321 MIB.addImm(0);
2322 else
2323 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2324
2325 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2326
2327 if (PtrTy.getSizeInBits() == 32)
2328 B.buildExtract(DstReg, PCReg, 0);
2329 return true;
2330 }
2331
2332bool AMDGPULegalizerInfo::legalizeGlobalValue(
2333 MachineInstr &MI, MachineRegisterInfo &MRI,
2334 MachineIRBuilder &B) const {
2335 Register DstReg = MI.getOperand(0).getReg();
2336 LLT Ty = MRI.getType(DstReg);
2337 unsigned AS = Ty.getAddressSpace();
2338
2339 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2340 MachineFunction &MF = B.getMF();
2341 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2342
2343 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2344 if (!MFI->isModuleEntryFunction() &&
2345 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2346 const Function &Fn = MF.getFunction();
2347 DiagnosticInfoUnsupported BadLDSDecl(
2348 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2349 DS_Warning);
2350 Fn.getContext().diagnose(BadLDSDecl);
2351
2352 // We currently don't have a way to correctly allocate LDS objects that
2353 // aren't directly associated with a kernel. We do force inlining of
2354 // functions that use local objects. However, if these dead functions are
2355 // not eliminated, we don't want a compile time error. Just emit a warning
2356 // and a trap, since there should be no callable path here.
2357 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2358 B.buildUndef(DstReg);
2359 MI.eraseFromParent();
2360 return true;
2361 }
2362
2363 // TODO: We could emit code to handle the initialization somewhere.
2364 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2365 const SITargetLowering *TLI = ST.getTargetLowering();
2366 if (!TLI->shouldUseLDSConstAddress(GV)) {
2367 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2368 return true; // Leave in place;
2369 }
2370
2371 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2372 Type *Ty = GV->getValueType();
2373 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2374 // zero-sized type in other languages to declare the dynamic shared
2375 // memory which size is not known at the compile time. They will be
2376 // allocated by the runtime and placed directly after the static
2377 // allocated ones. They all share the same offset.
2378 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2379 // Adjust alignment for that dynamic shared memory array.
2380 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2381 LLT S32 = LLT::scalar(32);
2382 auto Sz =
2383 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2384 B.buildIntToPtr(DstReg, Sz);
2385 MI.eraseFromParent();
2386 return true;
2387 }
2388 }
2389
2390 B.buildConstant(
2391 DstReg,
2392 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2393 MI.eraseFromParent();
2394 return true;
2395 }
2396
2397 const Function &Fn = MF.getFunction();
2398 DiagnosticInfoUnsupported BadInit(
2399 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2400 Fn.getContext().diagnose(BadInit);
2401 return true;
2402 }
2403
2404 const SITargetLowering *TLI = ST.getTargetLowering();
2405
2406 if (TLI->shouldEmitFixup(GV)) {
2407 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2408 MI.eraseFromParent();
2409 return true;
2410 }
2411
2412 if (TLI->shouldEmitPCReloc(GV)) {
2413 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2414 MI.eraseFromParent();
2415 return true;
2416 }
2417
2418 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2419 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2420
2421 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2422 MachinePointerInfo::getGOT(MF),
2423 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2424 MachineMemOperand::MOInvariant,
2425 8 /*Size*/, Align(8));
2426
2427 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2428
2429 if (Ty.getSizeInBits() == 32) {
2430 // Truncate if this is a 32-bit constant adrdess.
2431 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2432 B.buildExtract(DstReg, Load, 0);
2433 } else
2434 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2435
2436 MI.eraseFromParent();
2437 return true;
2438}
2439
2440static LLT widenToNextPowerOf2(LLT Ty) {
2441 if (Ty.isVector())
2442 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2443 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2444}
2445
2446bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2447 MachineInstr &MI) const {
2448 MachineIRBuilder &B = Helper.MIRBuilder;
2449 MachineRegisterInfo &MRI = *B.getMRI();
2450 GISelChangeObserver &Observer = Helper.Observer;
2451
2452 Register PtrReg = MI.getOperand(1).getReg();
2453 LLT PtrTy = MRI.getType(PtrReg);
2454 unsigned AddrSpace = PtrTy.getAddressSpace();
2455
2456 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2457 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2458 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2459 Observer.changingInstr(MI);
2460 MI.getOperand(1).setReg(Cast.getReg(0));
2461 Observer.changedInstr(MI);
2462 return true;
2463 }
2464
2465 if (MI.getOpcode() != AMDGPU::G_LOAD)
2466 return false;
2467
2468 Register ValReg = MI.getOperand(0).getReg();
2469 LLT ValTy = MRI.getType(ValReg);
2470
2471 MachineMemOperand *MMO = *MI.memoperands_begin();
2472 const unsigned ValSize = ValTy.getSizeInBits();
2473 const unsigned MemSize = 8 * MMO->getSize();
2474 const Align MemAlign = MMO->getAlign();
2475 const unsigned AlignInBits = 8 * MemAlign.value();
2476
2477 // Widen non-power-of-2 loads to the alignment if needed
2478 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2479 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2480
2481 // This was already the correct extending load result type, so just adjust
2482 // the memory type.
2483 if (WideMemSize == ValSize) {
2484 MachineFunction &MF = B.getMF();
2485
2486 MachineMemOperand *WideMMO =
2487 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2488 Observer.changingInstr(MI);
2489 MI.setMemRefs(MF, {WideMMO});
2490 Observer.changedInstr(MI);
2491 return true;
2492 }
2493
2494 // Don't bother handling edge case that should probably never be produced.
2495 if (ValSize > WideMemSize)
2496 return false;
2497
2498 LLT WideTy = widenToNextPowerOf2(ValTy);
2499
2500 Register WideLoad;
2501 if (!WideTy.isVector()) {
2502 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2503 B.buildTrunc(ValReg, WideLoad).getReg(0);
2504 } else {
2505 // Extract the subvector.
2506
2507 if (isRegisterType(ValTy)) {
2508 // If this a case where G_EXTRACT is legal, use it.
2509 // (e.g. <3 x s32> -> <4 x s32>)
2510 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2511 B.buildExtract(ValReg, WideLoad, 0);
2512 } else {
2513 // For cases where the widened type isn't a nice register value, unmerge
2514 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2515 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2516 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2517 B.setInsertPt(B.getMBB(), MI.getIterator());
2518 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2519 }
2520 }
2521
2522 MI.eraseFromParent();
2523 return true;
2524 }
2525
2526 return false;
2527}
2528
2529bool AMDGPULegalizerInfo::legalizeFMad(
2530 MachineInstr &MI, MachineRegisterInfo &MRI,
2531 MachineIRBuilder &B) const {
2532 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2533 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2533, __extension__ __PRETTY_FUNCTION__))
;
2534
2535 MachineFunction &MF = B.getMF();
2536 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2537
2538 // TODO: Always legal with future ftz flag.
2539 // FIXME: Do we need just output?
2540 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2541 return true;
2542 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2543 return true;
2544
2545 MachineIRBuilder HelperBuilder(MI);
2546 GISelObserverWrapper DummyObserver;
2547 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2548 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2549}
2550
2551bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2552 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2553 Register DstReg = MI.getOperand(0).getReg();
2554 Register PtrReg = MI.getOperand(1).getReg();
2555 Register CmpVal = MI.getOperand(2).getReg();
2556 Register NewVal = MI.getOperand(3).getReg();
2557
2558 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2559, __extension__ __PRETTY_FUNCTION__))
2559 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2559, __extension__ __PRETTY_FUNCTION__))
;
2560
2561 LLT ValTy = MRI.getType(CmpVal);
2562 LLT VecTy = LLT::vector(2, ValTy);
2563
2564 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2565
2566 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2567 .addDef(DstReg)
2568 .addUse(PtrReg)
2569 .addUse(PackedVal)
2570 .setMemRefs(MI.memoperands());
2571
2572 MI.eraseFromParent();
2573 return true;
2574}
2575
2576bool AMDGPULegalizerInfo::legalizeFlog(
2577 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2578 Register Dst = MI.getOperand(0).getReg();
2579 Register Src = MI.getOperand(1).getReg();
2580 LLT Ty = B.getMRI()->getType(Dst);
2581 unsigned Flags = MI.getFlags();
2582
2583 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2584 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2585
2586 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2587 MI.eraseFromParent();
2588 return true;
2589}
2590
2591bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2592 MachineIRBuilder &B) const {
2593 Register Dst = MI.getOperand(0).getReg();
2594 Register Src = MI.getOperand(1).getReg();
2595 unsigned Flags = MI.getFlags();
2596 LLT Ty = B.getMRI()->getType(Dst);
2597
2598 auto K = B.buildFConstant(Ty, numbers::log2e);
2599 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2600 B.buildFExp2(Dst, Mul, Flags);
2601 MI.eraseFromParent();
2602 return true;
2603}
2604
2605bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2606 MachineIRBuilder &B) const {
2607 Register Dst = MI.getOperand(0).getReg();
2608 Register Src0 = MI.getOperand(1).getReg();
2609 Register Src1 = MI.getOperand(2).getReg();
2610 unsigned Flags = MI.getFlags();
2611 LLT Ty = B.getMRI()->getType(Dst);
2612 const LLT S16 = LLT::scalar(16);
2613 const LLT S32 = LLT::scalar(32);
2614
2615 if (Ty == S32) {
2616 auto Log = B.buildFLog2(S32, Src0, Flags);
2617 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2618 .addUse(Log.getReg(0))
2619 .addUse(Src1)
2620 .setMIFlags(Flags);
2621 B.buildFExp2(Dst, Mul, Flags);
2622 } else if (Ty == S16) {
2623 // There's no f16 fmul_legacy, so we need to convert for it.
2624 auto Log = B.buildFLog2(S16, Src0, Flags);
2625 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2626 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2627 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2628 .addUse(Ext0.getReg(0))
2629 .addUse(Ext1.getReg(0))
2630 .setMIFlags(Flags);
2631
2632 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2633 } else
2634 return false;
2635
2636 MI.eraseFromParent();
2637 return true;
2638}
2639
2640// Find a source register, ignoring any possible source modifiers.
2641static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2642 Register ModSrc = OrigSrc;
2643 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2644 ModSrc = SrcFNeg->getOperand(1).getReg();
2645 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2646 ModSrc = SrcFAbs->getOperand(1).getReg();
2647 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2648 ModSrc = SrcFAbs->getOperand(1).getReg();
2649 return ModSrc;
2650}
2651
2652bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2653 MachineRegisterInfo &MRI,
2654 MachineIRBuilder &B) const {
2655
2656 const LLT S1 = LLT::scalar(1);
2657 const LLT S64 = LLT::scalar(64);
2658 Register Dst = MI.getOperand(0).getReg();
2659 Register OrigSrc = MI.getOperand(1).getReg();
2660 unsigned Flags = MI.getFlags();
2661 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2662, __extension__ __PRETTY_FUNCTION__))
2662 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2662, __extension__ __PRETTY_FUNCTION__))
;
2663
2664 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2665 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2666 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2667 // V_FRACT bug is:
2668 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2669 //
2670 // Convert floor(x) to (x - fract(x))
2671
2672 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2673 .addUse(OrigSrc)
2674 .setMIFlags(Flags);
2675
2676 // Give source modifier matching some assistance before obscuring a foldable
2677 // pattern.
2678
2679 // TODO: We can avoid the neg on the fract? The input sign to fract
2680 // shouldn't matter?
2681 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2682
2683 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2684
2685 Register Min = MRI.createGenericVirtualRegister(S64);
2686
2687 // We don't need to concern ourselves with the snan handling difference, so
2688 // use the one which will directly select.
2689 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2690 if (MFI->getMode().IEEE)
2691 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2692 else
2693 B.buildFMinNum(Min, Fract, Const, Flags);
2694
2695 Register CorrectedFract = Min;
2696 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2697 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2698 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2699 }
2700
2701 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2702 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2703
2704 MI.eraseFromParent();
2705 return true;
2706}
2707
2708// Turn an illegal packed v2s16 build vector into bit operations.
2709// TODO: This should probably be a bitcast action in LegalizerHelper.
2710bool AMDGPULegalizerInfo::legalizeBuildVector(
2711 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2712 Register Dst = MI.getOperand(0).getReg();
2713 const LLT S32 = LLT::scalar(32);
2714 assert(MRI.getType(Dst) == LLT::vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::vector(2,
16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2714, __extension__ __PRETTY_FUNCTION__))
;
2715
2716 Register Src0 = MI.getOperand(1).getReg();
2717 Register Src1 = MI.getOperand(2).getReg();
2718 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16
)) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2718, __extension__ __PRETTY_FUNCTION__))
;
2719
2720 auto Merge = B.buildMerge(S32, {Src0, Src1});
2721 B.buildBitcast(Dst, Merge);
2722
2723 MI.eraseFromParent();
2724 return true;
2725}
2726
2727// Check that this is a G_XOR x, -1
2728static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2729 if (MI.getOpcode() != TargetOpcode::G_XOR)
2730 return false;
2731 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2732 return ConstVal && *ConstVal == -1;
2733}
2734
2735// Return the use branch instruction, otherwise null if the usage is invalid.
2736static MachineInstr *
2737verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2738 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2739 Register CondDef = MI.getOperand(0).getReg();
2740 if (!MRI.hasOneNonDBGUse(CondDef))
2741 return nullptr;
2742
2743 MachineBasicBlock *Parent = MI.getParent();
2744 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2745
2746 if (isNot(MRI, *UseMI)) {
2747 Register NegatedCond = UseMI->getOperand(0).getReg();
2748 if (!MRI.hasOneNonDBGUse(NegatedCond))
2749 return nullptr;
2750
2751 // We're deleting the def of this value, so we need to remove it.
2752 UseMI->eraseFromParent();
2753
2754 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2755 Negated = true;
2756 }
2757
2758 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2759 return nullptr;
2760
2761 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2762 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2763 if (Next == Parent->end()) {
2764 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2765 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2766 return nullptr;
2767 UncondBrTarget = &*NextMBB;
2768 } else {
2769 if (Next->getOpcode() != AMDGPU::G_BR)
2770 return nullptr;
2771 Br = &*Next;
2772 UncondBrTarget = Br->getOperand(0).getMBB();
2773 }
2774
2775 return UseMI;
2776}
2777
2778bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2779 const ArgDescriptor *Arg,
2780 const TargetRegisterClass *ArgRC,
2781 LLT ArgTy) const {
2782 MCRegister SrcReg = Arg->getRegister();
2783 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2783, __extension__ __PRETTY_FUNCTION__))
;
2784 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2784, __extension__ __PRETTY_FUNCTION__))
;
2785
2786 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2787 ArgTy);
2788 if (Arg->isMasked()) {
2789 // TODO: Should we try to emit this once in the entry block?
2790 const LLT S32 = LLT::scalar(32);
2791 const unsigned Mask = Arg->getMask();
2792 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2793
2794 Register AndMaskSrc = LiveIn;
2795
2796 if (Shift != 0) {
2797 auto ShiftAmt = B.buildConstant(S32, Shift);
2798 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2799 }
2800
2801 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2802 } else {
2803 B.buildCopy(DstReg, LiveIn);
2804 }
2805
2806 return true;
2807}
2808
2809bool AMDGPULegalizerInfo::loadInputValue(
2810 Register DstReg, MachineIRBuilder &B,
2811 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2812 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2813 const ArgDescriptor *Arg;
2814 const TargetRegisterClass *ArgRC;
2815 LLT ArgTy;
2816 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2817
2818 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2819 return false; // TODO: Handle these
2820 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2821}
2822
2823bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2824 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2825 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2826 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2827 return false;
2828
2829 MI.eraseFromParent();
2830 return true;
2831}
2832
2833bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2834 MachineRegisterInfo &MRI,
2835 MachineIRBuilder &B) const {
2836 Register Dst = MI.getOperand(0).getReg();
2837 LLT DstTy = MRI.getType(Dst);
2838 LLT S16 = LLT::scalar(16);
2839 LLT S32 = LLT::scalar(32);
2840 LLT S64 = LLT::scalar(64);
2841
2842 if (DstTy == S16)
2843 return legalizeFDIV16(MI, MRI, B);
2844 if (DstTy == S32)
2845 return legalizeFDIV32(MI, MRI, B);
2846 if (DstTy == S64)
2847 return legalizeFDIV64(MI, MRI, B);
2848
2849 return false;
2850}
2851
2852void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2853 Register DstDivReg,
2854 Register DstRemReg,
2855 Register X,
2856 Register Y) const {
2857 const LLT S1 = LLT::scalar(1);
2858 const LLT S32 = LLT::scalar(32);
2859
2860 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2861 // algorithm used here.
2862
2863 // Initial estimate of inv(y).
2864 auto FloatY = B.buildUITOFP(S32, Y);
2865 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2866 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2867 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2868 auto Z = B.buildFPTOUI(S32, ScaledY);
2869
2870 // One round of UNR.
2871 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2872 auto NegYZ = B.buildMul(S32, NegY, Z);
2873 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2874
2875 // Quotient/remainder estimate.
2876 auto Q = B.buildUMulH(S32, X, Z);
2877 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2878
2879 // First quotient/remainder refinement.
2880 auto One = B.buildConstant(S32, 1);
2881 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2882 if (DstDivReg)
2883 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2884 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2885
2886 // Second quotient/remainder refinement.
2887 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2888 if (DstDivReg)
2889 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2890
2891 if (DstRemReg)
2892 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2893}
2894
2895// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2896//
2897// Return lo, hi of result
2898//
2899// %cvt.lo = G_UITOFP Val.lo
2900// %cvt.hi = G_UITOFP Val.hi
2901// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2902// %rcp = G_AMDGPU_RCP_IFLAG %mad
2903// %mul1 = G_FMUL %rcp, 0x5f7ffffc
2904// %mul2 = G_FMUL %mul1, 2**(-32)
2905// %trunc = G_INTRINSIC_TRUNC %mul2
2906// %mad2 = G_FMAD %trunc, -(2**32), %mul1
2907// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2908static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2909 Register Val) {
2910 const LLT S32 = LLT::scalar(32);
2911 auto Unmerge = B.buildUnmerge(S32, Val);
2912
2913 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2914 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2915
2916 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2917 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2918
2919 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2920 auto Mul1 =
2921 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2922
2923 // 2**(-32)
2924 auto Mul2 =
2925 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2926 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2927
2928 // -(2**32)
2929 auto Mad2 = B.buildFMAD(S32, Trunc,
2930 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2931
2932 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2933 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2934
2935 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2936}
2937
2938void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2939 Register DstDivReg,
2940 Register DstRemReg,
2941 Register Numer,
2942 Register Denom) const {
2943 const LLT S32 = LLT::scalar(32);
2944 const LLT S64 = LLT::scalar(64);
2945 const LLT S1 = LLT::scalar(1);
2946 Register RcpLo, RcpHi;
2947
2948 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2949
2950 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2951
2952 auto Zero64 = B.buildConstant(S64, 0);
2953 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2954
2955 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2956 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2957
2958 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2959 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2960 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2961
2962 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2963 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2964 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2965 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2966
2967 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2968 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2969 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2970 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2971 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2972
2973 auto Zero32 = B.buildConstant(S32, 0);
2974 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2975 auto Add2_HiC =
2976 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2977 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2978 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2979
2980 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2981 Register NumerLo = UnmergeNumer.getReg(0);
2982 Register NumerHi = UnmergeNumer.getReg(1);
2983
2984 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2985 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2986 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2987 Register Mul3_Lo = UnmergeMul3.getReg(0);
2988 Register Mul3_Hi = UnmergeMul3.getReg(1);
2989 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2990 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2991 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2992 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2993
2994 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2995 Register DenomLo = UnmergeDenom.getReg(0);
2996 Register DenomHi = UnmergeDenom.getReg(1);
2997
2998 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2999 auto C1 = B.buildSExt(S32, CmpHi);
3000
3001 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3002 auto C2 = B.buildSExt(S32, CmpLo);
3003
3004 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3005 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3006
3007 // TODO: Here and below portions of the code can be enclosed into if/endif.
3008 // Currently control flow is unconditional and we have 4 selects after
3009 // potential endif to substitute PHIs.
3010
3011 // if C3 != 0 ...
3012 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3013 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3014 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3015 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3016
3017 auto One64 = B.buildConstant(S64, 1);
3018 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3019
3020 auto C4 =
3021 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3022 auto C5 =
3023 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3024 auto C6 = B.buildSelect(
3025 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3026
3027 // if (C6 != 0)
3028 auto Add4 = B.buildAdd(S64, Add3, One64);
3029 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3030
3031 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3032 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3033 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3034
3035 // endif C6
3036 // endif C3
3037
3038 if (DstDivReg) {
3039 auto Sel1 = B.buildSelect(
3040 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3041 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3042 Sel1, MulHi3);
3043 }
3044
3045 if (DstRemReg) {
3046 auto Sel2 = B.buildSelect(
3047 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3048 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3049 Sel2, Sub1);
3050 }
3051}
3052
3053bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3054 MachineRegisterInfo &MRI,
3055 MachineIRBuilder &B) const {
3056 Register DstDivReg, DstRemReg;
3057 switch (MI.getOpcode()) {
3058 default:
3059 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3059)
;
3060 case AMDGPU::G_UDIV: {
3061 DstDivReg = MI.getOperand(0).getReg();
3062 break;
3063 }
3064 case AMDGPU::G_UREM: {
3065 DstRemReg = MI.getOperand(0).getReg();
3066 break;
3067 }
3068 case AMDGPU::G_UDIVREM: {
3069 DstDivReg = MI.getOperand(0).getReg();
3070 DstRemReg = MI.getOperand(1).getReg();
3071 break;
3072 }
3073 }
3074
3075 const LLT S64 = LLT::scalar(64);
3076 const LLT S32 = LLT::scalar(32);
3077 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3078 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3079 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3080 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3081
3082 if (Ty == S32)
3083 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3084 else if (Ty == S64)
3085 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3086 else
3087 return false;
3088
3089 MI.eraseFromParent();
3090 return true;
3091}
3092
3093bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3094 MachineRegisterInfo &MRI,
3095 MachineIRBuilder &B) const {
3096 const LLT S64 = LLT::scalar(64);
3097 const LLT S32 = LLT::scalar(32);
3098
3099 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3100 if (Ty != S32 && Ty != S64)
3101 return false;
3102
3103 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3104 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3105 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3106
3107 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3108 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3109 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3110
3111 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3112 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3113
3114 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3115 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3116
3117 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3118 switch (MI.getOpcode()) {
3119 default:
3120 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3120)
;
3121 case AMDGPU::G_SDIV: {
3122 DstDivReg = MI.getOperand(0).getReg();
3123 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3124 break;
3125 }
3126 case AMDGPU::G_SREM: {
3127 DstRemReg = MI.getOperand(0).getReg();
3128 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3129 break;
3130 }
3131 case AMDGPU::G_SDIVREM: {
3132 DstDivReg = MI.getOperand(0).getReg();
3133 DstRemReg = MI.getOperand(1).getReg();
3134 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3135 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3136 break;
3137 }
3138 }
3139
3140 if (Ty == S32)
3141 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3142 else
3143 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3144
3145 if (DstDivReg) {
3146 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3147 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3148 B.buildSub(DstDivReg, SignXor, Sign);
3149 }
3150
3151 if (DstRemReg) {
3152 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3153 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3154 B.buildSub(DstRemReg, SignXor, Sign);
3155 }
3156
3157 MI.eraseFromParent();
3158 return true;
3159}
3160
3161bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3162 MachineRegisterInfo &MRI,
3163 MachineIRBuilder &B) const {
3164 Register Res = MI.getOperand(0).getReg();
3165 Register LHS = MI.getOperand(1).getReg();
3166 Register RHS = MI.getOperand(2).getReg();
3167 uint16_t Flags = MI.getFlags();
3168 LLT ResTy = MRI.getType(Res);
3169
3170 const MachineFunction &MF = B.getMF();
3171 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3172 MI.getFlag(MachineInstr::FmAfn);
3173
3174 if (!AllowInaccurateRcp)
3175 return false;
3176
3177 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3178 // 1 / x -> RCP(x)
3179 if (CLHS->isExactlyValue(1.0)) {
3180 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3181 .addUse(RHS)
3182 .setMIFlags(Flags);
3183
3184 MI.eraseFromParent();
3185 return true;
3186 }
3187
3188 // -1 / x -> RCP( FNEG(x) )
3189 if (CLHS->isExactlyValue(-1.0)) {
3190 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3191 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3192 .addUse(FNeg.getReg(0))
3193 .setMIFlags(Flags);
3194
3195 MI.eraseFromParent();
3196 return true;
3197 }
3198 }
3199
3200 // x / y -> x * (1.0 / y)
3201 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3202 .addUse(RHS)
3203 .setMIFlags(Flags);
3204 B.buildFMul(Res, LHS, RCP, Flags);
3205
3206 MI.eraseFromParent();
3207 return true;
3208}
3209
3210bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3211 MachineRegisterInfo &MRI,
3212 MachineIRBuilder &B) const {
3213 Register Res = MI.getOperand(0).getReg();
3214 Register X = MI.getOperand(1).getReg();
3215 Register Y = MI.getOperand(2).getReg();
3216 uint16_t Flags = MI.getFlags();
3217 LLT ResTy = MRI.getType(Res);
3218
3219 const MachineFunction &MF = B.getMF();
3220 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3221 MI.getFlag(MachineInstr::FmAfn);
3222
3223 if (!AllowInaccurateRcp)
3224 return false;
3225
3226 auto NegY = B.buildFNeg(ResTy, Y);
3227 auto One = B.buildFConstant(ResTy, 1.0);
3228
3229 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3230 .addUse(Y)
3231 .setMIFlags(Flags);
3232
3233 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3234 R = B.buildFMA(ResTy, Tmp0, R, R);
3235
3236 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3237 R = B.buildFMA(ResTy, Tmp1, R, R);
3238
3239 auto Ret = B.buildFMul(ResTy, X, R);
3240 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3241
3242 B.buildFMA(Res, Tmp2, R, Ret);
3243 MI.eraseFromParent();
3244 return true;
3245}
3246
3247bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3248 MachineRegisterInfo &MRI,
3249 MachineIRBuilder &B) const {
3250 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3251 return true;
3252
3253 Register Res = MI.getOperand(0).getReg();
3254 Register LHS = MI.getOperand(1).getReg();
3255 Register RHS = MI.getOperand(2).getReg();
3256
3257 uint16_t Flags = MI.getFlags();
3258
3259 LLT S16 = LLT::scalar(16);
3260 LLT S32 = LLT::scalar(32);
3261
3262 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3263 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3264
3265 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3266 .addUse(RHSExt.getReg(0))
3267 .setMIFlags(Flags);
3268
3269 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3270 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3271
3272 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3273 .addUse(RDst.getReg(0))
3274 .addUse(RHS)
3275 .addUse(LHS)
3276 .setMIFlags(Flags);
3277
3278 MI.eraseFromParent();
3279 return true;
3280}
3281
3282// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3283// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3284static void toggleSPDenormMode(bool Enable,
3285 MachineIRBuilder &B,
3286 const GCNSubtarget &ST,
3287 AMDGPU::SIModeRegisterDefaults Mode) {
3288 // Set SP denorm mode to this value.
3289 unsigned SPDenormMode =
3290 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3291
3292 if (ST.hasDenormModeInst()) {
3293 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3294 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3295
3296 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3297 B.buildInstr(AMDGPU::S_DENORM_MODE)
3298 .addImm(NewDenormModeValue);
3299
3300 } else {
3301 // Select FP32 bit field in mode register.
3302 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3303 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3304 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3305
3306 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3307 .addImm(SPDenormMode)
3308 .addImm(SPDenormModeBitField);
3309 }
3310}
3311
3312bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3313 MachineRegisterInfo &MRI,
3314 MachineIRBuilder &B) const {
3315 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3316 return true;
3317
3318 Register Res = MI.getOperand(0).getReg();
3319 Register LHS = MI.getOperand(1).getReg();
3320 Register RHS = MI.getOperand(2).getReg();
3321 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3322 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3323
3324 uint16_t Flags = MI.getFlags();
3325
3326 LLT S32 = LLT::scalar(32);
3327 LLT S1 = LLT::scalar(1);
3328
3329 auto One = B.buildFConstant(S32, 1.0f);
3330
3331 auto DenominatorScaled =
3332 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3333 .addUse(LHS)
3334 .addUse(RHS)
3335 .addImm(0)
3336 .setMIFlags(Flags);
3337 auto NumeratorScaled =
3338 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3339 .addUse(LHS)
3340 .addUse(RHS)
3341 .addImm(1)
3342 .setMIFlags(Flags);
3343
3344 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3345 .addUse(DenominatorScaled.getReg(0))
3346 .setMIFlags(Flags);
3347 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3348
3349 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3350 // aren't modeled as reading it.
3351 if (!Mode.allFP32Denormals())
3352 toggleSPDenormMode(true, B, ST, Mode);
3353
3354 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3355 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3356 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3357 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3358 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3359 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3360
3361 if (!Mode.allFP32Denormals())
3362 toggleSPDenormMode(false, B, ST, Mode);
3363
3364 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3365 .addUse(Fma4.getReg(0))
3366 .addUse(Fma1.getReg(0))
3367 .addUse(Fma3.getReg(0))
3368 .addUse(NumeratorScaled.getReg(1))
3369 .setMIFlags(Flags);
3370
3371 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3372 .addUse(Fmas.getReg(0))
3373 .addUse(RHS)
3374 .addUse(LHS)
3375 .setMIFlags(Flags);
3376
3377 MI.eraseFromParent();
3378 return true;
3379}
3380
3381bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3382 MachineRegisterInfo &MRI,
3383 MachineIRBuilder &B) const {
3384 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3385 return true;
3386
3387 Register Res = MI.getOperand(0).getReg();
3388 Register LHS = MI.getOperand(1).getReg();
3389 Register RHS = MI.getOperand(2).getReg();
3390
3391 uint16_t Flags = MI.getFlags();
3392
3393 LLT S64 = LLT::scalar(64);
3394 LLT S1 = LLT::scalar(1);
3395
3396 auto One = B.buildFConstant(S64, 1.0);
3397
3398 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3399 .addUse(LHS)
3400 .addUse(RHS)
3401 .addImm(0)
3402 .setMIFlags(Flags);
3403
3404 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3405
3406 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3407 .addUse(DivScale0.getReg(0))
3408 .setMIFlags(Flags);
3409
3410 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3411 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3412 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3413
3414 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3415 .addUse(LHS)
3416 .addUse(RHS)
3417 .addImm(1)
3418 .setMIFlags(Flags);
3419
3420 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3421 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3422 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3423
3424 Register Scale;
3425 if (!ST.hasUsableDivScaleConditionOutput()) {
3426 // Workaround a hardware bug on SI where the condition output from div_scale
3427 // is not usable.
3428
3429 LLT S32 = LLT::scalar(32);
3430
3431 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3432 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3433 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3434 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3435
3436 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3437 Scale1Unmerge.getReg(1));
3438 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3439 Scale0Unmerge.getReg(1));
3440 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3441 } else {
3442 Scale = DivScale1.getReg(1);
3443 }
3444
3445 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3446 .addUse(Fma4.getReg(0))
3447 .addUse(Fma3.getReg(0))
3448 .addUse(Mul.getReg(0))
3449 .addUse(Scale)
3450 .setMIFlags(Flags);
3451
3452 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3453 .addUse(Fmas.getReg(0))
3454 .addUse(RHS)
3455 .addUse(LHS)
3456 .setMIFlags(Flags);
3457
3458 MI.eraseFromParent();
3459 return true;
3460}
3461
3462bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3463 MachineRegisterInfo &MRI,
3464 MachineIRBuilder &B) const {
3465 Register Res = MI.getOperand(0).getReg();
3466 Register LHS = MI.getOperand(2).getReg();
3467 Register RHS = MI.getOperand(3).getReg();
3468 uint16_t Flags = MI.getFlags();
3469
3470 LLT S32 = LLT::scalar(32);
3471 LLT S1 = LLT::scalar(1);
3472
3473 auto Abs = B.buildFAbs(S32, RHS, Flags);
3474 const APFloat C0Val(1.0f);
3475
3476 auto C0 = B.buildConstant(S32, 0x6f800000);
3477 auto C1 = B.buildConstant(S32, 0x2f800000);
3478 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3479
3480 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3481 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3482
3483 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3484
3485 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3486 .addUse(Mul0.getReg(0))
3487 .setMIFlags(Flags);
3488
3489 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3490
3491 B.buildFMul(Res, Sel, Mul1, Flags);
3492
3493 MI.eraseFromParent();
3494 return true;
3495}
3496
3497// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3498// FIXME: Why do we handle this one but not other removed instructions?
3499//
3500// Reciprocal square root. The clamp prevents infinite results, clamping
3501// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3502// +-max_float.
3503bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3504 MachineRegisterInfo &MRI,
3505 MachineIRBuilder &B) const {
3506 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3507 return true;
3508
3509 Register Dst = MI.getOperand(0).getReg();
3510 Register Src = MI.getOperand(2).getReg();
3511 auto Flags = MI.getFlags();
3512
3513 LLT Ty = MRI.getType(Dst);
3514
3515 const fltSemantics *FltSemantics;
3516 if (Ty == LLT::scalar(32))
3517 FltSemantics = &APFloat::IEEEsingle();
3518 else if (Ty == LLT::scalar(64))
3519 FltSemantics = &APFloat::IEEEdouble();
3520 else
3521 return false;
3522
3523 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3524 .addUse(Src)
3525 .setMIFlags(Flags);
3526
3527 // We don't need to concern ourselves with the snan handling difference, since
3528 // the rsq quieted (or not) so use the one which will directly select.
3529 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3530 const bool UseIEEE = MFI->getMode().IEEE;
3531
3532 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3533 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3534 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3535
3536 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3537
3538 if (UseIEEE)
3539 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3540 else
3541 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3542 MI.eraseFromParent();
3543 return true;
3544}
3545
3546static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3547 switch (IID) {
3548 case Intrinsic::amdgcn_ds_fadd:
3549 return AMDGPU::G_ATOMICRMW_FADD;
3550 case Intrinsic::amdgcn_ds_fmin:
3551 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3552 case Intrinsic::amdgcn_ds_fmax:
3553 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3554 default:
3555 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3555)
;
3556 }
3557}
3558
3559bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3560 MachineInstr &MI,
3561 Intrinsic::ID IID) const {
3562 GISelChangeObserver &Observer = Helper.Observer;
3563 Observer.changingInstr(MI);
3564
3565 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3566
3567 // The remaining operands were used to set fields in the MemOperand on
3568 // construction.
3569 for (int I = 6; I > 3; --I)
3570 MI.RemoveOperand(I);
3571
3572 MI.RemoveOperand(1); // Remove the intrinsic ID.
3573 Observer.changedInstr(MI);
3574 return true;
3575}
3576
3577bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3578 MachineRegisterInfo &MRI,
3579 MachineIRBuilder &B) const {
3580 uint64_t Offset =
3581 ST.getTargetLowering()->getImplicitParameterOffset(
3582 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3583 LLT DstTy = MRI.getType(DstReg);
3584 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3585
3586 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3587 if (!loadInputValue(KernargPtrReg, B,
3588 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3589 return false;
3590
3591 // FIXME: This should be nuw
3592 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3593 return true;
3594}
3595
3596bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3597 MachineRegisterInfo &MRI,
3598 MachineIRBuilder &B) const {
3599 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3600 if (!MFI->isEntryFunction()) {
3601 return legalizePreloadedArgIntrin(MI, MRI, B,
3602 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3603 }
3604
3605 Register DstReg = MI.getOperand(0).getReg();
3606 if (!getImplicitArgPtr(DstReg, MRI, B))
3607 return false;
3608
3609 MI.eraseFromParent();
3610 return true;
3611}
3612
3613bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3614 MachineRegisterInfo &MRI,
3615 MachineIRBuilder &B,
3616 unsigned AddrSpace) const {
3617 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3618 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3619 Register Hi32 = Unmerge.getReg(1);
3620
3621 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3622 MI.eraseFromParent();
3623 return true;
3624}
3625
3626// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3627// offset (the offset that is included in bounds checking and swizzling, to be
3628// split between the instruction's voffset and immoffset fields) and soffset
3629// (the offset that is excluded from bounds checking and swizzling, to go in
3630// the instruction's soffset field). This function takes the first kind of
3631// offset and figures out how to split it between voffset and immoffset.
3632std::tuple<Register, unsigned, unsigned>
3633AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3634 Register OrigOffset) const {
3635 const unsigned MaxImm = 4095;
3636 Register BaseReg;
3637 unsigned TotalConstOffset;
3638 const LLT S32 = LLT::scalar(32);
3639 MachineRegisterInfo &MRI = *B.getMRI();
3640
3641 std::tie(BaseReg, TotalConstOffset) =
3642 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3643
3644 unsigned ImmOffset = TotalConstOffset;
3645
3646 // If BaseReg is a pointer, convert it to int.
3647 if (MRI.getType(BaseReg).isPointer())
3648 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3649
3650 // If the immediate value is too big for the immoffset field, put the value
3651 // and -4096 into the immoffset field so that the value that is copied/added
3652 // for the voffset field is a multiple of 4096, and it stands more chance
3653 // of being CSEd with the copy/add for another similar load/store.
3654 // However, do not do that rounding down to a multiple of 4096 if that is a
3655 // negative number, as it appears to be illegal to have a negative offset
3656 // in the vgpr, even if adding the immediate offset makes it positive.
3657 unsigned Overflow = ImmOffset & ~MaxImm;
3658 ImmOffset -= Overflow;
3659 if ((int32_t)Overflow < 0) {
3660 Overflow += ImmOffset;
3661 ImmOffset = 0;
3662 }
3663
3664 if (Overflow != 0) {
3665 if (!BaseReg) {
3666 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3667 } else {
3668 auto OverflowVal = B.buildConstant(S32, Overflow);
3669 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3670 }
3671 }
3672
3673 if (!BaseReg)
3674 BaseReg = B.buildConstant(S32, 0).getReg(0);
3675
3676 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3677}
3678
3679/// Handle register layout difference for f16 images for some subtargets.
3680Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3681 MachineRegisterInfo &MRI,
3682 Register Reg,
3683 bool ImageStore) const {
3684 const LLT S16 = LLT::scalar(16);
3685 const LLT S32 = LLT::scalar(32);
3686 LLT StoreVT = MRI.getType(Reg);
3687 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3687, __extension__ __PRETTY_FUNCTION__))
;
3688
3689 if (ST.hasUnpackedD16VMem()) {
3690 auto Unmerge = B.buildUnmerge(S16, Reg);
3691
3692 SmallVector<Register, 4> WideRegs;
3693 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3694 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3695
3696 int NumElts = StoreVT.getNumElements();
3697
3698 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3699 }
3700
3701 if (ImageStore && ST.hasImageStoreD16Bug()) {
3702 if (StoreVT.getNumElements() == 2) {
3703 SmallVector<Register, 4> PackedRegs;
3704 Reg = B.buildBitcast(S32, Reg).getReg(0);
3705 PackedRegs.push_back(Reg);
3706 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3707 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3708 }
3709
3710 if (StoreVT.getNumElements() == 3) {
3711 SmallVector<Register, 4> PackedRegs;
3712 auto Unmerge = B.buildUnmerge(S16, Reg);
3713 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3714 PackedRegs.push_back(Unmerge.getReg(I));
3715 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3716 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3717 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3718 }
3719
3720 if (StoreVT.getNumElements() == 4) {
3721 SmallVector<Register, 4> PackedRegs;
3722 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3723 auto Unmerge = B.buildUnmerge(S32, Reg);
3724 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3725 PackedRegs.push_back(Unmerge.getReg(I));
3726 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3727 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3728 }
3729
3730 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3730)
;
3731 }
3732
3733 return Reg;
3734}
3735
3736Register AMDGPULegalizerInfo::fixStoreSourceType(
3737 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3738 MachineRegisterInfo *MRI = B.getMRI();
3739 LLT Ty = MRI->getType(VData);
3740
3741 const LLT S16 = LLT::scalar(16);
3742
3743 // Fixup illegal register types for i8 stores.
3744 if (Ty == LLT::scalar(8) || Ty == S16) {
3745 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3746 return AnyExt;
3747 }
3748
3749 if (Ty.isVector()) {
3750 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3751 if (IsFormat)
3752 return handleD16VData(B, *MRI, VData);
3753 }
3754 }
3755
3756 return VData;
3757}
3758
3759bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3760 MachineRegisterInfo &MRI,
3761 MachineIRBuilder &B,
3762 bool IsTyped,
3763 bool IsFormat) const {
3764 Register VData = MI.getOperand(1).getReg();
3765 LLT Ty = MRI.getType(VData);
3766 LLT EltTy = Ty.getScalarType();
3767 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3768 const LLT S32 = LLT::scalar(32);
3769
3770 VData = fixStoreSourceType(B, VData, IsFormat);
3771 Register RSrc = MI.getOperand(2).getReg();
3772
3773 MachineMemOperand *MMO = *MI.memoperands_begin();
3774 const int MemSize = MMO->getSize();
3775
3776 unsigned ImmOffset;
3777 unsigned TotalOffset;
3778
3779 // The typed intrinsics add an immediate after the registers.
3780 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3781
3782 // The struct intrinsic variants add one additional operand over raw.
3783 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3784 Register VIndex;
3785 int OpOffset = 0;
3786 if (HasVIndex) {
3787 VIndex = MI.getOperand(3).getReg();
3788 OpOffset = 1;
3789 }
3790
3791 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3792 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3793
3794 unsigned Format = 0;
3795 if (IsTyped) {
3796 Format = MI.getOperand(5 + OpOffset).getImm();
3797 ++OpOffset;
3798 }
3799
3800 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3801
3802 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3803 if (TotalOffset != 0)
3804 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3805
3806 unsigned Opc;
3807 if (IsTyped) {
3808 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3809 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3810 } else if (IsFormat) {
3811 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3812 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3813 } else {
3814 switch (MemSize) {
3815 case 1:
3816 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3817 break;
3818 case 2:
3819 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3820 break;
3821 default:
3822 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3823 break;
3824 }
3825 }
3826
3827 if (!VIndex)
3828 VIndex = B.buildConstant(S32, 0).getReg(0);
3829
3830 auto MIB = B.buildInstr(Opc)
3831 .addUse(VData) // vdata
3832 .addUse(RSrc) // rsrc
3833 .addUse(VIndex) // vindex
3834 .addUse(VOffset) // voffset
3835 .addUse(SOffset) // soffset
3836 .addImm(ImmOffset); // offset(imm)
3837
3838 if (IsTyped)
3839 MIB.addImm(Format);
3840
3841 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3842 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3843 .addMemOperand(MMO);
3844
3845 MI.eraseFromParent();
3846 return true;
3847}
3848
3849bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3850 MachineRegisterInfo &MRI,
3851 MachineIRBuilder &B,
3852 bool IsFormat,
3853 bool IsTyped) const {
3854 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3855 MachineMemOperand *MMO = *MI.memoperands_begin();
3856 const int MemSize = MMO->getSize();
3857 const LLT S32 = LLT::scalar(32);
3858
3859 Register Dst = MI.getOperand(0).getReg();
3860 Register RSrc = MI.getOperand(2).getReg();
3861
3862 // The typed intrinsics add an immediate after the registers.
3863 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3864
3865 // The struct intrinsic variants add one additional operand over raw.
3866 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3867 Register VIndex;
3868 int OpOffset = 0;
3869 if (HasVIndex) {
3870 VIndex = MI.getOperand(3).getReg();
3871 OpOffset = 1;
3872 }
3873
3874 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3875 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3876
3877 unsigned Format = 0;
3878 if (IsTyped) {
3879 Format = MI.getOperand(5 + OpOffset).getImm();
3880 ++OpOffset;
3881 }
3882
3883 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3884 unsigned ImmOffset;
3885 unsigned TotalOffset;
3886
3887 LLT Ty = MRI.getType(Dst);
3888 LLT EltTy = Ty.getScalarType();
3889 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3890 const bool Unpacked = ST.hasUnpackedD16VMem();
3891
3892 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3893 if (TotalOffset != 0)
3894 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3895
3896 unsigned Opc;
3897
3898 if (IsTyped) {
3899 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3900 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3901 } else if (IsFormat) {
3902 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3903 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3904 } else {
3905 switch (MemSize) {
3906 case 1:
3907 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3908 break;
3909 case 2:
3910 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3911 break;
3912 default:
3913 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3914 break;
3915 }
3916 }
3917
3918 Register LoadDstReg;
3919
3920 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3921 LLT UnpackedTy = Ty.changeElementSize(32);
3922
3923 if (IsExtLoad)
3924 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3925 else if (Unpacked && IsD16 && Ty.isVector())
3926 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3927 else
3928 LoadDstReg = Dst;
3929
3930 if (!VIndex)
3931 VIndex = B.buildConstant(S32, 0).getReg(0);
3932
3933 auto MIB = B.buildInstr(Opc)
3934 .addDef(LoadDstReg) // vdata
3935 .addUse(RSrc) // rsrc
3936 .addUse(VIndex) // vindex
3937 .addUse(VOffset) // voffset
3938 .addUse(SOffset) // soffset
3939 .addImm(ImmOffset); // offset(imm)
3940
3941 if (IsTyped)
3942 MIB.addImm(Format);
3943
3944 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3945 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3946 .addMemOperand(MMO);
3947
3948 if (LoadDstReg != Dst) {
3949 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3950
3951 // Widen result for extending loads was widened.
3952 if (IsExtLoad)
3953 B.buildTrunc(Dst, LoadDstReg);
3954 else {
3955 // Repack to original 16-bit vector result
3956 // FIXME: G_TRUNC should work, but legalization currently fails
3957 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3958 SmallVector<Register, 4> Repack;
3959 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3960 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3961 B.buildMerge(Dst, Repack);
3962 }
3963 }
3964
3965 MI.eraseFromParent();
3966 return true;
3967}
3968
3969bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3970 MachineIRBuilder &B,
3971 bool IsInc) const {
3972 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3973 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3974 B.buildInstr(Opc)
3975 .addDef(MI.getOperand(0).getReg())
3976 .addUse(MI.getOperand(2).getReg())
3977 .addUse(MI.getOperand(3).getReg())
3978 .cloneMemRefs(MI);
3979 MI.eraseFromParent();
3980 return true;
3981}
3982
3983static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3984 switch (IntrID) {
3985 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3986 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3987 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3988 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3989 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3990 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3991 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3992 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3993 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3994 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3995 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3996 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3997 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3998 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3999 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4000 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4001 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4002 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4003 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4004 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4005 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4006 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4007 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4008 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4009 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4010 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4011 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4012 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4013 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4014 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4015 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4016 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4017 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4018 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4019 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4020 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4021 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4022 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4023 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4024 case Intrinsic::amdgcn_buffer_atomic_fadd:
4025 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4026 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4027 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4028 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4029 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4030 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4031 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4032 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4033 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4034 default:
4035 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4035)
;
4036 }
4037}
4038
4039bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4040 MachineIRBuilder &B,
4041 Intrinsic::ID IID) const {
4042 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4043 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4044 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4045
4046 Register Dst;
4047
4048 int OpOffset = 0;
4049 if (HasReturn) {
4050 // A few FP atomics do not support return values.
4051 Dst = MI.getOperand(0).getReg();
4052 } else {
4053 OpOffset = -1;
4054 }
4055
4056 Register VData = MI.getOperand(2 + OpOffset).getReg();
4057 Register CmpVal;
4058
4059 if (IsCmpSwap) {
4060 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4061 ++OpOffset;
4062 }
4063
4064 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4065 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4066
4067 // The struct intrinsic variants add one additional operand over raw.
4068 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4069 Register VIndex;
4070 if (HasVIndex) {
4071 VIndex = MI.getOperand(4 + OpOffset).getReg();
4072 ++OpOffset;
4073 }
4074
4075 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4076 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4077 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4078
4079 MachineMemOperand *MMO = *MI.memoperands_begin();
4080
4081 unsigned ImmOffset;
4082 unsigned TotalOffset;
4083 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4084 if (TotalOffset != 0)
4085 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4086
4087 if (!VIndex)
4088 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4089
4090 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4091
4092 if (HasReturn)
4093 MIB.addDef(Dst);
4094
4095 MIB.addUse(VData); // vdata
4096
4097 if (IsCmpSwap)
4098 MIB.addReg(CmpVal);
4099
4100 MIB.addUse(RSrc) // rsrc
4101 .addUse(VIndex) // vindex
4102 .addUse(VOffset) // voffset
4103 .addUse(SOffset) // soffset
4104 .addImm(ImmOffset) // offset(imm)
4105 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4106 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4107 .addMemOperand(MMO);
4108
4109 MI.eraseFromParent();
4110 return true;
4111}
4112
4113/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4114/// vector with s16 typed elements.
4115static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4116 SmallVectorImpl<Register> &PackedAddrs,
4117 unsigned ArgOffset,
4118 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4119 bool IsA16, bool IsG16) {
4120 const LLT S16 = LLT::scalar(16);
4121 const LLT V2S16 = LLT::vector(2, 16);
4122 auto EndIdx = Intr->VAddrEnd;
4123
4124 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4125 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4126 if (!SrcOp.isReg())
4127 continue; // _L to _LZ may have eliminated this.
4128
4129 Register AddrReg = SrcOp.getReg();
4130
4131 if (I < Intr->GradientStart) {
4132 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4133 PackedAddrs.push_back(AddrReg);
4134 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4135 (I >= Intr->CoordStart && !IsA16)) {
4136 // Handle any gradient or coordinate operands that should not be packed
4137 PackedAddrs.push_back(AddrReg);
4138 } else {
4139 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4140 // derivatives dx/dh and dx/dv are packed with undef.
4141 if (((I + 1) >= EndIdx) ||
4142 ((Intr->NumGradients / 2) % 2 == 1 &&
4143 (I == static_cast<unsigned>(Intr->GradientStart +
4144 (Intr->NumGradients / 2) - 1) ||
4145 I == static_cast<unsigned>(Intr->GradientStart +
4146 Intr->NumGradients - 1))) ||
4147 // Check for _L to _LZ optimization
4148 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4149 PackedAddrs.push_back(
4150 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4151 .getReg(0));
4152 } else {
4153 PackedAddrs.push_back(
4154 B.buildBuildVector(
4155 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4156 .getReg(0));
4157 ++I;
4158 }
4159 }
4160 }
4161}
4162
4163/// Convert from separate vaddr components to a single vector address register,
4164/// and replace the remaining operands with $noreg.
4165static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4166 int DimIdx, int NumVAddrs) {
4167 const LLT S32 = LLT::scalar(32);
4168
4169 SmallVector<Register, 8> AddrRegs;
4170 for (int I = 0; I != NumVAddrs; ++I) {
4171 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4172 if (SrcOp.isReg()) {
4173 AddrRegs.push_back(SrcOp.getReg());
4174 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4174, __extension__ __PRETTY_FUNCTION__))
;
4175 }
4176 }
4177
4178 int NumAddrRegs = AddrRegs.size();
4179 if (NumAddrRegs != 1) {
4180 // Round up to 8 elements for v5-v7
4181 // FIXME: Missing intermediate sized register classes and instructions.
4182 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4183 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4184 auto Undef = B.buildUndef(S32);
4185 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4186 NumAddrRegs = RoundedNumRegs;
4187 }
4188
4189 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4190 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4191 }
4192
4193 for (int I = 1; I != NumVAddrs; ++I) {
4194 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4195 if (SrcOp.isReg())
4196 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4197 }
4198}
4199
4200/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4201///
4202/// Depending on the subtarget, load/store with 16-bit element data need to be
4203/// rewritten to use the low half of 32-bit registers, or directly use a packed
4204/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4205/// registers.
4206///
4207/// We don't want to directly select image instructions just yet, but also want
4208/// to exposes all register repacking to the legalizer/combiners. We also don't
4209/// want a selected instrution entering RegBankSelect. In order to avoid
4210/// defining a multitude of intermediate image instructions, directly hack on
4211/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4212/// now unnecessary arguments with $noreg.
4213bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4214 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4215 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4216
4217 const unsigned NumDefs = MI.getNumExplicitDefs();
4218 const unsigned ArgOffset = NumDefs + 1;
4219 bool IsTFE = NumDefs == 2;
1
Assuming 'NumDefs' is not equal to 2
4220 // We are only processing the operands of d16 image operations on subtargets
4221 // that use the unpacked register layout, or need to repack the TFE result.
4222
4223 // TODO: Do we need to guard against already legalized intrinsics?
4224 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4225 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4226
4227 MachineRegisterInfo *MRI = B.getMRI();
4228 const LLT S32 = LLT::scalar(32);
4229 const LLT S16 = LLT::scalar(16);
4230 const LLT V2S16 = LLT::vector(2, 16);
4231
4232 unsigned DMask = 0;
4233
4234 // Check for 16 bit addresses and pack if true.
4235 LLT GradTy =
4236 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4237 LLT AddrTy =
4238 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4239 const bool IsG16 = GradTy == S16;
2
Calling 'LLT::operator=='
5
Returning from 'LLT::operator=='
4240 const bool IsA16 = AddrTy == S16;
6
Calling 'LLT::operator=='
8
Returning from 'LLT::operator=='
4241
4242 int DMaskLanes = 0;
4243 if (!BaseOpcode->Atomic) {
9
Assuming field 'Atomic' is true
10
Taking false branch
4244 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4245 if (BaseOpcode->Gather4) {
4246 DMaskLanes = 4;
4247 } else if (DMask != 0) {
4248 DMaskLanes = countPopulation(DMask);
4249 } else if (!IsTFE && !BaseOpcode->Store) {
4250 // If dmask is 0, this is a no-op load. This can be eliminated.
4251 B.buildUndef(MI.getOperand(0));
4252 MI.eraseFromParent();
4253 return true;
4254 }
4255 }
4256
4257 Observer.changingInstr(MI);
4258 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4259
4260 unsigned NewOpcode = NumDefs == 0 ?
11
Assuming 'NumDefs' is not equal to 0
12
'?' condition is false
4261 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4262
4263 // Track that we legalized this
4264 MI.setDesc(B.getTII().get(NewOpcode));
4265
4266 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4267 // dmask to be at least 1 otherwise the instruction will fail
4268 if (IsTFE
12.1
'IsTFE' is false
12.1
'IsTFE' is false
&& DMask == 0) {
4269 DMask = 0x1;
4270 DMaskLanes = 1;
4271 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4272 }
4273
4274 if (BaseOpcode->Atomic
12.2
Field 'Atomic' is true
12.2
Field 'Atomic' is true
) {
13
Taking true branch
4275 Register VData0 = MI.getOperand(2).getReg();
4276 LLT Ty = MRI->getType(VData0);
4277
4278 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4279 if (Ty.isVector())
14
Calling 'LLT::isVector'
16
Returning from 'LLT::isVector'
17
Taking false branch
4280 return false;
4281
4282 if (BaseOpcode->AtomicX2) {
18
Assuming field 'AtomicX2' is false
19
Taking false branch
4283 Register VData1 = MI.getOperand(3).getReg();
4284 // The two values are packed in one register.
4285 LLT PackedTy = LLT::vector(2, Ty);
4286 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4287 MI.getOperand(2).setReg(Concat.getReg(0));
4288 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4289 }
4290 }
4291
4292 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4293
4294 // Optimize _L to _LZ when _L is zero
4295 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
20
Assuming 'LZMappingInfo' is null
21
Taking false branch
4296 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4297 const ConstantFP *ConstantLod;
4298
4299 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4300 m_GFCst(ConstantLod))) {
4301 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4302 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4303 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4304 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4305 Intr->Dim);
4306
4307 // The starting indexes should remain in the same place.
4308 --CorrectedNumVAddrs;
4309
4310 MI.getOperand(MI.getNumExplicitDefs())
4311 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4312 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4313 Intr = NewImageDimIntr;
4314 }
4315 }
4316 }
4317
4318 // Optimize _mip away, when 'lod' is zero
4319 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
22
Assuming the condition is false
23
Taking false branch
4320 int64_t ConstantLod;
4321 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4322 m_ICst(ConstantLod))) {
4323 if (ConstantLod == 0) {
4324 // TODO: Change intrinsic opcode and remove operand instead or replacing
4325 // it with 0, as the _L to _LZ handling is done above.
4326 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4327 --CorrectedNumVAddrs;
4328 }
4329 }
4330 }
4331
4332 // Rewrite the addressing register layout before doing anything else.
4333 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
24
Assuming field 'Gradients' is false
4334 // 16 bit gradients are supported, but are tied to the A16 control
4335 // so both gradients and addresses must be 16 bit
4336 return false;
4337 }
4338
4339 if (IsA16
24.1
'IsA16' is false
24.1
'IsA16' is false
&& !ST.hasA16()) {
4340 // A16 not supported
4341 return false;
4342 }
4343
4344 if (IsA16
24.2
'IsA16' is false
24.2
'IsA16' is false
|| IsG16
24.3
'IsG16' is false
24.3
'IsG16' is false
) {
25
Taking false branch
4345 if (Intr->NumVAddrs > 1) {
4346 SmallVector<Register, 4> PackedRegs;
4347
4348 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4349 IsG16);
4350
4351 // See also below in the non-a16 branch
4352 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4353
4354 if (!UseNSA && PackedRegs.size() > 1) {
4355 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4356 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4357 PackedRegs[0] = Concat.getReg(0);
4358 PackedRegs.resize(1);
4359 }
4360
4361 const unsigned NumPacked = PackedRegs.size();
4362 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4363 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4364 if (!SrcOp.isReg()) {
4365 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4365, __extension__ __PRETTY_FUNCTION__))
;
4366 continue;
4367 }
4368
4369 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4369, __extension__ __PRETTY_FUNCTION__))
;
4370
4371 if (I - Intr->VAddrStart < NumPacked)
4372 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4373 else
4374 SrcOp.setReg(AMDGPU::NoRegister);
4375 }
4376 }
4377 } else {
4378 // If the register allocator cannot place the address registers contiguously
4379 // without introducing moves, then using the non-sequential address encoding
4380 // is always preferable, since it saves VALU instructions and is usually a
4381 // wash in terms of code size or even better.
4382 //
4383 // However, we currently have no way of hinting to the register allocator
4384 // that MIMG addresses should be placed contiguously when it is possible to
4385 // do so, so force non-NSA for the common 2-address case as a heuristic.
4386 //
4387 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4388 // allocation when possible.
4389 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
26
Assuming 'CorrectedNumVAddrs' is < 3
4390
4391 if (!UseNSA
26.1
'UseNSA' is false
26.1
'UseNSA' is false
&& Intr->NumVAddrs > 1)
27
Assuming field 'NumVAddrs' is <= 1
28
Taking false branch
4392 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4393 Intr->NumVAddrs);
4394 }
4395
4396 int Flags = 0;
4397 if (IsA16
28.1
'IsA16' is false
28.1
'IsA16' is false
)
29
Taking false branch
4398 Flags |= 1;
4399 if (IsG16
29.1
'IsG16' is false
29.1
'IsG16' is false
)
30
Taking false branch
4400 Flags |= 2;
4401 MI.addOperand(MachineOperand::CreateImm(Flags));
4402
4403 if (BaseOpcode->Store) { // No TFE for stores?
31
Assuming field 'Store' is false
32
Taking false branch
4404 // TODO: Handle dmask trim
4405 Register VData = MI.getOperand(1).getReg();
4406 LLT Ty = MRI->getType(VData);
4407 if (!Ty.isVector() || Ty.getElementType() != S16)
4408 return true;
4409
4410 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4411 if (RepackedReg != VData) {
4412 MI.getOperand(1).setReg(RepackedReg);
4413 }
4414
4415 return true;
4416 }
4417
4418 Register DstReg = MI.getOperand(0).getReg();
4419 LLT Ty = MRI->getType(DstReg);
4420 const LLT EltTy = Ty.getScalarType();
4421 const bool IsD16 = Ty.getScalarType() == S16;
33
Calling 'LLT::operator=='
36
Returning from 'LLT::operator=='
4422 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
37
Assuming the condition is true
38
'?' condition is true
4423
4424 // Confirm that the return type is large enough for the dmask specified
4425 if (NumElts
38.1
'NumElts' is >= 'DMaskLanes'
38.1
'NumElts' is >= 'DMaskLanes'
< DMaskLanes)
39
Taking false branch
4426 return false;
4427
4428 if (NumElts > 4 || DMaskLanes
40.1
'DMaskLanes' is <= 4
40.1
'DMaskLanes' is <= 4
> 4)
40
Assuming 'NumElts' is <= 4
41
Taking false branch
4429 return false;
4430
4431 const unsigned AdjustedNumElts = DMaskLanes
41.1
'DMaskLanes' is equal to 0
41.1
'DMaskLanes' is equal to 0
== 0 ? 1 : DMaskLanes;
42
'?' condition is true
4432 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4433
4434 // The raw dword aligned data component of the load. The only legal cases
4435 // where this matters should be when using the packed D16 format, for
4436 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4437 LLT RoundedTy;
4438
4439 // S32 vector to to cover all data, plus TFE result element.
4440 LLT TFETy;
4441
4442 // Register type to use for each loaded component. Will be S32 or V2S16.
4443 LLT RegTy;
4444
4445 if (IsD16
42.1
'IsD16' is false
42.1
'IsD16' is false
&& ST.hasUnpackedD16VMem()) {
4446 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4447 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4448 RegTy = S32;
4449 } else {
4450 unsigned EltSize = EltTy.getSizeInBits();
43
Calling 'LLT::getSizeInBits'
46
Returning from 'LLT::getSizeInBits'
47
'EltSize' initialized to 0
4451 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4452 unsigned RoundedSize = 32 * RoundedElts;
4453 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
48
Division by zero
4454 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4455 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4456 }
4457
4458 // The return type does not need adjustment.
4459 // TODO: Should we change s16 case to s32 or <2 x s16>?
4460 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4461 return true;
4462
4463 Register Dst1Reg;
4464
4465 // Insert after the instruction.
4466 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4467
4468 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4469 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4470 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4471 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4472
4473 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4474
4475 MI.getOperand(0).setReg(NewResultReg);
4476
4477 // In the IR, TFE is supposed to be used with a 2 element struct return
4478 // type. The intruction really returns these two values in one contiguous
4479 // register, with one additional dword beyond the loaded data. Rewrite the
4480 // return type to use a single register result.
4481
4482 if (IsTFE) {
4483 Dst1Reg = MI.getOperand(1).getReg();
4484 if (MRI->getType(Dst1Reg) != S32)
4485 return false;
4486
4487 // TODO: Make sure the TFE operand bit is set.
4488 MI.RemoveOperand(1);
4489
4490 // Handle the easy case that requires no repack instructions.
4491 if (Ty == S32) {
4492 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4493 return true;
4494 }
4495 }
4496
4497 // Now figure out how to copy the new result register back into the old
4498 // result.
4499 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4500
4501 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4502
4503 if (ResultNumRegs == 1) {
4504 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4504, __extension__ __PRETTY_FUNCTION__))
;
4505 ResultRegs[0] = NewResultReg;
4506 } else {
4507 // We have to repack into a new vector of some kind.
4508 for (int I = 0; I != NumDataRegs; ++I)
4509 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4510 B.buildUnmerge(ResultRegs, NewResultReg);
4511
4512 // Drop the final TFE element to get the data part. The TFE result is
4513 // directly written to the right place already.
4514 if (IsTFE)
4515 ResultRegs.resize(NumDataRegs);
4516 }
4517
4518 // For an s16 scalar result, we form an s32 result with a truncate regardless
4519 // of packed vs. unpacked.
4520 if (IsD16 && !Ty.isVector()) {
4521 B.buildTrunc(DstReg, ResultRegs[0]);
4522 return true;
4523 }
4524
4525 // Avoid a build/concat_vector of 1 entry.
4526 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4527 B.buildBitcast(DstReg, ResultRegs[0]);
4528 return true;
4529 }
4530
4531 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4531, __extension__ __PRETTY_FUNCTION__))
;
4532
4533 if (IsD16) {
4534 // For packed D16 results with TFE enabled, all the data components are
4535 // S32. Cast back to the expected type.
4536 //
4537 // TODO: We don't really need to use load s32 elements. We would only need one
4538 // cast for the TFE result if a multiple of v2s16 was used.
4539 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4540 for (Register &Reg : ResultRegs)
4541 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4542 } else if (ST.hasUnpackedD16VMem()) {
4543 for (Register &Reg : ResultRegs)
4544 Reg = B.buildTrunc(S16, Reg).getReg(0);
4545 }
4546 }
4547
4548 auto padWithUndef = [&](LLT Ty, int NumElts) {
4549 if (NumElts == 0)
4550 return;
4551 Register Undef = B.buildUndef(Ty).getReg(0);
4552 for (int I = 0; I != NumElts; ++I)
4553 ResultRegs.push_back(Undef);
4554 };
4555
4556 // Pad out any elements eliminated due to the dmask.
4557 LLT ResTy = MRI->getType(ResultRegs[0]);
4558 if (!ResTy.isVector()) {
4559 padWithUndef(ResTy, NumElts - ResultRegs.size());
4560 B.buildBuildVector(DstReg, ResultRegs);
4561 return true;
4562 }
4563
4564 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4564, __extension__ __PRETTY_FUNCTION__))
;
4565 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4566
4567 // Deal with the one annoying legal case.
4568 const LLT V3S16 = LLT::vector(3, 16);
4569 if (Ty == V3S16) {
4570 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4571 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4572 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4573 return true;
4574 }
4575
4576 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4577 B.buildConcatVectors(DstReg, ResultRegs);
4578 return true;
4579}
4580
4581bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4582 LegalizerHelper &Helper, MachineInstr &MI) const {
4583 MachineIRBuilder &B = Helper.MIRBuilder;
4584 GISelChangeObserver &Observer = Helper.Observer;
4585
4586 Register Dst = MI.getOperand(0).getReg();
4587 LLT Ty = B.getMRI()->getType(Dst);
4588 unsigned Size = Ty.getSizeInBits();
4589 MachineFunction &MF = B.getMF();
4590
4591 Observer.changingInstr(MI);
4592
4593 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4594 Ty = getBitcastRegisterType(Ty);
4595 Helper.bitcastDst(MI, Ty, 0);
4596 Dst = MI.getOperand(0).getReg();
4597 B.setInsertPt(B.getMBB(), MI);
4598 }
4599
4600 // FIXME: We don't really need this intermediate instruction. The intrinsic
4601 // should be fixed to have a memory operand. Since it's readnone, we're not
4602 // allowed to add one.
4603 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4604 MI.RemoveOperand(1); // Remove intrinsic ID
4605
4606 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4607 // TODO: Should this use datalayout alignment?
4608 const unsigned MemSize = (Size + 7) / 8;
4609 const Align MemAlign(4);
4610 MachineMemOperand *MMO = MF.getMachineMemOperand(
4611 MachinePointerInfo(),
4612 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4613 MachineMemOperand::MOInvariant,
4614 MemSize, MemAlign);
4615 MI.addMemOperand(MF, MMO);
4616
4617 // There are no 96-bit result scalar loads, but widening to 128-bit should
4618 // always be legal. We may need to restore this to a 96-bit result if it turns
4619 // out this needs to be converted to a vector load during RegBankSelect.
4620 if (!isPowerOf2_32(Size)) {
4621 if (Ty.isVector())
4622 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4623 else
4624 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4625 }
4626
4627 Observer.changedInstr(MI);
4628 return true;
4629}
4630
4631// TODO: Move to selection
4632bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4633 MachineRegisterInfo &MRI,
4634 MachineIRBuilder &B) const {
4635 if (!ST.isTrapHandlerEnabled() ||
4636 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4637 return legalizeTrapEndpgm(MI, MRI, B);
4638
4639 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4640 switch (*HsaAbiVer) {
4641 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4642 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4643 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4644 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4645 return ST.supportsGetDoorbellID() ?
4646 legalizeTrapHsa(MI, MRI, B) :
4647 legalizeTrapHsaQueuePtr(MI, MRI, B);
4648 }
4649 }
4650
4651 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4651)
;
4652}
4653
4654bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4655 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4656 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4657 MI.eraseFromParent();
4658 return true;
4659}
4660
4661bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4662 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4663 // Pass queue pointer to trap handler as input, and insert trap instruction
4664 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4665 Register LiveIn =
4666 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4667 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4668 return false;
4669
4670 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4671 B.buildCopy(SGPR01, LiveIn);
4672 B.buildInstr(AMDGPU::S_TRAP)
4673 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4674 .addReg(SGPR01, RegState::Implicit);
4675
4676 MI.eraseFromParent();
4677 return true;
4678}
4679
4680bool AMDGPULegalizerInfo::legalizeTrapHsa(
4681 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4682 B.buildInstr(AMDGPU::S_TRAP)
4683 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4684 MI.eraseFromParent();
4685 return true;
4686}
4687
4688bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4690 // Is non-HSA path or trap-handler disabled? then, report a warning
4691 // accordingly
4692 if (!ST.isTrapHandlerEnabled() ||
4693 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4694 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4695 "debugtrap handler not supported",
4696 MI.getDebugLoc(), DS_Warning);
4697 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4698 Ctx.diagnose(NoTrap);
4699 } else {
4700 // Insert debug-trap instruction
4701 B.buildInstr(AMDGPU::S_TRAP)
4702 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4703 }
4704
4705 MI.eraseFromParent();
4706 return true;
4707}
4708
4709bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4710 MachineIRBuilder &B) const {
4711 MachineRegisterInfo &MRI = *B.getMRI();
4712 const LLT S16 = LLT::scalar(16);
4713 const LLT S32 = LLT::scalar(32);
4714
4715 Register DstReg = MI.getOperand(0).getReg();
4716 Register NodePtr = MI.getOperand(2).getReg();
4717 Register RayExtent = MI.getOperand(3).getReg();
4718 Register RayOrigin = MI.getOperand(4).getReg();
4719 Register RayDir = MI.getOperand(5).getReg();
4720 Register RayInvDir = MI.getOperand(6).getReg();
4721 Register TDescr = MI.getOperand(7).getReg();
4722
4723 if (!ST.hasGFX10_AEncoding()) {
4724 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4725 "intrinsic not supported on subtarget",
4726 MI.getDebugLoc());
4727 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4728 return false;
4729 }
4730
4731 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4732 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4733 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4734 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4735 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4736 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4737
4738 SmallVector<Register, 12> Ops;
4739 if (Is64) {
4740 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4741 Ops.push_back(Unmerge.getReg(0));
4742 Ops.push_back(Unmerge.getReg(1));
4743 } else {
4744 Ops.push_back(NodePtr);
4745 }
4746 Ops.push_back(RayExtent);
4747
4748 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4749 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4750 Ops.push_back(Unmerge.getReg(0));
4751 Ops.push_back(Unmerge.getReg(1));
4752 Ops.push_back(Unmerge.getReg(2));
4753 };
4754
4755 packLanes(RayOrigin);
4756 if (IsA16) {
4757 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4758 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4759 Register R1 = MRI.createGenericVirtualRegister(S32);
4760 Register R2 = MRI.createGenericVirtualRegister(S32);
4761 Register R3 = MRI.createGenericVirtualRegister(S32);
4762 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4763 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4764 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4765 Ops.push_back(R1);
4766 Ops.push_back(R2);
4767 Ops.push_back(R3);
4768 } else {
4769 packLanes(RayDir);
4770 packLanes(RayInvDir);
4771 }
4772
4773 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4774 .addDef(DstReg)
4775 .addImm(Opcode);
4776
4777 for (Register R : Ops) {
4778 MIB.addUse(R);
4779 }
4780
4781 MIB.addUse(TDescr)
4782 .addImm(IsA16 ? 1 : 0)
4783 .cloneMemRefs(MI);
4784
4785 MI.eraseFromParent();
4786 return true;
4787}
4788
4789bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4790 MachineInstr &MI) const {
4791 MachineIRBuilder &B = Helper.MIRBuilder;
4792 MachineRegisterInfo &MRI = *B.getMRI();
4793
4794 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4795 auto IntrID = MI.getIntrinsicID();
4796 switch (IntrID) {
4797 case Intrinsic::amdgcn_if:
4798 case Intrinsic::amdgcn_else: {
4799 MachineInstr *Br = nullptr;
4800 MachineBasicBlock *UncondBrTarget = nullptr;
4801 bool Negated = false;
4802 if (MachineInstr *BrCond =
4803 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4804 const SIRegisterInfo *TRI
4805 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4806
4807 Register Def = MI.getOperand(1).getReg();
4808 Register Use = MI.getOperand(3).getReg();
4809
4810 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4811
4812 if (Negated)
4813 std::swap(CondBrTarget, UncondBrTarget);
4814
4815 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4816 if (IntrID == Intrinsic::amdgcn_if) {
4817 B.buildInstr(AMDGPU::SI_IF)
4818 .addDef(Def)
4819 .addUse(Use)
4820 .addMBB(UncondBrTarget);
4821 } else {
4822 B.buildInstr(AMDGPU::SI_ELSE)
4823 .addDef(Def)
4824 .addUse(Use)
4825 .addMBB(UncondBrTarget);
4826 }
4827
4828 if (Br) {
4829 Br->getOperand(0).setMBB(CondBrTarget);
4830 } else {
4831 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4832 // since we're swapping branch targets it needs to be reinserted.
4833 // FIXME: IRTranslator should probably not do this
4834 B.buildBr(*CondBrTarget);
4835 }
4836
4837 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4838 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4839 MI.eraseFromParent();
4840 BrCond->eraseFromParent();
4841 return true;
4842 }
4843
4844 return false;
4845 }
4846 case Intrinsic::amdgcn_loop: {
4847 MachineInstr *Br = nullptr;
4848 MachineBasicBlock *UncondBrTarget = nullptr;
4849 bool Negated = false;
4850 if (MachineInstr *BrCond =
4851 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4852 const SIRegisterInfo *TRI
4853 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4854
4855 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4856 Register Reg = MI.getOperand(2).getReg();
4857
4858 if (Negated)
4859 std::swap(CondBrTarget, UncondBrTarget);
4860
4861 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4862 B.buildInstr(AMDGPU::SI_LOOP)
4863 .addUse(Reg)
4864 .addMBB(UncondBrTarget);
4865
4866 if (Br)
4867 Br->getOperand(0).setMBB(CondBrTarget);
4868 else
4869 B.buildBr(*CondBrTarget);
4870
4871 MI.eraseFromParent();
4872 BrCond->eraseFromParent();
4873 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4874 return true;
4875 }
4876
4877 return false;
4878 }
4879 case Intrinsic::amdgcn_kernarg_segment_ptr:
4880 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4881 // This only makes sense to call in a kernel, so just lower to null.
4882 B.buildConstant(MI.getOperand(0).getReg(), 0);
4883 MI.eraseFromParent();
4884 return true;
4885 }
4886
4887 return legalizePreloadedArgIntrin(
4888 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4889 case Intrinsic::amdgcn_implicitarg_ptr:
4890 return legalizeImplicitArgPtr(MI, MRI, B);
4891 case Intrinsic::amdgcn_workitem_id_x:
4892 return legalizePreloadedArgIntrin(MI, MRI, B,
4893 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4894 case Intrinsic::amdgcn_workitem_id_y:
4895 return legalizePreloadedArgIntrin(MI, MRI, B,
4896 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4897 case Intrinsic::amdgcn_workitem_id_z:
4898 return legalizePreloadedArgIntrin(MI, MRI, B,
4899 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4900 case Intrinsic::amdgcn_workgroup_id_x:
4901 return legalizePreloadedArgIntrin(MI, MRI, B,
4902 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4903 case Intrinsic::amdgcn_workgroup_id_y:
4904 return legalizePreloadedArgIntrin(MI, MRI, B,
4905 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4906 case Intrinsic::amdgcn_workgroup_id_z:
4907 return legalizePreloadedArgIntrin(MI, MRI, B,
4908 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4909 case Intrinsic::amdgcn_dispatch_ptr:
4910 return legalizePreloadedArgIntrin(MI, MRI, B,
4911 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4912 case Intrinsic::amdgcn_queue_ptr:
4913 return legalizePreloadedArgIntrin(MI, MRI, B,
4914 AMDGPUFunctionArgInfo::QUEUE_PTR);
4915 case Intrinsic::amdgcn_implicit_buffer_ptr:
4916 return legalizePreloadedArgIntrin(
4917 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4918 case Intrinsic::amdgcn_dispatch_id:
4919 return legalizePreloadedArgIntrin(MI, MRI, B,
4920 AMDGPUFunctionArgInfo::DISPATCH_ID);
4921 case Intrinsic::amdgcn_fdiv_fast:
4922 return legalizeFDIVFastIntrin(MI, MRI, B);
4923 case Intrinsic::amdgcn_is_shared:
4924 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4925 case Intrinsic::amdgcn_is_private:
4926 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4927 case Intrinsic::amdgcn_wavefrontsize: {
4928 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4929 MI.eraseFromParent();
4930 return true;
4931 }
4932 case Intrinsic::amdgcn_s_buffer_load:
4933 return legalizeSBufferLoad(Helper, MI);
4934 case Intrinsic::amdgcn_raw_buffer_store:
4935 case Intrinsic::amdgcn_struct_buffer_store:
4936 return legalizeBufferStore(MI, MRI, B, false, false);
4937 case Intrinsic::amdgcn_raw_buffer_store_format:
4938 case Intrinsic::amdgcn_struct_buffer_store_format:
4939 return legalizeBufferStore(MI, MRI, B, false, true);
4940 case Intrinsic::amdgcn_raw_tbuffer_store:
4941 case Intrinsic::amdgcn_struct_tbuffer_store:
4942 return legalizeBufferStore(MI, MRI, B, true, true);
4943 case Intrinsic::amdgcn_raw_buffer_load:
4944 case Intrinsic::amdgcn_struct_buffer_load:
4945 return legalizeBufferLoad(MI, MRI, B, false, false);
4946 case Intrinsic::amdgcn_raw_buffer_load_format:
4947 case Intrinsic::amdgcn_struct_buffer_load_format:
4948 return legalizeBufferLoad(MI, MRI, B, true, false);
4949 case Intrinsic::amdgcn_raw_tbuffer_load:
4950 case Intrinsic::amdgcn_struct_tbuffer_load:
4951 return legalizeBufferLoad(MI, MRI, B, true, true);
4952 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4953 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4954 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4955 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4956 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4957 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4958 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4959 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4960 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4961 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4962 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4963 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4964 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4965 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4966 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4967 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4968 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4969 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4970 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4971 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4972 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4973 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4974 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4975 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4976 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4977 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4978 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4979 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4980 case Intrinsic::amdgcn_buffer_atomic_fadd:
4981 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4982 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4983 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4984 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4985 return legalizeBufferAtomic(MI, B, IntrID);
4986 case Intrinsic::amdgcn_atomic_inc:
4987 return legalizeAtomicIncDec(MI, B, true);
4988 case Intrinsic::amdgcn_atomic_dec:
4989 return legalizeAtomicIncDec(MI, B, false);
4990 case Intrinsic::trap:
4991 return legalizeTrapIntrinsic(MI, MRI, B);
4992 case Intrinsic::debugtrap:
4993 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4994 case Intrinsic::amdgcn_rsq_clamp:
4995 return legalizeRsqClampIntrinsic(MI, MRI, B);
4996 case Intrinsic::amdgcn_ds_fadd:
4997 case Intrinsic::amdgcn_ds_fmin:
4998 case Intrinsic::amdgcn_ds_fmax:
4999 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5000 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5001 return legalizeBVHIntrinsic(MI, B);
5002 default: {
5003 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5004 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5005 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5006 return true;
5007 }
5008 }
5009
5010 return true;
5011}

/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h

1//== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// Implement a low-level type suitable for MachineInstr level instruction
10/// selection.
11///
12/// For a type attached to a MachineInstr, we only care about 2 details: total
13/// size and the number of vector lanes (if any). Accordingly, there are 4
14/// possible valid type-kinds:
15///
16/// * `sN` for scalars and aggregates
17/// * `<N x sM>` for vectors, which must have at least 2 elements.
18/// * `pN` for pointers
19///
20/// Other information required for correct selection is expected to be carried
21/// by the opcode, or non-type flags. For example the distinction between G_ADD
22/// and G_FADD for int/float or fast-math flags.
23///
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
27#define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
28
29#include "llvm/ADT/DenseMapInfo.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/MachineValueType.h"
32#include <cassert>
33
34namespace llvm {
35
36class DataLayout;
37class Type;
38class raw_ostream;
39
40class LLT {
41public:
42 /// Get a low-level scalar or aggregate "bag of bits".
43 static LLT scalar(unsigned SizeInBits) {
44 assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 44, __extension__ __PRETTY_FUNCTION__))
;
45 return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0,
46 SizeInBits, /*AddressSpace=*/0};
47 }
48
49 /// Get a low-level pointer in the given address space.
50 static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
51 assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 51, __extension__ __PRETTY_FUNCTION__))
;
52 return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0,
53 SizeInBits, AddressSpace};
54 }
55
56 /// Get a low-level vector of some number of elements and element width.
57 /// \p NumElements must be at least 2.
58 static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) {
59 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 59, __extension__ __PRETTY_FUNCTION__))
;
60 assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 &&
"invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 60, __extension__ __PRETTY_FUNCTION__))
;
61 return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements,
62 ScalarSizeInBits, /*AddressSpace=*/0};
63 }
64
65 /// Get a low-level vector of some number of elements and element type.
66 static LLT vector(uint16_t NumElements, LLT ScalarTy) {
67 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 67, __extension__ __PRETTY_FUNCTION__))
;
68 assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type"
) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 68, __extension__ __PRETTY_FUNCTION__))
;
69 return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements,
70 ScalarTy.getSizeInBits(),
71 ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0};
72 }
73
74 static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) {
75 return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy);
76 }
77
78 static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) {
79 return scalarOrVector(NumElements, LLT::scalar(ScalarSize));
80 }
81
82 explicit LLT(bool isPointer, bool isVector, uint16_t NumElements,
83 unsigned SizeInBits, unsigned AddressSpace) {
84 init(isPointer, isVector, NumElements, SizeInBits, AddressSpace);
85 }
86 explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {}
87
88 explicit LLT(MVT VT);
89
90 bool isValid() const { return RawData != 0; }
91
92 bool isScalar() const { return isValid() && !IsPointer && !IsVector; }
93
94 bool isPointer() const { return isValid() && IsPointer && !IsVector; }
95
96 bool isVector() const { return isValid() && IsVector; }
15
Returning zero, which participates in a condition later
97
98 /// Returns the number of elements in a vector LLT. Must only be called on
99 /// vector types.
100 uint16_t getNumElements() const {
101 assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate"
) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 101, __extension__ __PRETTY_FUNCTION__))
;
102 if (!IsPointer)
103 return getFieldValue(VectorElementsFieldInfo);
104 else
105 return getFieldValue(PointerVectorElementsFieldInfo);
106 }
107
108 /// Returns the total size of the type. Must only be called on sized types.
109 unsigned getSizeInBits() const {
110 if (isPointer() || isScalar())
44
Taking false branch
111 return getScalarSizeInBits();
112 return getScalarSizeInBits() * getNumElements();
45
Returning zero
113 }
114
115 /// Returns the total size of the type in bytes, i.e. number of whole bytes
116 /// needed to represent the size in bits. Must only be called on sized types.
117 unsigned getSizeInBytes() const {
118 return (getSizeInBits() + 7) / 8;
119 }
120
121 LLT getScalarType() const {
122 return isVector() ? getElementType() : *this;
123 }
124
125 /// If this type is a vector, return a vector with the same number of elements
126 /// but the new element type. Otherwise, return the new element type.
127 LLT changeElementType(LLT NewEltTy) const {
128 return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy;
129 }
130
131 /// If this type is a vector, return a vector with the same number of elements
132 /// but the new element size. Otherwise, return the new element type. Invalid
133 /// for pointer types. For pointer types, use changeElementType.
134 LLT changeElementSize(unsigned NewEltSize) const {
135 assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
136 "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
;
137 return isVector() ? LLT::vector(getNumElements(), NewEltSize)
138 : LLT::scalar(NewEltSize);
139 }
140
141 /// Return a vector or scalar with the same element type and the new number of
142 /// elements.
143 LLT changeNumElements(unsigned NewNumElts) const {
144 return LLT::scalarOrVector(NewNumElts, getScalarType());
145 }
146
147 /// Return a type that is \p Factor times smaller. Reduces the number of
148 /// elements if this is a vector, or the bitwidth for scalar/pointers. Does
149 /// not attempt to handle cases that aren't evenly divisible.
150 LLT divide(int Factor) const {
151 assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail
("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
;
152 if (isVector()) {
153 assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void
(0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 153, __extension__ __PRETTY_FUNCTION__))
;
154 return scalarOrVector(getNumElements() / Factor, getElementType());
155 }
156
157 assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void
(0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 157, __extension__ __PRETTY_FUNCTION__))
;
158 return scalar(getSizeInBits() / Factor);
159 }
160
161 bool isByteSized() const { return (getSizeInBits() & 7) == 0; }
162
163 unsigned getScalarSizeInBits() const {
164 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 164, __extension__ __PRETTY_FUNCTION__))
;
165 if (!IsVector) {
166 if (!IsPointer)
167 return getFieldValue(ScalarSizeFieldInfo);
168 else
169 return getFieldValue(PointerSizeFieldInfo);
170 } else {
171 if (!IsPointer)
172 return getFieldValue(VectorSizeFieldInfo);
173 else
174 return getFieldValue(PointerVectorSizeFieldInfo);
175 }
176 }
177
178 unsigned getAddressSpace() const {
179 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180 assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type"
) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 180, __extension__ __PRETTY_FUNCTION__))
;
181 if (!IsVector)
182 return getFieldValue(PointerAddressSpaceFieldInfo);
183 else
184 return getFieldValue(PointerVectorAddressSpaceFieldInfo);
185 }
186
187 /// Returns the vector's element type. Only valid for vector types.
188 LLT getElementType() const {
189 assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate"
) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 189, __extension__ __PRETTY_FUNCTION__))
;
190 if (IsPointer)
191 return pointer(getAddressSpace(), getScalarSizeInBits());
192 else
193 return scalar(getScalarSizeInBits());
194 }
195
196 void print(raw_ostream &OS) const;
197
198#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
199 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {
200 print(dbgs());
201 dbgs() << '\n';
202 }
203#endif
204
205 bool operator==(const LLT &RHS) const {
206 return IsPointer
6.1
'IsPointer' is not equal to 'RHS.IsPointer'
6.1
'IsPointer' is not equal to 'RHS.IsPointer'
== RHS.IsPointer
&& IsVector == RHS.IsVector &&
3
Assuming 'IsPointer' is not equal to 'RHS.IsPointer'
4
Returning zero, which participates in a condition later
7
Returning zero, which participates in a condition later
34
Assuming 'IsPointer' is not equal to 'RHS.IsPointer'
35
Returning zero, which participates in a condition later
207 RHS.RawData == RawData;
208 }
209
210 bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
211
212 friend struct DenseMapInfo<LLT>;
213 friend class GISelInstProfileBuilder;
214
215private:
216 /// LLT is packed into 64 bits as follows:
217 /// isPointer : 1
218 /// isVector : 1
219 /// with 62 bits remaining for Kind-specific data, packed in bitfields
220 /// as described below. As there isn't a simple portable way to pack bits
221 /// into bitfields, here the different fields in the packed structure is
222 /// described in static const *Field variables. Each of these variables
223 /// is a 2-element array, with the first element describing the bitfield size
224 /// and the second element describing the bitfield offset.
225 typedef int BitFieldInfo[2];
226 ///
227 /// This is how the bitfields are packed per Kind:
228 /// * Invalid:
229 /// gets encoded as RawData == 0, as that is an invalid encoding, since for
230 /// valid encodings, SizeInBits/SizeOfElement must be larger than 0.
231 /// * Non-pointer scalar (isPointer == 0 && isVector == 0):
232 /// SizeInBits: 32;
233 static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0};
234 /// * Pointer (isPointer == 1 && isVector == 0):
235 /// SizeInBits: 16;
236 /// AddressSpace: 24;
237 static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0};
238 static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{
239 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]};
240 /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1):
241 /// NumElements: 16;
242 /// SizeOfElement: 32;
243 static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0};
244 static const constexpr BitFieldInfo VectorSizeFieldInfo{
245 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]};
246 /// * Vector-of-pointer (isPointer == 1 && isVector == 1):
247 /// NumElements: 16;
248 /// SizeOfElement: 16;
249 /// AddressSpace: 24;
250 static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0};
251 static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{
252 16,
253 PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]};
254 static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{
255 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]};
256
257 uint64_t IsPointer : 1;
258 uint64_t IsVector : 1;
259 uint64_t RawData : 62;
260
261 static uint64_t getMask(const BitFieldInfo FieldInfo) {
262 const int FieldSizeInBits = FieldInfo[0];
263 return (((uint64_t)1) << FieldSizeInBits) - 1;
264 }
265 static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) {
266 assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field"
) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 266, __extension__ __PRETTY_FUNCTION__))
;
267 return (Val & Mask) << Shift;
268 }
269 static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) {
270 return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]);
271 }
272 uint64_t getFieldValue(const BitFieldInfo FieldInfo) const {
273 return getMask(FieldInfo) & (RawData >> FieldInfo[1]);
274 }
275
276 void init(bool IsPointer, bool IsVector, uint16_t NumElements,
277 unsigned SizeInBits, unsigned AddressSpace) {
278 this->IsPointer = IsPointer;
279 this->IsVector = IsVector;
280 if (!IsVector) {
281 if (!IsPointer)
282 RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo);
283 else
284 RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) |
285 maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo);
286 } else {
287 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 287, __extension__ __PRETTY_FUNCTION__))
;
288 if (!IsPointer)
289 RawData = maskAndShift(NumElements, VectorElementsFieldInfo) |
290 maskAndShift(SizeInBits, VectorSizeFieldInfo);
291 else
292 RawData =
293 maskAndShift(NumElements, PointerVectorElementsFieldInfo) |
294 maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) |
295 maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo);
296 }
297 }
298
299 uint64_t getUniqueRAWLLTData() const {
300 return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 |
301 ((uint64_t)IsVector);
302 }
303};
304
305inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
306 Ty.print(OS);
307 return OS;
308}
309
310template<> struct DenseMapInfo<LLT> {
311 static inline LLT getEmptyKey() {
312 LLT Invalid;
313 Invalid.IsPointer = true;
314 return Invalid;
315 }
316 static inline LLT getTombstoneKey() {
317 LLT Invalid;
318 Invalid.IsVector = true;
319 return Invalid;
320 }
321 static inline unsigned getHashValue(const LLT &Ty) {
322 uint64_t Val = Ty.getUniqueRAWLLTData();
323 return DenseMapInfo<uint64_t>::getHashValue(Val);
324 }
325 static bool isEqual(const LLT &LHS, const LLT &RHS) {
326 return LHS == RHS;
327 }
328};
329
330}
331
332#endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H