Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 4351, column 49
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-05-07-005843-9350-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeNumElements(Pow2NElts);
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returs true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99 };
100}
101
102static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 unsigned Size = Ty.getSizeInBits();
107 unsigned Pieces = (Size + 63) / 64;
108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110 };
111}
112
113// Increase the number of vector elements to reach the next multiple of 32-bit
114// type.
115static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116 return [=](const LegalityQuery &Query) {
117 const LLT Ty = Query.Types[TypeIdx];
118
119 const LLT EltTy = Ty.getElementType();
120 const int Size = Ty.getSizeInBits();
121 const int EltSize = EltTy.getSizeInBits();
122 const int NextMul32 = (Size + 31) / 32;
123
124 assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 124, __extension__ __PRETTY_FUNCTION__))
;
125
126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128 };
129}
130
131static LLT getBitcastRegisterType(const LLT Ty) {
132 const unsigned Size = Ty.getSizeInBits();
133
134 LLT CoercedTy;
135 if (Size <= 32) {
136 // <2 x s8> -> s16
137 // <4 x s8> -> s32
138 return LLT::scalar(Size);
139 }
140
141 return LLT::scalarOrVector(Size / 32, 32);
142}
143
144static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 const LLT Ty = Query.Types[TypeIdx];
147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148 };
149}
150
151static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152 return [=](const LegalityQuery &Query) {
153 const LLT Ty = Query.Types[TypeIdx];
154 unsigned Size = Ty.getSizeInBits();
155 assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail
("Size % 32 == 0", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 155, __extension__ __PRETTY_FUNCTION__))
;
156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157 };
158}
159
160static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161 return [=](const LegalityQuery &Query) {
162 const LLT QueryTy = Query.Types[TypeIdx];
163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164 };
165}
166
167static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168 return [=](const LegalityQuery &Query) {
169 const LLT QueryTy = Query.Types[TypeIdx];
170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171 };
172}
173
174static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175 return [=](const LegalityQuery &Query) {
176 const LLT QueryTy = Query.Types[TypeIdx];
177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178 };
179}
180
181static bool isRegisterSize(unsigned Size) {
182 return Size % 32 == 0 && Size <= MaxRegisterSize;
183}
184
185static bool isRegisterVectorElementType(LLT EltTy) {
186 const int EltSize = EltTy.getSizeInBits();
187 return EltSize == 16 || EltSize % 32 == 0;
188}
189
190static bool isRegisterVectorType(LLT Ty) {
191 const int EltSize = Ty.getElementType().getSizeInBits();
192 return EltSize == 32 || EltSize == 64 ||
193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194 EltSize == 128 || EltSize == 256;
195}
196
197static bool isRegisterType(LLT Ty) {
198 if (!isRegisterSize(Ty.getSizeInBits()))
199 return false;
200
201 if (Ty.isVector())
202 return isRegisterVectorType(Ty);
203
204 return true;
205}
206
207// Any combination of 32 or 64-bit elements up the maximum register size, and
208// multiples of v2s16.
209static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210 return [=](const LegalityQuery &Query) {
211 return isRegisterType(Query.Types[TypeIdx]);
212 };
213}
214
215static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216 return [=](const LegalityQuery &Query) {
217 const LLT QueryTy = Query.Types[TypeIdx];
218 if (!QueryTy.isVector())
219 return false;
220 const LLT EltTy = QueryTy.getElementType();
221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222 };
223}
224
225static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226 return [=](const LegalityQuery &Query) {
227 const LLT Ty = Query.Types[TypeIdx];
228 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230 };
231}
232
233// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234// handle some operations by just promoting the register during
235// selection. There are also d16 loads on GFX9+ which preserve the high bits.
236static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237 bool IsLoad) {
238 switch (AS) {
239 case AMDGPUAS::PRIVATE_ADDRESS:
240 // FIXME: Private element size.
241 return ST.enableFlatScratch() ? 128 : 32;
242 case AMDGPUAS::LOCAL_ADDRESS:
243 return ST.useDS128() ? 128 : 64;
244 case AMDGPUAS::GLOBAL_ADDRESS:
245 case AMDGPUAS::CONSTANT_ADDRESS:
246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247 // Treat constant and global as identical. SMRD loads are sometimes usable for
248 // global loads (ideally constant address space should be eliminated)
249 // depending on the context. Legality cannot be context dependent, but
250 // RegBankSelect can split the load as necessary depending on the pointer
251 // register bank/uniformity and if the memory is invariant or not written in a
252 // kernel.
253 return IsLoad ? 512 : 128;
254 default:
255 // Flat addresses may contextually need to be split to 32-bit parts if they
256 // may alias scratch depending on the subtarget.
257 return 128;
258 }
259}
260
261static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262 const LegalityQuery &Query,
263 unsigned Opcode) {
264 const LLT Ty = Query.Types[0];
265
266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267 const bool IsLoad = Opcode != AMDGPU::G_STORE;
268
269 unsigned RegSize = Ty.getSizeInBits();
270 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272 unsigned AS = Query.Types[1].getAddressSpace();
273
274 // All of these need to be custom lowered to cast the pointer operand.
275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276 return false;
277
278 // TODO: We should be able to widen loads if the alignment is high enough, but
279 // we also need to modify the memory access size.
280#if 0
281 // Accept widening loads based on alignment.
282 if (IsLoad && MemSize < Size)
283 MemSize = std::max(MemSize, Align);
284#endif
285
286 // Only 1-byte and 2-byte to 32-bit extloads are valid.
287 if (MemSize != RegSize && RegSize != 32)
288 return false;
289
290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291 return false;
292
293 switch (MemSize) {
294 case 8:
295 case 16:
296 case 32:
297 case 64:
298 case 128:
299 break;
300 case 96:
301 if (!ST.hasDwordx3LoadStores())
302 return false;
303 break;
304 case 256:
305 case 512:
306 // These may contextually need to be broken down.
307 break;
308 default:
309 return false;
310 }
311
312 assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) :
__assert_fail ("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 312, __extension__ __PRETTY_FUNCTION__))
;
313
314 if (AlignBits < MemSize) {
315 const SITargetLowering *TLI = ST.getTargetLowering();
316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317 Align(AlignBits / 8)))
318 return false;
319 }
320
321 return true;
322}
323
324// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325// workaround this. Eventually it should ignore the type for loads and only care
326// about the size. Return true in cases where we will workaround this for now by
327// bitcasting.
328static bool loadStoreBitcastWorkaround(const LLT Ty) {
329 if (EnableNewLegality)
330 return false;
331
332 const unsigned Size = Ty.getSizeInBits();
333 if (Size <= 64)
334 return false;
335 if (!Ty.isVector())
336 return true;
337
338 LLT EltTy = Ty.getElementType();
339 if (EltTy.isPointer())
340 return true;
341
342 unsigned EltSize = EltTy.getSizeInBits();
343 return EltSize != 32 && EltSize != 64;
344}
345
346static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347 unsigned Opcode) {
348 const LLT Ty = Query.Types[0];
349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350 !loadStoreBitcastWorkaround(Ty);
351}
352
353/// Return true if a load or store of the type should be lowered with a bitcast
354/// to a different type.
355static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356 const unsigned MemSizeInBits) {
357 const unsigned Size = Ty.getSizeInBits();
358 if (Size != MemSizeInBits)
359 return Size <= 32 && Ty.isVector();
360
361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362 return true;
363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364 !isRegisterVectorElementType(Ty.getElementType());
365}
366
367/// Return true if we should legalize a load by widening an odd sized memory
368/// access up to the alignment. Note this case when the memory access itself
369/// changes, not the size of the result register.
370static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371 unsigned AlignInBits, unsigned AddrSpace,
372 unsigned Opcode) {
373 // We don't want to widen cases that are naturally legal.
374 if (isPowerOf2_32(SizeInBits))
375 return false;
376
377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378 // end up widening these for a scalar load during RegBankSelect, since there
379 // aren't 96-bit scalar loads.
380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381 return false;
382
383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384 return false;
385
386 // A load is known dereferenceable up to the alignment, so it's legal to widen
387 // to it.
388 //
389 // TODO: Could check dereferenceable for less aligned cases.
390 unsigned RoundedSize = NextPowerOf2(SizeInBits);
391 if (AlignInBits < RoundedSize)
392 return false;
393
394 // Do not widen if it would introduce a slow unaligned load.
395 const SITargetLowering *TLI = ST.getTargetLowering();
396 bool Fast = false;
397 return TLI->allowsMisalignedMemoryAccessesImpl(
398 RoundedSize, AddrSpace, Align(AlignInBits / 8),
399 MachineMemOperand::MOLoad, &Fast) &&
400 Fast;
401}
402
403static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404 unsigned Opcode) {
405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406 return false;
407
408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409 Query.MMODescrs[0].AlignInBits,
410 Query.Types[1].getAddressSpace(), Opcode);
411}
412
413AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414 const GCNTargetMachine &TM)
415 : ST(ST_) {
416 using namespace TargetOpcode;
417
418 auto GetAddrSpacePtr = [&TM](unsigned AS) {
419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420 };
421
422 const LLT S1 = LLT::scalar(1);
423 const LLT S8 = LLT::scalar(8);
424 const LLT S16 = LLT::scalar(16);
425 const LLT S32 = LLT::scalar(32);
426 const LLT S64 = LLT::scalar(64);
427 const LLT S128 = LLT::scalar(128);
428 const LLT S256 = LLT::scalar(256);
429 const LLT S512 = LLT::scalar(512);
430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431
432 const LLT V2S8 = LLT::vector(2, 8);
433 const LLT V2S16 = LLT::vector(2, 16);
434 const LLT V4S16 = LLT::vector(4, 16);
435
436 const LLT V2S32 = LLT::vector(2, 32);
437 const LLT V3S32 = LLT::vector(3, 32);
438 const LLT V4S32 = LLT::vector(4, 32);
439 const LLT V5S32 = LLT::vector(5, 32);
440 const LLT V6S32 = LLT::vector(6, 32);
441 const LLT V7S32 = LLT::vector(7, 32);
442 const LLT V8S32 = LLT::vector(8, 32);
443 const LLT V9S32 = LLT::vector(9, 32);
444 const LLT V10S32 = LLT::vector(10, 32);
445 const LLT V11S32 = LLT::vector(11, 32);
446 const LLT V12S32 = LLT::vector(12, 32);
447 const LLT V13S32 = LLT::vector(13, 32);
448 const LLT V14S32 = LLT::vector(14, 32);
449 const LLT V15S32 = LLT::vector(15, 32);
450 const LLT V16S32 = LLT::vector(16, 32);
451 const LLT V32S32 = LLT::vector(32, 32);
452
453 const LLT V2S64 = LLT::vector(2, 64);
454 const LLT V3S64 = LLT::vector(3, 64);
455 const LLT V4S64 = LLT::vector(4, 64);
456 const LLT V5S64 = LLT::vector(5, 64);
457 const LLT V6S64 = LLT::vector(6, 64);
458 const LLT V7S64 = LLT::vector(7, 64);
459 const LLT V8S64 = LLT::vector(8, 64);
460 const LLT V16S64 = LLT::vector(16, 64);
461
462 std::initializer_list<LLT> AllS32Vectors =
463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465 std::initializer_list<LLT> AllS64Vectors =
466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467
468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475
476 const LLT CodePtr = FlatPtr;
477
478 const std::initializer_list<LLT> AddrSpaces64 = {
479 GlobalPtr, ConstantPtr, FlatPtr
480 };
481
482 const std::initializer_list<LLT> AddrSpaces32 = {
483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484 };
485
486 const std::initializer_list<LLT> FPTypesBase = {
487 S32, S64
488 };
489
490 const std::initializer_list<LLT> FPTypes16 = {
491 S32, S64, S16
492 };
493
494 const std::initializer_list<LLT> FPTypesPK16 = {
495 S32, S64, S16, V2S16
496 };
497
498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499
500 setAction({G_BRCOND, S1}, Legal); // VCC branches
501 setAction({G_BRCOND, S32}, Legal); // SCC branches
502
503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504 // elements for v3s16
505 getActionDefinitionsBuilder(G_PHI)
506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507 .legalFor(AllS32Vectors)
508 .legalFor(AllS64Vectors)
509 .legalFor(AddrSpaces64)
510 .legalFor(AddrSpaces32)
511 .legalIf(isPointer(0))
512 .clampScalar(0, S16, S256)
513 .widenScalarToNextPow2(0, 32)
514 .clampMaxNumElements(0, S32, 16)
515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516 .scalarize(0);
517
518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519 // Full set of gfx9 features.
520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521 .legalFor({S32, S16, V2S16})
522 .clampScalar(0, S16, S32)
523 .clampMaxNumElements(0, S16, 2)
524 .scalarize(0)
525 .widenScalarToNextPow2(0, 32);
526
527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528 .legalFor({S32, S16, V2S16}) // Clamp modifier
529 .minScalarOrElt(0, S16)
530 .clampMaxNumElements(0, S16, 2)
531 .scalarize(0)
532 .widenScalarToNextPow2(0, 32)
533 .lower();
534 } else if (ST.has16BitInsts()) {
535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536 .legalFor({S32, S16})
537 .clampScalar(0, S16, S32)
538 .scalarize(0)
539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540
541 // Technically the saturating operations require clamp bit support, but this
542 // was introduced at the same time as 16-bit operations.
543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544 .legalFor({S32, S16}) // Clamp modifier
545 .minScalar(0, S16)
546 .scalarize(0)
547 .widenScalarToNextPow2(0, 16)
548 .lower();
549
550 // We're just lowering this, but it helps get a better result to try to
551 // coerce to the desired type first.
552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553 .minScalar(0, S16)
554 .scalarize(0)
555 .lower();
556 } else {
557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558 .legalFor({S32})
559 .clampScalar(0, S32, S32)
560 .scalarize(0);
561
562 if (ST.hasIntClamp()) {
563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564 .legalFor({S32}) // Clamp modifier.
565 .scalarize(0)
566 .minScalarOrElt(0, S32)
567 .lower();
568 } else {
569 // Clamp bit support was added in VI, along with 16-bit operations.
570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571 .minScalar(0, S32)
572 .scalarize(0)
573 .lower();
574 }
575
576 // FIXME: DAG expansion gets better results. The widening uses the smaller
577 // range values and goes for the min/max lowering directly.
578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579 .minScalar(0, S32)
580 .scalarize(0)
581 .lower();
582 }
583
584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585 .customFor({S32, S64})
586 .clampScalar(0, S32, S64)
587 .widenScalarToNextPow2(0, 32)
588 .scalarize(0);
589
590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591 .legalFor({S32})
592 .maxScalarOrElt(0, S32);
593
594 if (ST.hasVOP3PInsts()) {
595 Mulh
596 .clampMaxNumElements(0, S8, 2)
597 .lowerFor({V2S8});
598 }
599
600 Mulh
601 .scalarize(0)
602 .lower();
603
604 // Report legal for any types we can handle anywhere. For the cases only legal
605 // on the SALU, RegBankSelect will be able to re-legalize.
606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608 .clampScalar(0, S32, S64)
609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611 .widenScalarToNextPow2(0)
612 .scalarize(0);
613
614 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616 .legalFor({{S32, S1}, {S32, S32}})
617 .minScalar(0, S32)
618 // TODO: .scalarize(0)
619 .lower();
620
621 getActionDefinitionsBuilder(G_BITCAST)
622 // Don't worry about the size constraint.
623 .legalIf(all(isRegisterType(0), isRegisterType(1)))
624 .lower();
625
626
627 getActionDefinitionsBuilder(G_CONSTANT)
628 .legalFor({S1, S32, S64, S16, GlobalPtr,
629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630 .legalIf(isPointer(0))
631 .clampScalar(0, S32, S64)
632 .widenScalarToNextPow2(0);
633
634 getActionDefinitionsBuilder(G_FCONSTANT)
635 .legalFor({S32, S64, S16})
636 .clampScalar(0, S16, S64);
637
638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639 .legalIf(isRegisterType(0))
640 // s1 and s16 are special cases because they have legal operations on
641 // them, but don't really occupy registers in the normal way.
642 .legalFor({S1, S16})
643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644 .clampScalarOrElt(0, S32, MaxScalar)
645 .widenScalarToNextPow2(0, 32)
646 .clampMaxNumElements(0, S32, 16);
647
648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649
650 // If the amount is divergent, we have to do a wave reduction to get the
651 // maximum value, so this is expanded during RegBankSelect.
652 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653 .legalFor({{PrivatePtr, S32}});
654
655 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656 .customIf(typeIsNot(0, PrivatePtr));
657
658 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659
660 auto &FPOpActions = getActionDefinitionsBuilder(
661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662 .legalFor({S32, S64});
663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664 .customFor({S32, S64});
665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666 .customFor({S32, S64});
667
668 if (ST.has16BitInsts()) {
669 if (ST.hasVOP3PInsts())
670 FPOpActions.legalFor({S16, V2S16});
671 else
672 FPOpActions.legalFor({S16});
673
674 TrigActions.customFor({S16});
675 FDIVActions.customFor({S16});
676 }
677
678 auto &MinNumMaxNum = getActionDefinitionsBuilder({
679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680
681 if (ST.hasVOP3PInsts()) {
682 MinNumMaxNum.customFor(FPTypesPK16)
683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684 .clampMaxNumElements(0, S16, 2)
685 .clampScalar(0, S16, S64)
686 .scalarize(0);
687 } else if (ST.has16BitInsts()) {
688 MinNumMaxNum.customFor(FPTypes16)
689 .clampScalar(0, S16, S64)
690 .scalarize(0);
691 } else {
692 MinNumMaxNum.customFor(FPTypesBase)
693 .clampScalar(0, S32, S64)
694 .scalarize(0);
695 }
696
697 if (ST.hasVOP3PInsts())
698 FPOpActions.clampMaxNumElements(0, S16, 2);
699
700 FPOpActions
701 .scalarize(0)
702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703
704 TrigActions
705 .scalarize(0)
706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707
708 FDIVActions
709 .scalarize(0)
710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711
712 getActionDefinitionsBuilder({G_FNEG, G_FABS})
713 .legalFor(FPTypesPK16)
714 .clampMaxNumElements(0, S16, 2)
715 .scalarize(0)
716 .clampScalar(0, S16, S64);
717
718 if (ST.has16BitInsts()) {
719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720 .legalFor({S32, S64, S16})
721 .scalarize(0)
722 .clampScalar(0, S16, S64);
723 } else {
724 getActionDefinitionsBuilder(G_FSQRT)
725 .legalFor({S32, S64})
726 .scalarize(0)
727 .clampScalar(0, S32, S64);
728
729 if (ST.hasFractBug()) {
730 getActionDefinitionsBuilder(G_FFLOOR)
731 .customFor({S64})
732 .legalFor({S32, S64})
733 .scalarize(0)
734 .clampScalar(0, S32, S64);
735 } else {
736 getActionDefinitionsBuilder(G_FFLOOR)
737 .legalFor({S32, S64})
738 .scalarize(0)
739 .clampScalar(0, S32, S64);
740 }
741 }
742
743 getActionDefinitionsBuilder(G_FPTRUNC)
744 .legalFor({{S32, S64}, {S16, S32}})
745 .scalarize(0)
746 .lower();
747
748 getActionDefinitionsBuilder(G_FPEXT)
749 .legalFor({{S64, S32}, {S32, S16}})
750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751 .scalarize(0);
752
753 getActionDefinitionsBuilder(G_FSUB)
754 // Use actual fsub instruction
755 .legalFor({S32})
756 // Must use fadd + fneg
757 .lowerFor({S64, S16, V2S16})
758 .scalarize(0)
759 .clampScalar(0, S32, S64);
760
761 // Whether this is legal depends on the floating point mode for the function.
762 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764 FMad.customFor({S32, S16});
765 else if (ST.hasMadMacF32Insts())
766 FMad.customFor({S32});
767 else if (ST.hasMadF16())
768 FMad.customFor({S16});
769 FMad.scalarize(0)
770 .lower();
771
772 auto &FRem = getActionDefinitionsBuilder(G_FREM);
773 if (ST.has16BitInsts()) {
774 FRem.customFor({S16, S32, S64});
775 } else {
776 FRem.minScalar(0, S32)
777 .customFor({S32, S64});
778 }
779 FRem.scalarize(0);
780
781 // TODO: Do we need to clamp maximum bitwidth?
782 getActionDefinitionsBuilder(G_TRUNC)
783 .legalIf(isScalar(0))
784 .legalFor({{V2S16, V2S32}})
785 .clampMaxNumElements(0, S16, 2)
786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787 // situations (like an invalid implicit use), we don't want to infinite loop
788 // in the legalizer.
789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790 .alwaysLegal();
791
792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794 {S32, S1}, {S64, S1}, {S16, S1}})
795 .scalarize(0)
796 .clampScalar(0, S32, S64)
797 .widenScalarToNextPow2(1, 32);
798
799 // TODO: Split s1->s64 during regbankselect for VALU.
800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802 .lowerFor({{S32, S64}})
803 .lowerIf(typeIs(1, S1))
804 .customFor({{S64, S64}});
805 if (ST.has16BitInsts())
806 IToFP.legalFor({{S16, S16}});
807 IToFP.clampScalar(1, S32, S64)
808 .minScalar(0, S32)
809 .scalarize(0)
810 .widenScalarToNextPow2(1);
811
812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814 .customFor({{S64, S64}})
815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816 if (ST.has16BitInsts())
817 FPToI.legalFor({{S16, S16}});
818 else
819 FPToI.minScalar(1, S32);
820
821 FPToI.minScalar(0, S32)
822 .widenScalarToNextPow2(0, 32)
823 .scalarize(0)
824 .lower();
825
826 // Lower roundeven into G_FRINT
827 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
828 .scalarize(0)
829 .lower();
830
831 if (ST.has16BitInsts()) {
832 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
833 .legalFor({S16, S32, S64})
834 .clampScalar(0, S16, S64)
835 .scalarize(0);
836 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
837 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
838 .legalFor({S32, S64})
839 .clampScalar(0, S32, S64)
840 .scalarize(0);
841 } else {
842 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
843 .legalFor({S32})
844 .customFor({S64})
845 .clampScalar(0, S32, S64)
846 .scalarize(0);
847 }
848
849 getActionDefinitionsBuilder(G_PTR_ADD)
850 .legalIf(all(isPointer(0), sameSize(0, 1)))
851 .scalarize(0)
852 .scalarSameSizeAs(1, 0);
853
854 getActionDefinitionsBuilder(G_PTRMASK)
855 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
856 .scalarSameSizeAs(1, 0)
857 .scalarize(0);
858
859 auto &CmpBuilder =
860 getActionDefinitionsBuilder(G_ICMP)
861 // The compare output type differs based on the register bank of the output,
862 // so make both s1 and s32 legal.
863 //
864 // Scalar compares producing output in scc will be promoted to s32, as that
865 // is the allocatable register type that will be needed for the copy from
866 // scc. This will be promoted during RegBankSelect, and we assume something
867 // before that won't try to use s32 result types.
868 //
869 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
870 // bank.
871 .legalForCartesianProduct(
872 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
873 .legalForCartesianProduct(
874 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
875 if (ST.has16BitInsts()) {
876 CmpBuilder.legalFor({{S1, S16}});
877 }
878
879 CmpBuilder
880 .widenScalarToNextPow2(1)
881 .clampScalar(1, S32, S64)
882 .scalarize(0)
883 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
884
885 getActionDefinitionsBuilder(G_FCMP)
886 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
887 .widenScalarToNextPow2(1)
888 .clampScalar(1, S32, S64)
889 .scalarize(0);
890
891 // FIXME: fpow has a selection pattern that should move to custom lowering.
892 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
893 if (ST.has16BitInsts())
894 Exp2Ops.legalFor({S32, S16});
895 else
896 Exp2Ops.legalFor({S32});
897 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
898 Exp2Ops.scalarize(0);
899
900 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
901 if (ST.has16BitInsts())
902 ExpOps.customFor({{S32}, {S16}});
903 else
904 ExpOps.customFor({S32});
905 ExpOps.clampScalar(0, MinScalarFPTy, S32)
906 .scalarize(0);
907
908 getActionDefinitionsBuilder(G_FPOWI)
909 .clampScalar(0, MinScalarFPTy, S32)
910 .lower();
911
912 // The 64-bit versions produce 32-bit results, but only on the SALU.
913 getActionDefinitionsBuilder(G_CTPOP)
914 .legalFor({{S32, S32}, {S32, S64}})
915 .clampScalar(0, S32, S32)
916 .clampScalar(1, S32, S64)
917 .scalarize(0)
918 .widenScalarToNextPow2(0, 32)
919 .widenScalarToNextPow2(1, 32);
920
921 // The hardware instructions return a different result on 0 than the generic
922 // instructions expect. The hardware produces -1, but these produce the
923 // bitwidth.
924 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
925 .scalarize(0)
926 .clampScalar(0, S32, S32)
927 .clampScalar(1, S32, S64)
928 .widenScalarToNextPow2(0, 32)
929 .widenScalarToNextPow2(1, 32)
930 .lower();
931
932 // The 64-bit versions produce 32-bit results, but only on the SALU.
933 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
934 .legalFor({{S32, S32}, {S32, S64}})
935 .clampScalar(0, S32, S32)
936 .clampScalar(1, S32, S64)
937 .scalarize(0)
938 .widenScalarToNextPow2(0, 32)
939 .widenScalarToNextPow2(1, 32);
940
941 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
942 // RegBankSelect.
943 getActionDefinitionsBuilder(G_BITREVERSE)
944 .legalFor({S32, S64})
945 .clampScalar(0, S32, S64)
946 .scalarize(0)
947 .widenScalarToNextPow2(0);
948
949 if (ST.has16BitInsts()) {
950 getActionDefinitionsBuilder(G_BSWAP)
951 .legalFor({S16, S32, V2S16})
952 .clampMaxNumElements(0, S16, 2)
953 // FIXME: Fixing non-power-of-2 before clamp is workaround for
954 // narrowScalar limitation.
955 .widenScalarToNextPow2(0)
956 .clampScalar(0, S16, S32)
957 .scalarize(0);
958
959 if (ST.hasVOP3PInsts()) {
960 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
961 .legalFor({S32, S16, V2S16})
962 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
963 .clampMaxNumElements(0, S16, 2)
964 .minScalar(0, S16)
965 .widenScalarToNextPow2(0)
966 .scalarize(0)
967 .lower();
968 } else {
969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970 .legalFor({S32, S16})
971 .widenScalarToNextPow2(0)
972 .minScalar(0, S16)
973 .scalarize(0)
974 .lower();
975 }
976 } else {
977 // TODO: Should have same legality without v_perm_b32
978 getActionDefinitionsBuilder(G_BSWAP)
979 .legalFor({S32})
980 .lowerIf(scalarNarrowerThan(0, 32))
981 // FIXME: Fixing non-power-of-2 before clamp is workaround for
982 // narrowScalar limitation.
983 .widenScalarToNextPow2(0)
984 .maxScalar(0, S32)
985 .scalarize(0)
986 .lower();
987
988 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
989 .legalFor({S32})
990 .minScalar(0, S32)
991 .widenScalarToNextPow2(0)
992 .scalarize(0)
993 .lower();
994 }
995
996 getActionDefinitionsBuilder(G_INTTOPTR)
997 // List the common cases
998 .legalForCartesianProduct(AddrSpaces64, {S64})
999 .legalForCartesianProduct(AddrSpaces32, {S32})
1000 .scalarize(0)
1001 // Accept any address space as long as the size matches
1002 .legalIf(sameSize(0, 1))
1003 .widenScalarIf(smallerThan(1, 0),
1004 [](const LegalityQuery &Query) {
1005 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1006 })
1007 .narrowScalarIf(largerThan(1, 0),
1008 [](const LegalityQuery &Query) {
1009 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1010 });
1011
1012 getActionDefinitionsBuilder(G_PTRTOINT)
1013 // List the common cases
1014 .legalForCartesianProduct(AddrSpaces64, {S64})
1015 .legalForCartesianProduct(AddrSpaces32, {S32})
1016 .scalarize(0)
1017 // Accept any address space as long as the size matches
1018 .legalIf(sameSize(0, 1))
1019 .widenScalarIf(smallerThan(0, 1),
1020 [](const LegalityQuery &Query) {
1021 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1022 })
1023 .narrowScalarIf(
1024 largerThan(0, 1),
1025 [](const LegalityQuery &Query) {
1026 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1027 });
1028
1029 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1030 .scalarize(0)
1031 .custom();
1032
1033 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1034 bool IsLoad) -> bool {
1035 const LLT DstTy = Query.Types[0];
1036
1037 // Split vector extloads.
1038 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1039 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1040
1041 if (MemSize < DstTy.getSizeInBits())
1042 MemSize = std::max(MemSize, AlignBits);
1043
1044 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1045 return true;
1046
1047 const LLT PtrTy = Query.Types[1];
1048 unsigned AS = PtrTy.getAddressSpace();
1049 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1050 return true;
1051
1052 // Catch weird sized loads that don't evenly divide into the access sizes
1053 // TODO: May be able to widen depending on alignment etc.
1054 unsigned NumRegs = (MemSize + 31) / 32;
1055 if (NumRegs == 3) {
1056 if (!ST.hasDwordx3LoadStores())
1057 return true;
1058 } else {
1059 // If the alignment allows, these should have been widened.
1060 if (!isPowerOf2_32(NumRegs))
1061 return true;
1062 }
1063
1064 if (AlignBits < MemSize) {
1065 const SITargetLowering *TLI = ST.getTargetLowering();
1066 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1067 Align(AlignBits / 8));
1068 }
1069
1070 return false;
1071 };
1072
1073 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1074 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1075 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1076
1077 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1078 // LDS
1079 // TODO: Unsupported flat for SI.
1080
1081 for (unsigned Op : {G_LOAD, G_STORE}) {
1082 const bool IsStore = Op == G_STORE;
1083
1084 auto &Actions = getActionDefinitionsBuilder(Op);
1085 // Explicitly list some common cases.
1086 // TODO: Does this help compile time at all?
1087 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1088 {V2S32, GlobalPtr, 64, GlobalAlign32},
1089 {V4S32, GlobalPtr, 128, GlobalAlign32},
1090 {S64, GlobalPtr, 64, GlobalAlign32},
1091 {V2S64, GlobalPtr, 128, GlobalAlign32},
1092 {V2S16, GlobalPtr, 32, GlobalAlign32},
1093 {S32, GlobalPtr, 8, GlobalAlign8},
1094 {S32, GlobalPtr, 16, GlobalAlign16},
1095
1096 {S32, LocalPtr, 32, 32},
1097 {S64, LocalPtr, 64, 32},
1098 {V2S32, LocalPtr, 64, 32},
1099 {S32, LocalPtr, 8, 8},
1100 {S32, LocalPtr, 16, 16},
1101 {V2S16, LocalPtr, 32, 32},
1102
1103 {S32, PrivatePtr, 32, 32},
1104 {S32, PrivatePtr, 8, 8},
1105 {S32, PrivatePtr, 16, 16},
1106 {V2S16, PrivatePtr, 32, 32},
1107
1108 {S32, ConstantPtr, 32, GlobalAlign32},
1109 {V2S32, ConstantPtr, 64, GlobalAlign32},
1110 {V4S32, ConstantPtr, 128, GlobalAlign32},
1111 {S64, ConstantPtr, 64, GlobalAlign32},
1112 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1113 Actions.legalIf(
1114 [=](const LegalityQuery &Query) -> bool {
1115 return isLoadStoreLegal(ST, Query, Op);
1116 });
1117
1118 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1119 // 64-bits.
1120 //
1121 // TODO: Should generalize bitcast action into coerce, which will also cover
1122 // inserting addrspacecasts.
1123 Actions.customIf(typeIs(1, Constant32Ptr));
1124
1125 // Turn any illegal element vectors into something easier to deal
1126 // with. These will ultimately produce 32-bit scalar shifts to extract the
1127 // parts anyway.
1128 //
1129 // For odd 16-bit element vectors, prefer to split those into pieces with
1130 // 16-bit vector parts.
1131 Actions.bitcastIf(
1132 [=](const LegalityQuery &Query) -> bool {
1133 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1134 Query.MMODescrs[0].SizeInBits);
1135 }, bitcastToRegisterType(0));
1136
1137 if (!IsStore) {
1138 // Widen suitably aligned loads by loading extra bytes. The standard
1139 // legalization actions can't properly express widening memory operands.
1140 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1141 return shouldWidenLoad(ST, Query, G_LOAD);
1142 });
1143 }
1144
1145 // FIXME: load/store narrowing should be moved to lower action
1146 Actions
1147 .narrowScalarIf(
1148 [=](const LegalityQuery &Query) -> bool {
1149 return !Query.Types[0].isVector() &&
1150 needToSplitMemOp(Query, Op == G_LOAD);
1151 },
1152 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1153 const LLT DstTy = Query.Types[0];
1154 const LLT PtrTy = Query.Types[1];
1155
1156 const unsigned DstSize = DstTy.getSizeInBits();
1157 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1158
1159 // Split extloads.
1160 if (DstSize > MemSize)
1161 return std::make_pair(0, LLT::scalar(MemSize));
1162
1163 if (!isPowerOf2_32(DstSize)) {
1164 // We're probably decomposing an odd sized store. Try to split
1165 // to the widest type. TODO: Account for alignment. As-is it
1166 // should be OK, since the new parts will be further legalized.
1167 unsigned FloorSize = PowerOf2Floor(DstSize);
1168 return std::make_pair(0, LLT::scalar(FloorSize));
1169 }
1170
1171 if (DstSize > 32 && (DstSize % 32 != 0)) {
1172 // FIXME: Need a way to specify non-extload of larger size if
1173 // suitably aligned.
1174 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1175 }
1176
1177 unsigned MaxSize = maxSizeForAddrSpace(ST,
1178 PtrTy.getAddressSpace(),
1179 Op == G_LOAD);
1180 if (MemSize > MaxSize)
1181 return std::make_pair(0, LLT::scalar(MaxSize));
1182
1183 unsigned Align = Query.MMODescrs[0].AlignInBits;
1184 return std::make_pair(0, LLT::scalar(Align));
1185 })
1186 .fewerElementsIf(
1187 [=](const LegalityQuery &Query) -> bool {
1188 return Query.Types[0].isVector() &&
1189 needToSplitMemOp(Query, Op == G_LOAD);
1190 },
1191 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1192 const LLT DstTy = Query.Types[0];
1193 const LLT PtrTy = Query.Types[1];
1194
1195 LLT EltTy = DstTy.getElementType();
1196 unsigned MaxSize = maxSizeForAddrSpace(ST,
1197 PtrTy.getAddressSpace(),
1198 Op == G_LOAD);
1199
1200 // FIXME: Handle widened to power of 2 results better. This ends
1201 // up scalarizing.
1202 // FIXME: 3 element stores scalarized on SI
1203
1204 // Split if it's too large for the address space.
1205 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1206 unsigned NumElts = DstTy.getNumElements();
1207 unsigned EltSize = EltTy.getSizeInBits();
1208
1209 if (MaxSize % EltSize == 0) {
1210 return std::make_pair(
1211 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1212 }
1213
1214 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1215
1216 // FIXME: Refine when odd breakdowns handled
1217 // The scalars will need to be re-legalized.
1218 if (NumPieces == 1 || NumPieces >= NumElts ||
1219 NumElts % NumPieces != 0)
1220 return std::make_pair(0, EltTy);
1221
1222 return std::make_pair(0,
1223 LLT::vector(NumElts / NumPieces, EltTy));
1224 }
1225
1226 // FIXME: We could probably handle weird extending loads better.
1227 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1228 if (DstTy.getSizeInBits() > MemSize)
1229 return std::make_pair(0, EltTy);
1230
1231 unsigned EltSize = EltTy.getSizeInBits();
1232 unsigned DstSize = DstTy.getSizeInBits();
1233 if (!isPowerOf2_32(DstSize)) {
1234 // We're probably decomposing an odd sized store. Try to split
1235 // to the widest type. TODO: Account for alignment. As-is it
1236 // should be OK, since the new parts will be further legalized.
1237 unsigned FloorSize = PowerOf2Floor(DstSize);
1238 return std::make_pair(
1239 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1240 }
1241
1242 // Need to split because of alignment.
1243 unsigned Align = Query.MMODescrs[0].AlignInBits;
1244 if (EltSize > Align &&
1245 (EltSize / Align < DstTy.getNumElements())) {
1246 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1247 }
1248
1249 // May need relegalization for the scalars.
1250 return std::make_pair(0, EltTy);
1251 })
1252 .lowerIfMemSizeNotPow2()
1253 .minScalar(0, S32);
1254
1255 if (IsStore)
1256 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1257
1258 Actions
1259 .widenScalarToNextPow2(0)
1260 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1261 .lower();
1262 }
1263
1264 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1265 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1266 {S32, GlobalPtr, 16, 2 * 8},
1267 {S32, LocalPtr, 8, 8},
1268 {S32, LocalPtr, 16, 16},
1269 {S32, PrivatePtr, 8, 8},
1270 {S32, PrivatePtr, 16, 16},
1271 {S32, ConstantPtr, 8, 8},
1272 {S32, ConstantPtr, 16, 2 * 8}});
1273 if (ST.hasFlatAddressSpace()) {
1274 ExtLoads.legalForTypesWithMemDesc(
1275 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1276 }
1277
1278 ExtLoads.clampScalar(0, S32, S32)
1279 .widenScalarToNextPow2(0)
1280 .unsupportedIfMemSizeNotPow2()
1281 .lower();
1282
1283 auto &Atomics = getActionDefinitionsBuilder(
1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287 G_ATOMICRMW_UMIN})
1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289 {S64, GlobalPtr}, {S64, LocalPtr},
1290 {S32, RegionPtr}, {S64, RegionPtr}});
1291 if (ST.hasFlatAddressSpace()) {
1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293 }
1294
1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296 if (ST.hasLDSFPAtomics()) {
1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298 if (ST.hasGFX90AInsts())
1299 Atomic.legalFor({{S64, LocalPtr}});
1300 }
1301 if (ST.hasAtomicFaddInsts())
1302 Atomic.legalFor({{S32, GlobalPtr}});
1303
1304 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1305 // demarshalling
1306 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1307 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1308 {S32, FlatPtr}, {S64, FlatPtr}})
1309 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1310 {S32, RegionPtr}, {S64, RegionPtr}});
1311 // TODO: Pointer types, any 32-bit or 64-bit vector
1312
1313 // Condition should be s32 for scalar, s1 for vector.
1314 getActionDefinitionsBuilder(G_SELECT)
1315 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1316 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1317 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1318 .clampScalar(0, S16, S64)
1319 .scalarize(1)
1320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1321 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1322 .clampMaxNumElements(0, S32, 2)
1323 .clampMaxNumElements(0, LocalPtr, 2)
1324 .clampMaxNumElements(0, PrivatePtr, 2)
1325 .scalarize(0)
1326 .widenScalarToNextPow2(0)
1327 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1328
1329 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1330 // be more flexible with the shift amount type.
1331 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1332 .legalFor({{S32, S32}, {S64, S32}});
1333 if (ST.has16BitInsts()) {
1334 if (ST.hasVOP3PInsts()) {
1335 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1336 .clampMaxNumElements(0, S16, 2);
1337 } else
1338 Shifts.legalFor({{S16, S16}});
1339
1340 // TODO: Support 16-bit shift amounts for all types
1341 Shifts.widenScalarIf(
1342 [=](const LegalityQuery &Query) {
1343 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1344 // 32-bit amount.
1345 const LLT ValTy = Query.Types[0];
1346 const LLT AmountTy = Query.Types[1];
1347 return ValTy.getSizeInBits() <= 16 &&
1348 AmountTy.getSizeInBits() < 16;
1349 }, changeTo(1, S16));
1350 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1351 Shifts.clampScalar(1, S32, S32);
1352 Shifts.clampScalar(0, S16, S64);
1353 Shifts.widenScalarToNextPow2(0, 16);
1354
1355 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1356 .minScalar(0, S16)
1357 .scalarize(0)
1358 .lower();
1359 } else {
1360 // Make sure we legalize the shift amount type first, as the general
1361 // expansion for the shifted type will produce much worse code if it hasn't
1362 // been truncated already.
1363 Shifts.clampScalar(1, S32, S32);
1364 Shifts.clampScalar(0, S32, S64);
1365 Shifts.widenScalarToNextPow2(0, 32);
1366
1367 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1368 .minScalar(0, S32)
1369 .scalarize(0)
1370 .lower();
1371 }
1372 Shifts.scalarize(0);
1373
1374 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1375 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1376 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1377 unsigned IdxTypeIdx = 2;
1378
1379 getActionDefinitionsBuilder(Op)
1380 .customIf([=](const LegalityQuery &Query) {
1381 const LLT EltTy = Query.Types[EltTypeIdx];
1382 const LLT VecTy = Query.Types[VecTypeIdx];
1383 const LLT IdxTy = Query.Types[IdxTypeIdx];
1384 const unsigned EltSize = EltTy.getSizeInBits();
1385 return (EltSize == 32 || EltSize == 64) &&
1386 VecTy.getSizeInBits() % 32 == 0 &&
1387 VecTy.getSizeInBits() <= MaxRegisterSize &&
1388 IdxTy.getSizeInBits() == 32;
1389 })
1390 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1391 bitcastToVectorElement32(VecTypeIdx))
1392 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1393 .bitcastIf(
1394 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1395 [=](const LegalityQuery &Query) {
1396 // For > 64-bit element types, try to turn this into a 64-bit
1397 // element vector since we may be able to do better indexing
1398 // if this is scalar. If not, fall back to 32.
1399 const LLT EltTy = Query.Types[EltTypeIdx];
1400 const LLT VecTy = Query.Types[VecTypeIdx];
1401 const unsigned DstEltSize = EltTy.getSizeInBits();
1402 const unsigned VecSize = VecTy.getSizeInBits();
1403
1404 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1405 return std::make_pair(
1406 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1407 })
1408 .clampScalar(EltTypeIdx, S32, S64)
1409 .clampScalar(VecTypeIdx, S32, S64)
1410 .clampScalar(IdxTypeIdx, S32, S32)
1411 .clampMaxNumElements(VecTypeIdx, S32, 32)
1412 // TODO: Clamp elements for 64-bit vectors?
1413 // It should only be necessary with variable indexes.
1414 // As a last resort, lower to the stack
1415 .lower();
1416 }
1417
1418 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1419 .unsupportedIf([=](const LegalityQuery &Query) {
1420 const LLT &EltTy = Query.Types[1].getElementType();
1421 return Query.Types[0] != EltTy;
1422 });
1423
1424 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1425 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1426 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1427
1428 // FIXME: Doesn't handle extract of illegal sizes.
1429 getActionDefinitionsBuilder(Op)
1430 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1431 // FIXME: Multiples of 16 should not be legal.
1432 .legalIf([=](const LegalityQuery &Query) {
1433 const LLT BigTy = Query.Types[BigTyIdx];
1434 const LLT LitTy = Query.Types[LitTyIdx];
1435 return (BigTy.getSizeInBits() % 32 == 0) &&
1436 (LitTy.getSizeInBits() % 16 == 0);
1437 })
1438 .widenScalarIf(
1439 [=](const LegalityQuery &Query) {
1440 const LLT BigTy = Query.Types[BigTyIdx];
1441 return (BigTy.getScalarSizeInBits() < 16);
1442 },
1443 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1444 .widenScalarIf(
1445 [=](const LegalityQuery &Query) {
1446 const LLT LitTy = Query.Types[LitTyIdx];
1447 return (LitTy.getScalarSizeInBits() < 16);
1448 },
1449 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1450 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1451 .widenScalarToNextPow2(BigTyIdx, 32);
1452
1453 }
1454
1455 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1456 .legalForCartesianProduct(AllS32Vectors, {S32})
1457 .legalForCartesianProduct(AllS64Vectors, {S64})
1458 .clampNumElements(0, V16S32, V32S32)
1459 .clampNumElements(0, V2S64, V16S64)
1460 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1461
1462 if (ST.hasScalarPackInsts()) {
1463 BuildVector
1464 // FIXME: Should probably widen s1 vectors straight to s32
1465 .minScalarOrElt(0, S16)
1466 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1467 .minScalar(1, S32);
1468
1469 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1470 .legalFor({V2S16, S32})
1471 .lower();
1472 BuildVector.minScalarOrElt(0, S32);
1473 } else {
1474 BuildVector.customFor({V2S16, S16});
1475 BuildVector.minScalarOrElt(0, S32);
1476
1477 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1478 .customFor({V2S16, S32})
1479 .lower();
1480 }
1481
1482 BuildVector.legalIf(isRegisterType(0));
1483
1484 // FIXME: Clamp maximum size
1485 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1486 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1487 .clampMaxNumElements(0, S32, 32)
1488 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1489 .clampMaxNumElements(0, S16, 64);
1490
1491 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1492 // pre-legalize.
1493 if (ST.hasVOP3PInsts()) {
1494 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1495 .customFor({V2S16, V2S16})
1496 .lower();
1497 } else
1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1499
1500 // Merge/Unmerge
1501 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1502 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1503 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1504
1505 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1506 const LLT Ty = Query.Types[TypeIdx];
1507 if (Ty.isVector()) {
1508 const LLT &EltTy = Ty.getElementType();
1509 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1510 return true;
1511 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1512 return true;
1513 }
1514 return false;
1515 };
1516
1517 auto &Builder = getActionDefinitionsBuilder(Op)
1518 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1519 .lowerFor({{S16, V2S16}})
1520 .lowerIf([=](const LegalityQuery &Query) {
1521 const LLT BigTy = Query.Types[BigTyIdx];
1522 return BigTy.getSizeInBits() == 32;
1523 })
1524 // Try to widen to s16 first for small types.
1525 // TODO: Only do this on targets with legal s16 shifts
1526 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1527 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1528 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1529 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1530 elementTypeIs(1, S16)),
1531 changeTo(1, V2S16))
1532 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1533 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1534 // valid.
1535 .clampScalar(LitTyIdx, S32, S512)
1536 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1537 // Break up vectors with weird elements into scalars
1538 .fewerElementsIf(
1539 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1540 scalarize(0))
1541 .fewerElementsIf(
1542 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1543 scalarize(1))
1544 .clampScalar(BigTyIdx, S32, MaxScalar);
1545
1546 if (Op == G_MERGE_VALUES) {
1547 Builder.widenScalarIf(
1548 // TODO: Use 16-bit shifts if legal for 8-bit values?
1549 [=](const LegalityQuery &Query) {
1550 const LLT Ty = Query.Types[LitTyIdx];
1551 return Ty.getSizeInBits() < 32;
1552 },
1553 changeTo(LitTyIdx, S32));
1554 }
1555
1556 Builder.widenScalarIf(
1557 [=](const LegalityQuery &Query) {
1558 const LLT Ty = Query.Types[BigTyIdx];
1559 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1560 Ty.getSizeInBits() % 16 != 0;
1561 },
1562 [=](const LegalityQuery &Query) {
1563 // Pick the next power of 2, or a multiple of 64 over 128.
1564 // Whichever is smaller.
1565 const LLT &Ty = Query.Types[BigTyIdx];
1566 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1567 if (NewSizeInBits >= 256) {
1568 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1569 if (RoundedTo < NewSizeInBits)
1570 NewSizeInBits = RoundedTo;
1571 }
1572 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1573 })
1574 // Any vectors left are the wrong size. Scalarize them.
1575 .scalarize(0)
1576 .scalarize(1);
1577 }
1578
1579 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1580 // RegBankSelect.
1581 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1582 .legalFor({{S32}, {S64}});
1583
1584 if (ST.hasVOP3PInsts()) {
1585 SextInReg.lowerFor({{V2S16}})
1586 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1587 // get more vector shift opportunities, since we'll get those when
1588 // expanded.
1589 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1590 } else if (ST.has16BitInsts()) {
1591 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1592 } else {
1593 // Prefer to promote to s32 before lowering if we don't have 16-bit
1594 // shifts. This avoid a lot of intermediate truncate and extend operations.
1595 SextInReg.lowerFor({{S32}, {S64}});
1596 }
1597
1598 SextInReg
1599 .scalarize(0)
1600 .clampScalar(0, S32, S64)
1601 .lower();
1602
1603 // TODO: Only Try to form v2s16 with legal packed instructions.
1604 getActionDefinitionsBuilder(G_FSHR)
1605 .legalFor({{S32, S32}})
1606 .lowerFor({{V2S16, V2S16}})
1607 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1608 .scalarize(0)
1609 .lower();
1610
1611 if (ST.hasVOP3PInsts()) {
1612 getActionDefinitionsBuilder(G_FSHL)
1613 .lowerFor({{V2S16, V2S16}})
1614 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1615 .scalarize(0)
1616 .lower();
1617 } else {
1618 getActionDefinitionsBuilder(G_FSHL)
1619 .scalarize(0)
1620 .lower();
1621 }
1622
1623 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1624 .legalFor({S64});
1625
1626 getActionDefinitionsBuilder(G_FENCE)
1627 .alwaysLegal();
1628
1629 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1630 .scalarize(0)
1631 .minScalar(0, S32)
1632 .lower();
1633
1634 getActionDefinitionsBuilder({
1635 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1636 G_FCOPYSIGN,
1637
1638 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1639 G_ATOMICRMW_NAND,
1640 G_ATOMICRMW_FSUB,
1641 G_READ_REGISTER,
1642 G_WRITE_REGISTER,
1643
1644 G_SADDO, G_SSUBO,
1645
1646 // TODO: Implement
1647 G_FMINIMUM, G_FMAXIMUM}).lower();
1648
1649 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1650 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1651 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1652 .unsupported();
1653
1654 computeTables();
1655 verify(*ST.getInstrInfo());
1656}
1657
1658bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1659 MachineInstr &MI) const {
1660 MachineIRBuilder &B = Helper.MIRBuilder;
1661 MachineRegisterInfo &MRI = *B.getMRI();
1662
1663 switch (MI.getOpcode()) {
1664 case TargetOpcode::G_ADDRSPACE_CAST:
1665 return legalizeAddrSpaceCast(MI, MRI, B);
1666 case TargetOpcode::G_FRINT:
1667 return legalizeFrint(MI, MRI, B);
1668 case TargetOpcode::G_FCEIL:
1669 return legalizeFceil(MI, MRI, B);
1670 case TargetOpcode::G_FREM:
1671 return legalizeFrem(MI, MRI, B);
1672 case TargetOpcode::G_INTRINSIC_TRUNC:
1673 return legalizeIntrinsicTrunc(MI, MRI, B);
1674 case TargetOpcode::G_SITOFP:
1675 return legalizeITOFP(MI, MRI, B, true);
1676 case TargetOpcode::G_UITOFP:
1677 return legalizeITOFP(MI, MRI, B, false);
1678 case TargetOpcode::G_FPTOSI:
1679 return legalizeFPTOI(MI, MRI, B, true);
1680 case TargetOpcode::G_FPTOUI:
1681 return legalizeFPTOI(MI, MRI, B, false);
1682 case TargetOpcode::G_FMINNUM:
1683 case TargetOpcode::G_FMAXNUM:
1684 case TargetOpcode::G_FMINNUM_IEEE:
1685 case TargetOpcode::G_FMAXNUM_IEEE:
1686 return legalizeMinNumMaxNum(Helper, MI);
1687 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1688 return legalizeExtractVectorElt(MI, MRI, B);
1689 case TargetOpcode::G_INSERT_VECTOR_ELT:
1690 return legalizeInsertVectorElt(MI, MRI, B);
1691 case TargetOpcode::G_SHUFFLE_VECTOR:
1692 return legalizeShuffleVector(MI, MRI, B);
1693 case TargetOpcode::G_FSIN:
1694 case TargetOpcode::G_FCOS:
1695 return legalizeSinCos(MI, MRI, B);
1696 case TargetOpcode::G_GLOBAL_VALUE:
1697 return legalizeGlobalValue(MI, MRI, B);
1698 case TargetOpcode::G_LOAD:
1699 return legalizeLoad(Helper, MI);
1700 case TargetOpcode::G_FMAD:
1701 return legalizeFMad(MI, MRI, B);
1702 case TargetOpcode::G_FDIV:
1703 return legalizeFDIV(MI, MRI, B);
1704 case TargetOpcode::G_UDIV:
1705 case TargetOpcode::G_UREM:
1706 return legalizeUDIV_UREM(MI, MRI, B);
1707 case TargetOpcode::G_SDIV:
1708 case TargetOpcode::G_SREM:
1709 return legalizeSDIV_SREM(MI, MRI, B);
1710 case TargetOpcode::G_ATOMIC_CMPXCHG:
1711 return legalizeAtomicCmpXChg(MI, MRI, B);
1712 case TargetOpcode::G_FLOG:
1713 return legalizeFlog(MI, B, numbers::ln2f);
1714 case TargetOpcode::G_FLOG10:
1715 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1716 case TargetOpcode::G_FEXP:
1717 return legalizeFExp(MI, B);
1718 case TargetOpcode::G_FPOW:
1719 return legalizeFPow(MI, B);
1720 case TargetOpcode::G_FFLOOR:
1721 return legalizeFFloor(MI, MRI, B);
1722 case TargetOpcode::G_BUILD_VECTOR:
1723 return legalizeBuildVector(MI, MRI, B);
1724 default:
1725 return false;
1726 }
1727
1728 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1728)
;
1729}
1730
1731Register AMDGPULegalizerInfo::getSegmentAperture(
1732 unsigned AS,
1733 MachineRegisterInfo &MRI,
1734 MachineIRBuilder &B) const {
1735 MachineFunction &MF = B.getMF();
1736 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1737 const LLT S32 = LLT::scalar(32);
1738
1739 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS
== AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1739, __extension__ __PRETTY_FUNCTION__))
;
1740
1741 if (ST.hasApertureRegs()) {
1742 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1743 // getreg.
1744 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1745 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1746 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1747 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1748 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1749 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1750 unsigned Encoding =
1751 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1752 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1753 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1754
1755 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1756
1757 B.buildInstr(AMDGPU::S_GETREG_B32)
1758 .addDef(GetReg)
1759 .addImm(Encoding);
1760 MRI.setType(GetReg, S32);
1761
1762 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1763 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1764 }
1765
1766 Register QueuePtr = MRI.createGenericVirtualRegister(
1767 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1768
1769 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1770 return Register();
1771
1772 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1773 // private_segment_aperture_base_hi.
1774 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1775
1776 // TODO: can we be smarter about machine pointer info?
1777 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1778 MachineMemOperand *MMO = MF.getMachineMemOperand(
1779 PtrInfo,
1780 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1781 MachineMemOperand::MOInvariant,
1782 4, commonAlignment(Align(64), StructOffset));
1783
1784 Register LoadAddr;
1785
1786 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1787 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1788}
1789
1790bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1791 MachineInstr &MI, MachineRegisterInfo &MRI,
1792 MachineIRBuilder &B) const {
1793 MachineFunction &MF = B.getMF();
1794
1795 const LLT S32 = LLT::scalar(32);
1796 Register Dst = MI.getOperand(0).getReg();
1797 Register Src = MI.getOperand(1).getReg();
1798
1799 LLT DstTy = MRI.getType(Dst);
1800 LLT SrcTy = MRI.getType(Src);
1801 unsigned DestAS = DstTy.getAddressSpace();
1802 unsigned SrcAS = SrcTy.getAddressSpace();
1803
1804 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1805 // vector element.
1806 assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1806, __extension__ __PRETTY_FUNCTION__))
;
1807
1808 const AMDGPUTargetMachine &TM
1809 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1810
1811 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1812 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1813 return true;
1814 }
1815
1816 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1817 // Truncate.
1818 B.buildExtract(Dst, Src, 0);
1819 MI.eraseFromParent();
1820 return true;
1821 }
1822
1823 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1824 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1825 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1826
1827 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1828 // another. Merge operands are required to be the same type, but creating an
1829 // extra ptrtoint would be kind of pointless.
1830 auto HighAddr = B.buildConstant(
1831 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1832 B.buildMerge(Dst, {Src, HighAddr});
1833 MI.eraseFromParent();
1834 return true;
1835 }
1836
1837 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1838 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1839, __extension__ __PRETTY_FUNCTION__))
1839 DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail
("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1839, __extension__ __PRETTY_FUNCTION__))
;
1840 unsigned NullVal = TM.getNullPointerValue(DestAS);
1841
1842 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1843 auto FlatNull = B.buildConstant(SrcTy, 0);
1844
1845 // Extract low 32-bits of the pointer.
1846 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1847
1848 auto CmpRes =
1849 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1850 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1851
1852 MI.eraseFromParent();
1853 return true;
1854 }
1855
1856 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1857 return false;
1858
1859 if (!ST.hasFlatAddressSpace())
1860 return false;
1861
1862 auto SegmentNull =
1863 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1864 auto FlatNull =
1865 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1866
1867 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1868 if (!ApertureReg.isValid())
1869 return false;
1870
1871 auto CmpRes =
1872 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1873
1874 // Coerce the type of the low half of the result so we can use merge_values.
1875 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1876
1877 // TODO: Should we allow mismatched types but matching sizes in merges to
1878 // avoid the ptrtoint?
1879 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1880 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1881
1882 MI.eraseFromParent();
1883 return true;
1884}
1885
1886bool AMDGPULegalizerInfo::legalizeFrint(
1887 MachineInstr &MI, MachineRegisterInfo &MRI,
1888 MachineIRBuilder &B) const {
1889 Register Src = MI.getOperand(1).getReg();
1890 LLT Ty = MRI.getType(Src);
1891 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits
() == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1891, __extension__ __PRETTY_FUNCTION__))
;
1892
1893 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1894 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1895
1896 auto C1 = B.buildFConstant(Ty, C1Val);
1897 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1898
1899 // TODO: Should this propagate fast-math-flags?
1900 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1901 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1902
1903 auto C2 = B.buildFConstant(Ty, C2Val);
1904 auto Fabs = B.buildFAbs(Ty, Src);
1905
1906 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1907 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1908 MI.eraseFromParent();
1909 return true;
1910}
1911
1912bool AMDGPULegalizerInfo::legalizeFceil(
1913 MachineInstr &MI, MachineRegisterInfo &MRI,
1914 MachineIRBuilder &B) const {
1915
1916 const LLT S1 = LLT::scalar(1);
1917 const LLT S64 = LLT::scalar(64);
1918
1919 Register Src = MI.getOperand(1).getReg();
1920 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1920, __extension__ __PRETTY_FUNCTION__))
;
1921
1922 // result = trunc(src)
1923 // if (src > 0.0 && src != result)
1924 // result += 1.0
1925
1926 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1927
1928 const auto Zero = B.buildFConstant(S64, 0.0);
1929 const auto One = B.buildFConstant(S64, 1.0);
1930 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1931 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1932 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1933 auto Add = B.buildSelect(S64, And, One, Zero);
1934
1935 // TODO: Should this propagate fast-math-flags?
1936 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1937 return true;
1938}
1939
1940bool AMDGPULegalizerInfo::legalizeFrem(
1941 MachineInstr &MI, MachineRegisterInfo &MRI,
1942 MachineIRBuilder &B) const {
1943 Register DstReg = MI.getOperand(0).getReg();
1944 Register Src0Reg = MI.getOperand(1).getReg();
1945 Register Src1Reg = MI.getOperand(2).getReg();
1946 auto Flags = MI.getFlags();
1947 LLT Ty = MRI.getType(DstReg);
1948
1949 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1950 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1951 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1952 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1953 MI.eraseFromParent();
1954 return true;
1955}
1956
1957static MachineInstrBuilder extractF64Exponent(Register Hi,
1958 MachineIRBuilder &B) {
1959 const unsigned FractBits = 52;
1960 const unsigned ExpBits = 11;
1961 LLT S32 = LLT::scalar(32);
1962
1963 auto Const0 = B.buildConstant(S32, FractBits - 32);
1964 auto Const1 = B.buildConstant(S32, ExpBits);
1965
1966 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1967 .addUse(Hi)
1968 .addUse(Const0.getReg(0))
1969 .addUse(Const1.getReg(0));
1970
1971 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1972}
1973
1974bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1975 MachineInstr &MI, MachineRegisterInfo &MRI,
1976 MachineIRBuilder &B) const {
1977 const LLT S1 = LLT::scalar(1);
1978 const LLT S32 = LLT::scalar(32);
1979 const LLT S64 = LLT::scalar(64);
1980
1981 Register Src = MI.getOperand(1).getReg();
1982 assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0
) : __assert_fail ("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1982, __extension__ __PRETTY_FUNCTION__))
;
1983
1984 // TODO: Should this use extract since the low half is unused?
1985 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1986 Register Hi = Unmerge.getReg(1);
1987
1988 // Extract the upper half, since this is where we will find the sign and
1989 // exponent.
1990 auto Exp = extractF64Exponent(Hi, B);
1991
1992 const unsigned FractBits = 52;
1993
1994 // Extract the sign bit.
1995 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
1996 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1997
1998 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
1999
2000 const auto Zero32 = B.buildConstant(S32, 0);
2001
2002 // Extend back to 64-bits.
2003 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2004
2005 auto Shr = B.buildAShr(S64, FractMask, Exp);
2006 auto Not = B.buildNot(S64, Shr);
2007 auto Tmp0 = B.buildAnd(S64, Src, Not);
2008 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2009
2010 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2011 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2012
2013 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2014 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2015 MI.eraseFromParent();
2016 return true;
2017}
2018
2019bool AMDGPULegalizerInfo::legalizeITOFP(
2020 MachineInstr &MI, MachineRegisterInfo &MRI,
2021 MachineIRBuilder &B, bool Signed) const {
2022
2023 Register Dst = MI.getOperand(0).getReg();
2024 Register Src = MI.getOperand(1).getReg();
2025
2026 const LLT S64 = LLT::scalar(64);
2027 const LLT S32 = LLT::scalar(32);
2028
2029 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2029, __extension__ __PRETTY_FUNCTION__))
;
2030
2031 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2032
2033 auto CvtHi = Signed ?
2034 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2035 B.buildUITOFP(S64, Unmerge.getReg(1));
2036
2037 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2038
2039 auto ThirtyTwo = B.buildConstant(S32, 32);
2040 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2041 .addUse(CvtHi.getReg(0))
2042 .addUse(ThirtyTwo.getReg(0));
2043
2044 // TODO: Should this propagate fast-math-flags?
2045 B.buildFAdd(Dst, LdExp, CvtLo);
2046 MI.eraseFromParent();
2047 return true;
2048}
2049
2050// TODO: Copied from DAG implementation. Verify logic and document how this
2051// actually works.
2052bool AMDGPULegalizerInfo::legalizeFPTOI(
2053 MachineInstr &MI, MachineRegisterInfo &MRI,
2054 MachineIRBuilder &B, bool Signed) const {
2055
2056 Register Dst = MI.getOperand(0).getReg();
2057 Register Src = MI.getOperand(1).getReg();
2058
2059 const LLT S64 = LLT::scalar(64);
2060 const LLT S32 = LLT::scalar(32);
2061
2062 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)(static_cast <bool> (MRI.getType(Src) == S64 &&
MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2062, __extension__ __PRETTY_FUNCTION__))
;
2063
2064 unsigned Flags = MI.getFlags();
2065
2066 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2067 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL));
2068 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL));
2069
2070 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2071 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2072 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2073
2074 auto Hi = Signed ?
2075 B.buildFPTOSI(S32, FloorMul) :
2076 B.buildFPTOUI(S32, FloorMul);
2077 auto Lo = B.buildFPTOUI(S32, Fma);
2078
2079 B.buildMerge(Dst, { Lo, Hi });
2080 MI.eraseFromParent();
2081
2082 return true;
2083}
2084
2085bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2086 MachineInstr &MI) const {
2087 MachineFunction &MF = Helper.MIRBuilder.getMF();
2088 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2089
2090 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2091 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2092
2093 // With ieee_mode disabled, the instructions have the correct behavior
2094 // already for G_FMINNUM/G_FMAXNUM
2095 if (!MFI->getMode().IEEE)
2096 return !IsIEEEOp;
2097
2098 if (IsIEEEOp)
2099 return true;
2100
2101 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2102}
2103
2104bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2105 MachineInstr &MI, MachineRegisterInfo &MRI,
2106 MachineIRBuilder &B) const {
2107 // TODO: Should move some of this into LegalizerHelper.
2108
2109 // TODO: Promote dynamic indexing of s16 to s32
2110
2111 // FIXME: Artifact combiner probably should have replaced the truncated
2112 // constant before this, so we shouldn't need
2113 // getConstantVRegValWithLookThrough.
2114 Optional<ValueAndVReg> MaybeIdxVal =
2115 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2116 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2117 return true;
2118 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2119
2120 Register Dst = MI.getOperand(0).getReg();
2121 Register Vec = MI.getOperand(1).getReg();
2122
2123 LLT VecTy = MRI.getType(Vec);
2124 LLT EltTy = VecTy.getElementType();
2125 assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2125, __extension__ __PRETTY_FUNCTION__))
;
2126
2127 if (IdxVal < VecTy.getNumElements())
2128 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2129 else
2130 B.buildUndef(Dst);
2131
2132 MI.eraseFromParent();
2133 return true;
2134}
2135
2136bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2137 MachineInstr &MI, MachineRegisterInfo &MRI,
2138 MachineIRBuilder &B) const {
2139 // TODO: Should move some of this into LegalizerHelper.
2140
2141 // TODO: Promote dynamic indexing of s16 to s32
2142
2143 // FIXME: Artifact combiner probably should have replaced the truncated
2144 // constant before this, so we shouldn't need
2145 // getConstantVRegValWithLookThrough.
2146 Optional<ValueAndVReg> MaybeIdxVal =
2147 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2148 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2149 return true;
2150
2151 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2152 Register Dst = MI.getOperand(0).getReg();
2153 Register Vec = MI.getOperand(1).getReg();
2154 Register Ins = MI.getOperand(2).getReg();
2155
2156 LLT VecTy = MRI.getType(Vec);
2157 LLT EltTy = VecTy.getElementType();
2158 assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void (
0) : __assert_fail ("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2158, __extension__ __PRETTY_FUNCTION__))
;
2159
2160 if (IdxVal < VecTy.getNumElements())
2161 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2162 else
2163 B.buildUndef(Dst);
2164
2165 MI.eraseFromParent();
2166 return true;
2167}
2168
2169bool AMDGPULegalizerInfo::legalizeShuffleVector(
2170 MachineInstr &MI, MachineRegisterInfo &MRI,
2171 MachineIRBuilder &B) const {
2172 const LLT V2S16 = LLT::vector(2, 16);
2173
2174 Register Dst = MI.getOperand(0).getReg();
2175 Register Src0 = MI.getOperand(1).getReg();
2176 LLT DstTy = MRI.getType(Dst);
2177 LLT SrcTy = MRI.getType(Src0);
2178
2179 if (SrcTy == V2S16 && DstTy == V2S16 &&
2180 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2181 return true;
2182
2183 MachineIRBuilder HelperBuilder(MI);
2184 GISelObserverWrapper DummyObserver;
2185 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2186 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2187}
2188
2189bool AMDGPULegalizerInfo::legalizeSinCos(
2190 MachineInstr &MI, MachineRegisterInfo &MRI,
2191 MachineIRBuilder &B) const {
2192
2193 Register DstReg = MI.getOperand(0).getReg();
2194 Register SrcReg = MI.getOperand(1).getReg();
2195 LLT Ty = MRI.getType(DstReg);
2196 unsigned Flags = MI.getFlags();
2197
2198 Register TrigVal;
2199 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2200 if (ST.hasTrigReducedRange()) {
2201 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2202 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2203 .addUse(MulVal.getReg(0))
2204 .setMIFlags(Flags).getReg(0);
2205 } else
2206 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2207
2208 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2209 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2210 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2211 .addUse(TrigVal)
2212 .setMIFlags(Flags);
2213 MI.eraseFromParent();
2214 return true;
2215}
2216
2217bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2218 MachineIRBuilder &B,
2219 const GlobalValue *GV,
2220 int64_t Offset,
2221 unsigned GAFlags) const {
2222 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) &&
"32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2222, __extension__ __PRETTY_FUNCTION__))
;
2223 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2224 // to the following code sequence:
2225 //
2226 // For constant address space:
2227 // s_getpc_b64 s[0:1]
2228 // s_add_u32 s0, s0, $symbol
2229 // s_addc_u32 s1, s1, 0
2230 //
2231 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2232 // a fixup or relocation is emitted to replace $symbol with a literal
2233 // constant, which is a pc-relative offset from the encoding of the $symbol
2234 // operand to the global variable.
2235 //
2236 // For global address space:
2237 // s_getpc_b64 s[0:1]
2238 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2239 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2240 //
2241 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2242 // fixups or relocations are emitted to replace $symbol@*@lo and
2243 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2244 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2245 // operand to the global variable.
2246 //
2247 // What we want here is an offset from the value returned by s_getpc
2248 // (which is the address of the s_add_u32 instruction) to the global
2249 // variable, but since the encoding of $symbol starts 4 bytes after the start
2250 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2251 // small. This requires us to add 4 to the global variable offset in order to
2252 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2253 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2254 // instruction.
2255
2256 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2257
2258 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2259 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2260
2261 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2262 .addDef(PCReg);
2263
2264 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2265 if (GAFlags == SIInstrInfo::MO_NONE)
2266 MIB.addImm(0);
2267 else
2268 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2269
2270 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2271
2272 if (PtrTy.getSizeInBits() == 32)
2273 B.buildExtract(DstReg, PCReg, 0);
2274 return true;
2275 }
2276
2277bool AMDGPULegalizerInfo::legalizeGlobalValue(
2278 MachineInstr &MI, MachineRegisterInfo &MRI,
2279 MachineIRBuilder &B) const {
2280 Register DstReg = MI.getOperand(0).getReg();
2281 LLT Ty = MRI.getType(DstReg);
2282 unsigned AS = Ty.getAddressSpace();
2283
2284 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2285 MachineFunction &MF = B.getMF();
2286 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2287
2288 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2289 if (!MFI->isModuleEntryFunction()) {
2290 const Function &Fn = MF.getFunction();
2291 DiagnosticInfoUnsupported BadLDSDecl(
2292 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2293 DS_Warning);
2294 Fn.getContext().diagnose(BadLDSDecl);
2295
2296 // We currently don't have a way to correctly allocate LDS objects that
2297 // aren't directly associated with a kernel. We do force inlining of
2298 // functions that use local objects. However, if these dead functions are
2299 // not eliminated, we don't want a compile time error. Just emit a warning
2300 // and a trap, since there should be no callable path here.
2301 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2302 B.buildUndef(DstReg);
2303 MI.eraseFromParent();
2304 return true;
2305 }
2306
2307 // TODO: We could emit code to handle the initialization somewhere.
2308 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2309 const SITargetLowering *TLI = ST.getTargetLowering();
2310 if (!TLI->shouldUseLDSConstAddress(GV)) {
2311 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2312 return true; // Leave in place;
2313 }
2314
2315 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2316 Type *Ty = GV->getValueType();
2317 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2318 // zero-sized type in other languages to declare the dynamic shared
2319 // memory which size is not known at the compile time. They will be
2320 // allocated by the runtime and placed directly after the static
2321 // allocated ones. They all share the same offset.
2322 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2323 // Adjust alignment for that dynamic shared memory array.
2324 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2325 LLT S32 = LLT::scalar(32);
2326 auto Sz =
2327 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2328 B.buildIntToPtr(DstReg, Sz);
2329 MI.eraseFromParent();
2330 return true;
2331 }
2332 }
2333
2334 B.buildConstant(
2335 DstReg,
2336 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2337 MI.eraseFromParent();
2338 return true;
2339 }
2340
2341 const Function &Fn = MF.getFunction();
2342 DiagnosticInfoUnsupported BadInit(
2343 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2344 Fn.getContext().diagnose(BadInit);
2345 return true;
2346 }
2347
2348 const SITargetLowering *TLI = ST.getTargetLowering();
2349
2350 if (TLI->shouldEmitFixup(GV)) {
2351 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2352 MI.eraseFromParent();
2353 return true;
2354 }
2355
2356 if (TLI->shouldEmitPCReloc(GV)) {
2357 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2358 MI.eraseFromParent();
2359 return true;
2360 }
2361
2362 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2363 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2364
2365 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2366 MachinePointerInfo::getGOT(MF),
2367 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2368 MachineMemOperand::MOInvariant,
2369 8 /*Size*/, Align(8));
2370
2371 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2372
2373 if (Ty.getSizeInBits() == 32) {
2374 // Truncate if this is a 32-bit constant adrdess.
2375 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2376 B.buildExtract(DstReg, Load, 0);
2377 } else
2378 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2379
2380 MI.eraseFromParent();
2381 return true;
2382}
2383
2384static LLT widenToNextPowerOf2(LLT Ty) {
2385 if (Ty.isVector())
2386 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2387 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2388}
2389
2390bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2391 MachineInstr &MI) const {
2392 MachineIRBuilder &B = Helper.MIRBuilder;
2393 MachineRegisterInfo &MRI = *B.getMRI();
2394 GISelChangeObserver &Observer = Helper.Observer;
2395
2396 Register PtrReg = MI.getOperand(1).getReg();
2397 LLT PtrTy = MRI.getType(PtrReg);
2398 unsigned AddrSpace = PtrTy.getAddressSpace();
2399
2400 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2401 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2402 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2403 Observer.changingInstr(MI);
2404 MI.getOperand(1).setReg(Cast.getReg(0));
2405 Observer.changedInstr(MI);
2406 return true;
2407 }
2408
2409 Register ValReg = MI.getOperand(0).getReg();
2410 LLT ValTy = MRI.getType(ValReg);
2411
2412 MachineMemOperand *MMO = *MI.memoperands_begin();
2413 const unsigned ValSize = ValTy.getSizeInBits();
2414 const unsigned MemSize = 8 * MMO->getSize();
2415 const Align MemAlign = MMO->getAlign();
2416 const unsigned AlignInBits = 8 * MemAlign.value();
2417
2418 // Widen non-power-of-2 loads to the alignment if needed
2419 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2420 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2421
2422 // This was already the correct extending load result type, so just adjust
2423 // the memory type.
2424 if (WideMemSize == ValSize) {
2425 MachineFunction &MF = B.getMF();
2426
2427 MachineMemOperand *WideMMO =
2428 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2429 Observer.changingInstr(MI);
2430 MI.setMemRefs(MF, {WideMMO});
2431 Observer.changedInstr(MI);
2432 return true;
2433 }
2434
2435 // Don't bother handling edge case that should probably never be produced.
2436 if (ValSize > WideMemSize)
2437 return false;
2438
2439 LLT WideTy = widenToNextPowerOf2(ValTy);
2440
2441 Register WideLoad;
2442 if (!WideTy.isVector()) {
2443 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2444 B.buildTrunc(ValReg, WideLoad).getReg(0);
2445 } else {
2446 // Extract the subvector.
2447
2448 if (isRegisterType(ValTy)) {
2449 // If this a case where G_EXTRACT is legal, use it.
2450 // (e.g. <3 x s32> -> <4 x s32>)
2451 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2452 B.buildExtract(ValReg, WideLoad, 0);
2453 } else {
2454 // For cases where the widened type isn't a nice register value, unmerge
2455 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2456 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2457 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2458 B.setInsertPt(B.getMBB(), MI.getIterator());
2459 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2460 }
2461 }
2462
2463 MI.eraseFromParent();
2464 return true;
2465 }
2466
2467 return false;
2468}
2469
2470bool AMDGPULegalizerInfo::legalizeFMad(
2471 MachineInstr &MI, MachineRegisterInfo &MRI,
2472 MachineIRBuilder &B) const {
2473 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2474 assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2474, __extension__ __PRETTY_FUNCTION__))
;
2475
2476 MachineFunction &MF = B.getMF();
2477 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2478
2479 // TODO: Always legal with future ftz flag.
2480 // FIXME: Do we need just output?
2481 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2482 return true;
2483 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2484 return true;
2485
2486 MachineIRBuilder HelperBuilder(MI);
2487 GISelObserverWrapper DummyObserver;
2488 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2489 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2490}
2491
2492bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2493 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2494 Register DstReg = MI.getOperand(0).getReg();
2495 Register PtrReg = MI.getOperand(1).getReg();
2496 Register CmpVal = MI.getOperand(2).getReg();
2497 Register NewVal = MI.getOperand(3).getReg();
2498
2499 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2500, __extension__ __PRETTY_FUNCTION__))
2500 "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI.
getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2500, __extension__ __PRETTY_FUNCTION__))
;
2501
2502 LLT ValTy = MRI.getType(CmpVal);
2503 LLT VecTy = LLT::vector(2, ValTy);
2504
2505 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2506
2507 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2508 .addDef(DstReg)
2509 .addUse(PtrReg)
2510 .addUse(PackedVal)
2511 .setMemRefs(MI.memoperands());
2512
2513 MI.eraseFromParent();
2514 return true;
2515}
2516
2517bool AMDGPULegalizerInfo::legalizeFlog(
2518 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2519 Register Dst = MI.getOperand(0).getReg();
2520 Register Src = MI.getOperand(1).getReg();
2521 LLT Ty = B.getMRI()->getType(Dst);
2522 unsigned Flags = MI.getFlags();
2523
2524 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2525 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2526
2527 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2528 MI.eraseFromParent();
2529 return true;
2530}
2531
2532bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2533 MachineIRBuilder &B) const {
2534 Register Dst = MI.getOperand(0).getReg();
2535 Register Src = MI.getOperand(1).getReg();
2536 unsigned Flags = MI.getFlags();
2537 LLT Ty = B.getMRI()->getType(Dst);
2538
2539 auto K = B.buildFConstant(Ty, numbers::log2e);
2540 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2541 B.buildFExp2(Dst, Mul, Flags);
2542 MI.eraseFromParent();
2543 return true;
2544}
2545
2546bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2547 MachineIRBuilder &B) const {
2548 Register Dst = MI.getOperand(0).getReg();
2549 Register Src0 = MI.getOperand(1).getReg();
2550 Register Src1 = MI.getOperand(2).getReg();
2551 unsigned Flags = MI.getFlags();
2552 LLT Ty = B.getMRI()->getType(Dst);
2553 const LLT S16 = LLT::scalar(16);
2554 const LLT S32 = LLT::scalar(32);
2555
2556 if (Ty == S32) {
2557 auto Log = B.buildFLog2(S32, Src0, Flags);
2558 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2559 .addUse(Log.getReg(0))
2560 .addUse(Src1)
2561 .setMIFlags(Flags);
2562 B.buildFExp2(Dst, Mul, Flags);
2563 } else if (Ty == S16) {
2564 // There's no f16 fmul_legacy, so we need to convert for it.
2565 auto Log = B.buildFLog2(S16, Src0, Flags);
2566 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2567 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2568 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2569 .addUse(Ext0.getReg(0))
2570 .addUse(Ext1.getReg(0))
2571 .setMIFlags(Flags);
2572
2573 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2574 } else
2575 return false;
2576
2577 MI.eraseFromParent();
2578 return true;
2579}
2580
2581// Find a source register, ignoring any possible source modifiers.
2582static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2583 Register ModSrc = OrigSrc;
2584 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2585 ModSrc = SrcFNeg->getOperand(1).getReg();
2586 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2587 ModSrc = SrcFAbs->getOperand(1).getReg();
2588 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2589 ModSrc = SrcFAbs->getOperand(1).getReg();
2590 return ModSrc;
2591}
2592
2593bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2594 MachineRegisterInfo &MRI,
2595 MachineIRBuilder &B) const {
2596
2597 const LLT S1 = LLT::scalar(1);
2598 const LLT S64 = LLT::scalar(64);
2599 Register Dst = MI.getOperand(0).getReg();
2600 Register OrigSrc = MI.getOperand(1).getReg();
2601 unsigned Flags = MI.getFlags();
2602 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2603, __extension__ __PRETTY_FUNCTION__))
2603 "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType
(Dst) == S64 && "this should not have been custom lowered"
) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2603, __extension__ __PRETTY_FUNCTION__))
;
2604
2605 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2606 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2607 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2608 // V_FRACT bug is:
2609 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2610 //
2611 // Convert floor(x) to (x - fract(x))
2612
2613 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2614 .addUse(OrigSrc)
2615 .setMIFlags(Flags);
2616
2617 // Give source modifier matching some assistance before obscuring a foldable
2618 // pattern.
2619
2620 // TODO: We can avoid the neg on the fract? The input sign to fract
2621 // shouldn't matter?
2622 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2623
2624 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2625
2626 Register Min = MRI.createGenericVirtualRegister(S64);
2627
2628 // We don't need to concern ourselves with the snan handling difference, so
2629 // use the one which will directly select.
2630 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2631 if (MFI->getMode().IEEE)
2632 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2633 else
2634 B.buildFMinNum(Min, Fract, Const, Flags);
2635
2636 Register CorrectedFract = Min;
2637 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2638 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2639 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2640 }
2641
2642 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2643 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2644
2645 MI.eraseFromParent();
2646 return true;
2647}
2648
2649// Turn an illegal packed v2s16 build vector into bit operations.
2650// TODO: This should probably be a bitcast action in LegalizerHelper.
2651bool AMDGPULegalizerInfo::legalizeBuildVector(
2652 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2653 Register Dst = MI.getOperand(0).getReg();
2654 const LLT S32 = LLT::scalar(32);
2655 assert(MRI.getType(Dst) == LLT::vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::vector(2,
16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2655, __extension__ __PRETTY_FUNCTION__))
;
2656
2657 Register Src0 = MI.getOperand(1).getReg();
2658 Register Src1 = MI.getOperand(2).getReg();
2659 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast <bool> (MRI.getType(Src0) == LLT::scalar(16
)) ? void (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2659, __extension__ __PRETTY_FUNCTION__))
;
2660
2661 auto Merge = B.buildMerge(S32, {Src0, Src1});
2662 B.buildBitcast(Dst, Merge);
2663
2664 MI.eraseFromParent();
2665 return true;
2666}
2667
2668// Check that this is a G_XOR x, -1
2669static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2670 if (MI.getOpcode() != TargetOpcode::G_XOR)
2671 return false;
2672 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2673 return ConstVal && *ConstVal == -1;
2674}
2675
2676// Return the use branch instruction, otherwise null if the usage is invalid.
2677static MachineInstr *
2678verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2679 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2680 Register CondDef = MI.getOperand(0).getReg();
2681 if (!MRI.hasOneNonDBGUse(CondDef))
2682 return nullptr;
2683
2684 MachineBasicBlock *Parent = MI.getParent();
2685 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2686
2687 if (isNot(MRI, *UseMI)) {
2688 Register NegatedCond = UseMI->getOperand(0).getReg();
2689 if (!MRI.hasOneNonDBGUse(NegatedCond))
2690 return nullptr;
2691
2692 // We're deleting the def of this value, so we need to remove it.
2693 UseMI->eraseFromParent();
2694
2695 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2696 Negated = true;
2697 }
2698
2699 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2700 return nullptr;
2701
2702 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2703 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2704 if (Next == Parent->end()) {
2705 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2706 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2707 return nullptr;
2708 UncondBrTarget = &*NextMBB;
2709 } else {
2710 if (Next->getOpcode() != AMDGPU::G_BR)
2711 return nullptr;
2712 Br = &*Next;
2713 UncondBrTarget = Br->getOperand(0).getMBB();
2714 }
2715
2716 return UseMI;
2717}
2718
2719bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2720 const ArgDescriptor *Arg,
2721 const TargetRegisterClass *ArgRC,
2722 LLT ArgTy) const {
2723 MCRegister SrcReg = Arg->getRegister();
2724 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg
) && "Physical register expected") ? void (0) : __assert_fail
("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2724, __extension__ __PRETTY_FUNCTION__))
;
2725 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected"
) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2725, __extension__ __PRETTY_FUNCTION__))
;
2726
2727 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2728 ArgTy);
2729 if (Arg->isMasked()) {
2730 // TODO: Should we try to emit this once in the entry block?
2731 const LLT S32 = LLT::scalar(32);
2732 const unsigned Mask = Arg->getMask();
2733 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2734
2735 Register AndMaskSrc = LiveIn;
2736
2737 if (Shift != 0) {
2738 auto ShiftAmt = B.buildConstant(S32, Shift);
2739 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2740 }
2741
2742 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2743 } else {
2744 B.buildCopy(DstReg, LiveIn);
2745 }
2746
2747 return true;
2748}
2749
2750bool AMDGPULegalizerInfo::loadInputValue(
2751 Register DstReg, MachineIRBuilder &B,
2752 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2753 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2754 const ArgDescriptor *Arg;
2755 const TargetRegisterClass *ArgRC;
2756 LLT ArgTy;
2757 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2758
2759 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2760 return false; // TODO: Handle these
2761 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2762}
2763
2764bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2765 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2766 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2767 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2768 return false;
2769
2770 MI.eraseFromParent();
2771 return true;
2772}
2773
2774bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2775 MachineRegisterInfo &MRI,
2776 MachineIRBuilder &B) const {
2777 Register Dst = MI.getOperand(0).getReg();
2778 LLT DstTy = MRI.getType(Dst);
2779 LLT S16 = LLT::scalar(16);
2780 LLT S32 = LLT::scalar(32);
2781 LLT S64 = LLT::scalar(64);
2782
2783 if (DstTy == S16)
2784 return legalizeFDIV16(MI, MRI, B);
2785 if (DstTy == S32)
2786 return legalizeFDIV32(MI, MRI, B);
2787 if (DstTy == S64)
2788 return legalizeFDIV64(MI, MRI, B);
2789
2790 return false;
2791}
2792
2793void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2794 Register DstReg,
2795 Register X,
2796 Register Y,
2797 bool IsDiv) const {
2798 const LLT S1 = LLT::scalar(1);
2799 const LLT S32 = LLT::scalar(32);
2800
2801 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2802 // algorithm used here.
2803
2804 // Initial estimate of inv(y).
2805 auto FloatY = B.buildUITOFP(S32, Y);
2806 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2807 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2808 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2809 auto Z = B.buildFPTOUI(S32, ScaledY);
2810
2811 // One round of UNR.
2812 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2813 auto NegYZ = B.buildMul(S32, NegY, Z);
2814 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2815
2816 // Quotient/remainder estimate.
2817 auto Q = B.buildUMulH(S32, X, Z);
2818 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2819
2820 // First quotient/remainder refinement.
2821 auto One = B.buildConstant(S32, 1);
2822 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2823 if (IsDiv)
2824 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2825 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2826
2827 // Second quotient/remainder refinement.
2828 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2829 if (IsDiv)
2830 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2831 else
2832 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2833}
2834
2835// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2836//
2837// Return lo, hi of result
2838//
2839// %cvt.lo = G_UITOFP Val.lo
2840// %cvt.hi = G_UITOFP Val.hi
2841// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2842// %rcp = G_AMDGPU_RCP_IFLAG %mad
2843// %mul1 = G_FMUL %rcp, 0x5f7ffffc
2844// %mul2 = G_FMUL %mul1, 2**(-32)
2845// %trunc = G_INTRINSIC_TRUNC %mul2
2846// %mad2 = G_FMAD %trunc, -(2**32), %mul1
2847// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2848static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2849 Register Val) {
2850 const LLT S32 = LLT::scalar(32);
2851 auto Unmerge = B.buildUnmerge(S32, Val);
2852
2853 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2854 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2855
2856 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2857 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2858
2859 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2860 auto Mul1 =
2861 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2862
2863 // 2**(-32)
2864 auto Mul2 =
2865 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2866 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2867
2868 // -(2**32)
2869 auto Mad2 = B.buildFMAD(S32, Trunc,
2870 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2871
2872 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2873 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2874
2875 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2876}
2877
2878void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2879 Register DstReg,
2880 Register Numer,
2881 Register Denom,
2882 bool IsDiv) const {
2883 const LLT S32 = LLT::scalar(32);
2884 const LLT S64 = LLT::scalar(64);
2885 const LLT S1 = LLT::scalar(1);
2886 Register RcpLo, RcpHi;
2887
2888 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2889
2890 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2891
2892 auto Zero64 = B.buildConstant(S64, 0);
2893 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2894
2895 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2896 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2897
2898 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2899 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2900 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2901
2902 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2903 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2904 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2905 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2906
2907 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2908 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2909 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2910 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2911 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2912
2913 auto Zero32 = B.buildConstant(S32, 0);
2914 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2915 auto Add2_HiC =
2916 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2917 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2918 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2919
2920 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2921 Register NumerLo = UnmergeNumer.getReg(0);
2922 Register NumerHi = UnmergeNumer.getReg(1);
2923
2924 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2925 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2926 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2927 Register Mul3_Lo = UnmergeMul3.getReg(0);
2928 Register Mul3_Hi = UnmergeMul3.getReg(1);
2929 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2930 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2931 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2932 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2933
2934 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2935 Register DenomLo = UnmergeDenom.getReg(0);
2936 Register DenomHi = UnmergeDenom.getReg(1);
2937
2938 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2939 auto C1 = B.buildSExt(S32, CmpHi);
2940
2941 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2942 auto C2 = B.buildSExt(S32, CmpLo);
2943
2944 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2945 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2946
2947 // TODO: Here and below portions of the code can be enclosed into if/endif.
2948 // Currently control flow is unconditional and we have 4 selects after
2949 // potential endif to substitute PHIs.
2950
2951 // if C3 != 0 ...
2952 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2953 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2954 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2955 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2956
2957 auto One64 = B.buildConstant(S64, 1);
2958 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2959
2960 auto C4 =
2961 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2962 auto C5 =
2963 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2964 auto C6 = B.buildSelect(
2965 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2966
2967 // if (C6 != 0)
2968 auto Add4 = B.buildAdd(S64, Add3, One64);
2969 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2970
2971 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2972 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2973 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2974
2975 // endif C6
2976 // endif C3
2977
2978 if (IsDiv) {
2979 auto Sel1 = B.buildSelect(
2980 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2981 B.buildSelect(DstReg,
2982 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2983 } else {
2984 auto Sel2 = B.buildSelect(
2985 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2986 B.buildSelect(DstReg,
2987 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2988 }
2989}
2990
2991bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2992 MachineRegisterInfo &MRI,
2993 MachineIRBuilder &B) const {
2994 const LLT S64 = LLT::scalar(64);
2995 const LLT S32 = LLT::scalar(32);
2996 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2997 Register DstReg = MI.getOperand(0).getReg();
2998 Register Num = MI.getOperand(1).getReg();
2999 Register Den = MI.getOperand(2).getReg();
3000 LLT Ty = MRI.getType(DstReg);
3001
3002 if (Ty == S32)
3003 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3004 else if (Ty == S64)
3005 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3006 else
3007 return false;
3008
3009 MI.eraseFromParent();
3010 return true;
3011
3012}
3013
3014bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3015 MachineRegisterInfo &MRI,
3016 MachineIRBuilder &B) const {
3017 const LLT S64 = LLT::scalar(64);
3018 const LLT S32 = LLT::scalar(32);
3019
3020 Register DstReg = MI.getOperand(0).getReg();
3021 const LLT Ty = MRI.getType(DstReg);
3022 if (Ty != S32 && Ty != S64)
3023 return false;
3024
3025 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3026
3027 Register LHS = MI.getOperand(1).getReg();
3028 Register RHS = MI.getOperand(2).getReg();
3029
3030 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3031 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3032 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3033
3034 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3035 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3036
3037 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3038 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3039
3040 Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3041 if (Ty == S32)
3042 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3043 else
3044 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3045
3046 Register Sign;
3047 if (IsDiv)
3048 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3049 else
3050 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3051
3052 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3053 B.buildSub(DstReg, UDivRem, Sign);
3054
3055 MI.eraseFromParent();
3056 return true;
3057}
3058
3059bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3060 MachineRegisterInfo &MRI,
3061 MachineIRBuilder &B) const {
3062 Register Res = MI.getOperand(0).getReg();
3063 Register LHS = MI.getOperand(1).getReg();
3064 Register RHS = MI.getOperand(2).getReg();
3065 uint16_t Flags = MI.getFlags();
3066 LLT ResTy = MRI.getType(Res);
3067
3068 const MachineFunction &MF = B.getMF();
3069 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3070 MI.getFlag(MachineInstr::FmAfn);
3071
3072 if (!AllowInaccurateRcp)
3073 return false;
3074
3075 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3076 // 1 / x -> RCP(x)
3077 if (CLHS->isExactlyValue(1.0)) {
3078 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3079 .addUse(RHS)
3080 .setMIFlags(Flags);
3081
3082 MI.eraseFromParent();
3083 return true;
3084 }
3085
3086 // -1 / x -> RCP( FNEG(x) )
3087 if (CLHS->isExactlyValue(-1.0)) {
3088 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3089 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3090 .addUse(FNeg.getReg(0))
3091 .setMIFlags(Flags);
3092
3093 MI.eraseFromParent();
3094 return true;
3095 }
3096 }
3097
3098 // x / y -> x * (1.0 / y)
3099 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3100 .addUse(RHS)
3101 .setMIFlags(Flags);
3102 B.buildFMul(Res, LHS, RCP, Flags);
3103
3104 MI.eraseFromParent();
3105 return true;
3106}
3107
3108bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3109 MachineRegisterInfo &MRI,
3110 MachineIRBuilder &B) const {
3111 Register Res = MI.getOperand(0).getReg();
3112 Register X = MI.getOperand(1).getReg();
3113 Register Y = MI.getOperand(2).getReg();
3114 uint16_t Flags = MI.getFlags();
3115 LLT ResTy = MRI.getType(Res);
3116
3117 const MachineFunction &MF = B.getMF();
3118 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3119 MI.getFlag(MachineInstr::FmAfn);
3120
3121 if (!AllowInaccurateRcp)
3122 return false;
3123
3124 auto NegY = B.buildFNeg(ResTy, Y);
3125 auto One = B.buildFConstant(ResTy, 1.0);
3126
3127 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3128 .addUse(Y)
3129 .setMIFlags(Flags);
3130
3131 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3132 R = B.buildFMA(ResTy, Tmp0, R, R);
3133
3134 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3135 R = B.buildFMA(ResTy, Tmp1, R, R);
3136
3137 auto Ret = B.buildFMul(ResTy, X, R);
3138 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3139
3140 B.buildFMA(Res, Tmp2, R, Ret);
3141 MI.eraseFromParent();
3142 return true;
3143}
3144
3145bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3146 MachineRegisterInfo &MRI,
3147 MachineIRBuilder &B) const {
3148 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3149 return true;
3150
3151 Register Res = MI.getOperand(0).getReg();
3152 Register LHS = MI.getOperand(1).getReg();
3153 Register RHS = MI.getOperand(2).getReg();
3154
3155 uint16_t Flags = MI.getFlags();
3156
3157 LLT S16 = LLT::scalar(16);
3158 LLT S32 = LLT::scalar(32);
3159
3160 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3161 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3162
3163 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3164 .addUse(RHSExt.getReg(0))
3165 .setMIFlags(Flags);
3166
3167 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3168 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3169
3170 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3171 .addUse(RDst.getReg(0))
3172 .addUse(RHS)
3173 .addUse(LHS)
3174 .setMIFlags(Flags);
3175
3176 MI.eraseFromParent();
3177 return true;
3178}
3179
3180// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3181// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3182static void toggleSPDenormMode(bool Enable,
3183 MachineIRBuilder &B,
3184 const GCNSubtarget &ST,
3185 AMDGPU::SIModeRegisterDefaults Mode) {
3186 // Set SP denorm mode to this value.
3187 unsigned SPDenormMode =
3188 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3189
3190 if (ST.hasDenormModeInst()) {
3191 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3192 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3193
3194 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3195 B.buildInstr(AMDGPU::S_DENORM_MODE)
3196 .addImm(NewDenormModeValue);
3197
3198 } else {
3199 // Select FP32 bit field in mode register.
3200 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3201 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3202 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3203
3204 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3205 .addImm(SPDenormMode)
3206 .addImm(SPDenormModeBitField);
3207 }
3208}
3209
3210bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3211 MachineRegisterInfo &MRI,
3212 MachineIRBuilder &B) const {
3213 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3214 return true;
3215
3216 Register Res = MI.getOperand(0).getReg();
3217 Register LHS = MI.getOperand(1).getReg();
3218 Register RHS = MI.getOperand(2).getReg();
3219 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3220 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3221
3222 uint16_t Flags = MI.getFlags();
3223
3224 LLT S32 = LLT::scalar(32);
3225 LLT S1 = LLT::scalar(1);
3226
3227 auto One = B.buildFConstant(S32, 1.0f);
3228
3229 auto DenominatorScaled =
3230 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3231 .addUse(LHS)
3232 .addUse(RHS)
3233 .addImm(0)
3234 .setMIFlags(Flags);
3235 auto NumeratorScaled =
3236 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3237 .addUse(LHS)
3238 .addUse(RHS)
3239 .addImm(1)
3240 .setMIFlags(Flags);
3241
3242 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3243 .addUse(DenominatorScaled.getReg(0))
3244 .setMIFlags(Flags);
3245 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3246
3247 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3248 // aren't modeled as reading it.
3249 if (!Mode.allFP32Denormals())
3250 toggleSPDenormMode(true, B, ST, Mode);
3251
3252 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3253 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3254 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3255 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3256 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3257 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3258
3259 if (!Mode.allFP32Denormals())
3260 toggleSPDenormMode(false, B, ST, Mode);
3261
3262 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3263 .addUse(Fma4.getReg(0))
3264 .addUse(Fma1.getReg(0))
3265 .addUse(Fma3.getReg(0))
3266 .addUse(NumeratorScaled.getReg(1))
3267 .setMIFlags(Flags);
3268
3269 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3270 .addUse(Fmas.getReg(0))
3271 .addUse(RHS)
3272 .addUse(LHS)
3273 .setMIFlags(Flags);
3274
3275 MI.eraseFromParent();
3276 return true;
3277}
3278
3279bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3280 MachineRegisterInfo &MRI,
3281 MachineIRBuilder &B) const {
3282 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3283 return true;
3284
3285 Register Res = MI.getOperand(0).getReg();
3286 Register LHS = MI.getOperand(1).getReg();
3287 Register RHS = MI.getOperand(2).getReg();
3288
3289 uint16_t Flags = MI.getFlags();
3290
3291 LLT S64 = LLT::scalar(64);
3292 LLT S1 = LLT::scalar(1);
3293
3294 auto One = B.buildFConstant(S64, 1.0);
3295
3296 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3297 .addUse(LHS)
3298 .addUse(RHS)
3299 .addImm(0)
3300 .setMIFlags(Flags);
3301
3302 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3303
3304 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3305 .addUse(DivScale0.getReg(0))
3306 .setMIFlags(Flags);
3307
3308 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3309 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3310 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3311
3312 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3313 .addUse(LHS)
3314 .addUse(RHS)
3315 .addImm(1)
3316 .setMIFlags(Flags);
3317
3318 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3319 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3320 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3321
3322 Register Scale;
3323 if (!ST.hasUsableDivScaleConditionOutput()) {
3324 // Workaround a hardware bug on SI where the condition output from div_scale
3325 // is not usable.
3326
3327 LLT S32 = LLT::scalar(32);
3328
3329 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3330 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3331 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3332 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3333
3334 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3335 Scale1Unmerge.getReg(1));
3336 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3337 Scale0Unmerge.getReg(1));
3338 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3339 } else {
3340 Scale = DivScale1.getReg(1);
3341 }
3342
3343 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3344 .addUse(Fma4.getReg(0))
3345 .addUse(Fma3.getReg(0))
3346 .addUse(Mul.getReg(0))
3347 .addUse(Scale)
3348 .setMIFlags(Flags);
3349
3350 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3351 .addUse(Fmas.getReg(0))
3352 .addUse(RHS)
3353 .addUse(LHS)
3354 .setMIFlags(Flags);
3355
3356 MI.eraseFromParent();
3357 return true;
3358}
3359
3360bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3361 MachineRegisterInfo &MRI,
3362 MachineIRBuilder &B) const {
3363 Register Res = MI.getOperand(0).getReg();
3364 Register LHS = MI.getOperand(2).getReg();
3365 Register RHS = MI.getOperand(3).getReg();
3366 uint16_t Flags = MI.getFlags();
3367
3368 LLT S32 = LLT::scalar(32);
3369 LLT S1 = LLT::scalar(1);
3370
3371 auto Abs = B.buildFAbs(S32, RHS, Flags);
3372 const APFloat C0Val(1.0f);
3373
3374 auto C0 = B.buildConstant(S32, 0x6f800000);
3375 auto C1 = B.buildConstant(S32, 0x2f800000);
3376 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3377
3378 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3379 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3380
3381 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3382
3383 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3384 .addUse(Mul0.getReg(0))
3385 .setMIFlags(Flags);
3386
3387 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3388
3389 B.buildFMul(Res, Sel, Mul1, Flags);
3390
3391 MI.eraseFromParent();
3392 return true;
3393}
3394
3395// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3396// FIXME: Why do we handle this one but not other removed instructions?
3397//
3398// Reciprocal square root. The clamp prevents infinite results, clamping
3399// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3400// +-max_float.
3401bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3402 MachineRegisterInfo &MRI,
3403 MachineIRBuilder &B) const {
3404 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3405 return true;
3406
3407 Register Dst = MI.getOperand(0).getReg();
3408 Register Src = MI.getOperand(2).getReg();
3409 auto Flags = MI.getFlags();
3410
3411 LLT Ty = MRI.getType(Dst);
3412
3413 const fltSemantics *FltSemantics;
3414 if (Ty == LLT::scalar(32))
3415 FltSemantics = &APFloat::IEEEsingle();
3416 else if (Ty == LLT::scalar(64))
3417 FltSemantics = &APFloat::IEEEdouble();
3418 else
3419 return false;
3420
3421 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3422 .addUse(Src)
3423 .setMIFlags(Flags);
3424
3425 // We don't need to concern ourselves with the snan handling difference, since
3426 // the rsq quieted (or not) so use the one which will directly select.
3427 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3428 const bool UseIEEE = MFI->getMode().IEEE;
3429
3430 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3431 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3432 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3433
3434 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3435
3436 if (UseIEEE)
3437 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3438 else
3439 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3440 MI.eraseFromParent();
3441 return true;
3442}
3443
3444static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3445 switch (IID) {
3446 case Intrinsic::amdgcn_ds_fadd:
3447 return AMDGPU::G_ATOMICRMW_FADD;
3448 case Intrinsic::amdgcn_ds_fmin:
3449 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3450 case Intrinsic::amdgcn_ds_fmax:
3451 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3452 default:
3453 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3453)
;
3454 }
3455}
3456
3457bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3458 MachineInstr &MI,
3459 Intrinsic::ID IID) const {
3460 GISelChangeObserver &Observer = Helper.Observer;
3461 Observer.changingInstr(MI);
3462
3463 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3464
3465 // The remaining operands were used to set fields in the MemOperand on
3466 // construction.
3467 for (int I = 6; I > 3; --I)
3468 MI.RemoveOperand(I);
3469
3470 MI.RemoveOperand(1); // Remove the intrinsic ID.
3471 Observer.changedInstr(MI);
3472 return true;
3473}
3474
3475bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3476 MachineRegisterInfo &MRI,
3477 MachineIRBuilder &B) const {
3478 uint64_t Offset =
3479 ST.getTargetLowering()->getImplicitParameterOffset(
3480 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3481 LLT DstTy = MRI.getType(DstReg);
3482 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3483
3484 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3485 if (!loadInputValue(KernargPtrReg, B,
3486 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3487 return false;
3488
3489 // FIXME: This should be nuw
3490 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3491 return true;
3492}
3493
3494bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3495 MachineRegisterInfo &MRI,
3496 MachineIRBuilder &B) const {
3497 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3498 if (!MFI->isEntryFunction()) {
3499 return legalizePreloadedArgIntrin(MI, MRI, B,
3500 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3501 }
3502
3503 Register DstReg = MI.getOperand(0).getReg();
3504 if (!getImplicitArgPtr(DstReg, MRI, B))
3505 return false;
3506
3507 MI.eraseFromParent();
3508 return true;
3509}
3510
3511bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3512 MachineRegisterInfo &MRI,
3513 MachineIRBuilder &B,
3514 unsigned AddrSpace) const {
3515 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3516 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3517 Register Hi32 = Unmerge.getReg(1);
3518
3519 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3520 MI.eraseFromParent();
3521 return true;
3522}
3523
3524// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3525// offset (the offset that is included in bounds checking and swizzling, to be
3526// split between the instruction's voffset and immoffset fields) and soffset
3527// (the offset that is excluded from bounds checking and swizzling, to go in
3528// the instruction's soffset field). This function takes the first kind of
3529// offset and figures out how to split it between voffset and immoffset.
3530std::tuple<Register, unsigned, unsigned>
3531AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3532 Register OrigOffset) const {
3533 const unsigned MaxImm = 4095;
3534 Register BaseReg;
3535 unsigned TotalConstOffset;
3536 const LLT S32 = LLT::scalar(32);
3537 MachineRegisterInfo &MRI = *B.getMRI();
3538
3539 std::tie(BaseReg, TotalConstOffset) =
3540 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3541
3542 unsigned ImmOffset = TotalConstOffset;
3543
3544 // If BaseReg is a pointer, convert it to int.
3545 if (MRI.getType(BaseReg).isPointer())
3546 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3547
3548 // If the immediate value is too big for the immoffset field, put the value
3549 // and -4096 into the immoffset field so that the value that is copied/added
3550 // for the voffset field is a multiple of 4096, and it stands more chance
3551 // of being CSEd with the copy/add for another similar load/store.
3552 // However, do not do that rounding down to a multiple of 4096 if that is a
3553 // negative number, as it appears to be illegal to have a negative offset
3554 // in the vgpr, even if adding the immediate offset makes it positive.
3555 unsigned Overflow = ImmOffset & ~MaxImm;
3556 ImmOffset -= Overflow;
3557 if ((int32_t)Overflow < 0) {
3558 Overflow += ImmOffset;
3559 ImmOffset = 0;
3560 }
3561
3562 if (Overflow != 0) {
3563 if (!BaseReg) {
3564 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3565 } else {
3566 auto OverflowVal = B.buildConstant(S32, Overflow);
3567 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3568 }
3569 }
3570
3571 if (!BaseReg)
3572 BaseReg = B.buildConstant(S32, 0).getReg(0);
3573
3574 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3575}
3576
3577/// Handle register layout difference for f16 images for some subtargets.
3578Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3579 MachineRegisterInfo &MRI,
3580 Register Reg,
3581 bool ImageStore) const {
3582 const LLT S16 = LLT::scalar(16);
3583 const LLT S32 = LLT::scalar(32);
3584 LLT StoreVT = MRI.getType(Reg);
3585 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT
.getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3585, __extension__ __PRETTY_FUNCTION__))
;
3586
3587 if (ST.hasUnpackedD16VMem()) {
3588 auto Unmerge = B.buildUnmerge(S16, Reg);
3589
3590 SmallVector<Register, 4> WideRegs;
3591 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3592 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3593
3594 int NumElts = StoreVT.getNumElements();
3595
3596 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3597 }
3598
3599 if (ImageStore && ST.hasImageStoreD16Bug()) {
3600 if (StoreVT.getNumElements() == 2) {
3601 SmallVector<Register, 4> PackedRegs;
3602 Reg = B.buildBitcast(S32, Reg).getReg(0);
3603 PackedRegs.push_back(Reg);
3604 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3605 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3606 }
3607
3608 if (StoreVT.getNumElements() == 3) {
3609 SmallVector<Register, 4> PackedRegs;
3610 auto Unmerge = B.buildUnmerge(S16, Reg);
3611 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3612 PackedRegs.push_back(Unmerge.getReg(I));
3613 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3614 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3615 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3616 }
3617
3618 if (StoreVT.getNumElements() == 4) {
3619 SmallVector<Register, 4> PackedRegs;
3620 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3621 auto Unmerge = B.buildUnmerge(S32, Reg);
3622 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3623 PackedRegs.push_back(Unmerge.getReg(I));
3624 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3625 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3626 }
3627
3628 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3628)
;
3629 }
3630
3631 return Reg;
3632}
3633
3634Register AMDGPULegalizerInfo::fixStoreSourceType(
3635 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3636 MachineRegisterInfo *MRI = B.getMRI();
3637 LLT Ty = MRI->getType(VData);
3638
3639 const LLT S16 = LLT::scalar(16);
3640
3641 // Fixup illegal register types for i8 stores.
3642 if (Ty == LLT::scalar(8) || Ty == S16) {
3643 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3644 return AnyExt;
3645 }
3646
3647 if (Ty.isVector()) {
3648 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3649 if (IsFormat)
3650 return handleD16VData(B, *MRI, VData);
3651 }
3652 }
3653
3654 return VData;
3655}
3656
3657bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3658 MachineRegisterInfo &MRI,
3659 MachineIRBuilder &B,
3660 bool IsTyped,
3661 bool IsFormat) const {
3662 Register VData = MI.getOperand(1).getReg();
3663 LLT Ty = MRI.getType(VData);
3664 LLT EltTy = Ty.getScalarType();
3665 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3666 const LLT S32 = LLT::scalar(32);
3667
3668 VData = fixStoreSourceType(B, VData, IsFormat);
3669 Register RSrc = MI.getOperand(2).getReg();
3670
3671 MachineMemOperand *MMO = *MI.memoperands_begin();
3672 const int MemSize = MMO->getSize();
3673
3674 unsigned ImmOffset;
3675 unsigned TotalOffset;
3676
3677 // The typed intrinsics add an immediate after the registers.
3678 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3679
3680 // The struct intrinsic variants add one additional operand over raw.
3681 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3682 Register VIndex;
3683 int OpOffset = 0;
3684 if (HasVIndex) {
3685 VIndex = MI.getOperand(3).getReg();
3686 OpOffset = 1;
3687 }
3688
3689 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3690 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3691
3692 unsigned Format = 0;
3693 if (IsTyped) {
3694 Format = MI.getOperand(5 + OpOffset).getImm();
3695 ++OpOffset;
3696 }
3697
3698 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3699
3700 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3701 if (TotalOffset != 0)
3702 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3703
3704 unsigned Opc;
3705 if (IsTyped) {
3706 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3707 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3708 } else if (IsFormat) {
3709 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3710 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3711 } else {
3712 switch (MemSize) {
3713 case 1:
3714 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3715 break;
3716 case 2:
3717 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3718 break;
3719 default:
3720 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3721 break;
3722 }
3723 }
3724
3725 if (!VIndex)
3726 VIndex = B.buildConstant(S32, 0).getReg(0);
3727
3728 auto MIB = B.buildInstr(Opc)
3729 .addUse(VData) // vdata
3730 .addUse(RSrc) // rsrc
3731 .addUse(VIndex) // vindex
3732 .addUse(VOffset) // voffset
3733 .addUse(SOffset) // soffset
3734 .addImm(ImmOffset); // offset(imm)
3735
3736 if (IsTyped)
3737 MIB.addImm(Format);
3738
3739 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3740 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3741 .addMemOperand(MMO);
3742
3743 MI.eraseFromParent();
3744 return true;
3745}
3746
3747bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3748 MachineRegisterInfo &MRI,
3749 MachineIRBuilder &B,
3750 bool IsFormat,
3751 bool IsTyped) const {
3752 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3753 MachineMemOperand *MMO = *MI.memoperands_begin();
3754 const int MemSize = MMO->getSize();
3755 const LLT S32 = LLT::scalar(32);
3756
3757 Register Dst = MI.getOperand(0).getReg();
3758 Register RSrc = MI.getOperand(2).getReg();
3759
3760 // The typed intrinsics add an immediate after the registers.
3761 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3762
3763 // The struct intrinsic variants add one additional operand over raw.
3764 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3765 Register VIndex;
3766 int OpOffset = 0;
3767 if (HasVIndex) {
3768 VIndex = MI.getOperand(3).getReg();
3769 OpOffset = 1;
3770 }
3771
3772 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3773 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3774
3775 unsigned Format = 0;
3776 if (IsTyped) {
3777 Format = MI.getOperand(5 + OpOffset).getImm();
3778 ++OpOffset;
3779 }
3780
3781 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3782 unsigned ImmOffset;
3783 unsigned TotalOffset;
3784
3785 LLT Ty = MRI.getType(Dst);
3786 LLT EltTy = Ty.getScalarType();
3787 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3788 const bool Unpacked = ST.hasUnpackedD16VMem();
3789
3790 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3791 if (TotalOffset != 0)
3792 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3793
3794 unsigned Opc;
3795
3796 if (IsTyped) {
3797 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3798 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3799 } else if (IsFormat) {
3800 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3801 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3802 } else {
3803 switch (MemSize) {
3804 case 1:
3805 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3806 break;
3807 case 2:
3808 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3809 break;
3810 default:
3811 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3812 break;
3813 }
3814 }
3815
3816 Register LoadDstReg;
3817
3818 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3819 LLT UnpackedTy = Ty.changeElementSize(32);
3820
3821 if (IsExtLoad)
3822 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3823 else if (Unpacked && IsD16 && Ty.isVector())
3824 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3825 else
3826 LoadDstReg = Dst;
3827
3828 if (!VIndex)
3829 VIndex = B.buildConstant(S32, 0).getReg(0);
3830
3831 auto MIB = B.buildInstr(Opc)
3832 .addDef(LoadDstReg) // vdata
3833 .addUse(RSrc) // rsrc
3834 .addUse(VIndex) // vindex
3835 .addUse(VOffset) // voffset
3836 .addUse(SOffset) // soffset
3837 .addImm(ImmOffset); // offset(imm)
3838
3839 if (IsTyped)
3840 MIB.addImm(Format);
3841
3842 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3843 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3844 .addMemOperand(MMO);
3845
3846 if (LoadDstReg != Dst) {
3847 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3848
3849 // Widen result for extending loads was widened.
3850 if (IsExtLoad)
3851 B.buildTrunc(Dst, LoadDstReg);
3852 else {
3853 // Repack to original 16-bit vector result
3854 // FIXME: G_TRUNC should work, but legalization currently fails
3855 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3856 SmallVector<Register, 4> Repack;
3857 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3858 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3859 B.buildMerge(Dst, Repack);
3860 }
3861 }
3862
3863 MI.eraseFromParent();
3864 return true;
3865}
3866
3867bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3868 MachineIRBuilder &B,
3869 bool IsInc) const {
3870 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3871 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3872 B.buildInstr(Opc)
3873 .addDef(MI.getOperand(0).getReg())
3874 .addUse(MI.getOperand(2).getReg())
3875 .addUse(MI.getOperand(3).getReg())
3876 .cloneMemRefs(MI);
3877 MI.eraseFromParent();
3878 return true;
3879}
3880
3881static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3882 switch (IntrID) {
3883 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3884 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3886 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3887 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3888 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3889 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3890 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3891 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3892 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3893 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3894 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3895 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3896 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3898 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3899 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3901 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3902 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3903 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3904 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3905 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3906 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3907 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3908 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3909 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3910 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3911 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3913 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3914 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3916 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3917 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3918 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3919 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3920 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3921 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3922 case Intrinsic::amdgcn_buffer_atomic_fadd:
3923 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3924 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3926 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3927 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3929 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3930 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3931 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3932 default:
3933 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3933)
;
3934 }
3935}
3936
3937bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3938 MachineIRBuilder &B,
3939 Intrinsic::ID IID) const {
3940 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3941 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3942 const bool HasReturn = MI.getNumExplicitDefs() != 0;
3943
3944 Register Dst;
3945
3946 int OpOffset = 0;
3947 if (HasReturn) {
3948 // A few FP atomics do not support return values.
3949 Dst = MI.getOperand(0).getReg();
3950 } else {
3951 OpOffset = -1;
3952 }
3953
3954 Register VData = MI.getOperand(2 + OpOffset).getReg();
3955 Register CmpVal;
3956
3957 if (IsCmpSwap) {
3958 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3959 ++OpOffset;
3960 }
3961
3962 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3963 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3964
3965 // The struct intrinsic variants add one additional operand over raw.
3966 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3967 Register VIndex;
3968 if (HasVIndex) {
3969 VIndex = MI.getOperand(4 + OpOffset).getReg();
3970 ++OpOffset;
3971 }
3972
3973 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3974 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3975 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3976
3977 MachineMemOperand *MMO = *MI.memoperands_begin();
3978
3979 unsigned ImmOffset;
3980 unsigned TotalOffset;
3981 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3982 if (TotalOffset != 0)
3983 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3984
3985 if (!VIndex)
3986 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3987
3988 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3989
3990 if (HasReturn)
3991 MIB.addDef(Dst);
3992
3993 MIB.addUse(VData); // vdata
3994
3995 if (IsCmpSwap)
3996 MIB.addReg(CmpVal);
3997
3998 MIB.addUse(RSrc) // rsrc
3999 .addUse(VIndex) // vindex
4000 .addUse(VOffset) // voffset
4001 .addUse(SOffset) // soffset
4002 .addImm(ImmOffset) // offset(imm)
4003 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4004 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4005 .addMemOperand(MMO);
4006
4007 MI.eraseFromParent();
4008 return true;
4009}
4010
4011/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4012/// vector with s16 typed elements.
4013static void packImageA16AddressToDwords(
4014 MachineIRBuilder &B, MachineInstr &MI,
4015 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4016 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4017 const LLT S16 = LLT::scalar(16);
4018 const LLT V2S16 = LLT::vector(2, 16);
4019
4020 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4021 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4022 if (!SrcOp.isReg())
4023 continue; // _L to _LZ may have eliminated this.
4024
4025 Register AddrReg = SrcOp.getReg();
4026
4027 if (I < Intr->GradientStart) {
4028 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4029 PackedAddrs.push_back(AddrReg);
4030 } else {
4031 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4032 // derivatives dx/dh and dx/dv are packed with undef.
4033 if (((I + 1) >= EndIdx) ||
4034 ((Intr->NumGradients / 2) % 2 == 1 &&
4035 (I == static_cast<unsigned>(Intr->GradientStart +
4036 (Intr->NumGradients / 2) - 1) ||
4037 I == static_cast<unsigned>(Intr->GradientStart +
4038 Intr->NumGradients - 1))) ||
4039 // Check for _L to _LZ optimization
4040 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4041 PackedAddrs.push_back(
4042 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4043 .getReg(0));
4044 } else {
4045 PackedAddrs.push_back(
4046 B.buildBuildVector(
4047 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4048 .getReg(0));
4049 ++I;
4050 }
4051 }
4052 }
4053}
4054
4055/// Convert from separate vaddr components to a single vector address register,
4056/// and replace the remaining operands with $noreg.
4057static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4058 int DimIdx, int NumVAddrs) {
4059 const LLT S32 = LLT::scalar(32);
4060
4061 SmallVector<Register, 8> AddrRegs;
4062 for (int I = 0; I != NumVAddrs; ++I) {
4063 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4064 if (SrcOp.isReg()) {
4065 AddrRegs.push_back(SrcOp.getReg());
4066 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg
()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4066, __extension__ __PRETTY_FUNCTION__))
;
4067 }
4068 }
4069
4070 int NumAddrRegs = AddrRegs.size();
4071 if (NumAddrRegs != 1) {
4072 // Round up to 8 elements for v5-v7
4073 // FIXME: Missing intermediate sized register classes and instructions.
4074 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4075 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4076 auto Undef = B.buildUndef(S32);
4077 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4078 NumAddrRegs = RoundedNumRegs;
4079 }
4080
4081 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4082 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4083 }
4084
4085 for (int I = 1; I != NumVAddrs; ++I) {
4086 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4087 if (SrcOp.isReg())
4088 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4089 }
4090}
4091
4092/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4093///
4094/// Depending on the subtarget, load/store with 16-bit element data need to be
4095/// rewritten to use the low half of 32-bit registers, or directly use a packed
4096/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4097/// registers.
4098///
4099/// We don't want to directly select image instructions just yet, but also want
4100/// to exposes all register repacking to the legalizer/combiners. We also don't
4101/// want a selected instrution entering RegBankSelect. In order to avoid
4102/// defining a multitude of intermediate image instructions, directly hack on
4103/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4104/// now unnecessary arguments with $noreg.
4105bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4106 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4107 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4108
4109 const unsigned NumDefs = MI.getNumExplicitDefs();
4110 const unsigned ArgOffset = NumDefs + 1;
4111 bool IsTFE = NumDefs == 2;
1
Assuming 'NumDefs' is not equal to 2
4112 // We are only processing the operands of d16 image operations on subtargets
4113 // that use the unpacked register layout, or need to repack the TFE result.
4114
4115 // TODO: Do we need to guard against already legalized intrinsics?
4116 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4117 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4118
4119 MachineRegisterInfo *MRI = B.getMRI();
4120 const LLT S32 = LLT::scalar(32);
4121 const LLT S16 = LLT::scalar(16);
4122 const LLT V2S16 = LLT::vector(2, 16);
4123
4124 unsigned DMask = 0;
4125
4126 // Check for 16 bit addresses and pack if true.
4127 LLT GradTy =
4128 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4129 LLT AddrTy =
4130 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4131 const bool IsG16 = GradTy == S16;
2
Calling 'LLT::operator=='
5
Returning from 'LLT::operator=='
4132 const bool IsA16 = AddrTy == S16;
6
Calling 'LLT::operator=='
8
Returning from 'LLT::operator=='
4133
4134 int DMaskLanes = 0;
4135 if (!BaseOpcode->Atomic) {
9
Assuming field 'Atomic' is true
10
Taking false branch
4136 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4137 if (BaseOpcode->Gather4) {
4138 DMaskLanes = 4;
4139 } else if (DMask != 0) {
4140 DMaskLanes = countPopulation(DMask);
4141 } else if (!IsTFE && !BaseOpcode->Store) {
4142 // If dmask is 0, this is a no-op load. This can be eliminated.
4143 B.buildUndef(MI.getOperand(0));
4144 MI.eraseFromParent();
4145 return true;
4146 }
4147 }
4148
4149 Observer.changingInstr(MI);
4150 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4151
4152 unsigned NewOpcode = NumDefs == 0 ?
11
Assuming 'NumDefs' is not equal to 0
12
'?' condition is false
4153 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4154
4155 // Track that we legalized this
4156 MI.setDesc(B.getTII().get(NewOpcode));
4157
4158 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4159 // dmask to be at least 1 otherwise the instruction will fail
4160 if (IsTFE
12.1
'IsTFE' is false
12.1
'IsTFE' is false
&& DMask == 0) {
4161 DMask = 0x1;
4162 DMaskLanes = 1;
4163 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4164 }
4165
4166 if (BaseOpcode->Atomic
12.2
Field 'Atomic' is true
12.2
Field 'Atomic' is true
) {
13
Taking true branch
4167 Register VData0 = MI.getOperand(2).getReg();
4168 LLT Ty = MRI->getType(VData0);
4169
4170 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4171 if (Ty.isVector())
14
Calling 'LLT::isVector'
16
Returning from 'LLT::isVector'
17
Taking false branch
4172 return false;
4173
4174 if (BaseOpcode->AtomicX2) {
18
Assuming field 'AtomicX2' is false
19
Taking false branch
4175 Register VData1 = MI.getOperand(3).getReg();
4176 // The two values are packed in one register.
4177 LLT PackedTy = LLT::vector(2, Ty);
4178 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4179 MI.getOperand(2).setReg(Concat.getReg(0));
4180 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4181 }
4182 }
4183
4184 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4185
4186 // Optimize _L to _LZ when _L is zero
4187 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
20
Assuming 'LZMappingInfo' is null
21
Taking false branch
4188 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4189 const ConstantFP *ConstantLod;
4190
4191 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4192 m_GFCst(ConstantLod))) {
4193 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4194 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4195 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4196 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4197 Intr->Dim);
4198
4199 // The starting indexes should remain in the same place.
4200 --CorrectedNumVAddrs;
4201
4202 MI.getOperand(MI.getNumExplicitDefs())
4203 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4204 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4205 Intr = NewImageDimIntr;
4206 }
4207 }
4208 }
4209
4210 // Optimize _mip away, when 'lod' is zero
4211 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
22
Assuming the condition is false
23
Taking false branch
4212 int64_t ConstantLod;
4213 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4214 m_ICst(ConstantLod))) {
4215 if (ConstantLod == 0) {
4216 // TODO: Change intrinsic opcode and remove operand instead or replacing
4217 // it with 0, as the _L to _LZ handling is done above.
4218 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4219 --CorrectedNumVAddrs;
4220 }
4221 }
4222 }
4223
4224 // Rewrite the addressing register layout before doing anything else.
4225 if (IsA16
23.1
'IsA16' is false
23.1
'IsA16' is false
|| IsG16
23.2
'IsG16' is false
23.2
'IsG16' is false
) {
24
Taking false branch
4226 if (IsA16) {
4227 // Target must support the feature and gradients need to be 16 bit too
4228 if (!ST.hasA16() || !IsG16)
4229 return false;
4230 } else if (!ST.hasG16())
4231 return false;
4232
4233 if (Intr->NumVAddrs > 1) {
4234 SmallVector<Register, 4> PackedRegs;
4235 // Don't compress addresses for G16
4236 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4237 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4238 PackEndIdx);
4239
4240 if (!IsA16) {
4241 // Add uncompressed address
4242 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4243 int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4244 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32))(static_cast <bool> (B.getMRI()->getType(AddrReg) ==
LLT::scalar(32)) ? void (0) : __assert_fail ("B.getMRI()->getType(AddrReg) == LLT::scalar(32)"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4244, __extension__ __PRETTY_FUNCTION__))
;
4245 PackedRegs.push_back(AddrReg);
4246 }
4247 }
4248
4249 // See also below in the non-a16 branch
4250 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4251
4252 if (!UseNSA && PackedRegs.size() > 1) {
4253 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4254 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4255 PackedRegs[0] = Concat.getReg(0);
4256 PackedRegs.resize(1);
4257 }
4258
4259 const unsigned NumPacked = PackedRegs.size();
4260 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4261 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4262 if (!SrcOp.isReg()) {
4263 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm
() == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4263, __extension__ __PRETTY_FUNCTION__))
;
4264 continue;
4265 }
4266
4267 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister
) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4267, __extension__ __PRETTY_FUNCTION__))
;
4268
4269 if (I - Intr->VAddrStart < NumPacked)
4270 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4271 else
4272 SrcOp.setReg(AMDGPU::NoRegister);
4273 }
4274 }
4275 } else {
4276 // If the register allocator cannot place the address registers contiguously
4277 // without introducing moves, then using the non-sequential address encoding
4278 // is always preferable, since it saves VALU instructions and is usually a
4279 // wash in terms of code size or even better.
4280 //
4281 // However, we currently have no way of hinting to the register allocator
4282 // that MIMG addresses should be placed contiguously when it is possible to
4283 // do so, so force non-NSA for the common 2-address case as a heuristic.
4284 //
4285 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4286 // allocation when possible.
4287 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
25
Assuming 'CorrectedNumVAddrs' is < 3
4288
4289 if (!UseNSA
25.1
'UseNSA' is false
25.1
'UseNSA' is false
&& Intr->NumVAddrs > 1)
26
Assuming field 'NumVAddrs' is <= 1
27
Taking false branch
4290 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4291 Intr->NumVAddrs);
4292 }
4293
4294 int Flags = 0;
4295 if (IsA16
27.1
'IsA16' is false
27.1
'IsA16' is false
)
28
Taking false branch
4296 Flags |= 1;
4297 if (IsG16
28.1
'IsG16' is false
28.1
'IsG16' is false
)
29
Taking false branch
4298 Flags |= 2;
4299 MI.addOperand(MachineOperand::CreateImm(Flags));
4300
4301 if (BaseOpcode->Store) { // No TFE for stores?
30
Assuming field 'Store' is false
31
Taking false branch
4302 // TODO: Handle dmask trim
4303 Register VData = MI.getOperand(1).getReg();
4304 LLT Ty = MRI->getType(VData);
4305 if (!Ty.isVector() || Ty.getElementType() != S16)
4306 return true;
4307
4308 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4309 if (RepackedReg != VData) {
4310 MI.getOperand(1).setReg(RepackedReg);
4311 }
4312
4313 return true;
4314 }
4315
4316 Register DstReg = MI.getOperand(0).getReg();
4317 LLT Ty = MRI->getType(DstReg);
4318 const LLT EltTy = Ty.getScalarType();
4319 const bool IsD16 = Ty.getScalarType() == S16;
32
Calling 'LLT::operator=='
35
Returning from 'LLT::operator=='
4320 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
36
Assuming the condition is true
37
'?' condition is true
4321
4322 // Confirm that the return type is large enough for the dmask specified
4323 if (NumElts
37.1
'NumElts' is >= 'DMaskLanes'
37.1
'NumElts' is >= 'DMaskLanes'
< DMaskLanes)
38
Taking false branch
4324 return false;
4325
4326 if (NumElts > 4 || DMaskLanes
39.1
'DMaskLanes' is <= 4
39.1
'DMaskLanes' is <= 4
> 4)
39
Assuming 'NumElts' is <= 4
40
Taking false branch
4327 return false;
4328
4329 const unsigned AdjustedNumElts = DMaskLanes
40.1
'DMaskLanes' is equal to 0
40.1
'DMaskLanes' is equal to 0
== 0 ? 1 : DMaskLanes;
41
'?' condition is true
4330 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4331
4332 // The raw dword aligned data component of the load. The only legal cases
4333 // where this matters should be when using the packed D16 format, for
4334 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4335 LLT RoundedTy;
4336
4337 // S32 vector to to cover all data, plus TFE result element.
4338 LLT TFETy;
4339
4340 // Register type to use for each loaded component. Will be S32 or V2S16.
4341 LLT RegTy;
4342
4343 if (IsD16
41.1
'IsD16' is false
41.1
'IsD16' is false
&& ST.hasUnpackedD16VMem()) {
4344 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4345 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4346 RegTy = S32;
4347 } else {
4348 unsigned EltSize = EltTy.getSizeInBits();
42
Calling 'LLT::getSizeInBits'
45
Returning from 'LLT::getSizeInBits'
46
'EltSize' initialized to 0
4349 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4350 unsigned RoundedSize = 32 * RoundedElts;
4351 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
47
Division by zero
4352 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4353 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4354 }
4355
4356 // The return type does not need adjustment.
4357 // TODO: Should we change s16 case to s32 or <2 x s16>?
4358 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4359 return true;
4360
4361 Register Dst1Reg;
4362
4363 // Insert after the instruction.
4364 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4365
4366 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4367 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4368 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4369 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4370
4371 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4372
4373 MI.getOperand(0).setReg(NewResultReg);
4374
4375 // In the IR, TFE is supposed to be used with a 2 element struct return
4376 // type. The intruction really returns these two values in one contiguous
4377 // register, with one additional dword beyond the loaded data. Rewrite the
4378 // return type to use a single register result.
4379
4380 if (IsTFE) {
4381 Dst1Reg = MI.getOperand(1).getReg();
4382 if (MRI->getType(Dst1Reg) != S32)
4383 return false;
4384
4385 // TODO: Make sure the TFE operand bit is set.
4386 MI.RemoveOperand(1);
4387
4388 // Handle the easy case that requires no repack instructions.
4389 if (Ty == S32) {
4390 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4391 return true;
4392 }
4393 }
4394
4395 // Now figure out how to copy the new result register back into the old
4396 // result.
4397 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4398
4399 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4400
4401 if (ResultNumRegs == 1) {
4402 assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail
("!IsTFE", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4402, __extension__ __PRETTY_FUNCTION__))
;
4403 ResultRegs[0] = NewResultReg;
4404 } else {
4405 // We have to repack into a new vector of some kind.
4406 for (int I = 0; I != NumDataRegs; ++I)
4407 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4408 B.buildUnmerge(ResultRegs, NewResultReg);
4409
4410 // Drop the final TFE element to get the data part. The TFE result is
4411 // directly written to the right place already.
4412 if (IsTFE)
4413 ResultRegs.resize(NumDataRegs);
4414 }
4415
4416 // For an s16 scalar result, we form an s32 result with a truncate regardless
4417 // of packed vs. unpacked.
4418 if (IsD16 && !Ty.isVector()) {
4419 B.buildTrunc(DstReg, ResultRegs[0]);
4420 return true;
4421 }
4422
4423 // Avoid a build/concat_vector of 1 entry.
4424 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4425 B.buildBitcast(DstReg, ResultRegs[0]);
4426 return true;
4427 }
4428
4429 assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail
("Ty.isVector()", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4429, __extension__ __PRETTY_FUNCTION__))
;
4430
4431 if (IsD16) {
4432 // For packed D16 results with TFE enabled, all the data components are
4433 // S32. Cast back to the expected type.
4434 //
4435 // TODO: We don't really need to use load s32 elements. We would only need one
4436 // cast for the TFE result if a multiple of v2s16 was used.
4437 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4438 for (Register &Reg : ResultRegs)
4439 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4440 } else if (ST.hasUnpackedD16VMem()) {
4441 for (Register &Reg : ResultRegs)
4442 Reg = B.buildTrunc(S16, Reg).getReg(0);
4443 }
4444 }
4445
4446 auto padWithUndef = [&](LLT Ty, int NumElts) {
4447 if (NumElts == 0)
4448 return;
4449 Register Undef = B.buildUndef(Ty).getReg(0);
4450 for (int I = 0; I != NumElts; ++I)
4451 ResultRegs.push_back(Undef);
4452 };
4453
4454 // Pad out any elements eliminated due to the dmask.
4455 LLT ResTy = MRI->getType(ResultRegs[0]);
4456 if (!ResTy.isVector()) {
4457 padWithUndef(ResTy, NumElts - ResultRegs.size());
4458 B.buildBuildVector(DstReg, ResultRegs);
4459 return true;
4460 }
4461
4462 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() &&
ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4462, __extension__ __PRETTY_FUNCTION__))
;
4463 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4464
4465 // Deal with the one annoying legal case.
4466 const LLT V3S16 = LLT::vector(3, 16);
4467 if (Ty == V3S16) {
4468 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4469 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4470 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4471 return true;
4472 }
4473
4474 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4475 B.buildConcatVectors(DstReg, ResultRegs);
4476 return true;
4477}
4478
4479bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4480 LegalizerHelper &Helper, MachineInstr &MI) const {
4481 MachineIRBuilder &B = Helper.MIRBuilder;
4482 GISelChangeObserver &Observer = Helper.Observer;
4483
4484 Register Dst = MI.getOperand(0).getReg();
4485 LLT Ty = B.getMRI()->getType(Dst);
4486 unsigned Size = Ty.getSizeInBits();
4487 MachineFunction &MF = B.getMF();
4488
4489 Observer.changingInstr(MI);
4490
4491 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4492 Ty = getBitcastRegisterType(Ty);
4493 Helper.bitcastDst(MI, Ty, 0);
4494 Dst = MI.getOperand(0).getReg();
4495 B.setInsertPt(B.getMBB(), MI);
4496 }
4497
4498 // FIXME: We don't really need this intermediate instruction. The intrinsic
4499 // should be fixed to have a memory operand. Since it's readnone, we're not
4500 // allowed to add one.
4501 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4502 MI.RemoveOperand(1); // Remove intrinsic ID
4503
4504 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4505 // TODO: Should this use datalayout alignment?
4506 const unsigned MemSize = (Size + 7) / 8;
4507 const Align MemAlign(4);
4508 MachineMemOperand *MMO = MF.getMachineMemOperand(
4509 MachinePointerInfo(),
4510 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4511 MachineMemOperand::MOInvariant,
4512 MemSize, MemAlign);
4513 MI.addMemOperand(MF, MMO);
4514
4515 // There are no 96-bit result scalar loads, but widening to 128-bit should
4516 // always be legal. We may need to restore this to a 96-bit result if it turns
4517 // out this needs to be converted to a vector load during RegBankSelect.
4518 if (!isPowerOf2_32(Size)) {
4519 if (Ty.isVector())
4520 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4521 else
4522 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4523 }
4524
4525 Observer.changedInstr(MI);
4526 return true;
4527}
4528
4529// TODO: Move to selection
4530bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4531 MachineRegisterInfo &MRI,
4532 MachineIRBuilder &B) const {
4533 if (!ST.isTrapHandlerEnabled() ||
4534 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4535 return legalizeTrapEndpgm(MI, MRI, B);
4536
4537 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4538 switch (*HsaAbiVer) {
4539 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4540 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4541 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4542 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4543 return ST.supportsGetDoorbellID() ?
4544 legalizeTrapHsa(MI, MRI, B) :
4545 legalizeTrapHsaQueuePtr(MI, MRI, B);
4546 }
4547 }
4548
4549 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4549)
;
4550}
4551
4552bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4553 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4554 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4555 MI.eraseFromParent();
4556 return true;
4557}
4558
4559bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4560 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4561 // Pass queue pointer to trap handler as input, and insert trap instruction
4562 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4563 Register LiveIn =
4564 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4565 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4566 return false;
4567
4568 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4569 B.buildCopy(SGPR01, LiveIn);
4570 B.buildInstr(AMDGPU::S_TRAP)
4571 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4572 .addReg(SGPR01, RegState::Implicit);
4573
4574 MI.eraseFromParent();
4575 return true;
4576}
4577
4578bool AMDGPULegalizerInfo::legalizeTrapHsa(
4579 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4580 B.buildInstr(AMDGPU::S_TRAP)
4581 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4582 MI.eraseFromParent();
4583 return true;
4584}
4585
4586bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4587 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4588 // Is non-HSA path or trap-handler disabled? then, report a warning
4589 // accordingly
4590 if (!ST.isTrapHandlerEnabled() ||
4591 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4592 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4593 "debugtrap handler not supported",
4594 MI.getDebugLoc(), DS_Warning);
4595 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4596 Ctx.diagnose(NoTrap);
4597 } else {
4598 // Insert debug-trap instruction
4599 B.buildInstr(AMDGPU::S_TRAP)
4600 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4601 }
4602
4603 MI.eraseFromParent();
4604 return true;
4605}
4606
4607bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4608 MachineIRBuilder &B) const {
4609 MachineRegisterInfo &MRI = *B.getMRI();
4610 const LLT S16 = LLT::scalar(16);
4611 const LLT S32 = LLT::scalar(32);
4612
4613 Register DstReg = MI.getOperand(0).getReg();
4614 Register NodePtr = MI.getOperand(2).getReg();
4615 Register RayExtent = MI.getOperand(3).getReg();
4616 Register RayOrigin = MI.getOperand(4).getReg();
4617 Register RayDir = MI.getOperand(5).getReg();
4618 Register RayInvDir = MI.getOperand(6).getReg();
4619 Register TDescr = MI.getOperand(7).getReg();
4620
4621 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4622 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4623 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4624 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4625 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4626 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4627
4628 SmallVector<Register, 12> Ops;
4629 if (Is64) {
4630 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4631 Ops.push_back(Unmerge.getReg(0));
4632 Ops.push_back(Unmerge.getReg(1));
4633 } else {
4634 Ops.push_back(NodePtr);
4635 }
4636 Ops.push_back(RayExtent);
4637
4638 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4639 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4640 Ops.push_back(Unmerge.getReg(0));
4641 Ops.push_back(Unmerge.getReg(1));
4642 Ops.push_back(Unmerge.getReg(2));
4643 };
4644
4645 packLanes(RayOrigin);
4646 if (IsA16) {
4647 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4648 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4649 Register R1 = MRI.createGenericVirtualRegister(S32);
4650 Register R2 = MRI.createGenericVirtualRegister(S32);
4651 Register R3 = MRI.createGenericVirtualRegister(S32);
4652 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4653 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4654 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4655 Ops.push_back(R1);
4656 Ops.push_back(R2);
4657 Ops.push_back(R3);
4658 } else {
4659 packLanes(RayDir);
4660 packLanes(RayInvDir);
4661 }
4662
4663 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4664 .addDef(DstReg)
4665 .addImm(Opcode);
4666
4667 for (Register R : Ops) {
4668 MIB.addUse(R);
4669 }
4670
4671 MIB.addUse(TDescr)
4672 .addImm(IsA16 ? 1 : 0)
4673 .cloneMemRefs(MI);
4674
4675 MI.eraseFromParent();
4676 return true;
4677}
4678
4679bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4680 MachineInstr &MI) const {
4681 MachineIRBuilder &B = Helper.MIRBuilder;
4682 MachineRegisterInfo &MRI = *B.getMRI();
4683
4684 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4685 auto IntrID = MI.getIntrinsicID();
4686 switch (IntrID) {
4687 case Intrinsic::amdgcn_if:
4688 case Intrinsic::amdgcn_else: {
4689 MachineInstr *Br = nullptr;
4690 MachineBasicBlock *UncondBrTarget = nullptr;
4691 bool Negated = false;
4692 if (MachineInstr *BrCond =
4693 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4694 const SIRegisterInfo *TRI
4695 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4696
4697 Register Def = MI.getOperand(1).getReg();
4698 Register Use = MI.getOperand(3).getReg();
4699
4700 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4701
4702 if (Negated)
4703 std::swap(CondBrTarget, UncondBrTarget);
4704
4705 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4706 if (IntrID == Intrinsic::amdgcn_if) {
4707 B.buildInstr(AMDGPU::SI_IF)
4708 .addDef(Def)
4709 .addUse(Use)
4710 .addMBB(UncondBrTarget);
4711 } else {
4712 B.buildInstr(AMDGPU::SI_ELSE)
4713 .addDef(Def)
4714 .addUse(Use)
4715 .addMBB(UncondBrTarget);
4716 }
4717
4718 if (Br) {
4719 Br->getOperand(0).setMBB(CondBrTarget);
4720 } else {
4721 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4722 // since we're swapping branch targets it needs to be reinserted.
4723 // FIXME: IRTranslator should probably not do this
4724 B.buildBr(*CondBrTarget);
4725 }
4726
4727 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4728 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4729 MI.eraseFromParent();
4730 BrCond->eraseFromParent();
4731 return true;
4732 }
4733
4734 return false;
4735 }
4736 case Intrinsic::amdgcn_loop: {
4737 MachineInstr *Br = nullptr;
4738 MachineBasicBlock *UncondBrTarget = nullptr;
4739 bool Negated = false;
4740 if (MachineInstr *BrCond =
4741 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4742 const SIRegisterInfo *TRI
4743 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4744
4745 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4746 Register Reg = MI.getOperand(2).getReg();
4747
4748 if (Negated)
4749 std::swap(CondBrTarget, UncondBrTarget);
4750
4751 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4752 B.buildInstr(AMDGPU::SI_LOOP)
4753 .addUse(Reg)
4754 .addMBB(UncondBrTarget);
4755
4756 if (Br)
4757 Br->getOperand(0).setMBB(CondBrTarget);
4758 else
4759 B.buildBr(*CondBrTarget);
4760
4761 MI.eraseFromParent();
4762 BrCond->eraseFromParent();
4763 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4764 return true;
4765 }
4766
4767 return false;
4768 }
4769 case Intrinsic::amdgcn_kernarg_segment_ptr:
4770 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4771 // This only makes sense to call in a kernel, so just lower to null.
4772 B.buildConstant(MI.getOperand(0).getReg(), 0);
4773 MI.eraseFromParent();
4774 return true;
4775 }
4776
4777 return legalizePreloadedArgIntrin(
4778 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4779 case Intrinsic::amdgcn_implicitarg_ptr:
4780 return legalizeImplicitArgPtr(MI, MRI, B);
4781 case Intrinsic::amdgcn_workitem_id_x:
4782 return legalizePreloadedArgIntrin(MI, MRI, B,
4783 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4784 case Intrinsic::amdgcn_workitem_id_y:
4785 return legalizePreloadedArgIntrin(MI, MRI, B,
4786 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4787 case Intrinsic::amdgcn_workitem_id_z:
4788 return legalizePreloadedArgIntrin(MI, MRI, B,
4789 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4790 case Intrinsic::amdgcn_workgroup_id_x:
4791 return legalizePreloadedArgIntrin(MI, MRI, B,
4792 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4793 case Intrinsic::amdgcn_workgroup_id_y:
4794 return legalizePreloadedArgIntrin(MI, MRI, B,
4795 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4796 case Intrinsic::amdgcn_workgroup_id_z:
4797 return legalizePreloadedArgIntrin(MI, MRI, B,
4798 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4799 case Intrinsic::amdgcn_dispatch_ptr:
4800 return legalizePreloadedArgIntrin(MI, MRI, B,
4801 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4802 case Intrinsic::amdgcn_queue_ptr:
4803 return legalizePreloadedArgIntrin(MI, MRI, B,
4804 AMDGPUFunctionArgInfo::QUEUE_PTR);
4805 case Intrinsic::amdgcn_implicit_buffer_ptr:
4806 return legalizePreloadedArgIntrin(
4807 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4808 case Intrinsic::amdgcn_dispatch_id:
4809 return legalizePreloadedArgIntrin(MI, MRI, B,
4810 AMDGPUFunctionArgInfo::DISPATCH_ID);
4811 case Intrinsic::amdgcn_fdiv_fast:
4812 return legalizeFDIVFastIntrin(MI, MRI, B);
4813 case Intrinsic::amdgcn_is_shared:
4814 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4815 case Intrinsic::amdgcn_is_private:
4816 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4817 case Intrinsic::amdgcn_wavefrontsize: {
4818 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4819 MI.eraseFromParent();
4820 return true;
4821 }
4822 case Intrinsic::amdgcn_s_buffer_load:
4823 return legalizeSBufferLoad(Helper, MI);
4824 case Intrinsic::amdgcn_raw_buffer_store:
4825 case Intrinsic::amdgcn_struct_buffer_store:
4826 return legalizeBufferStore(MI, MRI, B, false, false);
4827 case Intrinsic::amdgcn_raw_buffer_store_format:
4828 case Intrinsic::amdgcn_struct_buffer_store_format:
4829 return legalizeBufferStore(MI, MRI, B, false, true);
4830 case Intrinsic::amdgcn_raw_tbuffer_store:
4831 case Intrinsic::amdgcn_struct_tbuffer_store:
4832 return legalizeBufferStore(MI, MRI, B, true, true);
4833 case Intrinsic::amdgcn_raw_buffer_load:
4834 case Intrinsic::amdgcn_struct_buffer_load:
4835 return legalizeBufferLoad(MI, MRI, B, false, false);
4836 case Intrinsic::amdgcn_raw_buffer_load_format:
4837 case Intrinsic::amdgcn_struct_buffer_load_format:
4838 return legalizeBufferLoad(MI, MRI, B, true, false);
4839 case Intrinsic::amdgcn_raw_tbuffer_load:
4840 case Intrinsic::amdgcn_struct_tbuffer_load:
4841 return legalizeBufferLoad(MI, MRI, B, true, true);
4842 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4843 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4844 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4845 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4846 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4847 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4848 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4849 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4850 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4851 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4852 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4853 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4854 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4855 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4856 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4857 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4858 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4859 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4860 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4861 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4862 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4863 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4864 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4865 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4866 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4867 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4868 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4869 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4870 case Intrinsic::amdgcn_buffer_atomic_fadd:
4871 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4872 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4873 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4874 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4875 return legalizeBufferAtomic(MI, B, IntrID);
4876 case Intrinsic::amdgcn_atomic_inc:
4877 return legalizeAtomicIncDec(MI, B, true);
4878 case Intrinsic::amdgcn_atomic_dec:
4879 return legalizeAtomicIncDec(MI, B, false);
4880 case Intrinsic::trap:
4881 return legalizeTrapIntrinsic(MI, MRI, B);
4882 case Intrinsic::debugtrap:
4883 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4884 case Intrinsic::amdgcn_rsq_clamp:
4885 return legalizeRsqClampIntrinsic(MI, MRI, B);
4886 case Intrinsic::amdgcn_ds_fadd:
4887 case Intrinsic::amdgcn_ds_fmin:
4888 case Intrinsic::amdgcn_ds_fmax:
4889 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4890 case Intrinsic::amdgcn_image_bvh_intersect_ray:
4891 return legalizeBVHIntrinsic(MI, B);
4892 default: {
4893 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4894 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4895 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4896 return true;
4897 }
4898 }
4899
4900 return true;
4901}

/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h

1//== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// Implement a low-level type suitable for MachineInstr level instruction
10/// selection.
11///
12/// For a type attached to a MachineInstr, we only care about 2 details: total
13/// size and the number of vector lanes (if any). Accordingly, there are 4
14/// possible valid type-kinds:
15///
16/// * `sN` for scalars and aggregates
17/// * `<N x sM>` for vectors, which must have at least 2 elements.
18/// * `pN` for pointers
19///
20/// Other information required for correct selection is expected to be carried
21/// by the opcode, or non-type flags. For example the distinction between G_ADD
22/// and G_FADD for int/float or fast-math flags.
23///
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
27#define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
28
29#include "llvm/ADT/DenseMapInfo.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/MachineValueType.h"
32#include <cassert>
33
34namespace llvm {
35
36class DataLayout;
37class Type;
38class raw_ostream;
39
40class LLT {
41public:
42 /// Get a low-level scalar or aggregate "bag of bits".
43 static LLT scalar(unsigned SizeInBits) {
44 assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 44, __extension__ __PRETTY_FUNCTION__))
;
45 return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0,
46 SizeInBits, /*AddressSpace=*/0};
47 }
48
49 /// Get a low-level pointer in the given address space.
50 static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
51 assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 51, __extension__ __PRETTY_FUNCTION__))
;
52 return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0,
53 SizeInBits, AddressSpace};
54 }
55
56 /// Get a low-level vector of some number of elements and element width.
57 /// \p NumElements must be at least 2.
58 static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) {
59 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 59, __extension__ __PRETTY_FUNCTION__))
;
60 assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 &&
"invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 60, __extension__ __PRETTY_FUNCTION__))
;
61 return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements,
62 ScalarSizeInBits, /*AddressSpace=*/0};
63 }
64
65 /// Get a low-level vector of some number of elements and element type.
66 static LLT vector(uint16_t NumElements, LLT ScalarTy) {
67 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 67, __extension__ __PRETTY_FUNCTION__))
;
68 assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type"
) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 68, __extension__ __PRETTY_FUNCTION__))
;
69 return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements,
70 ScalarTy.getSizeInBits(),
71 ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0};
72 }
73
74 static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) {
75 return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy);
76 }
77
78 static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) {
79 return scalarOrVector(NumElements, LLT::scalar(ScalarSize));
80 }
81
82 explicit LLT(bool isPointer, bool isVector, uint16_t NumElements,
83 unsigned SizeInBits, unsigned AddressSpace) {
84 init(isPointer, isVector, NumElements, SizeInBits, AddressSpace);
85 }
86 explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {}
87
88 explicit LLT(MVT VT);
89
90 bool isValid() const { return RawData != 0; }
91
92 bool isScalar() const { return isValid() && !IsPointer && !IsVector; }
93
94 bool isPointer() const { return isValid() && IsPointer && !IsVector; }
95
96 bool isVector() const { return isValid() && IsVector; }
15
Returning zero, which participates in a condition later
97
98 /// Returns the number of elements in a vector LLT. Must only be called on
99 /// vector types.
100 uint16_t getNumElements() const {
101 assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate"
) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 101, __extension__ __PRETTY_FUNCTION__))
;
102 if (!IsPointer)
103 return getFieldValue(VectorElementsFieldInfo);
104 else
105 return getFieldValue(PointerVectorElementsFieldInfo);
106 }
107
108 /// Returns the total size of the type. Must only be called on sized types.
109 unsigned getSizeInBits() const {
110 if (isPointer() || isScalar())
43
Taking false branch
111 return getScalarSizeInBits();
112 return getScalarSizeInBits() * getNumElements();
44
Returning zero
113 }
114
115 /// Returns the total size of the type in bytes, i.e. number of whole bytes
116 /// needed to represent the size in bits. Must only be called on sized types.
117 unsigned getSizeInBytes() const {
118 return (getSizeInBits() + 7) / 8;
119 }
120
121 LLT getScalarType() const {
122 return isVector() ? getElementType() : *this;
123 }
124
125 /// If this type is a vector, return a vector with the same number of elements
126 /// but the new element type. Otherwise, return the new element type.
127 LLT changeElementType(LLT NewEltTy) const {
128 return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy;
129 }
130
131 /// If this type is a vector, return a vector with the same number of elements
132 /// but the new element size. Otherwise, return the new element type. Invalid
133 /// for pointer types. For pointer types, use changeElementType.
134 LLT changeElementSize(unsigned NewEltSize) const {
135 assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
136 "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
;
137 return isVector() ? LLT::vector(getNumElements(), NewEltSize)
138 : LLT::scalar(NewEltSize);
139 }
140
141 /// Return a vector or scalar with the same element type and the new number of
142 /// elements.
143 LLT changeNumElements(unsigned NewNumElts) const {
144 return LLT::scalarOrVector(NewNumElts, getScalarType());
145 }
146
147 /// Return a type that is \p Factor times smaller. Reduces the number of
148 /// elements if this is a vector, or the bitwidth for scalar/pointers. Does
149 /// not attempt to handle cases that aren't evenly divisible.
150 LLT divide(int Factor) const {
151 assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail
("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
;
152 if (isVector()) {
153 assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void
(0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 153, __extension__ __PRETTY_FUNCTION__))
;
154 return scalarOrVector(getNumElements() / Factor, getElementType());
155 }
156
157 assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void
(0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 157, __extension__ __PRETTY_FUNCTION__))
;
158 return scalar(getSizeInBits() / Factor);
159 }
160
161 bool isByteSized() const { return (getSizeInBits() & 7) == 0; }
162
163 unsigned getScalarSizeInBits() const {
164 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 164, __extension__ __PRETTY_FUNCTION__))
;
165 if (!IsVector) {
166 if (!IsPointer)
167 return getFieldValue(ScalarSizeFieldInfo);
168 else
169 return getFieldValue(PointerSizeFieldInfo);
170 } else {
171 if (!IsPointer)
172 return getFieldValue(VectorSizeFieldInfo);
173 else
174 return getFieldValue(PointerVectorSizeFieldInfo);
175 }
176 }
177
178 unsigned getAddressSpace() const {
179 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180 assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type"
) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 180, __extension__ __PRETTY_FUNCTION__))
;
181 if (!IsVector)
182 return getFieldValue(PointerAddressSpaceFieldInfo);
183 else
184 return getFieldValue(PointerVectorAddressSpaceFieldInfo);
185 }
186
187 /// Returns the vector's element type. Only valid for vector types.
188 LLT getElementType() const {
189 assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate"
) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 189, __extension__ __PRETTY_FUNCTION__))
;
190 if (IsPointer)
191 return pointer(getAddressSpace(), getScalarSizeInBits());
192 else
193 return scalar(getScalarSizeInBits());
194 }
195
196 void print(raw_ostream &OS) const;
197
198#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
199 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {
200 print(dbgs());
201 dbgs() << '\n';
202 }
203#endif
204
205 bool operator==(const LLT &RHS) const {
206 return IsPointer
6.1
'IsPointer' is not equal to 'RHS.IsPointer'
6.1
'IsPointer' is not equal to 'RHS.IsPointer'
== RHS.IsPointer
&& IsVector == RHS.IsVector &&
3
Assuming 'IsPointer' is not equal to 'RHS.IsPointer'
4
Returning zero, which participates in a condition later
7
Returning zero, which participates in a condition later
33
Assuming 'IsPointer' is not equal to 'RHS.IsPointer'
34
Returning zero, which participates in a condition later
207 RHS.RawData == RawData;
208 }
209
210 bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
211
212 friend struct DenseMapInfo<LLT>;
213 friend class GISelInstProfileBuilder;
214
215private:
216 /// LLT is packed into 64 bits as follows:
217 /// isPointer : 1
218 /// isVector : 1
219 /// with 62 bits remaining for Kind-specific data, packed in bitfields
220 /// as described below. As there isn't a simple portable way to pack bits
221 /// into bitfields, here the different fields in the packed structure is
222 /// described in static const *Field variables. Each of these variables
223 /// is a 2-element array, with the first element describing the bitfield size
224 /// and the second element describing the bitfield offset.
225 typedef int BitFieldInfo[2];
226 ///
227 /// This is how the bitfields are packed per Kind:
228 /// * Invalid:
229 /// gets encoded as RawData == 0, as that is an invalid encoding, since for
230 /// valid encodings, SizeInBits/SizeOfElement must be larger than 0.
231 /// * Non-pointer scalar (isPointer == 0 && isVector == 0):
232 /// SizeInBits: 32;
233 static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0};
234 /// * Pointer (isPointer == 1 && isVector == 0):
235 /// SizeInBits: 16;
236 /// AddressSpace: 24;
237 static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0};
238 static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{
239 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]};
240 /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1):
241 /// NumElements: 16;
242 /// SizeOfElement: 32;
243 static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0};
244 static const constexpr BitFieldInfo VectorSizeFieldInfo{
245 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]};
246 /// * Vector-of-pointer (isPointer == 1 && isVector == 1):
247 /// NumElements: 16;
248 /// SizeOfElement: 16;
249 /// AddressSpace: 24;
250 static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0};
251 static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{
252 16,
253 PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]};
254 static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{
255 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]};
256
257 uint64_t IsPointer : 1;
258 uint64_t IsVector : 1;
259 uint64_t RawData : 62;
260
261 static uint64_t getMask(const BitFieldInfo FieldInfo) {
262 const int FieldSizeInBits = FieldInfo[0];
263 return (((uint64_t)1) << FieldSizeInBits) - 1;
264 }
265 static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) {
266 assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field"
) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 266, __extension__ __PRETTY_FUNCTION__))
;
267 return (Val & Mask) << Shift;
268 }
269 static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) {
270 return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]);
271 }
272 uint64_t getFieldValue(const BitFieldInfo FieldInfo) const {
273 return getMask(FieldInfo) & (RawData >> FieldInfo[1]);
274 }
275
276 void init(bool IsPointer, bool IsVector, uint16_t NumElements,
277 unsigned SizeInBits, unsigned AddressSpace) {
278 this->IsPointer = IsPointer;
279 this->IsVector = IsVector;
280 if (!IsVector) {
281 if (!IsPointer)
282 RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo);
283 else
284 RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) |
285 maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo);
286 } else {
287 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210506100649+6304c0836a4d/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 287, __extension__ __PRETTY_FUNCTION__))
;
288 if (!IsPointer)
289 RawData = maskAndShift(NumElements, VectorElementsFieldInfo) |
290 maskAndShift(SizeInBits, VectorSizeFieldInfo);
291 else
292 RawData =
293 maskAndShift(NumElements, PointerVectorElementsFieldInfo) |
294 maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) |
295 maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo);
296 }
297 }
298
299 uint64_t getUniqueRAWLLTData() const {
300 return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 |
301 ((uint64_t)IsVector);
302 }
303};
304
305inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
306 Ty.print(OS);
307 return OS;
308}
309
310template<> struct DenseMapInfo<LLT> {
311 static inline LLT getEmptyKey() {
312 LLT Invalid;
313 Invalid.IsPointer = true;
314 return Invalid;
315 }
316 static inline LLT getTombstoneKey() {
317 LLT Invalid;
318 Invalid.IsVector = true;
319 return Invalid;
320 }
321 static inline unsigned getHashValue(const LLT &Ty) {
322 uint64_t Val = Ty.getUniqueRAWLLTData();
323 return DenseMapInfo<uint64_t>::getHashValue(Val);
324 }
325 static bool isEqual(const LLT &LHS, const LLT &RHS) {
326 return LHS == RHS;
327 }
328};
329
330}
331
332#endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H