Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2704, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-09-17-195756-12974-1 -x c++ /build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/ADT/ScopeExit.h"
21#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24#include "llvm/CodeGen/TargetOpcodes.h"
25#include "llvm/CodeGen/ValueTypes.h"
26#include "llvm/IR/DerivedTypes.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Debug.h"
30
31#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
32
33using namespace llvm;
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
38
39// Hack until load/store selection patterns support any tuple of legal types.
40static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47static constexpr unsigned MaxRegisterSize = 1024;
48
49// Round the number of elements to the next power of two elements
50static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeNumElements(Pow2NElts);
54}
55
56// Round the number of bits to the next power of two bits
57static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61}
62
63/// \returs true if this is an odd sized vector which should widen by adding an
64/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65/// excludes s1 vectors, which should always be scalarized.
66static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78}
79
80static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85}
86
87static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93}
94
95static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100 };
101}
102
103static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 unsigned Size = Ty.getSizeInBits();
108 unsigned Pieces = (Size + 63) / 64;
109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111 };
112}
113
114// Increase the number of vector elements to reach the next multiple of 32-bit
115// type.
116static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117 return [=](const LegalityQuery &Query) {
118 const LLT Ty = Query.Types[TypeIdx];
119
120 const LLT EltTy = Ty.getElementType();
121 const int Size = Ty.getSizeInBits();
122 const int EltSize = EltTy.getSizeInBits();
123 const int NextMul32 = (Size + 31) / 32;
124
125 assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 125, __PRETTY_FUNCTION__))
;
126
127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129 };
130}
131
132static LLT getBitcastRegisterType(const LLT Ty) {
133 const unsigned Size = Ty.getSizeInBits();
134
135 LLT CoercedTy;
136 if (Size <= 32) {
137 // <2 x s8> -> s16
138 // <4 x s8> -> s32
139 return LLT::scalar(Size);
140 }
141
142 return LLT::scalarOrVector(Size / 32, 32);
143}
144
145static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146 return [=](const LegalityQuery &Query) {
147 const LLT Ty = Query.Types[TypeIdx];
148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149 };
150}
151
152static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 unsigned Size = Ty.getSizeInBits();
156 assert(Size % 32 == 0)((Size % 32 == 0) ? static_cast<void> (0) : __assert_fail
("Size % 32 == 0", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 156, __PRETTY_FUNCTION__))
;
157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158 };
159}
160
161static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162 return [=](const LegalityQuery &Query) {
163 const LLT QueryTy = Query.Types[TypeIdx];
164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165 };
166}
167
168static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169 return [=](const LegalityQuery &Query) {
170 const LLT QueryTy = Query.Types[TypeIdx];
171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172 };
173}
174
175static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176 return [=](const LegalityQuery &Query) {
177 const LLT QueryTy = Query.Types[TypeIdx];
178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179 };
180}
181
182static bool isRegisterSize(unsigned Size) {
183 return Size % 32 == 0 && Size <= MaxRegisterSize;
184}
185
186static bool isRegisterVectorElementType(LLT EltTy) {
187 const int EltSize = EltTy.getSizeInBits();
188 return EltSize == 16 || EltSize % 32 == 0;
189}
190
191static bool isRegisterVectorType(LLT Ty) {
192 const int EltSize = Ty.getElementType().getSizeInBits();
193 return EltSize == 32 || EltSize == 64 ||
194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195 EltSize == 128 || EltSize == 256;
196}
197
198static bool isRegisterType(LLT Ty) {
199 if (!isRegisterSize(Ty.getSizeInBits()))
200 return false;
201
202 if (Ty.isVector())
203 return isRegisterVectorType(Ty);
204
205 return true;
206}
207
208// Any combination of 32 or 64-bit elements up the maximum register size, and
209// multiples of v2s16.
210static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211 return [=](const LegalityQuery &Query) {
212 return isRegisterType(Query.Types[TypeIdx]);
213 };
214}
215
216static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 if (!QueryTy.isVector())
220 return false;
221 const LLT EltTy = QueryTy.getElementType();
222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223 };
224}
225
226static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227 return [=](const LegalityQuery &Query) {
228 const LLT Ty = Query.Types[TypeIdx];
229 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231 };
232}
233
234// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235// handle some operations by just promoting the register during
236// selection. There are also d16 loads on GFX9+ which preserve the high bits.
237static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238 bool IsLoad) {
239 switch (AS) {
240 case AMDGPUAS::PRIVATE_ADDRESS:
241 // FIXME: Private element size.
242 return 32;
243 case AMDGPUAS::LOCAL_ADDRESS:
244 return ST.useDS128() ? 128 : 64;
245 case AMDGPUAS::GLOBAL_ADDRESS:
246 case AMDGPUAS::CONSTANT_ADDRESS:
247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248 // Treat constant and global as identical. SMRD loads are sometimes usable for
249 // global loads (ideally constant address space should be eliminated)
250 // depending on the context. Legality cannot be context dependent, but
251 // RegBankSelect can split the load as necessary depending on the pointer
252 // register bank/uniformity and if the memory is invariant or not written in a
253 // kernel.
254 return IsLoad ? 512 : 128;
255 default:
256 // Flat addresses may contextually need to be split to 32-bit parts if they
257 // may alias scratch depending on the subtarget.
258 return 128;
259 }
260}
261
262static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263 const LegalityQuery &Query,
264 unsigned Opcode) {
265 const LLT Ty = Query.Types[0];
266
267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268 const bool IsLoad = Opcode != AMDGPU::G_STORE;
269
270 unsigned RegSize = Ty.getSizeInBits();
271 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273 unsigned AS = Query.Types[1].getAddressSpace();
274
275 // All of these need to be custom lowered to cast the pointer operand.
276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277 return false;
278
279 // TODO: We should be able to widen loads if the alignment is high enough, but
280 // we also need to modify the memory access size.
281#if 0
282 // Accept widening loads based on alignment.
283 if (IsLoad && MemSize < Size)
284 MemSize = std::max(MemSize, Align);
285#endif
286
287 // Only 1-byte and 2-byte to 32-bit extloads are valid.
288 if (MemSize != RegSize && RegSize != 32)
289 return false;
290
291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292 return false;
293
294 switch (MemSize) {
295 case 8:
296 case 16:
297 case 32:
298 case 64:
299 case 128:
300 break;
301 case 96:
302 if (!ST.hasDwordx3LoadStores())
303 return false;
304 break;
305 case 256:
306 case 512:
307 // These may contextually need to be broken down.
308 break;
309 default:
310 return false;
311 }
312
313 assert(RegSize >= MemSize)((RegSize >= MemSize) ? static_cast<void> (0) : __assert_fail
("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 313, __PRETTY_FUNCTION__))
;
314
315 if (AlignBits < MemSize) {
316 const SITargetLowering *TLI = ST.getTargetLowering();
317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318 Align(AlignBits / 8)))
319 return false;
320 }
321
322 return true;
323}
324
325// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326// workaround this. Eventually it should ignore the type for loads and only care
327// about the size. Return true in cases where we will workaround this for now by
328// bitcasting.
329static bool loadStoreBitcastWorkaround(const LLT Ty) {
330 if (EnableNewLegality)
331 return false;
332
333 const unsigned Size = Ty.getSizeInBits();
334 if (Size <= 64)
335 return false;
336 if (!Ty.isVector())
337 return true;
338
339 LLT EltTy = Ty.getElementType();
340 if (EltTy.isPointer())
341 return true;
342
343 unsigned EltSize = EltTy.getSizeInBits();
344 return EltSize != 32 && EltSize != 64;
345}
346
347static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
348 unsigned Opcode) {
349 const LLT Ty = Query.Types[0];
350 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
351 !loadStoreBitcastWorkaround(Ty);
352}
353
354/// Return true if a load or store of the type should be lowered with a bitcast
355/// to a different type.
356static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
357 const unsigned MemSizeInBits) {
358 const unsigned Size = Ty.getSizeInBits();
359 if (Size != MemSizeInBits)
360 return Size <= 32 && Ty.isVector();
361
362 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
363 return true;
364 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
365 !isRegisterVectorElementType(Ty.getElementType());
366}
367
368/// Return true if we should legalize a load by widening an odd sized memory
369/// access up to the alignment. Note this case when the memory access itself
370/// changes, not the size of the result register.
371static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
372 unsigned AlignInBits, unsigned AddrSpace,
373 unsigned Opcode) {
374 // We don't want to widen cases that are naturally legal.
375 if (isPowerOf2_32(SizeInBits))
376 return false;
377
378 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
379 // end up widening these for a scalar load during RegBankSelect, since there
380 // aren't 96-bit scalar loads.
381 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
382 return false;
383
384 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
385 return false;
386
387 // A load is known dereferenceable up to the alignment, so it's legal to widen
388 // to it.
389 //
390 // TODO: Could check dereferenceable for less aligned cases.
391 unsigned RoundedSize = NextPowerOf2(SizeInBits);
392 if (AlignInBits < RoundedSize)
393 return false;
394
395 // Do not widen if it would introduce a slow unaligned load.
396 const SITargetLowering *TLI = ST.getTargetLowering();
397 bool Fast = false;
398 return TLI->allowsMisalignedMemoryAccessesImpl(
399 RoundedSize, AddrSpace, Align(AlignInBits / 8),
400 MachineMemOperand::MOLoad, &Fast) &&
401 Fast;
402}
403
404static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
405 unsigned Opcode) {
406 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
407 return false;
408
409 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
410 Query.MMODescrs[0].AlignInBits,
411 Query.Types[1].getAddressSpace(), Opcode);
412}
413
414AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
415 const GCNTargetMachine &TM)
416 : ST(ST_) {
417 using namespace TargetOpcode;
418
419 auto GetAddrSpacePtr = [&TM](unsigned AS) {
420 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
421 };
422
423 const LLT S1 = LLT::scalar(1);
424 const LLT S16 = LLT::scalar(16);
425 const LLT S32 = LLT::scalar(32);
426 const LLT S64 = LLT::scalar(64);
427 const LLT S128 = LLT::scalar(128);
428 const LLT S256 = LLT::scalar(256);
429 const LLT S512 = LLT::scalar(512);
430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431
432 const LLT V2S16 = LLT::vector(2, 16);
433 const LLT V4S16 = LLT::vector(4, 16);
434
435 const LLT V2S32 = LLT::vector(2, 32);
436 const LLT V3S32 = LLT::vector(3, 32);
437 const LLT V4S32 = LLT::vector(4, 32);
438 const LLT V5S32 = LLT::vector(5, 32);
439 const LLT V6S32 = LLT::vector(6, 32);
440 const LLT V7S32 = LLT::vector(7, 32);
441 const LLT V8S32 = LLT::vector(8, 32);
442 const LLT V9S32 = LLT::vector(9, 32);
443 const LLT V10S32 = LLT::vector(10, 32);
444 const LLT V11S32 = LLT::vector(11, 32);
445 const LLT V12S32 = LLT::vector(12, 32);
446 const LLT V13S32 = LLT::vector(13, 32);
447 const LLT V14S32 = LLT::vector(14, 32);
448 const LLT V15S32 = LLT::vector(15, 32);
449 const LLT V16S32 = LLT::vector(16, 32);
450 const LLT V32S32 = LLT::vector(32, 32);
451
452 const LLT V2S64 = LLT::vector(2, 64);
453 const LLT V3S64 = LLT::vector(3, 64);
454 const LLT V4S64 = LLT::vector(4, 64);
455 const LLT V5S64 = LLT::vector(5, 64);
456 const LLT V6S64 = LLT::vector(6, 64);
457 const LLT V7S64 = LLT::vector(7, 64);
458 const LLT V8S64 = LLT::vector(8, 64);
459 const LLT V16S64 = LLT::vector(16, 64);
460
461 std::initializer_list<LLT> AllS32Vectors =
462 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
463 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
464 std::initializer_list<LLT> AllS64Vectors =
465 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
466
467 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
468 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
469 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
470 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
471 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
472 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
473 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
474
475 const LLT CodePtr = FlatPtr;
476
477 const std::initializer_list<LLT> AddrSpaces64 = {
478 GlobalPtr, ConstantPtr, FlatPtr
479 };
480
481 const std::initializer_list<LLT> AddrSpaces32 = {
482 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
483 };
484
485 const std::initializer_list<LLT> FPTypesBase = {
486 S32, S64
487 };
488
489 const std::initializer_list<LLT> FPTypes16 = {
490 S32, S64, S16
491 };
492
493 const std::initializer_list<LLT> FPTypesPK16 = {
494 S32, S64, S16, V2S16
495 };
496
497 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
498
499 setAction({G_BRCOND, S1}, Legal); // VCC branches
500 setAction({G_BRCOND, S32}, Legal); // SCC branches
501
502 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
503 // elements for v3s16
504 getActionDefinitionsBuilder(G_PHI)
505 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
506 .legalFor(AllS32Vectors)
507 .legalFor(AllS64Vectors)
508 .legalFor(AddrSpaces64)
509 .legalFor(AddrSpaces32)
510 .legalIf(isPointer(0))
511 .clampScalar(0, S16, S256)
512 .widenScalarToNextPow2(0, 32)
513 .clampMaxNumElements(0, S32, 16)
514 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
515 .scalarize(0);
516
517 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
518 // Full set of gfx9 features.
519 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
520 .legalFor({S32, S16, V2S16})
521 .clampScalar(0, S16, S32)
522 .clampMaxNumElements(0, S16, 2)
523 .scalarize(0)
524 .widenScalarToNextPow2(0, 32);
525
526 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
527 .legalFor({S32, S16, V2S16}) // Clamp modifier
528 .minScalarOrElt(0, S16)
529 .clampMaxNumElements(0, S16, 2)
530 .scalarize(0)
531 .widenScalarToNextPow2(0, 32)
532 .lower();
533 } else if (ST.has16BitInsts()) {
534 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
535 .legalFor({S32, S16})
536 .clampScalar(0, S16, S32)
537 .scalarize(0)
538 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
539
540 // Technically the saturating operations require clamp bit support, but this
541 // was introduced at the same time as 16-bit operations.
542 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
543 .legalFor({S32, S16}) // Clamp modifier
544 .minScalar(0, S16)
545 .scalarize(0)
546 .widenScalarToNextPow2(0, 16)
547 .lower();
548
549 // We're just lowering this, but it helps get a better result to try to
550 // coerce to the desired type first.
551 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
552 .minScalar(0, S16)
553 .scalarize(0)
554 .lower();
555 } else {
556 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
557 .legalFor({S32})
558 .clampScalar(0, S32, S32)
559 .scalarize(0);
560
561 if (ST.hasIntClamp()) {
562 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
563 .legalFor({S32}) // Clamp modifier.
564 .scalarize(0)
565 .minScalarOrElt(0, S32)
566 .lower();
567 } else {
568 // Clamp bit support was added in VI, along with 16-bit operations.
569 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
570 .minScalar(0, S32)
571 .scalarize(0)
572 .lower();
573 }
574
575 // FIXME: DAG expansion gets better results. The widening uses the smaller
576 // range values and goes for the min/max lowering directly.
577 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
578 .minScalar(0, S32)
579 .scalarize(0)
580 .lower();
581 }
582
583 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
584 .customFor({S32, S64})
585 .clampScalar(0, S32, S64)
586 .widenScalarToNextPow2(0, 32)
587 .scalarize(0);
588
589 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
590 .legalFor({S32})
591 .clampScalar(0, S32, S32)
592 .scalarize(0);
593
594 // Report legal for any types we can handle anywhere. For the cases only legal
595 // on the SALU, RegBankSelect will be able to re-legalize.
596 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
597 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
598 .clampScalar(0, S32, S64)
599 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
600 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
601 .widenScalarToNextPow2(0)
602 .scalarize(0);
603
604 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
605 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
606 .legalFor({{S32, S1}, {S32, S32}})
607 .minScalar(0, S32)
608 // TODO: .scalarize(0)
609 .lower();
610
611 getActionDefinitionsBuilder(G_BITCAST)
612 // Don't worry about the size constraint.
613 .legalIf(all(isRegisterType(0), isRegisterType(1)))
614 .lower();
615
616
617 getActionDefinitionsBuilder(G_CONSTANT)
618 .legalFor({S1, S32, S64, S16, GlobalPtr,
619 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
620 .legalIf(isPointer(0))
621 .clampScalar(0, S32, S64)
622 .widenScalarToNextPow2(0);
623
624 getActionDefinitionsBuilder(G_FCONSTANT)
625 .legalFor({S32, S64, S16})
626 .clampScalar(0, S16, S64);
627
628 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
629 .legalIf(isRegisterType(0))
630 // s1 and s16 are special cases because they have legal operations on
631 // them, but don't really occupy registers in the normal way.
632 .legalFor({S1, S16})
633 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
634 .clampScalarOrElt(0, S32, MaxScalar)
635 .widenScalarToNextPow2(0, 32)
636 .clampMaxNumElements(0, S32, 16);
637
638 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
639
640 // If the amount is divergent, we have to do a wave reduction to get the
641 // maximum value, so this is expanded during RegBankSelect.
642 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
643 .legalFor({{PrivatePtr, S32}});
644
645 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
646 .customIf(typeIsNot(0, PrivatePtr));
647
648 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
649
650 auto &FPOpActions = getActionDefinitionsBuilder(
651 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
652 .legalFor({S32, S64});
653 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
654 .customFor({S32, S64});
655 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
656 .customFor({S32, S64});
657
658 if (ST.has16BitInsts()) {
659 if (ST.hasVOP3PInsts())
660 FPOpActions.legalFor({S16, V2S16});
661 else
662 FPOpActions.legalFor({S16});
663
664 TrigActions.customFor({S16});
665 FDIVActions.customFor({S16});
666 }
667
668 auto &MinNumMaxNum = getActionDefinitionsBuilder({
669 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
670
671 if (ST.hasVOP3PInsts()) {
672 MinNumMaxNum.customFor(FPTypesPK16)
673 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
674 .clampMaxNumElements(0, S16, 2)
675 .clampScalar(0, S16, S64)
676 .scalarize(0);
677 } else if (ST.has16BitInsts()) {
678 MinNumMaxNum.customFor(FPTypes16)
679 .clampScalar(0, S16, S64)
680 .scalarize(0);
681 } else {
682 MinNumMaxNum.customFor(FPTypesBase)
683 .clampScalar(0, S32, S64)
684 .scalarize(0);
685 }
686
687 if (ST.hasVOP3PInsts())
688 FPOpActions.clampMaxNumElements(0, S16, 2);
689
690 FPOpActions
691 .scalarize(0)
692 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
693
694 TrigActions
695 .scalarize(0)
696 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
697
698 FDIVActions
699 .scalarize(0)
700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701
702 getActionDefinitionsBuilder({G_FNEG, G_FABS})
703 .legalFor(FPTypesPK16)
704 .clampMaxNumElements(0, S16, 2)
705 .scalarize(0)
706 .clampScalar(0, S16, S64);
707
708 if (ST.has16BitInsts()) {
709 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
710 .legalFor({S32, S64, S16})
711 .scalarize(0)
712 .clampScalar(0, S16, S64);
713 } else {
714 getActionDefinitionsBuilder(G_FSQRT)
715 .legalFor({S32, S64})
716 .scalarize(0)
717 .clampScalar(0, S32, S64);
718
719 if (ST.hasFractBug()) {
720 getActionDefinitionsBuilder(G_FFLOOR)
721 .customFor({S64})
722 .legalFor({S32, S64})
723 .scalarize(0)
724 .clampScalar(0, S32, S64);
725 } else {
726 getActionDefinitionsBuilder(G_FFLOOR)
727 .legalFor({S32, S64})
728 .scalarize(0)
729 .clampScalar(0, S32, S64);
730 }
731 }
732
733 getActionDefinitionsBuilder(G_FPTRUNC)
734 .legalFor({{S32, S64}, {S16, S32}})
735 .scalarize(0)
736 .lower();
737
738 getActionDefinitionsBuilder(G_FPEXT)
739 .legalFor({{S64, S32}, {S32, S16}})
740 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
741 .scalarize(0);
742
743 getActionDefinitionsBuilder(G_FSUB)
744 // Use actual fsub instruction
745 .legalFor({S32})
746 // Must use fadd + fneg
747 .lowerFor({S64, S16, V2S16})
748 .scalarize(0)
749 .clampScalar(0, S32, S64);
750
751 // Whether this is legal depends on the floating point mode for the function.
752 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
753 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
754 FMad.customFor({S32, S16});
755 else if (ST.hasMadMacF32Insts())
756 FMad.customFor({S32});
757 else if (ST.hasMadF16())
758 FMad.customFor({S16});
759 FMad.scalarize(0)
760 .lower();
761
762 auto &FRem = getActionDefinitionsBuilder(G_FREM);
763 if (ST.has16BitInsts()) {
764 FRem.customFor({S16, S32, S64});
765 } else {
766 FRem.minScalar(0, S32)
767 .customFor({S32, S64});
768 }
769 FRem.scalarize(0);
770
771 // TODO: Do we need to clamp maximum bitwidth?
772 getActionDefinitionsBuilder(G_TRUNC)
773 .legalIf(isScalar(0))
774 .legalFor({{V2S16, V2S32}})
775 .clampMaxNumElements(0, S16, 2)
776 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
777 // situations (like an invalid implicit use), we don't want to infinite loop
778 // in the legalizer.
779 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
780 .alwaysLegal();
781
782 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
783 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
784 {S32, S1}, {S64, S1}, {S16, S1}})
785 .scalarize(0)
786 .clampScalar(0, S32, S64)
787 .widenScalarToNextPow2(1, 32);
788
789 // TODO: Split s1->s64 during regbankselect for VALU.
790 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
791 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
792 .lowerFor({{S32, S64}})
793 .lowerIf(typeIs(1, S1))
794 .customFor({{S64, S64}});
795 if (ST.has16BitInsts())
796 IToFP.legalFor({{S16, S16}});
797 IToFP.clampScalar(1, S32, S64)
798 .minScalar(0, S32)
799 .scalarize(0)
800 .widenScalarToNextPow2(1);
801
802 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
803 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
804 .customFor({{S64, S64}})
805 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
806 if (ST.has16BitInsts())
807 FPToI.legalFor({{S16, S16}});
808 else
809 FPToI.minScalar(1, S32);
810
811 FPToI.minScalar(0, S32)
812 .scalarize(0)
813 .lower();
814
815 // Lower roundeven into G_FRINT
816 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
817 .scalarize(0)
818 .lower();
819
820 if (ST.has16BitInsts()) {
821 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
822 .legalFor({S16, S32, S64})
823 .clampScalar(0, S16, S64)
824 .scalarize(0);
825 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
826 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
827 .legalFor({S32, S64})
828 .clampScalar(0, S32, S64)
829 .scalarize(0);
830 } else {
831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832 .legalFor({S32})
833 .customFor({S64})
834 .clampScalar(0, S32, S64)
835 .scalarize(0);
836 }
837
838 getActionDefinitionsBuilder(G_PTR_ADD)
839 .legalIf(all(isPointer(0), sameSize(0, 1)))
840 .scalarize(0)
841 .scalarSameSizeAs(1, 0);
842
843 getActionDefinitionsBuilder(G_PTRMASK)
844 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
845 .scalarSameSizeAs(1, 0)
846 .scalarize(0);
847
848 auto &CmpBuilder =
849 getActionDefinitionsBuilder(G_ICMP)
850 // The compare output type differs based on the register bank of the output,
851 // so make both s1 and s32 legal.
852 //
853 // Scalar compares producing output in scc will be promoted to s32, as that
854 // is the allocatable register type that will be needed for the copy from
855 // scc. This will be promoted during RegBankSelect, and we assume something
856 // before that won't try to use s32 result types.
857 //
858 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
859 // bank.
860 .legalForCartesianProduct(
861 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
862 .legalForCartesianProduct(
863 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
864 if (ST.has16BitInsts()) {
865 CmpBuilder.legalFor({{S1, S16}});
866 }
867
868 CmpBuilder
869 .widenScalarToNextPow2(1)
870 .clampScalar(1, S32, S64)
871 .scalarize(0)
872 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
873
874 getActionDefinitionsBuilder(G_FCMP)
875 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
876 .widenScalarToNextPow2(1)
877 .clampScalar(1, S32, S64)
878 .scalarize(0);
879
880 // FIXME: fpow has a selection pattern that should move to custom lowering.
881 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
882 if (ST.has16BitInsts())
883 Exp2Ops.legalFor({S32, S16});
884 else
885 Exp2Ops.legalFor({S32});
886 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
887 Exp2Ops.scalarize(0);
888
889 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
890 if (ST.has16BitInsts())
891 ExpOps.customFor({{S32}, {S16}});
892 else
893 ExpOps.customFor({S32});
894 ExpOps.clampScalar(0, MinScalarFPTy, S32)
895 .scalarize(0);
896
897 getActionDefinitionsBuilder(G_FPOWI)
898 .clampScalar(0, MinScalarFPTy, S32)
899 .lower();
900
901 // The 64-bit versions produce 32-bit results, but only on the SALU.
902 getActionDefinitionsBuilder(G_CTPOP)
903 .legalFor({{S32, S32}, {S32, S64}})
904 .clampScalar(0, S32, S32)
905 .clampScalar(1, S32, S64)
906 .scalarize(0)
907 .widenScalarToNextPow2(0, 32)
908 .widenScalarToNextPow2(1, 32);
909
910 // The hardware instructions return a different result on 0 than the generic
911 // instructions expect. The hardware produces -1, but these produce the
912 // bitwidth.
913 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
914 .scalarize(0)
915 .clampScalar(0, S32, S32)
916 .clampScalar(1, S32, S64)
917 .widenScalarToNextPow2(0, 32)
918 .widenScalarToNextPow2(1, 32)
919 .lower();
920
921 // The 64-bit versions produce 32-bit results, but only on the SALU.
922 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
923 .legalFor({{S32, S32}, {S32, S64}})
924 .clampScalar(0, S32, S32)
925 .clampScalar(1, S32, S64)
926 .scalarize(0)
927 .widenScalarToNextPow2(0, 32)
928 .widenScalarToNextPow2(1, 32);
929
930 getActionDefinitionsBuilder(G_BITREVERSE)
931 .legalFor({S32})
932 .clampScalar(0, S32, S32)
933 .scalarize(0);
934
935 if (ST.has16BitInsts()) {
936 getActionDefinitionsBuilder(G_BSWAP)
937 .legalFor({S16, S32, V2S16})
938 .clampMaxNumElements(0, S16, 2)
939 // FIXME: Fixing non-power-of-2 before clamp is workaround for
940 // narrowScalar limitation.
941 .widenScalarToNextPow2(0)
942 .clampScalar(0, S16, S32)
943 .scalarize(0);
944
945 if (ST.hasVOP3PInsts()) {
946 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
947 .legalFor({S32, S16, V2S16})
948 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
949 .clampMaxNumElements(0, S16, 2)
950 .minScalar(0, S16)
951 .widenScalarToNextPow2(0)
952 .scalarize(0)
953 .lower();
954 } else {
955 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
956 .legalFor({S32, S16})
957 .widenScalarToNextPow2(0)
958 .minScalar(0, S16)
959 .scalarize(0)
960 .lower();
961 }
962 } else {
963 // TODO: Should have same legality without v_perm_b32
964 getActionDefinitionsBuilder(G_BSWAP)
965 .legalFor({S32})
966 .lowerIf(scalarNarrowerThan(0, 32))
967 // FIXME: Fixing non-power-of-2 before clamp is workaround for
968 // narrowScalar limitation.
969 .widenScalarToNextPow2(0)
970 .maxScalar(0, S32)
971 .scalarize(0)
972 .lower();
973
974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
975 .legalFor({S32})
976 .minScalar(0, S32)
977 .widenScalarToNextPow2(0)
978 .scalarize(0)
979 .lower();
980 }
981
982 getActionDefinitionsBuilder(G_INTTOPTR)
983 // List the common cases
984 .legalForCartesianProduct(AddrSpaces64, {S64})
985 .legalForCartesianProduct(AddrSpaces32, {S32})
986 .scalarize(0)
987 // Accept any address space as long as the size matches
988 .legalIf(sameSize(0, 1))
989 .widenScalarIf(smallerThan(1, 0),
990 [](const LegalityQuery &Query) {
991 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
992 })
993 .narrowScalarIf(largerThan(1, 0),
994 [](const LegalityQuery &Query) {
995 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
996 });
997
998 getActionDefinitionsBuilder(G_PTRTOINT)
999 // List the common cases
1000 .legalForCartesianProduct(AddrSpaces64, {S64})
1001 .legalForCartesianProduct(AddrSpaces32, {S32})
1002 .scalarize(0)
1003 // Accept any address space as long as the size matches
1004 .legalIf(sameSize(0, 1))
1005 .widenScalarIf(smallerThan(0, 1),
1006 [](const LegalityQuery &Query) {
1007 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1008 })
1009 .narrowScalarIf(
1010 largerThan(0, 1),
1011 [](const LegalityQuery &Query) {
1012 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1013 });
1014
1015 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1016 .scalarize(0)
1017 .custom();
1018
1019 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1020 bool IsLoad) -> bool {
1021 const LLT DstTy = Query.Types[0];
1022
1023 // Split vector extloads.
1024 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1025 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1026
1027 if (MemSize < DstTy.getSizeInBits())
1028 MemSize = std::max(MemSize, AlignBits);
1029
1030 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1031 return true;
1032
1033 const LLT PtrTy = Query.Types[1];
1034 unsigned AS = PtrTy.getAddressSpace();
1035 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1036 return true;
1037
1038 // Catch weird sized loads that don't evenly divide into the access sizes
1039 // TODO: May be able to widen depending on alignment etc.
1040 unsigned NumRegs = (MemSize + 31) / 32;
1041 if (NumRegs == 3) {
1042 if (!ST.hasDwordx3LoadStores())
1043 return true;
1044 } else {
1045 // If the alignment allows, these should have been widened.
1046 if (!isPowerOf2_32(NumRegs))
1047 return true;
1048 }
1049
1050 if (AlignBits < MemSize) {
1051 const SITargetLowering *TLI = ST.getTargetLowering();
1052 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1053 Align(AlignBits / 8));
1054 }
1055
1056 return false;
1057 };
1058
1059 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1060 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1061 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1062
1063 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1064 // LDS
1065 // TODO: Unsupported flat for SI.
1066
1067 for (unsigned Op : {G_LOAD, G_STORE}) {
1068 const bool IsStore = Op == G_STORE;
1069
1070 auto &Actions = getActionDefinitionsBuilder(Op);
1071 // Explicitly list some common cases.
1072 // TODO: Does this help compile time at all?
1073 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1074 {V2S32, GlobalPtr, 64, GlobalAlign32},
1075 {V4S32, GlobalPtr, 128, GlobalAlign32},
1076 {S64, GlobalPtr, 64, GlobalAlign32},
1077 {V2S64, GlobalPtr, 128, GlobalAlign32},
1078 {V2S16, GlobalPtr, 32, GlobalAlign32},
1079 {S32, GlobalPtr, 8, GlobalAlign8},
1080 {S32, GlobalPtr, 16, GlobalAlign16},
1081
1082 {S32, LocalPtr, 32, 32},
1083 {S64, LocalPtr, 64, 32},
1084 {V2S32, LocalPtr, 64, 32},
1085 {S32, LocalPtr, 8, 8},
1086 {S32, LocalPtr, 16, 16},
1087 {V2S16, LocalPtr, 32, 32},
1088
1089 {S32, PrivatePtr, 32, 32},
1090 {S32, PrivatePtr, 8, 8},
1091 {S32, PrivatePtr, 16, 16},
1092 {V2S16, PrivatePtr, 32, 32},
1093
1094 {S32, ConstantPtr, 32, GlobalAlign32},
1095 {V2S32, ConstantPtr, 64, GlobalAlign32},
1096 {V4S32, ConstantPtr, 128, GlobalAlign32},
1097 {S64, ConstantPtr, 64, GlobalAlign32},
1098 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1099 Actions.legalIf(
1100 [=](const LegalityQuery &Query) -> bool {
1101 return isLoadStoreLegal(ST, Query, Op);
1102 });
1103
1104 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1105 // 64-bits.
1106 //
1107 // TODO: Should generalize bitcast action into coerce, which will also cover
1108 // inserting addrspacecasts.
1109 Actions.customIf(typeIs(1, Constant32Ptr));
1110
1111 // Turn any illegal element vectors into something easier to deal
1112 // with. These will ultimately produce 32-bit scalar shifts to extract the
1113 // parts anyway.
1114 //
1115 // For odd 16-bit element vectors, prefer to split those into pieces with
1116 // 16-bit vector parts.
1117 Actions.bitcastIf(
1118 [=](const LegalityQuery &Query) -> bool {
1119 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1120 Query.MMODescrs[0].SizeInBits);
1121 }, bitcastToRegisterType(0));
1122
1123 if (!IsStore) {
1124 // Widen suitably aligned loads by loading extra bytes. The standard
1125 // legalization actions can't properly express widening memory operands.
1126 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1127 return shouldWidenLoad(ST, Query, G_LOAD);
1128 });
1129 }
1130
1131 // FIXME: load/store narrowing should be moved to lower action
1132 Actions
1133 .narrowScalarIf(
1134 [=](const LegalityQuery &Query) -> bool {
1135 return !Query.Types[0].isVector() &&
1136 needToSplitMemOp(Query, Op == G_LOAD);
1137 },
1138 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1139 const LLT DstTy = Query.Types[0];
1140 const LLT PtrTy = Query.Types[1];
1141
1142 const unsigned DstSize = DstTy.getSizeInBits();
1143 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1144
1145 // Split extloads.
1146 if (DstSize > MemSize)
1147 return std::make_pair(0, LLT::scalar(MemSize));
1148
1149 if (!isPowerOf2_32(DstSize)) {
1150 // We're probably decomposing an odd sized store. Try to split
1151 // to the widest type. TODO: Account for alignment. As-is it
1152 // should be OK, since the new parts will be further legalized.
1153 unsigned FloorSize = PowerOf2Floor(DstSize);
1154 return std::make_pair(0, LLT::scalar(FloorSize));
1155 }
1156
1157 if (DstSize > 32 && (DstSize % 32 != 0)) {
1158 // FIXME: Need a way to specify non-extload of larger size if
1159 // suitably aligned.
1160 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1161 }
1162
1163 unsigned MaxSize = maxSizeForAddrSpace(ST,
1164 PtrTy.getAddressSpace(),
1165 Op == G_LOAD);
1166 if (MemSize > MaxSize)
1167 return std::make_pair(0, LLT::scalar(MaxSize));
1168
1169 unsigned Align = Query.MMODescrs[0].AlignInBits;
1170 return std::make_pair(0, LLT::scalar(Align));
1171 })
1172 .fewerElementsIf(
1173 [=](const LegalityQuery &Query) -> bool {
1174 return Query.Types[0].isVector() &&
1175 needToSplitMemOp(Query, Op == G_LOAD);
1176 },
1177 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1178 const LLT DstTy = Query.Types[0];
1179 const LLT PtrTy = Query.Types[1];
1180
1181 LLT EltTy = DstTy.getElementType();
1182 unsigned MaxSize = maxSizeForAddrSpace(ST,
1183 PtrTy.getAddressSpace(),
1184 Op == G_LOAD);
1185
1186 // FIXME: Handle widened to power of 2 results better. This ends
1187 // up scalarizing.
1188 // FIXME: 3 element stores scalarized on SI
1189
1190 // Split if it's too large for the address space.
1191 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1192 unsigned NumElts = DstTy.getNumElements();
1193 unsigned EltSize = EltTy.getSizeInBits();
1194
1195 if (MaxSize % EltSize == 0) {
1196 return std::make_pair(
1197 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1198 }
1199
1200 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1201
1202 // FIXME: Refine when odd breakdowns handled
1203 // The scalars will need to be re-legalized.
1204 if (NumPieces == 1 || NumPieces >= NumElts ||
1205 NumElts % NumPieces != 0)
1206 return std::make_pair(0, EltTy);
1207
1208 return std::make_pair(0,
1209 LLT::vector(NumElts / NumPieces, EltTy));
1210 }
1211
1212 // FIXME: We could probably handle weird extending loads better.
1213 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1214 if (DstTy.getSizeInBits() > MemSize)
1215 return std::make_pair(0, EltTy);
1216
1217 unsigned EltSize = EltTy.getSizeInBits();
1218 unsigned DstSize = DstTy.getSizeInBits();
1219 if (!isPowerOf2_32(DstSize)) {
1220 // We're probably decomposing an odd sized store. Try to split
1221 // to the widest type. TODO: Account for alignment. As-is it
1222 // should be OK, since the new parts will be further legalized.
1223 unsigned FloorSize = PowerOf2Floor(DstSize);
1224 return std::make_pair(
1225 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1226 }
1227
1228 // Need to split because of alignment.
1229 unsigned Align = Query.MMODescrs[0].AlignInBits;
1230 if (EltSize > Align &&
1231 (EltSize / Align < DstTy.getNumElements())) {
1232 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1233 }
1234
1235 // May need relegalization for the scalars.
1236 return std::make_pair(0, EltTy);
1237 })
1238 .lowerIfMemSizeNotPow2()
1239 .minScalar(0, S32);
1240
1241 if (IsStore)
1242 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1243
1244 Actions
1245 .widenScalarToNextPow2(0)
1246 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1247 .lower();
1248 }
1249
1250 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1251 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1252 {S32, GlobalPtr, 16, 2 * 8},
1253 {S32, LocalPtr, 8, 8},
1254 {S32, LocalPtr, 16, 16},
1255 {S32, PrivatePtr, 8, 8},
1256 {S32, PrivatePtr, 16, 16},
1257 {S32, ConstantPtr, 8, 8},
1258 {S32, ConstantPtr, 16, 2 * 8}});
1259 if (ST.hasFlatAddressSpace()) {
1260 ExtLoads.legalForTypesWithMemDesc(
1261 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1262 }
1263
1264 ExtLoads.clampScalar(0, S32, S32)
1265 .widenScalarToNextPow2(0)
1266 .unsupportedIfMemSizeNotPow2()
1267 .lower();
1268
1269 auto &Atomics = getActionDefinitionsBuilder(
1270 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1271 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1272 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1273 G_ATOMICRMW_UMIN})
1274 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1275 {S64, GlobalPtr}, {S64, LocalPtr},
1276 {S32, RegionPtr}, {S64, RegionPtr}});
1277 if (ST.hasFlatAddressSpace()) {
1278 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1279 }
1280
1281 if (ST.hasLDSFPAtomics()) {
1282 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1283 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1284 }
1285
1286 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1287 // demarshalling
1288 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1289 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1290 {S32, FlatPtr}, {S64, FlatPtr}})
1291 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1292 {S32, RegionPtr}, {S64, RegionPtr}});
1293 // TODO: Pointer types, any 32-bit or 64-bit vector
1294
1295 // Condition should be s32 for scalar, s1 for vector.
1296 getActionDefinitionsBuilder(G_SELECT)
1297 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1298 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1299 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1300 .clampScalar(0, S16, S64)
1301 .scalarize(1)
1302 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1303 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1304 .clampMaxNumElements(0, S32, 2)
1305 .clampMaxNumElements(0, LocalPtr, 2)
1306 .clampMaxNumElements(0, PrivatePtr, 2)
1307 .scalarize(0)
1308 .widenScalarToNextPow2(0)
1309 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1310
1311 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1312 // be more flexible with the shift amount type.
1313 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1314 .legalFor({{S32, S32}, {S64, S32}});
1315 if (ST.has16BitInsts()) {
1316 if (ST.hasVOP3PInsts()) {
1317 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1318 .clampMaxNumElements(0, S16, 2);
1319 } else
1320 Shifts.legalFor({{S16, S16}});
1321
1322 // TODO: Support 16-bit shift amounts for all types
1323 Shifts.widenScalarIf(
1324 [=](const LegalityQuery &Query) {
1325 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1326 // 32-bit amount.
1327 const LLT ValTy = Query.Types[0];
1328 const LLT AmountTy = Query.Types[1];
1329 return ValTy.getSizeInBits() <= 16 &&
1330 AmountTy.getSizeInBits() < 16;
1331 }, changeTo(1, S16));
1332 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1333 Shifts.clampScalar(1, S32, S32);
1334 Shifts.clampScalar(0, S16, S64);
1335 Shifts.widenScalarToNextPow2(0, 16);
1336
1337 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1338 .minScalar(0, S16)
1339 .scalarize(0)
1340 .lower();
1341 } else {
1342 // Make sure we legalize the shift amount type first, as the general
1343 // expansion for the shifted type will produce much worse code if it hasn't
1344 // been truncated already.
1345 Shifts.clampScalar(1, S32, S32);
1346 Shifts.clampScalar(0, S32, S64);
1347 Shifts.widenScalarToNextPow2(0, 32);
1348
1349 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350 .minScalar(0, S32)
1351 .scalarize(0)
1352 .lower();
1353 }
1354 Shifts.scalarize(0);
1355
1356 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1357 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1358 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1359 unsigned IdxTypeIdx = 2;
1360
1361 getActionDefinitionsBuilder(Op)
1362 .customIf([=](const LegalityQuery &Query) {
1363 const LLT EltTy = Query.Types[EltTypeIdx];
1364 const LLT VecTy = Query.Types[VecTypeIdx];
1365 const LLT IdxTy = Query.Types[IdxTypeIdx];
1366 const unsigned EltSize = EltTy.getSizeInBits();
1367 return (EltSize == 32 || EltSize == 64) &&
1368 VecTy.getSizeInBits() % 32 == 0 &&
1369 VecTy.getSizeInBits() <= MaxRegisterSize &&
1370 IdxTy.getSizeInBits() == 32;
1371 })
1372 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1373 bitcastToVectorElement32(VecTypeIdx))
1374 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1375 .bitcastIf(
1376 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1377 [=](const LegalityQuery &Query) {
1378 // For > 64-bit element types, try to turn this into a 64-bit
1379 // element vector since we may be able to do better indexing
1380 // if this is scalar. If not, fall back to 32.
1381 const LLT EltTy = Query.Types[EltTypeIdx];
1382 const LLT VecTy = Query.Types[VecTypeIdx];
1383 const unsigned DstEltSize = EltTy.getSizeInBits();
1384 const unsigned VecSize = VecTy.getSizeInBits();
1385
1386 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1387 return std::make_pair(
1388 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1389 })
1390 .clampScalar(EltTypeIdx, S32, S64)
1391 .clampScalar(VecTypeIdx, S32, S64)
1392 .clampScalar(IdxTypeIdx, S32, S32)
1393 .clampMaxNumElements(VecTypeIdx, S32, 32)
1394 // TODO: Clamp elements for 64-bit vectors?
1395 // It should only be necessary with variable indexes.
1396 // As a last resort, lower to the stack
1397 .lower();
1398 }
1399
1400 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1401 .unsupportedIf([=](const LegalityQuery &Query) {
1402 const LLT &EltTy = Query.Types[1].getElementType();
1403 return Query.Types[0] != EltTy;
1404 });
1405
1406 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1407 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1408 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1409
1410 // FIXME: Doesn't handle extract of illegal sizes.
1411 getActionDefinitionsBuilder(Op)
1412 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1413 // FIXME: Multiples of 16 should not be legal.
1414 .legalIf([=](const LegalityQuery &Query) {
1415 const LLT BigTy = Query.Types[BigTyIdx];
1416 const LLT LitTy = Query.Types[LitTyIdx];
1417 return (BigTy.getSizeInBits() % 32 == 0) &&
1418 (LitTy.getSizeInBits() % 16 == 0);
1419 })
1420 .widenScalarIf(
1421 [=](const LegalityQuery &Query) {
1422 const LLT BigTy = Query.Types[BigTyIdx];
1423 return (BigTy.getScalarSizeInBits() < 16);
1424 },
1425 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1426 .widenScalarIf(
1427 [=](const LegalityQuery &Query) {
1428 const LLT LitTy = Query.Types[LitTyIdx];
1429 return (LitTy.getScalarSizeInBits() < 16);
1430 },
1431 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1432 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1433 .widenScalarToNextPow2(BigTyIdx, 32);
1434
1435 }
1436
1437 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1438 .legalForCartesianProduct(AllS32Vectors, {S32})
1439 .legalForCartesianProduct(AllS64Vectors, {S64})
1440 .clampNumElements(0, V16S32, V32S32)
1441 .clampNumElements(0, V2S64, V16S64)
1442 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1443
1444 if (ST.hasScalarPackInsts()) {
1445 BuildVector
1446 // FIXME: Should probably widen s1 vectors straight to s32
1447 .minScalarOrElt(0, S16)
1448 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1449 .minScalar(1, S32);
1450
1451 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1452 .legalFor({V2S16, S32})
1453 .lower();
1454 BuildVector.minScalarOrElt(0, S32);
1455 } else {
1456 BuildVector.customFor({V2S16, S16});
1457 BuildVector.minScalarOrElt(0, S32);
1458
1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460 .customFor({V2S16, S32})
1461 .lower();
1462 }
1463
1464 BuildVector.legalIf(isRegisterType(0));
1465
1466 // FIXME: Clamp maximum size
1467 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1468 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1469 .clampMaxNumElements(0, S32, 32)
1470 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1471 .clampMaxNumElements(0, S16, 64);
1472
1473 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1474 // pre-legalize.
1475 if (ST.hasVOP3PInsts()) {
1476 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1477 .customFor({V2S16, V2S16})
1478 .lower();
1479 } else
1480 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1481
1482 // Merge/Unmerge
1483 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1484 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1485 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1486
1487 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1488 const LLT Ty = Query.Types[TypeIdx];
1489 if (Ty.isVector()) {
1490 const LLT &EltTy = Ty.getElementType();
1491 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1492 return true;
1493 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1494 return true;
1495 }
1496 return false;
1497 };
1498
1499 auto &Builder = getActionDefinitionsBuilder(Op)
1500 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1501 .lowerFor({{S16, V2S16}})
1502 .lowerIf([=](const LegalityQuery &Query) {
1503 const LLT BigTy = Query.Types[BigTyIdx];
1504 return BigTy.getSizeInBits() == 32;
1505 })
1506 // Try to widen to s16 first for small types.
1507 // TODO: Only do this on targets with legal s16 shifts
1508 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1509 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1510 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1511 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1512 elementTypeIs(1, S16)),
1513 changeTo(1, V2S16))
1514 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1515 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1516 // valid.
1517 .clampScalar(LitTyIdx, S32, S512)
1518 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1519 // Break up vectors with weird elements into scalars
1520 .fewerElementsIf(
1521 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1522 scalarize(0))
1523 .fewerElementsIf(
1524 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1525 scalarize(1))
1526 .clampScalar(BigTyIdx, S32, MaxScalar);
1527
1528 if (Op == G_MERGE_VALUES) {
1529 Builder.widenScalarIf(
1530 // TODO: Use 16-bit shifts if legal for 8-bit values?
1531 [=](const LegalityQuery &Query) {
1532 const LLT Ty = Query.Types[LitTyIdx];
1533 return Ty.getSizeInBits() < 32;
1534 },
1535 changeTo(LitTyIdx, S32));
1536 }
1537
1538 Builder.widenScalarIf(
1539 [=](const LegalityQuery &Query) {
1540 const LLT Ty = Query.Types[BigTyIdx];
1541 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1542 Ty.getSizeInBits() % 16 != 0;
1543 },
1544 [=](const LegalityQuery &Query) {
1545 // Pick the next power of 2, or a multiple of 64 over 128.
1546 // Whichever is smaller.
1547 const LLT &Ty = Query.Types[BigTyIdx];
1548 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1549 if (NewSizeInBits >= 256) {
1550 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1551 if (RoundedTo < NewSizeInBits)
1552 NewSizeInBits = RoundedTo;
1553 }
1554 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1555 })
1556 // Any vectors left are the wrong size. Scalarize them.
1557 .scalarize(0)
1558 .scalarize(1);
1559 }
1560
1561 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1562 // RegBankSelect.
1563 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1564 .legalFor({{S32}, {S64}});
1565
1566 if (ST.hasVOP3PInsts()) {
1567 SextInReg.lowerFor({{V2S16}})
1568 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1569 // get more vector shift opportunities, since we'll get those when
1570 // expanded.
1571 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1572 } else if (ST.has16BitInsts()) {
1573 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1574 } else {
1575 // Prefer to promote to s32 before lowering if we don't have 16-bit
1576 // shifts. This avoid a lot of intermediate truncate and extend operations.
1577 SextInReg.lowerFor({{S32}, {S64}});
1578 }
1579
1580 SextInReg
1581 .scalarize(0)
1582 .clampScalar(0, S32, S64)
1583 .lower();
1584
1585 getActionDefinitionsBuilder(G_FSHR)
1586 .legalFor({{S32, S32}})
1587 .scalarize(0)
1588 .lower();
1589
1590 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1591 .legalFor({S64});
1592
1593 getActionDefinitionsBuilder(G_FENCE)
1594 .alwaysLegal();
1595
1596 getActionDefinitionsBuilder({
1597 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1598 G_FCOPYSIGN,
1599
1600 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1601 G_ATOMICRMW_NAND,
1602 G_ATOMICRMW_FSUB,
1603 G_READ_REGISTER,
1604 G_WRITE_REGISTER,
1605
1606 G_SADDO, G_SSUBO,
1607
1608 // TODO: Implement
1609 G_FMINIMUM, G_FMAXIMUM,
1610 G_FSHL
1611 }).lower();
1612
1613 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1614 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1615 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1616 .unsupported();
1617
1618 computeTables();
1619 verify(*ST.getInstrInfo());
1620}
1621
1622bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1623 MachineInstr &MI) const {
1624 MachineIRBuilder &B = Helper.MIRBuilder;
1625 MachineRegisterInfo &MRI = *B.getMRI();
1626
1627 switch (MI.getOpcode()) {
1628 case TargetOpcode::G_ADDRSPACE_CAST:
1629 return legalizeAddrSpaceCast(MI, MRI, B);
1630 case TargetOpcode::G_FRINT:
1631 return legalizeFrint(MI, MRI, B);
1632 case TargetOpcode::G_FCEIL:
1633 return legalizeFceil(MI, MRI, B);
1634 case TargetOpcode::G_FREM:
1635 return legalizeFrem(MI, MRI, B);
1636 case TargetOpcode::G_INTRINSIC_TRUNC:
1637 return legalizeIntrinsicTrunc(MI, MRI, B);
1638 case TargetOpcode::G_SITOFP:
1639 return legalizeITOFP(MI, MRI, B, true);
1640 case TargetOpcode::G_UITOFP:
1641 return legalizeITOFP(MI, MRI, B, false);
1642 case TargetOpcode::G_FPTOSI:
1643 return legalizeFPTOI(MI, MRI, B, true);
1644 case TargetOpcode::G_FPTOUI:
1645 return legalizeFPTOI(MI, MRI, B, false);
1646 case TargetOpcode::G_FMINNUM:
1647 case TargetOpcode::G_FMAXNUM:
1648 case TargetOpcode::G_FMINNUM_IEEE:
1649 case TargetOpcode::G_FMAXNUM_IEEE:
1650 return legalizeMinNumMaxNum(Helper, MI);
1651 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1652 return legalizeExtractVectorElt(MI, MRI, B);
1653 case TargetOpcode::G_INSERT_VECTOR_ELT:
1654 return legalizeInsertVectorElt(MI, MRI, B);
1655 case TargetOpcode::G_SHUFFLE_VECTOR:
1656 return legalizeShuffleVector(MI, MRI, B);
1657 case TargetOpcode::G_FSIN:
1658 case TargetOpcode::G_FCOS:
1659 return legalizeSinCos(MI, MRI, B);
1660 case TargetOpcode::G_GLOBAL_VALUE:
1661 return legalizeGlobalValue(MI, MRI, B);
1662 case TargetOpcode::G_LOAD:
1663 return legalizeLoad(Helper, MI);
1664 case TargetOpcode::G_FMAD:
1665 return legalizeFMad(MI, MRI, B);
1666 case TargetOpcode::G_FDIV:
1667 return legalizeFDIV(MI, MRI, B);
1668 case TargetOpcode::G_UDIV:
1669 case TargetOpcode::G_UREM:
1670 return legalizeUDIV_UREM(MI, MRI, B);
1671 case TargetOpcode::G_SDIV:
1672 case TargetOpcode::G_SREM:
1673 return legalizeSDIV_SREM(MI, MRI, B);
1674 case TargetOpcode::G_ATOMIC_CMPXCHG:
1675 return legalizeAtomicCmpXChg(MI, MRI, B);
1676 case TargetOpcode::G_FLOG:
1677 return legalizeFlog(MI, B, numbers::ln2f);
1678 case TargetOpcode::G_FLOG10:
1679 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1680 case TargetOpcode::G_FEXP:
1681 return legalizeFExp(MI, B);
1682 case TargetOpcode::G_FPOW:
1683 return legalizeFPow(MI, B);
1684 case TargetOpcode::G_FFLOOR:
1685 return legalizeFFloor(MI, MRI, B);
1686 case TargetOpcode::G_BUILD_VECTOR:
1687 return legalizeBuildVector(MI, MRI, B);
1688 default:
1689 return false;
1690 }
1691
1692 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1692)
;
1693}
1694
1695Register AMDGPULegalizerInfo::getSegmentAperture(
1696 unsigned AS,
1697 MachineRegisterInfo &MRI,
1698 MachineIRBuilder &B) const {
1699 MachineFunction &MF = B.getMF();
1700 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1701 const LLT S32 = LLT::scalar(32);
1702
1703 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1703, __PRETTY_FUNCTION__))
;
1704
1705 if (ST.hasApertureRegs()) {
1706 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1707 // getreg.
1708 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1709 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1710 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1711 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1712 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1713 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1714 unsigned Encoding =
1715 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1716 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1717 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1718
1719 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1720
1721 B.buildInstr(AMDGPU::S_GETREG_B32)
1722 .addDef(GetReg)
1723 .addImm(Encoding);
1724 MRI.setType(GetReg, S32);
1725
1726 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1727 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1728 }
1729
1730 Register QueuePtr = MRI.createGenericVirtualRegister(
1731 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1732
1733 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1734 return Register();
1735
1736 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1737 // private_segment_aperture_base_hi.
1738 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1739
1740 // TODO: can we be smarter about machine pointer info?
1741 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1742 MachineMemOperand *MMO = MF.getMachineMemOperand(
1743 PtrInfo,
1744 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1745 MachineMemOperand::MOInvariant,
1746 4, commonAlignment(Align(64), StructOffset));
1747
1748 Register LoadAddr;
1749
1750 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1751 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1752}
1753
1754bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1755 MachineInstr &MI, MachineRegisterInfo &MRI,
1756 MachineIRBuilder &B) const {
1757 MachineFunction &MF = B.getMF();
1758
1759 const LLT S32 = LLT::scalar(32);
1760 Register Dst = MI.getOperand(0).getReg();
1761 Register Src = MI.getOperand(1).getReg();
1762
1763 LLT DstTy = MRI.getType(Dst);
1764 LLT SrcTy = MRI.getType(Src);
1765 unsigned DestAS = DstTy.getAddressSpace();
1766 unsigned SrcAS = SrcTy.getAddressSpace();
1767
1768 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1769 // vector element.
1770 assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1770, __PRETTY_FUNCTION__))
;
1771
1772 const AMDGPUTargetMachine &TM
1773 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1774
1775 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1776 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1777 return true;
1778 }
1779
1780 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1781 // Truncate.
1782 B.buildExtract(Dst, Src, 0);
1783 MI.eraseFromParent();
1784 return true;
1785 }
1786
1787 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1788 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1789 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1790
1791 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1792 // another. Merge operands are required to be the same type, but creating an
1793 // extra ptrtoint would be kind of pointless.
1794 auto HighAddr = B.buildConstant(
1795 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1796 B.buildMerge(Dst, {Src, HighAddr});
1797 MI.eraseFromParent();
1798 return true;
1799 }
1800
1801 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1802 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1803, __PRETTY_FUNCTION__))
1803 DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1803, __PRETTY_FUNCTION__))
;
1804 unsigned NullVal = TM.getNullPointerValue(DestAS);
1805
1806 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1807 auto FlatNull = B.buildConstant(SrcTy, 0);
1808
1809 // Extract low 32-bits of the pointer.
1810 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1811
1812 auto CmpRes =
1813 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1814 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1815
1816 MI.eraseFromParent();
1817 return true;
1818 }
1819
1820 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1821 return false;
1822
1823 if (!ST.hasFlatAddressSpace())
1824 return false;
1825
1826 auto SegmentNull =
1827 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1828 auto FlatNull =
1829 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1830
1831 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1832 if (!ApertureReg.isValid())
1833 return false;
1834
1835 auto CmpRes =
1836 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1837
1838 // Coerce the type of the low half of the result so we can use merge_values.
1839 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1840
1841 // TODO: Should we allow mismatched types but matching sizes in merges to
1842 // avoid the ptrtoint?
1843 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1844 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1845
1846 MI.eraseFromParent();
1847 return true;
1848}
1849
1850bool AMDGPULegalizerInfo::legalizeFrint(
1851 MachineInstr &MI, MachineRegisterInfo &MRI,
1852 MachineIRBuilder &B) const {
1853 Register Src = MI.getOperand(1).getReg();
1854 LLT Ty = MRI.getType(Src);
1855 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast
<void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1855, __PRETTY_FUNCTION__))
;
1856
1857 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1858 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1859
1860 auto C1 = B.buildFConstant(Ty, C1Val);
1861 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1862
1863 // TODO: Should this propagate fast-math-flags?
1864 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1865 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1866
1867 auto C2 = B.buildFConstant(Ty, C2Val);
1868 auto Fabs = B.buildFAbs(Ty, Src);
1869
1870 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1871 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1872 MI.eraseFromParent();
1873 return true;
1874}
1875
1876bool AMDGPULegalizerInfo::legalizeFceil(
1877 MachineInstr &MI, MachineRegisterInfo &MRI,
1878 MachineIRBuilder &B) const {
1879
1880 const LLT S1 = LLT::scalar(1);
1881 const LLT S64 = LLT::scalar(64);
1882
1883 Register Src = MI.getOperand(1).getReg();
1884 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1884, __PRETTY_FUNCTION__))
;
1885
1886 // result = trunc(src)
1887 // if (src > 0.0 && src != result)
1888 // result += 1.0
1889
1890 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1891
1892 const auto Zero = B.buildFConstant(S64, 0.0);
1893 const auto One = B.buildFConstant(S64, 1.0);
1894 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1895 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1896 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1897 auto Add = B.buildSelect(S64, And, One, Zero);
1898
1899 // TODO: Should this propagate fast-math-flags?
1900 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1901 return true;
1902}
1903
1904bool AMDGPULegalizerInfo::legalizeFrem(
1905 MachineInstr &MI, MachineRegisterInfo &MRI,
1906 MachineIRBuilder &B) const {
1907 Register DstReg = MI.getOperand(0).getReg();
1908 Register Src0Reg = MI.getOperand(1).getReg();
1909 Register Src1Reg = MI.getOperand(2).getReg();
1910 auto Flags = MI.getFlags();
1911 LLT Ty = MRI.getType(DstReg);
1912
1913 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1914 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1915 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1916 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1917 MI.eraseFromParent();
1918 return true;
1919}
1920
1921static MachineInstrBuilder extractF64Exponent(Register Hi,
1922 MachineIRBuilder &B) {
1923 const unsigned FractBits = 52;
1924 const unsigned ExpBits = 11;
1925 LLT S32 = LLT::scalar(32);
1926
1927 auto Const0 = B.buildConstant(S32, FractBits - 32);
1928 auto Const1 = B.buildConstant(S32, ExpBits);
1929
1930 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1931 .addUse(Hi)
1932 .addUse(Const0.getReg(0))
1933 .addUse(Const1.getReg(0));
1934
1935 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1936}
1937
1938bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1939 MachineInstr &MI, MachineRegisterInfo &MRI,
1940 MachineIRBuilder &B) const {
1941 const LLT S1 = LLT::scalar(1);
1942 const LLT S32 = LLT::scalar(32);
1943 const LLT S64 = LLT::scalar(64);
1944
1945 Register Src = MI.getOperand(1).getReg();
1946 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1946, __PRETTY_FUNCTION__))
;
1947
1948 // TODO: Should this use extract since the low half is unused?
1949 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1950 Register Hi = Unmerge.getReg(1);
1951
1952 // Extract the upper half, since this is where we will find the sign and
1953 // exponent.
1954 auto Exp = extractF64Exponent(Hi, B);
1955
1956 const unsigned FractBits = 52;
1957
1958 // Extract the sign bit.
1959 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
1960 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1961
1962 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
1963
1964 const auto Zero32 = B.buildConstant(S32, 0);
1965
1966 // Extend back to 64-bits.
1967 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1968
1969 auto Shr = B.buildAShr(S64, FractMask, Exp);
1970 auto Not = B.buildNot(S64, Shr);
1971 auto Tmp0 = B.buildAnd(S64, Src, Not);
1972 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1973
1974 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1975 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1976
1977 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1978 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1979 MI.eraseFromParent();
1980 return true;
1981}
1982
1983bool AMDGPULegalizerInfo::legalizeITOFP(
1984 MachineInstr &MI, MachineRegisterInfo &MRI,
1985 MachineIRBuilder &B, bool Signed) const {
1986
1987 Register Dst = MI.getOperand(0).getReg();
1988 Register Src = MI.getOperand(1).getReg();
1989
1990 const LLT S64 = LLT::scalar(64);
1991 const LLT S32 = LLT::scalar(32);
1992
1993 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1993, __PRETTY_FUNCTION__))
;
1994
1995 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1996
1997 auto CvtHi = Signed ?
1998 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1999 B.buildUITOFP(S64, Unmerge.getReg(1));
2000
2001 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2002
2003 auto ThirtyTwo = B.buildConstant(S32, 32);
2004 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2005 .addUse(CvtHi.getReg(0))
2006 .addUse(ThirtyTwo.getReg(0));
2007
2008 // TODO: Should this propagate fast-math-flags?
2009 B.buildFAdd(Dst, LdExp, CvtLo);
2010 MI.eraseFromParent();
2011 return true;
2012}
2013
2014// TODO: Copied from DAG implementation. Verify logic and document how this
2015// actually works.
2016bool AMDGPULegalizerInfo::legalizeFPTOI(
2017 MachineInstr &MI, MachineRegisterInfo &MRI,
2018 MachineIRBuilder &B, bool Signed) const {
2019
2020 Register Dst = MI.getOperand(0).getReg();
2021 Register Src = MI.getOperand(1).getReg();
2022
2023 const LLT S64 = LLT::scalar(64);
2024 const LLT S32 = LLT::scalar(32);
2025
2026 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2026, __PRETTY_FUNCTION__))
;
2027
2028 unsigned Flags = MI.getFlags();
2029
2030 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2031 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL));
2032 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL));
2033
2034 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2035 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2036 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2037
2038 auto Hi = Signed ?
2039 B.buildFPTOSI(S32, FloorMul) :
2040 B.buildFPTOUI(S32, FloorMul);
2041 auto Lo = B.buildFPTOUI(S32, Fma);
2042
2043 B.buildMerge(Dst, { Lo, Hi });
2044 MI.eraseFromParent();
2045
2046 return true;
2047}
2048
2049bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2050 MachineInstr &MI) const {
2051 MachineFunction &MF = Helper.MIRBuilder.getMF();
2052 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2053
2054 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2055 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2056
2057 // With ieee_mode disabled, the instructions have the correct behavior
2058 // already for G_FMINNUM/G_FMAXNUM
2059 if (!MFI->getMode().IEEE)
2060 return !IsIEEEOp;
2061
2062 if (IsIEEEOp)
2063 return true;
2064
2065 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2066}
2067
2068bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2069 MachineInstr &MI, MachineRegisterInfo &MRI,
2070 MachineIRBuilder &B) const {
2071 // TODO: Should move some of this into LegalizerHelper.
2072
2073 // TODO: Promote dynamic indexing of s16 to s32
2074
2075 // FIXME: Artifact combiner probably should have replaced the truncated
2076 // constant before this, so we shouldn't need
2077 // getConstantVRegValWithLookThrough.
2078 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2079 MI.getOperand(2).getReg(), MRI);
2080 if (!IdxVal) // Dynamic case will be selected to register indexing.
2081 return true;
2082
2083 Register Dst = MI.getOperand(0).getReg();
2084 Register Vec = MI.getOperand(1).getReg();
2085
2086 LLT VecTy = MRI.getType(Vec);
2087 LLT EltTy = VecTy.getElementType();
2088 assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2088, __PRETTY_FUNCTION__))
;
2089
2090 if (IdxVal->Value < VecTy.getNumElements())
2091 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2092 else
2093 B.buildUndef(Dst);
2094
2095 MI.eraseFromParent();
2096 return true;
2097}
2098
2099bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2100 MachineInstr &MI, MachineRegisterInfo &MRI,
2101 MachineIRBuilder &B) const {
2102 // TODO: Should move some of this into LegalizerHelper.
2103
2104 // TODO: Promote dynamic indexing of s16 to s32
2105
2106 // FIXME: Artifact combiner probably should have replaced the truncated
2107 // constant before this, so we shouldn't need
2108 // getConstantVRegValWithLookThrough.
2109 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2110 MI.getOperand(3).getReg(), MRI);
2111 if (!IdxVal) // Dynamic case will be selected to register indexing.
2112 return true;
2113
2114 Register Dst = MI.getOperand(0).getReg();
2115 Register Vec = MI.getOperand(1).getReg();
2116 Register Ins = MI.getOperand(2).getReg();
2117
2118 LLT VecTy = MRI.getType(Vec);
2119 LLT EltTy = VecTy.getElementType();
2120 assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2120, __PRETTY_FUNCTION__))
;
2121
2122 if (IdxVal->Value < VecTy.getNumElements())
2123 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2124 else
2125 B.buildUndef(Dst);
2126
2127 MI.eraseFromParent();
2128 return true;
2129}
2130
2131bool AMDGPULegalizerInfo::legalizeShuffleVector(
2132 MachineInstr &MI, MachineRegisterInfo &MRI,
2133 MachineIRBuilder &B) const {
2134 const LLT V2S16 = LLT::vector(2, 16);
2135
2136 Register Dst = MI.getOperand(0).getReg();
2137 Register Src0 = MI.getOperand(1).getReg();
2138 LLT DstTy = MRI.getType(Dst);
2139 LLT SrcTy = MRI.getType(Src0);
2140
2141 if (SrcTy == V2S16 && DstTy == V2S16 &&
2142 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2143 return true;
2144
2145 MachineIRBuilder HelperBuilder(MI);
2146 GISelObserverWrapper DummyObserver;
2147 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2148 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2149}
2150
2151bool AMDGPULegalizerInfo::legalizeSinCos(
2152 MachineInstr &MI, MachineRegisterInfo &MRI,
2153 MachineIRBuilder &B) const {
2154
2155 Register DstReg = MI.getOperand(0).getReg();
2156 Register SrcReg = MI.getOperand(1).getReg();
2157 LLT Ty = MRI.getType(DstReg);
2158 unsigned Flags = MI.getFlags();
2159
2160 Register TrigVal;
2161 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2162 if (ST.hasTrigReducedRange()) {
2163 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2164 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2165 .addUse(MulVal.getReg(0))
2166 .setMIFlags(Flags).getReg(0);
2167 } else
2168 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2169
2170 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2171 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2172 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2173 .addUse(TrigVal)
2174 .setMIFlags(Flags);
2175 MI.eraseFromParent();
2176 return true;
2177}
2178
2179bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2180 MachineIRBuilder &B,
2181 const GlobalValue *GV,
2182 int64_t Offset,
2183 unsigned GAFlags) const {
2184 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")((isInt<32>(Offset + 4) && "32-bit offset is expected!"
) ? static_cast<void> (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2184, __PRETTY_FUNCTION__))
;
2185 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2186 // to the following code sequence:
2187 //
2188 // For constant address space:
2189 // s_getpc_b64 s[0:1]
2190 // s_add_u32 s0, s0, $symbol
2191 // s_addc_u32 s1, s1, 0
2192 //
2193 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2194 // a fixup or relocation is emitted to replace $symbol with a literal
2195 // constant, which is a pc-relative offset from the encoding of the $symbol
2196 // operand to the global variable.
2197 //
2198 // For global address space:
2199 // s_getpc_b64 s[0:1]
2200 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2201 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2202 //
2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204 // fixups or relocations are emitted to replace $symbol@*@lo and
2205 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2206 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2207 // operand to the global variable.
2208 //
2209 // What we want here is an offset from the value returned by s_getpc
2210 // (which is the address of the s_add_u32 instruction) to the global
2211 // variable, but since the encoding of $symbol starts 4 bytes after the start
2212 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2213 // small. This requires us to add 4 to the global variable offset in order to
2214 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2215 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2216 // instruction.
2217
2218 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2219
2220 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2221 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2222
2223 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2224 .addDef(PCReg);
2225
2226 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2227 if (GAFlags == SIInstrInfo::MO_NONE)
2228 MIB.addImm(0);
2229 else
2230 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2231
2232 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2233
2234 if (PtrTy.getSizeInBits() == 32)
2235 B.buildExtract(DstReg, PCReg, 0);
2236 return true;
2237 }
2238
2239bool AMDGPULegalizerInfo::legalizeGlobalValue(
2240 MachineInstr &MI, MachineRegisterInfo &MRI,
2241 MachineIRBuilder &B) const {
2242 Register DstReg = MI.getOperand(0).getReg();
2243 LLT Ty = MRI.getType(DstReg);
2244 unsigned AS = Ty.getAddressSpace();
2245
2246 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2247 MachineFunction &MF = B.getMF();
2248 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2249
2250 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2251 if (!MFI->isEntryFunction()) {
2252 const Function &Fn = MF.getFunction();
2253 DiagnosticInfoUnsupported BadLDSDecl(
2254 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2255 DS_Warning);
2256 Fn.getContext().diagnose(BadLDSDecl);
2257
2258 // We currently don't have a way to correctly allocate LDS objects that
2259 // aren't directly associated with a kernel. We do force inlining of
2260 // functions that use local objects. However, if these dead functions are
2261 // not eliminated, we don't want a compile time error. Just emit a warning
2262 // and a trap, since there should be no callable path here.
2263 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2264 B.buildUndef(DstReg);
2265 MI.eraseFromParent();
2266 return true;
2267 }
2268
2269 // TODO: We could emit code to handle the initialization somewhere.
2270 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2271 const SITargetLowering *TLI = ST.getTargetLowering();
2272 if (!TLI->shouldUseLDSConstAddress(GV)) {
2273 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2274 return true; // Leave in place;
2275 }
2276
2277 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2278 Type *Ty = GV->getValueType();
2279 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2280 // zero-sized type in other languages to declare the dynamic shared
2281 // memory which size is not known at the compile time. They will be
2282 // allocated by the runtime and placed directly after the static
2283 // allocated ones. They all share the same offset.
2284 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2285 // Adjust alignment for that dynamic shared memory array.
2286 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2287 LLT S32 = LLT::scalar(32);
2288 auto Sz =
2289 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2290 B.buildIntToPtr(DstReg, Sz);
2291 MI.eraseFromParent();
2292 return true;
2293 }
2294 }
2295
2296 B.buildConstant(
2297 DstReg,
2298 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2299 MI.eraseFromParent();
2300 return true;
2301 }
2302
2303 const Function &Fn = MF.getFunction();
2304 DiagnosticInfoUnsupported BadInit(
2305 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2306 Fn.getContext().diagnose(BadInit);
2307 return true;
2308 }
2309
2310 const SITargetLowering *TLI = ST.getTargetLowering();
2311
2312 if (TLI->shouldEmitFixup(GV)) {
2313 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2314 MI.eraseFromParent();
2315 return true;
2316 }
2317
2318 if (TLI->shouldEmitPCReloc(GV)) {
2319 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2320 MI.eraseFromParent();
2321 return true;
2322 }
2323
2324 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2325 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2326
2327 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2328 MachinePointerInfo::getGOT(MF),
2329 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2330 MachineMemOperand::MOInvariant,
2331 8 /*Size*/, Align(8));
2332
2333 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2334
2335 if (Ty.getSizeInBits() == 32) {
2336 // Truncate if this is a 32-bit constant adrdess.
2337 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2338 B.buildExtract(DstReg, Load, 0);
2339 } else
2340 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2341
2342 MI.eraseFromParent();
2343 return true;
2344}
2345
2346static LLT widenToNextPowerOf2(LLT Ty) {
2347 if (Ty.isVector())
2348 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2349 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2350}
2351
2352bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2353 MachineInstr &MI) const {
2354 MachineIRBuilder &B = Helper.MIRBuilder;
2355 MachineRegisterInfo &MRI = *B.getMRI();
2356 GISelChangeObserver &Observer = Helper.Observer;
2357
2358 Register PtrReg = MI.getOperand(1).getReg();
2359 LLT PtrTy = MRI.getType(PtrReg);
2360 unsigned AddrSpace = PtrTy.getAddressSpace();
2361
2362 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2363 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2364 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2365 Observer.changingInstr(MI);
2366 MI.getOperand(1).setReg(Cast.getReg(0));
2367 Observer.changedInstr(MI);
2368 return true;
2369 }
2370
2371 Register ValReg = MI.getOperand(0).getReg();
2372 LLT ValTy = MRI.getType(ValReg);
2373
2374 MachineMemOperand *MMO = *MI.memoperands_begin();
2375 const unsigned ValSize = ValTy.getSizeInBits();
2376 const unsigned MemSize = 8 * MMO->getSize();
2377 const Align MemAlign = MMO->getAlign();
2378 const unsigned AlignInBits = 8 * MemAlign.value();
2379
2380 // Widen non-power-of-2 loads to the alignment if needed
2381 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2382 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2383
2384 // This was already the correct extending load result type, so just adjust
2385 // the memory type.
2386 if (WideMemSize == ValSize) {
2387 MachineFunction &MF = B.getMF();
2388
2389 MachineMemOperand *WideMMO =
2390 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2391 Observer.changingInstr(MI);
2392 MI.setMemRefs(MF, {WideMMO});
2393 Observer.changedInstr(MI);
2394 return true;
2395 }
2396
2397 // Don't bother handling edge case that should probably never be produced.
2398 if (ValSize > WideMemSize)
2399 return false;
2400
2401 LLT WideTy = widenToNextPowerOf2(ValTy);
2402
2403 Register WideLoad;
2404 if (!WideTy.isVector()) {
2405 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2406 B.buildTrunc(ValReg, WideLoad).getReg(0);
2407 } else {
2408 // Extract the subvector.
2409
2410 if (isRegisterType(ValTy)) {
2411 // If this a case where G_EXTRACT is legal, use it.
2412 // (e.g. <3 x s32> -> <4 x s32>)
2413 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2414 B.buildExtract(ValReg, WideLoad, 0);
2415 } else {
2416 // For cases where the widened type isn't a nice register value, unmerge
2417 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2418 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2419 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2420 B.setInsertPt(B.getMBB(), MI.getIterator());
2421 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2422 }
2423 }
2424
2425 MI.eraseFromParent();
2426 return true;
2427 }
2428
2429 return false;
2430}
2431
2432bool AMDGPULegalizerInfo::legalizeFMad(
2433 MachineInstr &MI, MachineRegisterInfo &MRI,
2434 MachineIRBuilder &B) const {
2435 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2436 assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2436, __PRETTY_FUNCTION__))
;
2437
2438 MachineFunction &MF = B.getMF();
2439 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2440
2441 // TODO: Always legal with future ftz flag.
2442 // FIXME: Do we need just output?
2443 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2444 return true;
2445 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2446 return true;
2447
2448 MachineIRBuilder HelperBuilder(MI);
2449 GISelObserverWrapper DummyObserver;
2450 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2451 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2452}
2453
2454bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2455 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2456 Register DstReg = MI.getOperand(0).getReg();
2457 Register PtrReg = MI.getOperand(1).getReg();
2458 Register CmpVal = MI.getOperand(2).getReg();
2459 Register NewVal = MI.getOperand(3).getReg();
2460
2461 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&((AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace
()) && "this should not have been custom lowered") ? static_cast
<void> (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2462, __PRETTY_FUNCTION__))
2462 "this should not have been custom lowered")((AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace
()) && "this should not have been custom lowered") ? static_cast
<void> (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2462, __PRETTY_FUNCTION__))
;
2463
2464 LLT ValTy = MRI.getType(CmpVal);
2465 LLT VecTy = LLT::vector(2, ValTy);
2466
2467 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2468
2469 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2470 .addDef(DstReg)
2471 .addUse(PtrReg)
2472 .addUse(PackedVal)
2473 .setMemRefs(MI.memoperands());
2474
2475 MI.eraseFromParent();
2476 return true;
2477}
2478
2479bool AMDGPULegalizerInfo::legalizeFlog(
2480 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2481 Register Dst = MI.getOperand(0).getReg();
2482 Register Src = MI.getOperand(1).getReg();
2483 LLT Ty = B.getMRI()->getType(Dst);
2484 unsigned Flags = MI.getFlags();
2485
2486 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2487 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2488
2489 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2490 MI.eraseFromParent();
2491 return true;
2492}
2493
2494bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2495 MachineIRBuilder &B) const {
2496 Register Dst = MI.getOperand(0).getReg();
2497 Register Src = MI.getOperand(1).getReg();
2498 unsigned Flags = MI.getFlags();
2499 LLT Ty = B.getMRI()->getType(Dst);
2500
2501 auto K = B.buildFConstant(Ty, numbers::log2e);
2502 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2503 B.buildFExp2(Dst, Mul, Flags);
2504 MI.eraseFromParent();
2505 return true;
2506}
2507
2508bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2509 MachineIRBuilder &B) const {
2510 Register Dst = MI.getOperand(0).getReg();
2511 Register Src0 = MI.getOperand(1).getReg();
2512 Register Src1 = MI.getOperand(2).getReg();
2513 unsigned Flags = MI.getFlags();
2514 LLT Ty = B.getMRI()->getType(Dst);
2515 const LLT S16 = LLT::scalar(16);
2516 const LLT S32 = LLT::scalar(32);
2517
2518 if (Ty == S32) {
2519 auto Log = B.buildFLog2(S32, Src0, Flags);
2520 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2521 .addUse(Log.getReg(0))
2522 .addUse(Src1)
2523 .setMIFlags(Flags);
2524 B.buildFExp2(Dst, Mul, Flags);
2525 } else if (Ty == S16) {
2526 // There's no f16 fmul_legacy, so we need to convert for it.
2527 auto Log = B.buildFLog2(S16, Src0, Flags);
2528 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2529 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2530 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2531 .addUse(Ext0.getReg(0))
2532 .addUse(Ext1.getReg(0))
2533 .setMIFlags(Flags);
2534
2535 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2536 } else
2537 return false;
2538
2539 MI.eraseFromParent();
2540 return true;
2541}
2542
2543// Find a source register, ignoring any possible source modifiers.
2544static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2545 Register ModSrc = OrigSrc;
2546 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2547 ModSrc = SrcFNeg->getOperand(1).getReg();
2548 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2549 ModSrc = SrcFAbs->getOperand(1).getReg();
2550 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2551 ModSrc = SrcFAbs->getOperand(1).getReg();
2552 return ModSrc;
2553}
2554
2555bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2556 MachineRegisterInfo &MRI,
2557 MachineIRBuilder &B) const {
2558
2559 const LLT S1 = LLT::scalar(1);
2560 const LLT S64 = LLT::scalar(64);
2561 Register Dst = MI.getOperand(0).getReg();
2562 Register OrigSrc = MI.getOperand(1).getReg();
2563 unsigned Flags = MI.getFlags();
2564 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&((ST.hasFractBug() && MRI.getType(Dst) == S64 &&
"this should not have been custom lowered") ? static_cast<
void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2565, __PRETTY_FUNCTION__))
2565 "this should not have been custom lowered")((ST.hasFractBug() && MRI.getType(Dst) == S64 &&
"this should not have been custom lowered") ? static_cast<
void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2565, __PRETTY_FUNCTION__))
;
2566
2567 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2568 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2569 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2570 // V_FRACT bug is:
2571 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2572 //
2573 // Convert floor(x) to (x - fract(x))
2574
2575 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2576 .addUse(OrigSrc)
2577 .setMIFlags(Flags);
2578
2579 // Give source modifier matching some assistance before obscuring a foldable
2580 // pattern.
2581
2582 // TODO: We can avoid the neg on the fract? The input sign to fract
2583 // shouldn't matter?
2584 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2585
2586 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2587
2588 Register Min = MRI.createGenericVirtualRegister(S64);
2589
2590 // We don't need to concern ourselves with the snan handling difference, so
2591 // use the one which will directly select.
2592 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2593 if (MFI->getMode().IEEE)
2594 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2595 else
2596 B.buildFMinNum(Min, Fract, Const, Flags);
2597
2598 Register CorrectedFract = Min;
2599 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2600 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2601 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2602 }
2603
2604 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2605 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2606
2607 MI.eraseFromParent();
2608 return true;
2609}
2610
2611// Turn an illegal packed v2s16 build vector into bit operations.
2612// TODO: This should probably be a bitcast action in LegalizerHelper.
2613bool AMDGPULegalizerInfo::legalizeBuildVector(
2614 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2615 Register Dst = MI.getOperand(0).getReg();
2616 const LLT S32 = LLT::scalar(32);
2617 assert(MRI.getType(Dst) == LLT::vector(2, 16))((MRI.getType(Dst) == LLT::vector(2, 16)) ? static_cast<void
> (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2617, __PRETTY_FUNCTION__))
;
2618
2619 Register Src0 = MI.getOperand(1).getReg();
2620 Register Src1 = MI.getOperand(2).getReg();
2621 assert(MRI.getType(Src0) == LLT::scalar(16))((MRI.getType(Src0) == LLT::scalar(16)) ? static_cast<void
> (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2621, __PRETTY_FUNCTION__))
;
2622
2623 auto Merge = B.buildMerge(S32, {Src0, Src1});
2624 B.buildBitcast(Dst, Merge);
2625
2626 MI.eraseFromParent();
2627 return true;
2628}
2629
2630// Check that this is a G_XOR x, -1
2631static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2632 if (MI.getOpcode() != TargetOpcode::G_XOR)
2633 return false;
2634 auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
2635 return ConstVal && *ConstVal == -1;
2636}
2637
2638// Return the use branch instruction, otherwise null if the usage is invalid.
2639static MachineInstr *
2640verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2641 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2642 Register CondDef = MI.getOperand(0).getReg();
2643 if (!MRI.hasOneNonDBGUse(CondDef))
2644 return nullptr;
2645
2646 MachineBasicBlock *Parent = MI.getParent();
2647 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2648
2649 if (isNot(MRI, *UseMI)) {
2650 Register NegatedCond = UseMI->getOperand(0).getReg();
2651 if (!MRI.hasOneNonDBGUse(NegatedCond))
2652 return nullptr;
2653
2654 // We're deleting the def of this value, so we need to remove it.
2655 UseMI->eraseFromParent();
2656
2657 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2658 Negated = true;
2659 }
2660
2661 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2662 return nullptr;
2663
2664 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2665 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2666 if (Next == Parent->end()) {
2667 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2668 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2669 return nullptr;
2670 UncondBrTarget = &*NextMBB;
2671 } else {
2672 if (Next->getOpcode() != AMDGPU::G_BR)
2673 return nullptr;
2674 Br = &*Next;
2675 UncondBrTarget = Br->getOperand(0).getMBB();
2676 }
2677
2678 return UseMI;
2679}
2680
2681bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2682 const ArgDescriptor *Arg,
2683 const TargetRegisterClass *ArgRC,
2684 LLT ArgTy) const {
2685 MCRegister SrcReg = Arg->getRegister();
2686 assert(SrcReg.isPhysical() && "Physical register expected")((SrcReg.isPhysical() && "Physical register expected"
) ? static_cast<void> (0) : __assert_fail ("SrcReg.isPhysical() && \"Physical register expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2686, __PRETTY_FUNCTION__))
;
6
'?' condition is true
2687 assert(DstReg.isVirtual() && "Virtual register expected")((DstReg.isVirtual() && "Virtual register expected") ?
static_cast<void> (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2687, __PRETTY_FUNCTION__))
;
7
Assuming the condition is true
8
'?' condition is true
2688
2689 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2690 ArgTy);
2691 if (Arg->isMasked()) {
9
Calling 'ArgDescriptor::isMasked'
12
Returning from 'ArgDescriptor::isMasked'
13
Taking true branch
2692 // TODO: Should we try to emit this once in the entry block?
2693 const LLT S32 = LLT::scalar(32);
2694 const unsigned Mask = Arg->getMask();
2695 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
14
Calling 'countTrailingZeros<unsigned int>'
21
Returning from 'countTrailingZeros<unsigned int>'
22
'Shift' initialized to 32
2696
2697 Register AndMaskSrc = LiveIn;
2698
2699 if (Shift
22.1
'Shift' is not equal to 0
22.1
'Shift' is not equal to 0
22.1
'Shift' is not equal to 0
!= 0) {
23
Taking true branch
2700 auto ShiftAmt = B.buildConstant(S32, Shift);
2701 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2702 }
2703
2704 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
24
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2705 } else {
2706 B.buildCopy(DstReg, LiveIn);
2707 }
2708
2709 return true;
2710}
2711
2712bool AMDGPULegalizerInfo::loadInputValue(
2713 Register DstReg, MachineIRBuilder &B,
2714 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2715 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2716 const ArgDescriptor *Arg;
2717 const TargetRegisterClass *ArgRC;
2718 LLT ArgTy;
2719 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2720
2721 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4
Taking false branch
2722 return false; // TODO: Handle these
2723 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5
Calling 'AMDGPULegalizerInfo::loadInputValue'
2724}
2725
2726bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2727 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2728 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2729 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3
Calling 'AMDGPULegalizerInfo::loadInputValue'
2730 return false;
2731
2732 MI.eraseFromParent();
2733 return true;
2734}
2735
2736bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2737 MachineRegisterInfo &MRI,
2738 MachineIRBuilder &B) const {
2739 Register Dst = MI.getOperand(0).getReg();
2740 LLT DstTy = MRI.getType(Dst);
2741 LLT S16 = LLT::scalar(16);
2742 LLT S32 = LLT::scalar(32);
2743 LLT S64 = LLT::scalar(64);
2744
2745 if (legalizeFastUnsafeFDIV(MI, MRI, B))
2746 return true;
2747
2748 if (DstTy == S16)
2749 return legalizeFDIV16(MI, MRI, B);
2750 if (DstTy == S32)
2751 return legalizeFDIV32(MI, MRI, B);
2752 if (DstTy == S64)
2753 return legalizeFDIV64(MI, MRI, B);
2754
2755 return false;
2756}
2757
2758void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2759 Register DstReg,
2760 Register X,
2761 Register Y,
2762 bool IsDiv) const {
2763 const LLT S1 = LLT::scalar(1);
2764 const LLT S32 = LLT::scalar(32);
2765
2766 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2767 // algorithm used here.
2768
2769 // Initial estimate of inv(y).
2770 auto FloatY = B.buildUITOFP(S32, Y);
2771 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2772 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2773 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2774 auto Z = B.buildFPTOUI(S32, ScaledY);
2775
2776 // One round of UNR.
2777 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2778 auto NegYZ = B.buildMul(S32, NegY, Z);
2779 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2780
2781 // Quotient/remainder estimate.
2782 auto Q = B.buildUMulH(S32, X, Z);
2783 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2784
2785 // First quotient/remainder refinement.
2786 auto One = B.buildConstant(S32, 1);
2787 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2788 if (IsDiv)
2789 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2790 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2791
2792 // Second quotient/remainder refinement.
2793 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2794 if (IsDiv)
2795 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2796 else
2797 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2798}
2799
2800bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2801 MachineRegisterInfo &MRI,
2802 MachineIRBuilder &B) const {
2803 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2804 Register DstReg = MI.getOperand(0).getReg();
2805 Register Num = MI.getOperand(1).getReg();
2806 Register Den = MI.getOperand(2).getReg();
2807 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2808 MI.eraseFromParent();
2809 return true;
2810}
2811
2812// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2813//
2814// Return lo, hi of result
2815//
2816// %cvt.lo = G_UITOFP Val.lo
2817// %cvt.hi = G_UITOFP Val.hi
2818// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2819// %rcp = G_AMDGPU_RCP_IFLAG %mad
2820// %mul1 = G_FMUL %rcp, 0x5f7ffffc
2821// %mul2 = G_FMUL %mul1, 2**(-32)
2822// %trunc = G_INTRINSIC_TRUNC %mul2
2823// %mad2 = G_FMAD %trunc, -(2**32), %mul1
2824// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2825static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2826 Register Val) {
2827 const LLT S32 = LLT::scalar(32);
2828 auto Unmerge = B.buildUnmerge(S32, Val);
2829
2830 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2831 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2832
2833 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2834 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2835
2836 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2837 auto Mul1 =
2838 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2839
2840 // 2**(-32)
2841 auto Mul2 =
2842 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2843 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2844
2845 // -(2**32)
2846 auto Mad2 = B.buildFMAD(S32, Trunc,
2847 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2848
2849 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2850 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2851
2852 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2853}
2854
2855void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2856 Register DstReg,
2857 Register Numer,
2858 Register Denom,
2859 bool IsDiv) const {
2860 const LLT S32 = LLT::scalar(32);
2861 const LLT S64 = LLT::scalar(64);
2862 const LLT S1 = LLT::scalar(1);
2863 Register RcpLo, RcpHi;
2864
2865 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2866
2867 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2868
2869 auto Zero64 = B.buildConstant(S64, 0);
2870 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2871
2872 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2873 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2874
2875 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2876 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2877 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2878
2879 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2880 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2881 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2882 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2883
2884 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2885 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2886 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2887 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2888 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2889
2890 auto Zero32 = B.buildConstant(S32, 0);
2891 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2892 auto Add2_HiC =
2893 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2894 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2895 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2896
2897 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2898 Register NumerLo = UnmergeNumer.getReg(0);
2899 Register NumerHi = UnmergeNumer.getReg(1);
2900
2901 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2902 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2903 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2904 Register Mul3_Lo = UnmergeMul3.getReg(0);
2905 Register Mul3_Hi = UnmergeMul3.getReg(1);
2906 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2907 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2908 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2909 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2910
2911 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2912 Register DenomLo = UnmergeDenom.getReg(0);
2913 Register DenomHi = UnmergeDenom.getReg(1);
2914
2915 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2916 auto C1 = B.buildSExt(S32, CmpHi);
2917
2918 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2919 auto C2 = B.buildSExt(S32, CmpLo);
2920
2921 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2922 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2923
2924 // TODO: Here and below portions of the code can be enclosed into if/endif.
2925 // Currently control flow is unconditional and we have 4 selects after
2926 // potential endif to substitute PHIs.
2927
2928 // if C3 != 0 ...
2929 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2930 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2931 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2932 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2933
2934 auto One64 = B.buildConstant(S64, 1);
2935 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2936
2937 auto C4 =
2938 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2939 auto C5 =
2940 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2941 auto C6 = B.buildSelect(
2942 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2943
2944 // if (C6 != 0)
2945 auto Add4 = B.buildAdd(S64, Add3, One64);
2946 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2947
2948 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2949 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2950 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2951
2952 // endif C6
2953 // endif C3
2954
2955 if (IsDiv) {
2956 auto Sel1 = B.buildSelect(
2957 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2958 B.buildSelect(DstReg,
2959 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2960 } else {
2961 auto Sel2 = B.buildSelect(
2962 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2963 B.buildSelect(DstReg,
2964 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2965 }
2966}
2967
2968bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2969 MachineRegisterInfo &MRI,
2970 MachineIRBuilder &B) const {
2971 const LLT S64 = LLT::scalar(64);
2972 const LLT S32 = LLT::scalar(32);
2973 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2974 Register DstReg = MI.getOperand(0).getReg();
2975 Register Num = MI.getOperand(1).getReg();
2976 Register Den = MI.getOperand(2).getReg();
2977 LLT Ty = MRI.getType(DstReg);
2978
2979 if (Ty == S32)
2980 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2981 else if (Ty == S64)
2982 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2983 else
2984 return false;
2985
2986 MI.eraseFromParent();
2987 return true;
2988
2989}
2990
2991bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2992 MachineRegisterInfo &MRI,
2993 MachineIRBuilder &B) const {
2994 const LLT S64 = LLT::scalar(64);
2995 const LLT S32 = LLT::scalar(32);
2996
2997 Register DstReg = MI.getOperand(0).getReg();
2998 const LLT Ty = MRI.getType(DstReg);
2999 if (Ty != S32 && Ty != S64)
3000 return false;
3001
3002 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3003
3004 Register LHS = MI.getOperand(1).getReg();
3005 Register RHS = MI.getOperand(2).getReg();
3006
3007 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3008 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3009 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3010
3011 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3012 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3013
3014 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3015 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3016
3017 Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3018 if (Ty == S32)
3019 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3020 else
3021 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3022
3023 Register Sign;
3024 if (IsDiv)
3025 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3026 else
3027 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3028
3029 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3030 B.buildSub(DstReg, UDivRem, Sign);
3031
3032 MI.eraseFromParent();
3033 return true;
3034}
3035
3036bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3037 MachineRegisterInfo &MRI,
3038 MachineIRBuilder &B) const {
3039 Register Res = MI.getOperand(0).getReg();
3040 Register LHS = MI.getOperand(1).getReg();
3041 Register RHS = MI.getOperand(2).getReg();
3042
3043 uint16_t Flags = MI.getFlags();
3044
3045 LLT ResTy = MRI.getType(Res);
3046 LLT S32 = LLT::scalar(32);
3047 LLT S64 = LLT::scalar(64);
3048
3049 const MachineFunction &MF = B.getMF();
3050 bool Unsafe =
3051 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3052
3053 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3054 return false;
3055
3056 if (!Unsafe && ResTy == S32 &&
3057 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3058 return false;
3059
3060 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3061 // 1 / x -> RCP(x)
3062 if (CLHS->isExactlyValue(1.0)) {
3063 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3064 .addUse(RHS)
3065 .setMIFlags(Flags);
3066
3067 MI.eraseFromParent();
3068 return true;
3069 }
3070
3071 // -1 / x -> RCP( FNEG(x) )
3072 if (CLHS->isExactlyValue(-1.0)) {
3073 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3074 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3075 .addUse(FNeg.getReg(0))
3076 .setMIFlags(Flags);
3077
3078 MI.eraseFromParent();
3079 return true;
3080 }
3081 }
3082
3083 // x / y -> x * (1.0 / y)
3084 if (Unsafe) {
3085 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3086 .addUse(RHS)
3087 .setMIFlags(Flags);
3088 B.buildFMul(Res, LHS, RCP, Flags);
3089
3090 MI.eraseFromParent();
3091 return true;
3092 }
3093
3094 return false;
3095}
3096
3097bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3098 MachineRegisterInfo &MRI,
3099 MachineIRBuilder &B) const {
3100 Register Res = MI.getOperand(0).getReg();
3101 Register LHS = MI.getOperand(1).getReg();
3102 Register RHS = MI.getOperand(2).getReg();
3103
3104 uint16_t Flags = MI.getFlags();
3105
3106 LLT S16 = LLT::scalar(16);
3107 LLT S32 = LLT::scalar(32);
3108
3109 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3110 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3111
3112 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3113 .addUse(RHSExt.getReg(0))
3114 .setMIFlags(Flags);
3115
3116 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3117 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3118
3119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3120 .addUse(RDst.getReg(0))
3121 .addUse(RHS)
3122 .addUse(LHS)
3123 .setMIFlags(Flags);
3124
3125 MI.eraseFromParent();
3126 return true;
3127}
3128
3129// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3130// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3131static void toggleSPDenormMode(bool Enable,
3132 MachineIRBuilder &B,
3133 const GCNSubtarget &ST,
3134 AMDGPU::SIModeRegisterDefaults Mode) {
3135 // Set SP denorm mode to this value.
3136 unsigned SPDenormMode =
3137 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3138
3139 if (ST.hasDenormModeInst()) {
3140 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3141 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3142
3143 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3144 B.buildInstr(AMDGPU::S_DENORM_MODE)
3145 .addImm(NewDenormModeValue);
3146
3147 } else {
3148 // Select FP32 bit field in mode register.
3149 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3150 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3151 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3152
3153 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3154 .addImm(SPDenormMode)
3155 .addImm(SPDenormModeBitField);
3156 }
3157}
3158
3159bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3160 MachineRegisterInfo &MRI,
3161 MachineIRBuilder &B) const {
3162 Register Res = MI.getOperand(0).getReg();
3163 Register LHS = MI.getOperand(1).getReg();
3164 Register RHS = MI.getOperand(2).getReg();
3165 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3166 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3167
3168 uint16_t Flags = MI.getFlags();
3169
3170 LLT S32 = LLT::scalar(32);
3171 LLT S1 = LLT::scalar(1);
3172
3173 auto One = B.buildFConstant(S32, 1.0f);
3174
3175 auto DenominatorScaled =
3176 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3177 .addUse(LHS)
3178 .addUse(RHS)
3179 .addImm(0)
3180 .setMIFlags(Flags);
3181 auto NumeratorScaled =
3182 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3183 .addUse(LHS)
3184 .addUse(RHS)
3185 .addImm(1)
3186 .setMIFlags(Flags);
3187
3188 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3189 .addUse(DenominatorScaled.getReg(0))
3190 .setMIFlags(Flags);
3191 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3192
3193 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3194 // aren't modeled as reading it.
3195 if (!Mode.allFP32Denormals())
3196 toggleSPDenormMode(true, B, ST, Mode);
3197
3198 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3199 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3200 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3201 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3202 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3203 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3204
3205 if (!Mode.allFP32Denormals())
3206 toggleSPDenormMode(false, B, ST, Mode);
3207
3208 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3209 .addUse(Fma4.getReg(0))
3210 .addUse(Fma1.getReg(0))
3211 .addUse(Fma3.getReg(0))
3212 .addUse(NumeratorScaled.getReg(1))
3213 .setMIFlags(Flags);
3214
3215 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3216 .addUse(Fmas.getReg(0))
3217 .addUse(RHS)
3218 .addUse(LHS)
3219 .setMIFlags(Flags);
3220
3221 MI.eraseFromParent();
3222 return true;
3223}
3224
3225bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3226 MachineRegisterInfo &MRI,
3227 MachineIRBuilder &B) const {
3228 Register Res = MI.getOperand(0).getReg();
3229 Register LHS = MI.getOperand(1).getReg();
3230 Register RHS = MI.getOperand(2).getReg();
3231
3232 uint16_t Flags = MI.getFlags();
3233
3234 LLT S64 = LLT::scalar(64);
3235 LLT S1 = LLT::scalar(1);
3236
3237 auto One = B.buildFConstant(S64, 1.0);
3238
3239 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3240 .addUse(LHS)
3241 .addUse(RHS)
3242 .addImm(0)
3243 .setMIFlags(Flags);
3244
3245 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3246
3247 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3248 .addUse(DivScale0.getReg(0))
3249 .setMIFlags(Flags);
3250
3251 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3252 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3253 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3254
3255 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3256 .addUse(LHS)
3257 .addUse(RHS)
3258 .addImm(1)
3259 .setMIFlags(Flags);
3260
3261 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3262 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3263 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3264
3265 Register Scale;
3266 if (!ST.hasUsableDivScaleConditionOutput()) {
3267 // Workaround a hardware bug on SI where the condition output from div_scale
3268 // is not usable.
3269
3270 LLT S32 = LLT::scalar(32);
3271
3272 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3273 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3274 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3275 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3276
3277 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3278 Scale1Unmerge.getReg(1));
3279 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3280 Scale0Unmerge.getReg(1));
3281 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3282 } else {
3283 Scale = DivScale1.getReg(1);
3284 }
3285
3286 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3287 .addUse(Fma4.getReg(0))
3288 .addUse(Fma3.getReg(0))
3289 .addUse(Mul.getReg(0))
3290 .addUse(Scale)
3291 .setMIFlags(Flags);
3292
3293 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3294 .addUse(Fmas.getReg(0))
3295 .addUse(RHS)
3296 .addUse(LHS)
3297 .setMIFlags(Flags);
3298
3299 MI.eraseFromParent();
3300 return true;
3301}
3302
3303bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3304 MachineRegisterInfo &MRI,
3305 MachineIRBuilder &B) const {
3306 Register Res = MI.getOperand(0).getReg();
3307 Register LHS = MI.getOperand(2).getReg();
3308 Register RHS = MI.getOperand(3).getReg();
3309 uint16_t Flags = MI.getFlags();
3310
3311 LLT S32 = LLT::scalar(32);
3312 LLT S1 = LLT::scalar(1);
3313
3314 auto Abs = B.buildFAbs(S32, RHS, Flags);
3315 const APFloat C0Val(1.0f);
3316
3317 auto C0 = B.buildConstant(S32, 0x6f800000);
3318 auto C1 = B.buildConstant(S32, 0x2f800000);
3319 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3320
3321 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3322 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3323
3324 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3325
3326 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3327 .addUse(Mul0.getReg(0))
3328 .setMIFlags(Flags);
3329
3330 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3331
3332 B.buildFMul(Res, Sel, Mul1, Flags);
3333
3334 MI.eraseFromParent();
3335 return true;
3336}
3337
3338// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3339// FIXME: Why do we handle this one but not other removed instructions?
3340//
3341// Reciprocal square root. The clamp prevents infinite results, clamping
3342// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3343// +-max_float.
3344bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3345 MachineRegisterInfo &MRI,
3346 MachineIRBuilder &B) const {
3347 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3348 return true;
3349
3350 Register Dst = MI.getOperand(0).getReg();
3351 Register Src = MI.getOperand(2).getReg();
3352 auto Flags = MI.getFlags();
3353
3354 LLT Ty = MRI.getType(Dst);
3355
3356 const fltSemantics *FltSemantics;
3357 if (Ty == LLT::scalar(32))
3358 FltSemantics = &APFloat::IEEEsingle();
3359 else if (Ty == LLT::scalar(64))
3360 FltSemantics = &APFloat::IEEEdouble();
3361 else
3362 return false;
3363
3364 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3365 .addUse(Src)
3366 .setMIFlags(Flags);
3367
3368 // We don't need to concern ourselves with the snan handling difference, since
3369 // the rsq quieted (or not) so use the one which will directly select.
3370 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3371 const bool UseIEEE = MFI->getMode().IEEE;
3372
3373 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3374 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3375 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3376
3377 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3378
3379 if (UseIEEE)
3380 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3381 else
3382 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3383 MI.eraseFromParent();
3384 return true;
3385}
3386
3387static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3388 switch (IID) {
3389 case Intrinsic::amdgcn_ds_fadd:
3390 return AMDGPU::G_ATOMICRMW_FADD;
3391 case Intrinsic::amdgcn_ds_fmin:
3392 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3393 case Intrinsic::amdgcn_ds_fmax:
3394 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3395 default:
3396 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3396)
;
3397 }
3398}
3399
3400bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3401 MachineInstr &MI,
3402 Intrinsic::ID IID) const {
3403 GISelChangeObserver &Observer = Helper.Observer;
3404 Observer.changingInstr(MI);
3405
3406 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3407
3408 // The remaining operands were used to set fields in the MemOperand on
3409 // construction.
3410 for (int I = 6; I > 3; --I)
3411 MI.RemoveOperand(I);
3412
3413 MI.RemoveOperand(1); // Remove the intrinsic ID.
3414 Observer.changedInstr(MI);
3415 return true;
3416}
3417
3418bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3419 MachineRegisterInfo &MRI,
3420 MachineIRBuilder &B) const {
3421 uint64_t Offset =
3422 ST.getTargetLowering()->getImplicitParameterOffset(
3423 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3424 LLT DstTy = MRI.getType(DstReg);
3425 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3426
3427 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3428 if (!loadInputValue(KernargPtrReg, B,
3429 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3430 return false;
3431
3432 // FIXME: This should be nuw
3433 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3434 return true;
3435}
3436
3437bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3438 MachineRegisterInfo &MRI,
3439 MachineIRBuilder &B) const {
3440 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3441 if (!MFI->isEntryFunction()) {
3442 return legalizePreloadedArgIntrin(MI, MRI, B,
3443 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3444 }
3445
3446 Register DstReg = MI.getOperand(0).getReg();
3447 if (!getImplicitArgPtr(DstReg, MRI, B))
3448 return false;
3449
3450 MI.eraseFromParent();
3451 return true;
3452}
3453
3454bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3455 MachineRegisterInfo &MRI,
3456 MachineIRBuilder &B,
3457 unsigned AddrSpace) const {
3458 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3459 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3460 Register Hi32 = Unmerge.getReg(1);
3461
3462 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3463 MI.eraseFromParent();
3464 return true;
3465}
3466
3467// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3468// offset (the offset that is included in bounds checking and swizzling, to be
3469// split between the instruction's voffset and immoffset fields) and soffset
3470// (the offset that is excluded from bounds checking and swizzling, to go in
3471// the instruction's soffset field). This function takes the first kind of
3472// offset and figures out how to split it between voffset and immoffset.
3473std::tuple<Register, unsigned, unsigned>
3474AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3475 Register OrigOffset) const {
3476 const unsigned MaxImm = 4095;
3477 Register BaseReg;
3478 unsigned TotalConstOffset;
3479 MachineInstr *OffsetDef;
3480 const LLT S32 = LLT::scalar(32);
3481
3482 std::tie(BaseReg, TotalConstOffset, OffsetDef)
3483 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3484
3485 unsigned ImmOffset = TotalConstOffset;
3486
3487 // If the immediate value is too big for the immoffset field, put the value
3488 // and -4096 into the immoffset field so that the value that is copied/added
3489 // for the voffset field is a multiple of 4096, and it stands more chance
3490 // of being CSEd with the copy/add for another similar load/store.
3491 // However, do not do that rounding down to a multiple of 4096 if that is a
3492 // negative number, as it appears to be illegal to have a negative offset
3493 // in the vgpr, even if adding the immediate offset makes it positive.
3494 unsigned Overflow = ImmOffset & ~MaxImm;
3495 ImmOffset -= Overflow;
3496 if ((int32_t)Overflow < 0) {
3497 Overflow += ImmOffset;
3498 ImmOffset = 0;
3499 }
3500
3501 if (Overflow != 0) {
3502 if (!BaseReg) {
3503 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3504 } else {
3505 auto OverflowVal = B.buildConstant(S32, Overflow);
3506 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3507 }
3508 }
3509
3510 if (!BaseReg)
3511 BaseReg = B.buildConstant(S32, 0).getReg(0);
3512
3513 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3514}
3515
3516/// Handle register layout difference for f16 images for some subtargets.
3517Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3518 MachineRegisterInfo &MRI,
3519 Register Reg) const {
3520 if (!ST.hasUnpackedD16VMem())
3521 return Reg;
3522
3523 const LLT S16 = LLT::scalar(16);
3524 const LLT S32 = LLT::scalar(32);
3525 LLT StoreVT = MRI.getType(Reg);
3526 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16
) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3526, __PRETTY_FUNCTION__))
;
3527
3528 auto Unmerge = B.buildUnmerge(S16, Reg);
3529
3530 SmallVector<Register, 4> WideRegs;
3531 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3532 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3533
3534 int NumElts = StoreVT.getNumElements();
3535
3536 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3537}
3538
3539Register AMDGPULegalizerInfo::fixStoreSourceType(
3540 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3541 MachineRegisterInfo *MRI = B.getMRI();
3542 LLT Ty = MRI->getType(VData);
3543
3544 const LLT S16 = LLT::scalar(16);
3545
3546 // Fixup illegal register types for i8 stores.
3547 if (Ty == LLT::scalar(8) || Ty == S16) {
3548 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3549 return AnyExt;
3550 }
3551
3552 if (Ty.isVector()) {
3553 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3554 if (IsFormat)
3555 return handleD16VData(B, *MRI, VData);
3556 }
3557 }
3558
3559 return VData;
3560}
3561
3562bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3563 MachineRegisterInfo &MRI,
3564 MachineIRBuilder &B,
3565 bool IsTyped,
3566 bool IsFormat) const {
3567 Register VData = MI.getOperand(1).getReg();
3568 LLT Ty = MRI.getType(VData);
3569 LLT EltTy = Ty.getScalarType();
3570 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3571 const LLT S32 = LLT::scalar(32);
3572
3573 VData = fixStoreSourceType(B, VData, IsFormat);
3574 Register RSrc = MI.getOperand(2).getReg();
3575
3576 MachineMemOperand *MMO = *MI.memoperands_begin();
3577 const int MemSize = MMO->getSize();
3578
3579 unsigned ImmOffset;
3580 unsigned TotalOffset;
3581
3582 // The typed intrinsics add an immediate after the registers.
3583 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3584
3585 // The struct intrinsic variants add one additional operand over raw.
3586 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3587 Register VIndex;
3588 int OpOffset = 0;
3589 if (HasVIndex) {
3590 VIndex = MI.getOperand(3).getReg();
3591 OpOffset = 1;
3592 }
3593
3594 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3595 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3596
3597 unsigned Format = 0;
3598 if (IsTyped) {
3599 Format = MI.getOperand(5 + OpOffset).getImm();
3600 ++OpOffset;
3601 }
3602
3603 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3604
3605 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3606 if (TotalOffset != 0)
3607 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3608
3609 unsigned Opc;
3610 if (IsTyped) {
3611 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3612 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3613 } else if (IsFormat) {
3614 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3615 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3616 } else {
3617 switch (MemSize) {
3618 case 1:
3619 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3620 break;
3621 case 2:
3622 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3623 break;
3624 default:
3625 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3626 break;
3627 }
3628 }
3629
3630 if (!VIndex)
3631 VIndex = B.buildConstant(S32, 0).getReg(0);
3632
3633 auto MIB = B.buildInstr(Opc)
3634 .addUse(VData) // vdata
3635 .addUse(RSrc) // rsrc
3636 .addUse(VIndex) // vindex
3637 .addUse(VOffset) // voffset
3638 .addUse(SOffset) // soffset
3639 .addImm(ImmOffset); // offset(imm)
3640
3641 if (IsTyped)
3642 MIB.addImm(Format);
3643
3644 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3645 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3646 .addMemOperand(MMO);
3647
3648 MI.eraseFromParent();
3649 return true;
3650}
3651
3652bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3653 MachineRegisterInfo &MRI,
3654 MachineIRBuilder &B,
3655 bool IsFormat,
3656 bool IsTyped) const {
3657 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3658 MachineMemOperand *MMO = *MI.memoperands_begin();
3659 const int MemSize = MMO->getSize();
3660 const LLT S32 = LLT::scalar(32);
3661
3662 Register Dst = MI.getOperand(0).getReg();
3663 Register RSrc = MI.getOperand(2).getReg();
3664
3665 // The typed intrinsics add an immediate after the registers.
3666 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3667
3668 // The struct intrinsic variants add one additional operand over raw.
3669 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3670 Register VIndex;
3671 int OpOffset = 0;
3672 if (HasVIndex) {
3673 VIndex = MI.getOperand(3).getReg();
3674 OpOffset = 1;
3675 }
3676
3677 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3678 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3679
3680 unsigned Format = 0;
3681 if (IsTyped) {
3682 Format = MI.getOperand(5 + OpOffset).getImm();
3683 ++OpOffset;
3684 }
3685
3686 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3687 unsigned ImmOffset;
3688 unsigned TotalOffset;
3689
3690 LLT Ty = MRI.getType(Dst);
3691 LLT EltTy = Ty.getScalarType();
3692 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3693 const bool Unpacked = ST.hasUnpackedD16VMem();
3694
3695 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3696 if (TotalOffset != 0)
3697 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3698
3699 unsigned Opc;
3700
3701 if (IsTyped) {
3702 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3703 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3704 } else if (IsFormat) {
3705 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3706 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3707 } else {
3708 switch (MemSize) {
3709 case 1:
3710 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3711 break;
3712 case 2:
3713 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3714 break;
3715 default:
3716 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3717 break;
3718 }
3719 }
3720
3721 Register LoadDstReg;
3722
3723 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3724 LLT UnpackedTy = Ty.changeElementSize(32);
3725
3726 if (IsExtLoad)
3727 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3728 else if (Unpacked && IsD16 && Ty.isVector())
3729 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3730 else
3731 LoadDstReg = Dst;
3732
3733 if (!VIndex)
3734 VIndex = B.buildConstant(S32, 0).getReg(0);
3735
3736 auto MIB = B.buildInstr(Opc)
3737 .addDef(LoadDstReg) // vdata
3738 .addUse(RSrc) // rsrc
3739 .addUse(VIndex) // vindex
3740 .addUse(VOffset) // voffset
3741 .addUse(SOffset) // soffset
3742 .addImm(ImmOffset); // offset(imm)
3743
3744 if (IsTyped)
3745 MIB.addImm(Format);
3746
3747 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3748 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3749 .addMemOperand(MMO);
3750
3751 if (LoadDstReg != Dst) {
3752 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3753
3754 // Widen result for extending loads was widened.
3755 if (IsExtLoad)
3756 B.buildTrunc(Dst, LoadDstReg);
3757 else {
3758 // Repack to original 16-bit vector result
3759 // FIXME: G_TRUNC should work, but legalization currently fails
3760 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3761 SmallVector<Register, 4> Repack;
3762 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3763 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3764 B.buildMerge(Dst, Repack);
3765 }
3766 }
3767
3768 MI.eraseFromParent();
3769 return true;
3770}
3771
3772bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3773 MachineIRBuilder &B,
3774 bool IsInc) const {
3775 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3776 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3777 B.buildInstr(Opc)
3778 .addDef(MI.getOperand(0).getReg())
3779 .addUse(MI.getOperand(2).getReg())
3780 .addUse(MI.getOperand(3).getReg())
3781 .cloneMemRefs(MI);
3782 MI.eraseFromParent();
3783 return true;
3784}
3785
3786static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3787 switch (IntrID) {
3788 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3789 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3790 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3791 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3792 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3793 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3794 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3795 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3796 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3797 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3798 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3799 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3800 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3801 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3802 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3803 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3804 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3805 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3806 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3807 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3808 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3809 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3810 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3811 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3812 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3813 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3814 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3815 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3816 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3817 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3818 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3819 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3820 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3821 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3822 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3823 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3824 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3825 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3826 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3827 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3828 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3829 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3830 default:
3831 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3831)
;
3832 }
3833}
3834
3835bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3836 MachineIRBuilder &B,
3837 Intrinsic::ID IID) const {
3838 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3839 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3840 const bool HasReturn = MI.getNumExplicitDefs() != 0;
3841
3842 Register Dst;
3843
3844 int OpOffset = 0;
3845 if (HasReturn) {
3846 // A few FP atomics do not support return values.
3847 Dst = MI.getOperand(0).getReg();
3848 } else {
3849 OpOffset = -1;
3850 }
3851
3852 Register VData = MI.getOperand(2 + OpOffset).getReg();
3853 Register CmpVal;
3854
3855 if (IsCmpSwap) {
3856 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3857 ++OpOffset;
3858 }
3859
3860 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3861 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3862
3863 // The struct intrinsic variants add one additional operand over raw.
3864 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3865 Register VIndex;
3866 if (HasVIndex) {
3867 VIndex = MI.getOperand(4 + OpOffset).getReg();
3868 ++OpOffset;
3869 }
3870
3871 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3872 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3873 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3874
3875 MachineMemOperand *MMO = *MI.memoperands_begin();
3876
3877 unsigned ImmOffset;
3878 unsigned TotalOffset;
3879 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3880 if (TotalOffset != 0)
3881 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3882
3883 if (!VIndex)
3884 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3885
3886 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3887
3888 if (HasReturn)
3889 MIB.addDef(Dst);
3890
3891 MIB.addUse(VData); // vdata
3892
3893 if (IsCmpSwap)
3894 MIB.addReg(CmpVal);
3895
3896 MIB.addUse(RSrc) // rsrc
3897 .addUse(VIndex) // vindex
3898 .addUse(VOffset) // voffset
3899 .addUse(SOffset) // soffset
3900 .addImm(ImmOffset) // offset(imm)
3901 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3902 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3903 .addMemOperand(MMO);
3904
3905 MI.eraseFromParent();
3906 return true;
3907}
3908
3909/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3910/// vector with s16 typed elements.
3911static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3912 SmallVectorImpl<Register> &PackedAddrs,
3913 int AddrIdx, int DimIdx, int EndIdx,
3914 int NumGradients) {
3915 const LLT S16 = LLT::scalar(16);
3916 const LLT V2S16 = LLT::vector(2, 16);
3917
3918 for (int I = AddrIdx; I < EndIdx; ++I) {
3919 MachineOperand &SrcOp = MI.getOperand(I);
3920 if (!SrcOp.isReg())
3921 continue; // _L to _LZ may have eliminated this.
3922
3923 Register AddrReg = SrcOp.getReg();
3924
3925 if (I < DimIdx) {
3926 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3927 PackedAddrs.push_back(AddrReg);
3928 } else {
3929 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3930 // derivatives dx/dh and dx/dv are packed with undef.
3931 if (((I + 1) >= EndIdx) ||
3932 ((NumGradients / 2) % 2 == 1 &&
3933 (I == DimIdx + (NumGradients / 2) - 1 ||
3934 I == DimIdx + NumGradients - 1)) ||
3935 // Check for _L to _LZ optimization
3936 !MI.getOperand(I + 1).isReg()) {
3937 PackedAddrs.push_back(
3938 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3939 .getReg(0));
3940 } else {
3941 PackedAddrs.push_back(
3942 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3943 .getReg(0));
3944 ++I;
3945 }
3946 }
3947 }
3948}
3949
3950/// Convert from separate vaddr components to a single vector address register,
3951/// and replace the remaining operands with $noreg.
3952static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3953 int DimIdx, int NumVAddrs) {
3954 const LLT S32 = LLT::scalar(32);
3955
3956 SmallVector<Register, 8> AddrRegs;
3957 for (int I = 0; I != NumVAddrs; ++I) {
3958 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3959 if (SrcOp.isReg()) {
3960 AddrRegs.push_back(SrcOp.getReg());
3961 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)((B.getMRI()->getType(SrcOp.getReg()) == S32) ? static_cast
<void> (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3961, __PRETTY_FUNCTION__))
;
3962 }
3963 }
3964
3965 int NumAddrRegs = AddrRegs.size();
3966 if (NumAddrRegs != 1) {
3967 // Round up to 8 elements for v5-v7
3968 // FIXME: Missing intermediate sized register classes and instructions.
3969 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3970 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3971 auto Undef = B.buildUndef(S32);
3972 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3973 NumAddrRegs = RoundedNumRegs;
3974 }
3975
3976 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3977 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3978 }
3979
3980 for (int I = 1; I != NumVAddrs; ++I) {
3981 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3982 if (SrcOp.isReg())
3983 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3984 }
3985}
3986
3987/// Rewrite image intrinsics to use register layouts expected by the subtarget.
3988///
3989/// Depending on the subtarget, load/store with 16-bit element data need to be
3990/// rewritten to use the low half of 32-bit registers, or directly use a packed
3991/// layout. 16-bit addresses should also sometimes be packed into 32-bit
3992/// registers.
3993///
3994/// We don't want to directly select image instructions just yet, but also want
3995/// to exposes all register repacking to the legalizer/combiners. We also don't
3996/// want a selected instrution entering RegBankSelect. In order to avoid
3997/// defining a multitude of intermediate image instructions, directly hack on
3998/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3999/// now unnecessary arguments with $noreg.
4000bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4001 MachineInstr &MI, MachineIRBuilder &B,
4002 GISelChangeObserver &Observer,
4003 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
4004
4005 const int NumDefs = MI.getNumExplicitDefs();
4006 bool IsTFE = NumDefs == 2;
4007 // We are only processing the operands of d16 image operations on subtargets
4008 // that use the unpacked register layout, or need to repack the TFE result.
4009
4010 // TODO: Do we need to guard against already legalized intrinsics?
4011 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4012 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
4013
4014 MachineRegisterInfo *MRI = B.getMRI();
4015 const LLT S32 = LLT::scalar(32);
4016 const LLT S16 = LLT::scalar(16);
4017 const LLT V2S16 = LLT::vector(2, 16);
4018
4019 // Index of first address argument
4020 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
4021
4022 int NumVAddrs, NumGradients;
4023 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
4024 const int DMaskIdx = BaseOpcode->Atomic ? -1 :
4025 getDMaskIdx(BaseOpcode, NumDefs);
4026 unsigned DMask = 0;
4027
4028 // Check for 16 bit addresses and pack if true.
4029 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4030 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
4031 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
4032 const bool IsG16 = GradTy == S16;
4033 const bool IsA16 = AddrTy == S16;
4034
4035 int DMaskLanes = 0;
4036 if (!BaseOpcode->Atomic) {
4037 DMask = MI.getOperand(DMaskIdx).getImm();
4038 if (BaseOpcode->Gather4) {
4039 DMaskLanes = 4;
4040 } else if (DMask != 0) {
4041 DMaskLanes = countPopulation(DMask);
4042 } else if (!IsTFE && !BaseOpcode->Store) {
4043 // If dmask is 0, this is a no-op load. This can be eliminated.
4044 B.buildUndef(MI.getOperand(0));
4045 MI.eraseFromParent();
4046 return true;
4047 }
4048 }
4049
4050 Observer.changingInstr(MI);
4051 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4052
4053 unsigned NewOpcode = NumDefs == 0 ?
4054 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4055
4056 // Track that we legalized this
4057 MI.setDesc(B.getTII().get(NewOpcode));
4058
4059 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4060 // dmask to be at least 1 otherwise the instruction will fail
4061 if (IsTFE && DMask == 0) {
4062 DMask = 0x1;
4063 DMaskLanes = 1;
4064 MI.getOperand(DMaskIdx).setImm(DMask);
4065 }
4066
4067 if (BaseOpcode->Atomic) {
4068 Register VData0 = MI.getOperand(2).getReg();
4069 LLT Ty = MRI->getType(VData0);
4070
4071 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4072 if (Ty.isVector())
4073 return false;
4074
4075 if (BaseOpcode->AtomicX2) {
4076 Register VData1 = MI.getOperand(3).getReg();
4077 // The two values are packed in one register.
4078 LLT PackedTy = LLT::vector(2, Ty);
4079 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4080 MI.getOperand(2).setReg(Concat.getReg(0));
4081 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4082 }
4083 }
4084
4085 int CorrectedNumVAddrs = NumVAddrs;
4086
4087 // Optimize _L to _LZ when _L is zero
4088 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4089 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
4090 const ConstantFP *ConstantLod;
4091 const int LodIdx = AddrIdx + NumVAddrs - 1;
4092
4093 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
4094 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4095 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4096 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
4097 LZMappingInfo->LZ, ImageDimIntr->Dim);
4098
4099 // The starting indexes should remain in the same place.
4100 --NumVAddrs;
4101 --CorrectedNumVAddrs;
4102
4103 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
4104 static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
4105 MI.RemoveOperand(LodIdx);
4106 }
4107 }
4108 }
4109
4110 // Optimize _mip away, when 'lod' is zero
4111 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
4112 int64_t ConstantLod;
4113 const int LodIdx = AddrIdx + NumVAddrs - 1;
4114
4115 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
4116 if (ConstantLod == 0) {
4117 // TODO: Change intrinsic opcode and remove operand instead or replacing
4118 // it with 0, as the _L to _LZ handling is done above.
4119 MI.getOperand(LodIdx).ChangeToImmediate(0);
4120 --CorrectedNumVAddrs;
4121 }
4122 }
4123 }
4124
4125 // Rewrite the addressing register layout before doing anything else.
4126 if (IsA16 || IsG16) {
4127 if (IsA16) {
4128 // Target must support the feature and gradients need to be 16 bit too
4129 if (!ST.hasA16() || !IsG16)
4130 return false;
4131 } else if (!ST.hasG16())
4132 return false;
4133
4134 if (NumVAddrs > 1) {
4135 SmallVector<Register, 4> PackedRegs;
4136 // Don't compress addresses for G16
4137 const int PackEndIdx =
4138 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4139 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4140 PackEndIdx, NumGradients);
4141
4142 if (!IsA16) {
4143 // Add uncompressed address
4144 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4145 int AddrReg = MI.getOperand(I).getReg();
4146 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32))((B.getMRI()->getType(AddrReg) == LLT::scalar(32)) ? static_cast
<void> (0) : __assert_fail ("B.getMRI()->getType(AddrReg) == LLT::scalar(32)"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4146, __PRETTY_FUNCTION__))
;
4147 PackedRegs.push_back(AddrReg);
4148 }
4149 }
4150
4151 // See also below in the non-a16 branch
4152 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4153
4154 if (!UseNSA && PackedRegs.size() > 1) {
4155 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4156 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4157 PackedRegs[0] = Concat.getReg(0);
4158 PackedRegs.resize(1);
4159 }
4160
4161 const int NumPacked = PackedRegs.size();
4162 for (int I = 0; I != NumVAddrs; ++I) {
4163 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4164 if (!SrcOp.isReg()) {
4165 assert(SrcOp.isImm() && SrcOp.getImm() == 0)((SrcOp.isImm() && SrcOp.getImm() == 0) ? static_cast
<void> (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4165, __PRETTY_FUNCTION__))
;
4166 continue;
4167 }
4168
4169 assert(SrcOp.getReg() != AMDGPU::NoRegister)((SrcOp.getReg() != AMDGPU::NoRegister) ? static_cast<void
> (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4169, __PRETTY_FUNCTION__))
;
4170
4171 if (I < NumPacked)
4172 SrcOp.setReg(PackedRegs[I]);
4173 else
4174 SrcOp.setReg(AMDGPU::NoRegister);
4175 }
4176 }
4177 } else {
4178 // If the register allocator cannot place the address registers contiguously
4179 // without introducing moves, then using the non-sequential address encoding
4180 // is always preferable, since it saves VALU instructions and is usually a
4181 // wash in terms of code size or even better.
4182 //
4183 // However, we currently have no way of hinting to the register allocator
4184 // that MIMG addresses should be placed contiguously when it is possible to
4185 // do so, so force non-NSA for the common 2-address case as a heuristic.
4186 //
4187 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4188 // allocation when possible.
4189 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4190
4191 if (!UseNSA && NumVAddrs > 1)
4192 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4193 }
4194
4195 int Flags = 0;
4196 if (IsA16)
4197 Flags |= 1;
4198 if (IsG16)
4199 Flags |= 2;
4200 MI.addOperand(MachineOperand::CreateImm(Flags));
4201
4202 if (BaseOpcode->Store) { // No TFE for stores?
4203 // TODO: Handle dmask trim
4204 Register VData = MI.getOperand(1).getReg();
4205 LLT Ty = MRI->getType(VData);
4206 if (!Ty.isVector() || Ty.getElementType() != S16)
4207 return true;
4208
4209 Register RepackedReg = handleD16VData(B, *MRI, VData);
4210 if (RepackedReg != VData) {
4211 MI.getOperand(1).setReg(RepackedReg);
4212 }
4213
4214 return true;
4215 }
4216
4217 Register DstReg = MI.getOperand(0).getReg();
4218 LLT Ty = MRI->getType(DstReg);
4219 const LLT EltTy = Ty.getScalarType();
4220 const bool IsD16 = Ty.getScalarType() == S16;
4221 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4222
4223 // Confirm that the return type is large enough for the dmask specified
4224 if (NumElts < DMaskLanes)
4225 return false;
4226
4227 if (NumElts > 4 || DMaskLanes > 4)
4228 return false;
4229
4230 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4231 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4232
4233 // The raw dword aligned data component of the load. The only legal cases
4234 // where this matters should be when using the packed D16 format, for
4235 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4236 LLT RoundedTy;
4237
4238 // S32 vector to to cover all data, plus TFE result element.
4239 LLT TFETy;
4240
4241 // Register type to use for each loaded component. Will be S32 or V2S16.
4242 LLT RegTy;
4243
4244 if (IsD16 && ST.hasUnpackedD16VMem()) {
4245 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4246 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4247 RegTy = S32;
4248 } else {
4249 unsigned EltSize = EltTy.getSizeInBits();
4250 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4251 unsigned RoundedSize = 32 * RoundedElts;
4252 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4253 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4254 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4255 }
4256
4257 // The return type does not need adjustment.
4258 // TODO: Should we change s16 case to s32 or <2 x s16>?
4259 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4260 return true;
4261
4262 Register Dst1Reg;
4263
4264 // Insert after the instruction.
4265 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4266
4267 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4268 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4269 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4270 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4271
4272 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4273
4274 MI.getOperand(0).setReg(NewResultReg);
4275
4276 // In the IR, TFE is supposed to be used with a 2 element struct return
4277 // type. The intruction really returns these two values in one contiguous
4278 // register, with one additional dword beyond the loaded data. Rewrite the
4279 // return type to use a single register result.
4280
4281 if (IsTFE) {
4282 Dst1Reg = MI.getOperand(1).getReg();
4283 if (MRI->getType(Dst1Reg) != S32)
4284 return false;
4285
4286 // TODO: Make sure the TFE operand bit is set.
4287 MI.RemoveOperand(1);
4288
4289 // Handle the easy case that requires no repack instructions.
4290 if (Ty == S32) {
4291 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4292 return true;
4293 }
4294 }
4295
4296 // Now figure out how to copy the new result register back into the old
4297 // result.
4298 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4299
4300 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4301
4302 if (ResultNumRegs == 1) {
4303 assert(!IsTFE)((!IsTFE) ? static_cast<void> (0) : __assert_fail ("!IsTFE"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4303, __PRETTY_FUNCTION__))
;
4304 ResultRegs[0] = NewResultReg;
4305 } else {
4306 // We have to repack into a new vector of some kind.
4307 for (int I = 0; I != NumDataRegs; ++I)
4308 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4309 B.buildUnmerge(ResultRegs, NewResultReg);
4310
4311 // Drop the final TFE element to get the data part. The TFE result is
4312 // directly written to the right place already.
4313 if (IsTFE)
4314 ResultRegs.resize(NumDataRegs);
4315 }
4316
4317 // For an s16 scalar result, we form an s32 result with a truncate regardless
4318 // of packed vs. unpacked.
4319 if (IsD16 && !Ty.isVector()) {
4320 B.buildTrunc(DstReg, ResultRegs[0]);
4321 return true;
4322 }
4323
4324 // Avoid a build/concat_vector of 1 entry.
4325 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4326 B.buildBitcast(DstReg, ResultRegs[0]);
4327 return true;
4328 }
4329
4330 assert(Ty.isVector())((Ty.isVector()) ? static_cast<void> (0) : __assert_fail
("Ty.isVector()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4330, __PRETTY_FUNCTION__))
;
4331
4332 if (IsD16) {
4333 // For packed D16 results with TFE enabled, all the data components are
4334 // S32. Cast back to the expected type.
4335 //
4336 // TODO: We don't really need to use load s32 elements. We would only need one
4337 // cast for the TFE result if a multiple of v2s16 was used.
4338 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4339 for (Register &Reg : ResultRegs)
4340 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4341 } else if (ST.hasUnpackedD16VMem()) {
4342 for (Register &Reg : ResultRegs)
4343 Reg = B.buildTrunc(S16, Reg).getReg(0);
4344 }
4345 }
4346
4347 auto padWithUndef = [&](LLT Ty, int NumElts) {
4348 if (NumElts == 0)
4349 return;
4350 Register Undef = B.buildUndef(Ty).getReg(0);
4351 for (int I = 0; I != NumElts; ++I)
4352 ResultRegs.push_back(Undef);
4353 };
4354
4355 // Pad out any elements eliminated due to the dmask.
4356 LLT ResTy = MRI->getType(ResultRegs[0]);
4357 if (!ResTy.isVector()) {
4358 padWithUndef(ResTy, NumElts - ResultRegs.size());
4359 B.buildBuildVector(DstReg, ResultRegs);
4360 return true;
4361 }
4362
4363 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)((!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? static_cast
<void> (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4363, __PRETTY_FUNCTION__))
;
4364 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4365
4366 // Deal with the one annoying legal case.
4367 const LLT V3S16 = LLT::vector(3, 16);
4368 if (Ty == V3S16) {
4369 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4370 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4371 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4372 return true;
4373 }
4374
4375 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4376 B.buildConcatVectors(DstReg, ResultRegs);
4377 return true;
4378}
4379
4380bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4381 LegalizerHelper &Helper, MachineInstr &MI) const {
4382 MachineIRBuilder &B = Helper.MIRBuilder;
4383 GISelChangeObserver &Observer = Helper.Observer;
4384
4385 Register Dst = MI.getOperand(0).getReg();
4386 LLT Ty = B.getMRI()->getType(Dst);
4387 unsigned Size = Ty.getSizeInBits();
4388 MachineFunction &MF = B.getMF();
4389
4390 Observer.changingInstr(MI);
4391
4392 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4393 Ty = getBitcastRegisterType(Ty);
4394 Helper.bitcastDst(MI, Ty, 0);
4395 Dst = MI.getOperand(0).getReg();
4396 B.setInsertPt(B.getMBB(), MI);
4397 }
4398
4399 // FIXME: We don't really need this intermediate instruction. The intrinsic
4400 // should be fixed to have a memory operand. Since it's readnone, we're not
4401 // allowed to add one.
4402 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4403 MI.RemoveOperand(1); // Remove intrinsic ID
4404
4405 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4406 // TODO: Should this use datalayout alignment?
4407 const unsigned MemSize = (Size + 7) / 8;
4408 const Align MemAlign(4);
4409 MachineMemOperand *MMO = MF.getMachineMemOperand(
4410 MachinePointerInfo(),
4411 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4412 MachineMemOperand::MOInvariant,
4413 MemSize, MemAlign);
4414 MI.addMemOperand(MF, MMO);
4415
4416 // There are no 96-bit result scalar loads, but widening to 128-bit should
4417 // always be legal. We may need to restore this to a 96-bit result if it turns
4418 // out this needs to be converted to a vector load during RegBankSelect.
4419 if (!isPowerOf2_32(Size)) {
4420 if (Ty.isVector())
4421 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4422 else
4423 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4424 }
4425
4426 Observer.changedInstr(MI);
4427 return true;
4428}
4429
4430// TODO: Move to selection
4431bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4432 MachineRegisterInfo &MRI,
4433 MachineIRBuilder &B) const {
4434 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4435 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4436 !ST.isTrapHandlerEnabled()) {
4437 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4438 } else {
4439 // Pass queue pointer to trap handler as input, and insert trap instruction
4440 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4441 MachineRegisterInfo &MRI = *B.getMRI();
4442
4443 Register LiveIn =
4444 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4445 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4446 return false;
4447
4448 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4449 B.buildCopy(SGPR01, LiveIn);
4450 B.buildInstr(AMDGPU::S_TRAP)
4451 .addImm(GCNSubtarget::TrapIDLLVMTrap)
4452 .addReg(SGPR01, RegState::Implicit);
4453 }
4454
4455 MI.eraseFromParent();
4456 return true;
4457}
4458
4459bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4460 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4461 // Is non-HSA path or trap-handler disabled? then, report a warning
4462 // accordingly
4463 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4464 !ST.isTrapHandlerEnabled()) {
4465 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4466 "debugtrap handler not supported",
4467 MI.getDebugLoc(), DS_Warning);
4468 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4469 Ctx.diagnose(NoTrap);
4470 } else {
4471 // Insert debug-trap instruction
4472 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4473 }
4474
4475 MI.eraseFromParent();
4476 return true;
4477}
4478
4479bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4480 MachineInstr &MI) const {
4481 MachineIRBuilder &B = Helper.MIRBuilder;
4482 MachineRegisterInfo &MRI = *B.getMRI();
4483
4484 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4485 auto IntrID = MI.getIntrinsicID();
4486 switch (IntrID) {
1
Control jumps to 'case amdgcn_dispatch_id:' at line 4609
4487 case Intrinsic::amdgcn_if:
4488 case Intrinsic::amdgcn_else: {
4489 MachineInstr *Br = nullptr;
4490 MachineBasicBlock *UncondBrTarget = nullptr;
4491 bool Negated = false;
4492 if (MachineInstr *BrCond =
4493 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4494 const SIRegisterInfo *TRI
4495 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4496
4497 Register Def = MI.getOperand(1).getReg();
4498 Register Use = MI.getOperand(3).getReg();
4499
4500 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4501
4502 if (Negated)
4503 std::swap(CondBrTarget, UncondBrTarget);
4504
4505 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4506 if (IntrID == Intrinsic::amdgcn_if) {
4507 B.buildInstr(AMDGPU::SI_IF)
4508 .addDef(Def)
4509 .addUse(Use)
4510 .addMBB(UncondBrTarget);
4511 } else {
4512 B.buildInstr(AMDGPU::SI_ELSE)
4513 .addDef(Def)
4514 .addUse(Use)
4515 .addMBB(UncondBrTarget)
4516 .addImm(0);
4517 }
4518
4519 if (Br) {
4520 Br->getOperand(0).setMBB(CondBrTarget);
4521 } else {
4522 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4523 // since we're swapping branch targets it needs to be reinserted.
4524 // FIXME: IRTranslator should probably not do this
4525 B.buildBr(*CondBrTarget);
4526 }
4527
4528 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4529 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4530 MI.eraseFromParent();
4531 BrCond->eraseFromParent();
4532 return true;
4533 }
4534
4535 return false;
4536 }
4537 case Intrinsic::amdgcn_loop: {
4538 MachineInstr *Br = nullptr;
4539 MachineBasicBlock *UncondBrTarget = nullptr;
4540 bool Negated = false;
4541 if (MachineInstr *BrCond =
4542 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4543 const SIRegisterInfo *TRI
4544 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4545
4546 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4547 Register Reg = MI.getOperand(2).getReg();
4548
4549 if (Negated)
4550 std::swap(CondBrTarget, UncondBrTarget);
4551
4552 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4553 B.buildInstr(AMDGPU::SI_LOOP)
4554 .addUse(Reg)
4555 .addMBB(UncondBrTarget);
4556
4557 if (Br)
4558 Br->getOperand(0).setMBB(CondBrTarget);
4559 else
4560 B.buildBr(*CondBrTarget);
4561
4562 MI.eraseFromParent();
4563 BrCond->eraseFromParent();
4564 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4565 return true;
4566 }
4567
4568 return false;
4569 }
4570 case Intrinsic::amdgcn_kernarg_segment_ptr:
4571 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4572 // This only makes sense to call in a kernel, so just lower to null.
4573 B.buildConstant(MI.getOperand(0).getReg(), 0);
4574 MI.eraseFromParent();
4575 return true;
4576 }
4577
4578 return legalizePreloadedArgIntrin(
4579 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4580 case Intrinsic::amdgcn_implicitarg_ptr:
4581 return legalizeImplicitArgPtr(MI, MRI, B);
4582 case Intrinsic::amdgcn_workitem_id_x:
4583 return legalizePreloadedArgIntrin(MI, MRI, B,
4584 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4585 case Intrinsic::amdgcn_workitem_id_y:
4586 return legalizePreloadedArgIntrin(MI, MRI, B,
4587 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4588 case Intrinsic::amdgcn_workitem_id_z:
4589 return legalizePreloadedArgIntrin(MI, MRI, B,
4590 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4591 case Intrinsic::amdgcn_workgroup_id_x:
4592 return legalizePreloadedArgIntrin(MI, MRI, B,
4593 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4594 case Intrinsic::amdgcn_workgroup_id_y:
4595 return legalizePreloadedArgIntrin(MI, MRI, B,
4596 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4597 case Intrinsic::amdgcn_workgroup_id_z:
4598 return legalizePreloadedArgIntrin(MI, MRI, B,
4599 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4600 case Intrinsic::amdgcn_dispatch_ptr:
4601 return legalizePreloadedArgIntrin(MI, MRI, B,
4602 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4603 case Intrinsic::amdgcn_queue_ptr:
4604 return legalizePreloadedArgIntrin(MI, MRI, B,
4605 AMDGPUFunctionArgInfo::QUEUE_PTR);
4606 case Intrinsic::amdgcn_implicit_buffer_ptr:
4607 return legalizePreloadedArgIntrin(
4608 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4609 case Intrinsic::amdgcn_dispatch_id:
4610 return legalizePreloadedArgIntrin(MI, MRI, B,
2
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
4611 AMDGPUFunctionArgInfo::DISPATCH_ID);
4612 case Intrinsic::amdgcn_fdiv_fast:
4613 return legalizeFDIVFastIntrin(MI, MRI, B);
4614 case Intrinsic::amdgcn_is_shared:
4615 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4616 case Intrinsic::amdgcn_is_private:
4617 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4618 case Intrinsic::amdgcn_wavefrontsize: {
4619 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4620 MI.eraseFromParent();
4621 return true;
4622 }
4623 case Intrinsic::amdgcn_s_buffer_load:
4624 return legalizeSBufferLoad(Helper, MI);
4625 case Intrinsic::amdgcn_raw_buffer_store:
4626 case Intrinsic::amdgcn_struct_buffer_store:
4627 return legalizeBufferStore(MI, MRI, B, false, false);
4628 case Intrinsic::amdgcn_raw_buffer_store_format:
4629 case Intrinsic::amdgcn_struct_buffer_store_format:
4630 return legalizeBufferStore(MI, MRI, B, false, true);
4631 case Intrinsic::amdgcn_raw_tbuffer_store:
4632 case Intrinsic::amdgcn_struct_tbuffer_store:
4633 return legalizeBufferStore(MI, MRI, B, true, true);
4634 case Intrinsic::amdgcn_raw_buffer_load:
4635 case Intrinsic::amdgcn_struct_buffer_load:
4636 return legalizeBufferLoad(MI, MRI, B, false, false);
4637 case Intrinsic::amdgcn_raw_buffer_load_format:
4638 case Intrinsic::amdgcn_struct_buffer_load_format:
4639 return legalizeBufferLoad(MI, MRI, B, true, false);
4640 case Intrinsic::amdgcn_raw_tbuffer_load:
4641 case Intrinsic::amdgcn_struct_tbuffer_load:
4642 return legalizeBufferLoad(MI, MRI, B, true, true);
4643 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4644 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4645 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4646 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4647 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4648 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4649 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4650 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4651 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4652 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4653 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4654 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4655 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4656 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4657 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4658 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4659 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4660 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4661 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4662 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4663 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4664 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4665 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4666 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4667 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4668 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4669 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4670 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4671 return legalizeBufferAtomic(MI, B, IntrID);
4672 case Intrinsic::amdgcn_atomic_inc:
4673 return legalizeAtomicIncDec(MI, B, true);
4674 case Intrinsic::amdgcn_atomic_dec:
4675 return legalizeAtomicIncDec(MI, B, false);
4676 case Intrinsic::trap:
4677 return legalizeTrapIntrinsic(MI, MRI, B);
4678 case Intrinsic::debugtrap:
4679 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4680 case Intrinsic::amdgcn_rsq_clamp:
4681 return legalizeRsqClampIntrinsic(MI, MRI, B);
4682 case Intrinsic::amdgcn_ds_fadd:
4683 case Intrinsic::amdgcn_ds_fmin:
4684 case Intrinsic::amdgcn_ds_fmax:
4685 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4686 default: {
4687 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4688 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4689 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4690 return true;
4691 }
4692 }
4693
4694 return true;
4695}

/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/Pass.h"
15#include "llvm/Support/LowLevelTypeImpl.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class TargetRegisterClass;
22class TargetRegisterInfo;
23
24struct ArgDescriptor {
25private:
26 friend struct AMDGPUFunctionArgInfo;
27 friend class AMDGPUArgumentUsageInfo;
28
29 union {
30 MCRegister Reg;
31 unsigned StackOffset;
32 };
33
34 // Bitmask to locate argument within the register.
35 unsigned Mask;
36
37 bool IsStack : 1;
38 bool IsSet : 1;
39
40public:
41 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
42 bool IsStack = false, bool IsSet = false)
43 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
44
45 static constexpr ArgDescriptor createRegister(Register Reg,
46 unsigned Mask = ~0u) {
47 return ArgDescriptor(Reg, Mask, false, true);
48 }
49
50 static constexpr ArgDescriptor createStack(unsigned Offset,
51 unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
56 unsigned Mask) {
57 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
58 }
59
60 bool isSet() const {
61 return IsSet;
62 }
63
64 explicit operator bool() const {
65 return isSet();
66 }
67
68 bool isRegister() const {
69 return !IsStack;
70 }
71
72 MCRegister getRegister() const {
73 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 73, __PRETTY_FUNCTION__))
;
74 return Reg;
75 }
76
77 unsigned getStackOffset() const {
78 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 78, __PRETTY_FUNCTION__))
;
79 return StackOffset;
80 }
81
82 unsigned getMask() const {
83 return Mask;
84 }
85
86 bool isMasked() const {
87 return Mask != ~0u;
10
Assuming the condition is true
11
Returning the value 1, which participates in a condition later
88 }
89
90 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
91};
92
93inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
94 Arg.print(OS);
95 return OS;
96}
97
98struct AMDGPUFunctionArgInfo {
99 enum PreloadedValue {
100 // SGPRS:
101 PRIVATE_SEGMENT_BUFFER = 0,
102 DISPATCH_PTR = 1,
103 QUEUE_PTR = 2,
104 KERNARG_SEGMENT_PTR = 3,
105 DISPATCH_ID = 4,
106 FLAT_SCRATCH_INIT = 5,
107 WORKGROUP_ID_X = 10,
108 WORKGROUP_ID_Y = 11,
109 WORKGROUP_ID_Z = 12,
110 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
111 IMPLICIT_BUFFER_PTR = 15,
112 IMPLICIT_ARG_PTR = 16,
113
114 // VGPRS:
115 WORKITEM_ID_X = 17,
116 WORKITEM_ID_Y = 18,
117 WORKITEM_ID_Z = 19,
118 FIRST_VGPR_VALUE = WORKITEM_ID_X
119 };
120
121 // Kernel input registers setup for the HSA ABI in allocation order.
122
123 // User SGPRs in kernels
124 // XXX - Can these require argument spills?
125 ArgDescriptor PrivateSegmentBuffer;
126 ArgDescriptor DispatchPtr;
127 ArgDescriptor QueuePtr;
128 ArgDescriptor KernargSegmentPtr;
129 ArgDescriptor DispatchID;
130 ArgDescriptor FlatScratchInit;
131 ArgDescriptor PrivateSegmentSize;
132
133 // System SGPRs in kernels.
134 ArgDescriptor WorkGroupIDX;
135 ArgDescriptor WorkGroupIDY;
136 ArgDescriptor WorkGroupIDZ;
137 ArgDescriptor WorkGroupInfo;
138 ArgDescriptor PrivateSegmentWaveByteOffset;
139
140 // Pointer with offset from kernargsegmentptr to where special ABI arguments
141 // are passed to callable functions.
142 ArgDescriptor ImplicitArgPtr;
143
144 // Input registers for non-HSA ABI
145 ArgDescriptor ImplicitBufferPtr;
146
147 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <algorithm>
18#include <cassert>
19#include <climits>
20#include <cmath>
21#include <cstdint>
22#include <cstring>
23#include <limits>
24#include <type_traits>
25
26#ifdef __ANDROID_NDK__
27#include <android/api-level.h>
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and MathExtras.h is popular.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44/// The behavior an operation has on an input of 0.
45enum ZeroBehavior {
46 /// The returned value is undefined.
47 ZB_Undefined,
48 /// The returned value is numeric_limits<T>::max()
49 ZB_Max,
50 /// The returned value is numeric_limits<T>::digits
51 ZB_Width
52};
53
54/// Mathematical constants.
55namespace numbers {
56// TODO: Track C++20 std::numbers.
57// TODO: Favor using the hexadecimal FP constants (requires C++17).
58constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
59 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
60 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
61 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
62 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
63 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
64 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
65 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
66 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
67 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
68 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
69 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
70 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
71 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
72 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
73constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
74 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
75 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
76 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
77 log2ef = 1.44269504F, // (0x1.715476P+0)
78 log10ef = .434294482F, // (0x1.bcb7b2P-2)
79 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
80 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
81 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
82 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
83 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
84 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
85 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
86 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
87 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
88} // namespace numbers
89
90namespace detail {
91template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
92 static unsigned count(T Val, ZeroBehavior) {
93 if (!Val)
94 return std::numeric_limits<T>::digits;
95 if (Val & 0x1)
96 return 0;
97
98 // Bisection method.
99 unsigned ZeroBits = 0;
100 T Shift = std::numeric_limits<T>::digits >> 1;
101 T Mask = std::numeric_limits<T>::max() >> Shift;
102 while (Shift) {
103 if ((Val & Mask) == 0) {
104 Val >>= Shift;
105 ZeroBits |= Shift;
106 }
107 Shift >>= 1;
108 Mask >>= Shift;
109 }
110 return ZeroBits;
111 }
112};
113
114#if defined(__GNUC__4) || defined(_MSC_VER)
115template <typename T> struct TrailingZerosCounter<T, 4> {
116 static unsigned count(T Val, ZeroBehavior ZB) {
117 if (ZB
15.1
'ZB' is not equal to ZB_Undefined
15.1
'ZB' is not equal to ZB_Undefined
15.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
16
Assuming 'Val' is equal to 0
17
Taking true branch
118 return 32;
18
Returning the value 32
119
120#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
121 return __builtin_ctz(Val);
122#elif defined(_MSC_VER)
123 unsigned long Index;
124 _BitScanForward(&Index, Val);
125 return Index;
126#endif
127 }
128};
129
130#if !defined(_MSC_VER) || defined(_M_X64)
131template <typename T> struct TrailingZerosCounter<T, 8> {
132 static unsigned count(T Val, ZeroBehavior ZB) {
133 if (ZB != ZB_Undefined && Val == 0)
134 return 64;
135
136#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
137 return __builtin_ctzll(Val);
138#elif defined(_MSC_VER)
139 unsigned long Index;
140 _BitScanForward64(&Index, Val);
141 return Index;
142#endif
143 }
144};
145#endif
146#endif
147} // namespace detail
148
149/// Count number of 0's from the least significant bit to the most
150/// stopping at the first 1.
151///
152/// Only unsigned integral types are allowed.
153///
154/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
155/// valid arguments.
156template <typename T>
157unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
158 static_assert(std::numeric_limits<T>::is_integer &&
159 !std::numeric_limits<T>::is_signed,
160 "Only unsigned integral types are allowed.");
161 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
15
Calling 'TrailingZerosCounter::count'
19
Returning from 'TrailingZerosCounter::count'
20
Returning the value 32
162}
163
164namespace detail {
165template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
166 static unsigned count(T Val, ZeroBehavior) {
167 if (!Val)
168 return std::numeric_limits<T>::digits;
169
170 // Bisection method.
171 unsigned ZeroBits = 0;
172 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
173 T Tmp = Val >> Shift;
174 if (Tmp)
175 Val = Tmp;
176 else
177 ZeroBits |= Shift;
178 }
179 return ZeroBits;
180 }
181};
182
183#if defined(__GNUC__4) || defined(_MSC_VER)
184template <typename T> struct LeadingZerosCounter<T, 4> {
185 static unsigned count(T Val, ZeroBehavior ZB) {
186 if (ZB != ZB_Undefined && Val == 0)
187 return 32;
188
189#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
190 return __builtin_clz(Val);
191#elif defined(_MSC_VER)
192 unsigned long Index;
193 _BitScanReverse(&Index, Val);
194 return Index ^ 31;
195#endif
196 }
197};
198
199#if !defined(_MSC_VER) || defined(_M_X64)
200template <typename T> struct LeadingZerosCounter<T, 8> {
201 static unsigned count(T Val, ZeroBehavior ZB) {
202 if (ZB != ZB_Undefined && Val == 0)
203 return 64;
204
205#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
206 return __builtin_clzll(Val);
207#elif defined(_MSC_VER)
208 unsigned long Index;
209 _BitScanReverse64(&Index, Val);
210 return Index ^ 63;
211#endif
212 }
213};
214#endif
215#endif
216} // namespace detail
217
218/// Count number of 0's from the most significant bit to the least
219/// stopping at the first 1.
220///
221/// Only unsigned integral types are allowed.
222///
223/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
224/// valid arguments.
225template <typename T>
226unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
227 static_assert(std::numeric_limits<T>::is_integer &&
228 !std::numeric_limits<T>::is_signed,
229 "Only unsigned integral types are allowed.");
230 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
231}
232
233/// Get the index of the first set bit starting from the least
234/// significant bit.
235///
236/// Only unsigned integral types are allowed.
237///
238/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
239/// valid arguments.
240template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
241 if (ZB == ZB_Max && Val == 0)
242 return std::numeric_limits<T>::max();
243
244 return countTrailingZeros(Val, ZB_Undefined);
245}
246
247/// Create a bitmask with the N right-most bits set to 1, and all other
248/// bits set to 0. Only unsigned types are allowed.
249template <typename T> T maskTrailingOnes(unsigned N) {
250 static_assert(std::is_unsigned<T>::value, "Invalid type!");
251 const unsigned Bits = CHAR_BIT8 * sizeof(T);
252 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 252, __PRETTY_FUNCTION__))
;
253 return N == 0 ? 0 : (T(-1) >> (Bits - N));
254}
255
256/// Create a bitmask with the N left-most bits set to 1, and all other
257/// bits set to 0. Only unsigned types are allowed.
258template <typename T> T maskLeadingOnes(unsigned N) {
259 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
260}
261
262/// Create a bitmask with the N right-most bits set to 0, and all other
263/// bits set to 1. Only unsigned types are allowed.
264template <typename T> T maskTrailingZeros(unsigned N) {
265 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
266}
267
268/// Create a bitmask with the N left-most bits set to 0, and all other
269/// bits set to 1. Only unsigned types are allowed.
270template <typename T> T maskLeadingZeros(unsigned N) {
271 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
272}
273
274/// Get the index of the last set bit starting from the least
275/// significant bit.
276///
277/// Only unsigned integral types are allowed.
278///
279/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
280/// valid arguments.
281template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
282 if (ZB == ZB_Max && Val == 0)
283 return std::numeric_limits<T>::max();
284
285 // Use ^ instead of - because both gcc and llvm can remove the associated ^
286 // in the __builtin_clz intrinsic on x86.
287 return countLeadingZeros(Val, ZB_Undefined) ^
288 (std::numeric_limits<T>::digits - 1);
289}
290
291/// Macro compressed bit reversal table for 256 bits.
292///
293/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
294static const unsigned char BitReverseTable256[256] = {
295#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
296#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
297#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
298 R6(0), R6(2), R6(1), R6(3)
299#undef R2
300#undef R4
301#undef R6
302};
303
304/// Reverse the bits in \p Val.
305template <typename T>
306T reverseBits(T Val) {
307 unsigned char in[sizeof(Val)];
308 unsigned char out[sizeof(Val)];
309 std::memcpy(in, &Val, sizeof(Val));
310 for (unsigned i = 0; i < sizeof(Val); ++i)
311 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
312 std::memcpy(&Val, out, sizeof(Val));
313 return Val;
314}
315
316#if __has_builtin(__builtin_bitreverse8)1
317template<>
318inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
319 return __builtin_bitreverse8(Val);
320}
321#endif
322
323#if __has_builtin(__builtin_bitreverse16)1
324template<>
325inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
326 return __builtin_bitreverse16(Val);
327}
328#endif
329
330#if __has_builtin(__builtin_bitreverse32)1
331template<>
332inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
333 return __builtin_bitreverse32(Val);
334}
335#endif
336
337#if __has_builtin(__builtin_bitreverse64)1
338template<>
339inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
340 return __builtin_bitreverse64(Val);
341}
342#endif
343
344// NOTE: The following support functions use the _32/_64 extensions instead of
345// type overloading so that signed and unsigned integers can be used without
346// ambiguity.
347
348/// Return the high 32 bits of a 64 bit value.
349constexpr inline uint32_t Hi_32(uint64_t Value) {
350 return static_cast<uint32_t>(Value >> 32);
351}
352
353/// Return the low 32 bits of a 64 bit value.
354constexpr inline uint32_t Lo_32(uint64_t Value) {
355 return static_cast<uint32_t>(Value);
356}
357
358/// Make a 64-bit integer from a high / low pair of 32-bit integers.
359constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
360 return ((uint64_t)High << 32) | (uint64_t)Low;
361}
362
363/// Checks if an integer fits into the given bit width.
364template <unsigned N> constexpr inline bool isInt(int64_t x) {
365 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
366}
367// Template specializations to get better code for common cases.
368template <> constexpr inline bool isInt<8>(int64_t x) {
369 return static_cast<int8_t>(x) == x;
370}
371template <> constexpr inline bool isInt<16>(int64_t x) {
372 return static_cast<int16_t>(x) == x;
373}
374template <> constexpr inline bool isInt<32>(int64_t x) {
375 return static_cast<int32_t>(x) == x;
376}
377
378/// Checks if a signed integer is an N bit number shifted left by S.
379template <unsigned N, unsigned S>
380constexpr inline bool isShiftedInt(int64_t x) {
381 static_assert(
382 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
383 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
384 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
385}
386
387/// Checks if an unsigned integer fits into the given bit width.
388///
389/// This is written as two functions rather than as simply
390///
391/// return N >= 64 || X < (UINT64_C(1) << N);
392///
393/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
394/// left too many places.
395template <unsigned N>
396constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
397 static_assert(N > 0, "isUInt<0> doesn't make sense");
398 return X < (UINT64_C(1)1UL << (N));
399}
400template <unsigned N>
401constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t X) {
402 return true;
403}
404
405// Template specializations to get better code for common cases.
406template <> constexpr inline bool isUInt<8>(uint64_t x) {
407 return static_cast<uint8_t>(x) == x;
408}
409template <> constexpr inline bool isUInt<16>(uint64_t x) {
410 return static_cast<uint16_t>(x) == x;
411}
412template <> constexpr inline bool isUInt<32>(uint64_t x) {
413 return static_cast<uint32_t>(x) == x;
414}
415
416/// Checks if a unsigned integer is an N bit number shifted left by S.
417template <unsigned N, unsigned S>
418constexpr inline bool isShiftedUInt(uint64_t x) {
419 static_assert(
420 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
421 static_assert(N + S <= 64,
422 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
423 // Per the two static_asserts above, S must be strictly less than 64. So
424 // 1 << S is not undefined behavior.
425 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
426}
427
428/// Gets the maximum value for a N-bit unsigned integer.
429inline uint64_t maxUIntN(uint64_t N) {
430 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 430, __PRETTY_FUNCTION__))
;
431
432 // uint64_t(1) << 64 is undefined behavior, so we can't do
433 // (uint64_t(1) << N) - 1
434 // without checking first that N != 64. But this works and doesn't have a
435 // branch.
436 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
437}
438
439/// Gets the minimum value for a N-bit signed integer.
440inline int64_t minIntN(int64_t N) {
441 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 441, __PRETTY_FUNCTION__))
;
442
443 return -(UINT64_C(1)1UL<<(N-1));
444}
445
446/// Gets the maximum value for a N-bit signed integer.
447inline int64_t maxIntN(int64_t N) {
448 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 448, __PRETTY_FUNCTION__))
;
449
450 // This relies on two's complement wraparound when N == 64, so we convert to
451 // int64_t only at the very end to avoid UB.
452 return (UINT64_C(1)1UL << (N - 1)) - 1;
453}
454
455/// Checks if an unsigned integer fits into the given (dynamic) bit width.
456inline bool isUIntN(unsigned N, uint64_t x) {
457 return N >= 64 || x <= maxUIntN(N);
458}
459
460/// Checks if an signed integer fits into the given (dynamic) bit width.
461inline bool isIntN(unsigned N, int64_t x) {
462 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
463}
464
465/// Return true if the argument is a non-empty sequence of ones starting at the
466/// least significant bit with the remainder zero (32 bit version).
467/// Ex. isMask_32(0x0000FFFFU) == true.
468constexpr inline bool isMask_32(uint32_t Value) {
469 return Value && ((Value + 1) & Value) == 0;
470}
471
472/// Return true if the argument is a non-empty sequence of ones starting at the
473/// least significant bit with the remainder zero (64 bit version).
474constexpr inline bool isMask_64(uint64_t Value) {
475 return Value && ((Value + 1) & Value) == 0;
476}
477
478/// Return true if the argument contains a non-empty sequence of ones with the
479/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
480constexpr inline bool isShiftedMask_32(uint32_t Value) {
481 return Value && isMask_32((Value - 1) | Value);
482}
483
484/// Return true if the argument contains a non-empty sequence of ones with the
485/// remainder zero (64 bit version.)
486constexpr inline bool isShiftedMask_64(uint64_t Value) {
487 return Value && isMask_64((Value - 1) | Value);
488}
489
490/// Return true if the argument is a power of two > 0.
491/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
492constexpr inline bool isPowerOf2_32(uint32_t Value) {
493 return Value && !(Value & (Value - 1));
494}
495
496/// Return true if the argument is a power of two > 0 (64 bit edition.)
497constexpr inline bool isPowerOf2_64(uint64_t Value) {
498 return Value && !(Value & (Value - 1));
499}
500
501/// Count the number of ones from the most significant bit to the first
502/// zero bit.
503///
504/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
505/// Only unsigned integral types are allowed.
506///
507/// \param ZB the behavior on an input of all ones. Only ZB_Width and
508/// ZB_Undefined are valid arguments.
509template <typename T>
510unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
511 static_assert(std::numeric_limits<T>::is_integer &&
512 !std::numeric_limits<T>::is_signed,
513 "Only unsigned integral types are allowed.");
514 return countLeadingZeros<T>(~Value, ZB);
515}
516
517/// Count the number of ones from the least significant bit to the first
518/// zero bit.
519///
520/// Ex. countTrailingOnes(0x00FF00FF) == 8.
521/// Only unsigned integral types are allowed.
522///
523/// \param ZB the behavior on an input of all ones. Only ZB_Width and
524/// ZB_Undefined are valid arguments.
525template <typename T>
526unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
527 static_assert(std::numeric_limits<T>::is_integer &&
528 !std::numeric_limits<T>::is_signed,
529 "Only unsigned integral types are allowed.");
530 return countTrailingZeros<T>(~Value, ZB);
531}
532
533namespace detail {
534template <typename T, std::size_t SizeOfT> struct PopulationCounter {
535 static unsigned count(T Value) {
536 // Generic version, forward to 32 bits.
537 static_assert(SizeOfT <= 4, "Not implemented!");
538#if defined(__GNUC__4)
539 return __builtin_popcount(Value);
540#else
541 uint32_t v = Value;
542 v = v - ((v >> 1) & 0x55555555);
543 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
544 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
545#endif
546 }
547};
548
549template <typename T> struct PopulationCounter<T, 8> {
550 static unsigned count(T Value) {
551#if defined(__GNUC__4)
552 return __builtin_popcountll(Value);
553#else
554 uint64_t v = Value;
555 v = v - ((v >> 1) & 0x5555555555555555ULL);
556 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
557 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
558 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
559#endif
560 }
561};
562} // namespace detail
563
564/// Count the number of set bits in a value.
565/// Ex. countPopulation(0xF000F000) = 8
566/// Returns 0 if the word is zero.
567template <typename T>
568inline unsigned countPopulation(T Value) {
569 static_assert(std::numeric_limits<T>::is_integer &&
570 !std::numeric_limits<T>::is_signed,
571 "Only unsigned integral types are allowed.");
572 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
573}
574
575/// Compile time Log2.
576/// Valid only for positive powers of two.
577template <size_t kValue> constexpr inline size_t CTLog2() {
578 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
579 "Value is not a valid power of 2");
580 return 1 + CTLog2<kValue / 2>();
581}
582
583template <> constexpr inline size_t CTLog2<1>() { return 0; }
584
585/// Return the log base 2 of the specified value.
586inline double Log2(double Value) {
587#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
588 return __builtin_log(Value) / __builtin_log(2.0);
589#else
590 return log2(Value);
591#endif
592}
593
594/// Return the floor log base 2 of the specified value, -1 if the value is zero.
595/// (32 bit edition.)
596/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
597inline unsigned Log2_32(uint32_t Value) {
598 return 31 - countLeadingZeros(Value);
599}
600
601/// Return the floor log base 2 of the specified value, -1 if the value is zero.
602/// (64 bit edition.)
603inline unsigned Log2_64(uint64_t Value) {
604 return 63 - countLeadingZeros(Value);
605}
606
607/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
608/// (32 bit edition).
609/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
610inline unsigned Log2_32_Ceil(uint32_t Value) {
611 return 32 - countLeadingZeros(Value - 1);
612}
613
614/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
615/// (64 bit edition.)
616inline unsigned Log2_64_Ceil(uint64_t Value) {
617 return 64 - countLeadingZeros(Value - 1);
618}
619
620/// Return the greatest common divisor of the values using Euclid's algorithm.
621template <typename T>
622inline T greatestCommonDivisor(T A, T B) {
623 while (B) {
624 T Tmp = B;
625 B = A % B;
626 A = Tmp;
627 }
628 return A;
629}
630
631inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
632 return greatestCommonDivisor<uint64_t>(A, B);
633}
634
635/// This function takes a 64-bit integer and returns the bit equivalent double.
636inline double BitsToDouble(uint64_t Bits) {
637 double D;
638 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
639 memcpy(&D, &Bits, sizeof(Bits));
640 return D;
641}
642
643/// This function takes a 32-bit integer and returns the bit equivalent float.
644inline float BitsToFloat(uint32_t Bits) {
645 float F;
646 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
647 memcpy(&F, &Bits, sizeof(Bits));
648 return F;
649}
650
651/// This function takes a double and returns the bit equivalent 64-bit integer.
652/// Note that copying doubles around changes the bits of NaNs on some hosts,
653/// notably x86, so this routine cannot be used if these bits are needed.
654inline uint64_t DoubleToBits(double Double) {
655 uint64_t Bits;
656 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
657 memcpy(&Bits, &Double, sizeof(Double));
658 return Bits;
659}
660
661/// This function takes a float and returns the bit equivalent 32-bit integer.
662/// Note that copying floats around changes the bits of NaNs on some hosts,
663/// notably x86, so this routine cannot be used if these bits are needed.
664inline uint32_t FloatToBits(float Float) {
665 uint32_t Bits;
666 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
667 memcpy(&Bits, &Float, sizeof(Float));
668 return Bits;
669}
670
671/// A and B are either alignments or offsets. Return the minimum alignment that
672/// may be assumed after adding the two together.
673constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
674 // The largest power of 2 that divides both A and B.
675 //
676 // Replace "-Value" by "1+~Value" in the following commented code to avoid
677 // MSVC warning C4146
678 // return (A | B) & -(A | B);
679 return (A | B) & (1 + ~(A | B));
680}
681
682/// Returns the next power of two (in 64-bits) that is strictly greater than A.
683/// Returns zero on overflow.
684inline uint64_t NextPowerOf2(uint64_t A) {
685 A |= (A >> 1);
686 A |= (A >> 2);
687 A |= (A >> 4);
688 A |= (A >> 8);
689 A |= (A >> 16);
690 A |= (A >> 32);
691 return A + 1;
692}
693
694/// Returns the power of two which is less than or equal to the given value.
695/// Essentially, it is a floor operation across the domain of powers of two.
696inline uint64_t PowerOf2Floor(uint64_t A) {
697 if (!A) return 0;
698 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
699}
700
701/// Returns the power of two which is greater than or equal to the given value.
702/// Essentially, it is a ceil operation across the domain of powers of two.
703inline uint64_t PowerOf2Ceil(uint64_t A) {
704 if (!A)
705 return 0;
706 return NextPowerOf2(A - 1);
707}
708
709/// Returns the next integer (mod 2**64) that is greater than or equal to
710/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
711///
712/// If non-zero \p Skew is specified, the return value will be a minimal
713/// integer that is greater than or equal to \p Value and equal to
714/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
715/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
716///
717/// Examples:
718/// \code
719/// alignTo(5, 8) = 8
720/// alignTo(17, 8) = 24
721/// alignTo(~0LL, 8) = 0
722/// alignTo(321, 255) = 510
723///
724/// alignTo(5, 8, 7) = 7
725/// alignTo(17, 8, 1) = 17
726/// alignTo(~0LL, 8, 3) = 3
727/// alignTo(321, 255, 42) = 552
728/// \endcode
729inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
730 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 730, __PRETTY_FUNCTION__))
;
731 Skew %= Align;
732 return (Value + Align - 1 - Skew) / Align * Align + Skew;
733}
734
735/// Returns the next integer (mod 2**64) that is greater than or equal to
736/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
737template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
738 static_assert(Align != 0u, "Align must be non-zero");
739 return (Value + Align - 1) / Align * Align;
740}
741
742/// Returns the integer ceil(Numerator / Denominator).
743inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
744 return alignTo(Numerator, Denominator) / Denominator;
745}
746
747/// Returns the integer nearest(Numerator / Denominator).
748inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
749 return (Numerator + (Denominator / 2)) / Denominator;
750}
751
752/// Returns the largest uint64_t less than or equal to \p Value and is
753/// \p Skew mod \p Align. \p Align must be non-zero
754inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
755 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 755, __PRETTY_FUNCTION__))
;
756 Skew %= Align;
757 return (Value - Skew) / Align * Align + Skew;
758}
759
760/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
761/// Requires 0 < B <= 32.
762template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
763 static_assert(B > 0, "Bit width can't be 0.");
764 static_assert(B <= 32, "Bit width out of range.");
765 return int32_t(X << (32 - B)) >> (32 - B);
766}
767
768/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
769/// Requires 0 < B < 32.
770inline int32_t SignExtend32(uint32_t X, unsigned B) {
771 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 771, __PRETTY_FUNCTION__))
;
772 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 772, __PRETTY_FUNCTION__))
;
773 return int32_t(X << (32 - B)) >> (32 - B);
774}
775
776/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
777/// Requires 0 < B < 64.
778template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
779 static_assert(B > 0, "Bit width can't be 0.");
780 static_assert(B <= 64, "Bit width out of range.");
781 return int64_t(x << (64 - B)) >> (64 - B);
782}
783
784/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
785/// Requires 0 < B < 64.
786inline int64_t SignExtend64(uint64_t X, unsigned B) {
787 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 787, __PRETTY_FUNCTION__))
;
788 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/include/llvm/Support/MathExtras.h"
, 788, __PRETTY_FUNCTION__))
;
789 return int64_t(X << (64 - B)) >> (64 - B);
790}
791
792/// Subtract two unsigned integers, X and Y, of type T and return the absolute
793/// value of the result.
794template <typename T>
795std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
796 return std::max(X, Y) - std::min(X, Y);
797}
798
799/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
800/// maximum representable value of T on overflow. ResultOverflowed indicates if
801/// the result is larger than the maximum representable value of type T.
802template <typename T>
803std::enable_if_t<std::is_unsigned<T>::value, T>
804SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
805 bool Dummy;
806 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
807 // Hacker's Delight, p. 29
808 T Z = X + Y;
809 Overflowed = (Z < X || Z < Y);
810 if (Overflowed)
811 return std::numeric_limits<T>::max();
812 else
813 return Z;
814}
815
816/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
817/// maximum representable value of T on overflow. ResultOverflowed indicates if
818/// the result is larger than the maximum representable value of type T.
819template <typename T>
820std::enable_if_t<std::is_unsigned<T>::value, T>
821SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
822 bool Dummy;
823 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
824
825 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
826 // because it fails for uint16_t (where multiplication can have undefined
827 // behavior due to promotion to int), and requires a division in addition
828 // to the multiplication.
829
830 Overflowed = false;
831
832 // Log2(Z) would be either Log2Z or Log2Z + 1.
833 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
834 // will necessarily be less than Log2Max as desired.
835 int Log2Z = Log2_64(X) + Log2_64(Y);
836 const T Max = std::numeric_limits<T>::max();
837 int Log2Max = Log2_64(Max);
838 if (Log2Z < Log2Max) {
839 return X * Y;
840 }
841 if (Log2Z > Log2Max) {
842 Overflowed = true;
843 return Max;
844 }
845
846 // We're going to use the top bit, and maybe overflow one
847 // bit past it. Multiply all but the bottom bit then add
848 // that on at the end.
849 T Z = (X >> 1) * Y;
850 if (Z & ~(Max >> 1)) {
851 Overflowed = true;
852 return Max;
853 }
854 Z <<= 1;
855 if (X & 1)
856 return SaturatingAdd(Z, Y, ResultOverflowed);
857
858 return Z;
859}
860
861/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
862/// the product. Clamp the result to the maximum representable value of T on
863/// overflow. ResultOverflowed indicates if the result is larger than the
864/// maximum representable value of type T.
865template <typename T>
866std::enable_if_t<std::is_unsigned<T>::value, T>
867SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
868 bool Dummy;
869 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
870
871 T Product = SaturatingMultiply(X, Y, &Overflowed);
872 if (Overflowed)
873 return Product;
874
875 return SaturatingAdd(A, Product, &Overflowed);
876}
877
878/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
879extern const float huge_valf;
880
881
882/// Add two signed integers, computing the two's complement truncated result,
883/// returning true if overflow occured.
884template <typename T>
885std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
886#if __has_builtin(__builtin_add_overflow)1
887 return __builtin_add_overflow(X, Y, &Result);
888#else
889 // Perform the unsigned addition.
890 using U = std::make_unsigned_t<T>;
891 const U UX = static_cast<U>(X);
892 const U UY = static_cast<U>(Y);
893 const U UResult = UX + UY;
894
895 // Convert to signed.
896 Result = static_cast<T>(UResult);
897
898 // Adding two positive numbers should result in a positive number.
899 if (X > 0 && Y > 0)
900 return Result <= 0;
901 // Adding two negatives should result in a negative number.
902 if (X < 0 && Y < 0)
903 return Result >= 0;
904 return false;
905#endif
906}
907
908/// Subtract two signed integers, computing the two's complement truncated
909/// result, returning true if an overflow ocurred.
910template <typename T>
911std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
912#if __has_builtin(__builtin_sub_overflow)1
913 return __builtin_sub_overflow(X, Y, &Result);
914#else
915 // Perform the unsigned addition.
916 using U = std::make_unsigned_t<T>;
917 const U UX = static_cast<U>(X);
918 const U UY = static_cast<U>(Y);
919 const U UResult = UX - UY;
920
921 // Convert to signed.
922 Result = static_cast<T>(UResult);
923
924 // Subtracting a positive number from a negative results in a negative number.
925 if (X <= 0 && Y > 0)
926 return Result >= 0;
927 // Subtracting a negative number from a positive results in a positive number.
928 if (X >= 0 && Y < 0)
929 return Result <= 0;
930 return false;
931#endif
932}
933
934/// Multiply two signed integers, computing the two's complement truncated
935/// result, returning true if an overflow ocurred.
936template <typename T>
937std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
938 // Perform the unsigned multiplication on absolute values.
939 using U = std::make_unsigned_t<T>;
940 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
941 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
942 const U UResult = UX * UY;
943
944 // Convert to signed.
945 const bool IsNegative = (X < 0) ^ (Y < 0);
946 Result = IsNegative ? (0 - UResult) : UResult;
947
948 // If any of the args was 0, result is 0 and no overflow occurs.
949 if (UX == 0 || UY == 0)
950 return false;
951
952 // UX and UY are in [1, 2^n], where n is the number of digits.
953 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
954 // positive) divided by an argument compares to the other.
955 if (IsNegative)
956 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
957 else
958 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
959}
960
961} // End llvm namespace
962
963#endif