Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2741, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-04-14-063029-18377-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeNumElements(Pow2NElts);
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returs true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99 };
100}
101
102static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 unsigned Size = Ty.getSizeInBits();
107 unsigned Pieces = (Size + 63) / 64;
108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110 };
111}
112
113// Increase the number of vector elements to reach the next multiple of 32-bit
114// type.
115static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116 return [=](const LegalityQuery &Query) {
117 const LLT Ty = Query.Types[TypeIdx];
118
119 const LLT EltTy = Ty.getElementType();
120 const int Size = Ty.getSizeInBits();
121 const int EltSize = EltTy.getSizeInBits();
122 const int NextMul32 = (Size + 31) / 32;
123
124 assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 124, __PRETTY_FUNCTION__))
;
125
126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128 };
129}
130
131static LLT getBitcastRegisterType(const LLT Ty) {
132 const unsigned Size = Ty.getSizeInBits();
133
134 LLT CoercedTy;
135 if (Size <= 32) {
136 // <2 x s8> -> s16
137 // <4 x s8> -> s32
138 return LLT::scalar(Size);
139 }
140
141 return LLT::scalarOrVector(Size / 32, 32);
142}
143
144static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 const LLT Ty = Query.Types[TypeIdx];
147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148 };
149}
150
151static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152 return [=](const LegalityQuery &Query) {
153 const LLT Ty = Query.Types[TypeIdx];
154 unsigned Size = Ty.getSizeInBits();
155 assert(Size % 32 == 0)((Size % 32 == 0) ? static_cast<void> (0) : __assert_fail
("Size % 32 == 0", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 155, __PRETTY_FUNCTION__))
;
156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157 };
158}
159
160static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161 return [=](const LegalityQuery &Query) {
162 const LLT QueryTy = Query.Types[TypeIdx];
163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164 };
165}
166
167static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168 return [=](const LegalityQuery &Query) {
169 const LLT QueryTy = Query.Types[TypeIdx];
170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171 };
172}
173
174static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175 return [=](const LegalityQuery &Query) {
176 const LLT QueryTy = Query.Types[TypeIdx];
177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178 };
179}
180
181static bool isRegisterSize(unsigned Size) {
182 return Size % 32 == 0 && Size <= MaxRegisterSize;
183}
184
185static bool isRegisterVectorElementType(LLT EltTy) {
186 const int EltSize = EltTy.getSizeInBits();
187 return EltSize == 16 || EltSize % 32 == 0;
188}
189
190static bool isRegisterVectorType(LLT Ty) {
191 const int EltSize = Ty.getElementType().getSizeInBits();
192 return EltSize == 32 || EltSize == 64 ||
193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194 EltSize == 128 || EltSize == 256;
195}
196
197static bool isRegisterType(LLT Ty) {
198 if (!isRegisterSize(Ty.getSizeInBits()))
199 return false;
200
201 if (Ty.isVector())
202 return isRegisterVectorType(Ty);
203
204 return true;
205}
206
207// Any combination of 32 or 64-bit elements up the maximum register size, and
208// multiples of v2s16.
209static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210 return [=](const LegalityQuery &Query) {
211 return isRegisterType(Query.Types[TypeIdx]);
212 };
213}
214
215static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216 return [=](const LegalityQuery &Query) {
217 const LLT QueryTy = Query.Types[TypeIdx];
218 if (!QueryTy.isVector())
219 return false;
220 const LLT EltTy = QueryTy.getElementType();
221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222 };
223}
224
225static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226 return [=](const LegalityQuery &Query) {
227 const LLT Ty = Query.Types[TypeIdx];
228 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230 };
231}
232
233// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234// handle some operations by just promoting the register during
235// selection. There are also d16 loads on GFX9+ which preserve the high bits.
236static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237 bool IsLoad) {
238 switch (AS) {
239 case AMDGPUAS::PRIVATE_ADDRESS:
240 // FIXME: Private element size.
241 return ST.enableFlatScratch() ? 128 : 32;
242 case AMDGPUAS::LOCAL_ADDRESS:
243 return ST.useDS128() ? 128 : 64;
244 case AMDGPUAS::GLOBAL_ADDRESS:
245 case AMDGPUAS::CONSTANT_ADDRESS:
246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247 // Treat constant and global as identical. SMRD loads are sometimes usable for
248 // global loads (ideally constant address space should be eliminated)
249 // depending on the context. Legality cannot be context dependent, but
250 // RegBankSelect can split the load as necessary depending on the pointer
251 // register bank/uniformity and if the memory is invariant or not written in a
252 // kernel.
253 return IsLoad ? 512 : 128;
254 default:
255 // Flat addresses may contextually need to be split to 32-bit parts if they
256 // may alias scratch depending on the subtarget.
257 return 128;
258 }
259}
260
261static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262 const LegalityQuery &Query,
263 unsigned Opcode) {
264 const LLT Ty = Query.Types[0];
265
266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267 const bool IsLoad = Opcode != AMDGPU::G_STORE;
268
269 unsigned RegSize = Ty.getSizeInBits();
270 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272 unsigned AS = Query.Types[1].getAddressSpace();
273
274 // All of these need to be custom lowered to cast the pointer operand.
275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276 return false;
277
278 // TODO: We should be able to widen loads if the alignment is high enough, but
279 // we also need to modify the memory access size.
280#if 0
281 // Accept widening loads based on alignment.
282 if (IsLoad && MemSize < Size)
283 MemSize = std::max(MemSize, Align);
284#endif
285
286 // Only 1-byte and 2-byte to 32-bit extloads are valid.
287 if (MemSize != RegSize && RegSize != 32)
288 return false;
289
290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291 return false;
292
293 switch (MemSize) {
294 case 8:
295 case 16:
296 case 32:
297 case 64:
298 case 128:
299 break;
300 case 96:
301 if (!ST.hasDwordx3LoadStores())
302 return false;
303 break;
304 case 256:
305 case 512:
306 // These may contextually need to be broken down.
307 break;
308 default:
309 return false;
310 }
311
312 assert(RegSize >= MemSize)((RegSize >= MemSize) ? static_cast<void> (0) : __assert_fail
("RegSize >= MemSize", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 312, __PRETTY_FUNCTION__))
;
313
314 if (AlignBits < MemSize) {
315 const SITargetLowering *TLI = ST.getTargetLowering();
316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317 Align(AlignBits / 8)))
318 return false;
319 }
320
321 return true;
322}
323
324// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325// workaround this. Eventually it should ignore the type for loads and only care
326// about the size. Return true in cases where we will workaround this for now by
327// bitcasting.
328static bool loadStoreBitcastWorkaround(const LLT Ty) {
329 if (EnableNewLegality)
330 return false;
331
332 const unsigned Size = Ty.getSizeInBits();
333 if (Size <= 64)
334 return false;
335 if (!Ty.isVector())
336 return true;
337
338 LLT EltTy = Ty.getElementType();
339 if (EltTy.isPointer())
340 return true;
341
342 unsigned EltSize = EltTy.getSizeInBits();
343 return EltSize != 32 && EltSize != 64;
344}
345
346static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347 unsigned Opcode) {
348 const LLT Ty = Query.Types[0];
349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350 !loadStoreBitcastWorkaround(Ty);
351}
352
353/// Return true if a load or store of the type should be lowered with a bitcast
354/// to a different type.
355static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356 const unsigned MemSizeInBits) {
357 const unsigned Size = Ty.getSizeInBits();
358 if (Size != MemSizeInBits)
359 return Size <= 32 && Ty.isVector();
360
361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362 return true;
363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364 !isRegisterVectorElementType(Ty.getElementType());
365}
366
367/// Return true if we should legalize a load by widening an odd sized memory
368/// access up to the alignment. Note this case when the memory access itself
369/// changes, not the size of the result register.
370static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371 unsigned AlignInBits, unsigned AddrSpace,
372 unsigned Opcode) {
373 // We don't want to widen cases that are naturally legal.
374 if (isPowerOf2_32(SizeInBits))
375 return false;
376
377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378 // end up widening these for a scalar load during RegBankSelect, since there
379 // aren't 96-bit scalar loads.
380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381 return false;
382
383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384 return false;
385
386 // A load is known dereferenceable up to the alignment, so it's legal to widen
387 // to it.
388 //
389 // TODO: Could check dereferenceable for less aligned cases.
390 unsigned RoundedSize = NextPowerOf2(SizeInBits);
391 if (AlignInBits < RoundedSize)
392 return false;
393
394 // Do not widen if it would introduce a slow unaligned load.
395 const SITargetLowering *TLI = ST.getTargetLowering();
396 bool Fast = false;
397 return TLI->allowsMisalignedMemoryAccessesImpl(
398 RoundedSize, AddrSpace, Align(AlignInBits / 8),
399 MachineMemOperand::MOLoad, &Fast) &&
400 Fast;
401}
402
403static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404 unsigned Opcode) {
405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406 return false;
407
408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409 Query.MMODescrs[0].AlignInBits,
410 Query.Types[1].getAddressSpace(), Opcode);
411}
412
413AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414 const GCNTargetMachine &TM)
415 : ST(ST_) {
416 using namespace TargetOpcode;
417
418 auto GetAddrSpacePtr = [&TM](unsigned AS) {
419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420 };
421
422 const LLT S1 = LLT::scalar(1);
423 const LLT S8 = LLT::scalar(8);
424 const LLT S16 = LLT::scalar(16);
425 const LLT S32 = LLT::scalar(32);
426 const LLT S64 = LLT::scalar(64);
427 const LLT S128 = LLT::scalar(128);
428 const LLT S256 = LLT::scalar(256);
429 const LLT S512 = LLT::scalar(512);
430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431
432 const LLT V2S8 = LLT::vector(2, 8);
433 const LLT V2S16 = LLT::vector(2, 16);
434 const LLT V4S16 = LLT::vector(4, 16);
435
436 const LLT V2S32 = LLT::vector(2, 32);
437 const LLT V3S32 = LLT::vector(3, 32);
438 const LLT V4S32 = LLT::vector(4, 32);
439 const LLT V5S32 = LLT::vector(5, 32);
440 const LLT V6S32 = LLT::vector(6, 32);
441 const LLT V7S32 = LLT::vector(7, 32);
442 const LLT V8S32 = LLT::vector(8, 32);
443 const LLT V9S32 = LLT::vector(9, 32);
444 const LLT V10S32 = LLT::vector(10, 32);
445 const LLT V11S32 = LLT::vector(11, 32);
446 const LLT V12S32 = LLT::vector(12, 32);
447 const LLT V13S32 = LLT::vector(13, 32);
448 const LLT V14S32 = LLT::vector(14, 32);
449 const LLT V15S32 = LLT::vector(15, 32);
450 const LLT V16S32 = LLT::vector(16, 32);
451 const LLT V32S32 = LLT::vector(32, 32);
452
453 const LLT V2S64 = LLT::vector(2, 64);
454 const LLT V3S64 = LLT::vector(3, 64);
455 const LLT V4S64 = LLT::vector(4, 64);
456 const LLT V5S64 = LLT::vector(5, 64);
457 const LLT V6S64 = LLT::vector(6, 64);
458 const LLT V7S64 = LLT::vector(7, 64);
459 const LLT V8S64 = LLT::vector(8, 64);
460 const LLT V16S64 = LLT::vector(16, 64);
461
462 std::initializer_list<LLT> AllS32Vectors =
463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465 std::initializer_list<LLT> AllS64Vectors =
466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467
468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475
476 const LLT CodePtr = FlatPtr;
477
478 const std::initializer_list<LLT> AddrSpaces64 = {
479 GlobalPtr, ConstantPtr, FlatPtr
480 };
481
482 const std::initializer_list<LLT> AddrSpaces32 = {
483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484 };
485
486 const std::initializer_list<LLT> FPTypesBase = {
487 S32, S64
488 };
489
490 const std::initializer_list<LLT> FPTypes16 = {
491 S32, S64, S16
492 };
493
494 const std::initializer_list<LLT> FPTypesPK16 = {
495 S32, S64, S16, V2S16
496 };
497
498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499
500 setAction({G_BRCOND, S1}, Legal); // VCC branches
501 setAction({G_BRCOND, S32}, Legal); // SCC branches
502
503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504 // elements for v3s16
505 getActionDefinitionsBuilder(G_PHI)
506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507 .legalFor(AllS32Vectors)
508 .legalFor(AllS64Vectors)
509 .legalFor(AddrSpaces64)
510 .legalFor(AddrSpaces32)
511 .legalIf(isPointer(0))
512 .clampScalar(0, S16, S256)
513 .widenScalarToNextPow2(0, 32)
514 .clampMaxNumElements(0, S32, 16)
515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516 .scalarize(0);
517
518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519 // Full set of gfx9 features.
520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521 .legalFor({S32, S16, V2S16})
522 .clampScalar(0, S16, S32)
523 .clampMaxNumElements(0, S16, 2)
524 .scalarize(0)
525 .widenScalarToNextPow2(0, 32);
526
527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528 .legalFor({S32, S16, V2S16}) // Clamp modifier
529 .minScalarOrElt(0, S16)
530 .clampMaxNumElements(0, S16, 2)
531 .scalarize(0)
532 .widenScalarToNextPow2(0, 32)
533 .lower();
534 } else if (ST.has16BitInsts()) {
535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536 .legalFor({S32, S16})
537 .clampScalar(0, S16, S32)
538 .scalarize(0)
539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540
541 // Technically the saturating operations require clamp bit support, but this
542 // was introduced at the same time as 16-bit operations.
543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544 .legalFor({S32, S16}) // Clamp modifier
545 .minScalar(0, S16)
546 .scalarize(0)
547 .widenScalarToNextPow2(0, 16)
548 .lower();
549
550 // We're just lowering this, but it helps get a better result to try to
551 // coerce to the desired type first.
552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553 .minScalar(0, S16)
554 .scalarize(0)
555 .lower();
556 } else {
557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558 .legalFor({S32})
559 .clampScalar(0, S32, S32)
560 .scalarize(0);
561
562 if (ST.hasIntClamp()) {
563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564 .legalFor({S32}) // Clamp modifier.
565 .scalarize(0)
566 .minScalarOrElt(0, S32)
567 .lower();
568 } else {
569 // Clamp bit support was added in VI, along with 16-bit operations.
570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571 .minScalar(0, S32)
572 .scalarize(0)
573 .lower();
574 }
575
576 // FIXME: DAG expansion gets better results. The widening uses the smaller
577 // range values and goes for the min/max lowering directly.
578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579 .minScalar(0, S32)
580 .scalarize(0)
581 .lower();
582 }
583
584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585 .customFor({S32, S64})
586 .clampScalar(0, S32, S64)
587 .widenScalarToNextPow2(0, 32)
588 .scalarize(0);
589
590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591 .legalFor({S32})
592 .maxScalarOrElt(0, S32);
593
594 if (ST.hasVOP3PInsts()) {
595 Mulh
596 .clampMaxNumElements(0, S8, 2)
597 .lowerFor({V2S8});
598 }
599
600 Mulh
601 .scalarize(0)
602 .lower();
603
604 // Report legal for any types we can handle anywhere. For the cases only legal
605 // on the SALU, RegBankSelect will be able to re-legalize.
606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608 .clampScalar(0, S32, S64)
609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611 .widenScalarToNextPow2(0)
612 .scalarize(0);
613
614 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616 .legalFor({{S32, S1}, {S32, S32}})
617 .minScalar(0, S32)
618 // TODO: .scalarize(0)
619 .lower();
620
621 getActionDefinitionsBuilder(G_BITCAST)
622 // Don't worry about the size constraint.
623 .legalIf(all(isRegisterType(0), isRegisterType(1)))
624 .lower();
625
626
627 getActionDefinitionsBuilder(G_CONSTANT)
628 .legalFor({S1, S32, S64, S16, GlobalPtr,
629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630 .legalIf(isPointer(0))
631 .clampScalar(0, S32, S64)
632 .widenScalarToNextPow2(0);
633
634 getActionDefinitionsBuilder(G_FCONSTANT)
635 .legalFor({S32, S64, S16})
636 .clampScalar(0, S16, S64);
637
638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639 .legalIf(isRegisterType(0))
640 // s1 and s16 are special cases because they have legal operations on
641 // them, but don't really occupy registers in the normal way.
642 .legalFor({S1, S16})
643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644 .clampScalarOrElt(0, S32, MaxScalar)
645 .widenScalarToNextPow2(0, 32)
646 .clampMaxNumElements(0, S32, 16);
647
648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649
650 // If the amount is divergent, we have to do a wave reduction to get the
651 // maximum value, so this is expanded during RegBankSelect.
652 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653 .legalFor({{PrivatePtr, S32}});
654
655 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656 .customIf(typeIsNot(0, PrivatePtr));
657
658 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659
660 auto &FPOpActions = getActionDefinitionsBuilder(
661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662 .legalFor({S32, S64});
663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664 .customFor({S32, S64});
665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666 .customFor({S32, S64});
667
668 if (ST.has16BitInsts()) {
669 if (ST.hasVOP3PInsts())
670 FPOpActions.legalFor({S16, V2S16});
671 else
672 FPOpActions.legalFor({S16});
673
674 TrigActions.customFor({S16});
675 FDIVActions.customFor({S16});
676 }
677
678 auto &MinNumMaxNum = getActionDefinitionsBuilder({
679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680
681 if (ST.hasVOP3PInsts()) {
682 MinNumMaxNum.customFor(FPTypesPK16)
683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684 .clampMaxNumElements(0, S16, 2)
685 .clampScalar(0, S16, S64)
686 .scalarize(0);
687 } else if (ST.has16BitInsts()) {
688 MinNumMaxNum.customFor(FPTypes16)
689 .clampScalar(0, S16, S64)
690 .scalarize(0);
691 } else {
692 MinNumMaxNum.customFor(FPTypesBase)
693 .clampScalar(0, S32, S64)
694 .scalarize(0);
695 }
696
697 if (ST.hasVOP3PInsts())
698 FPOpActions.clampMaxNumElements(0, S16, 2);
699
700 FPOpActions
701 .scalarize(0)
702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703
704 TrigActions
705 .scalarize(0)
706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707
708 FDIVActions
709 .scalarize(0)
710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711
712 getActionDefinitionsBuilder({G_FNEG, G_FABS})
713 .legalFor(FPTypesPK16)
714 .clampMaxNumElements(0, S16, 2)
715 .scalarize(0)
716 .clampScalar(0, S16, S64);
717
718 if (ST.has16BitInsts()) {
719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720 .legalFor({S32, S64, S16})
721 .scalarize(0)
722 .clampScalar(0, S16, S64);
723 } else {
724 getActionDefinitionsBuilder(G_FSQRT)
725 .legalFor({S32, S64})
726 .scalarize(0)
727 .clampScalar(0, S32, S64);
728
729 if (ST.hasFractBug()) {
730 getActionDefinitionsBuilder(G_FFLOOR)
731 .customFor({S64})
732 .legalFor({S32, S64})
733 .scalarize(0)
734 .clampScalar(0, S32, S64);
735 } else {
736 getActionDefinitionsBuilder(G_FFLOOR)
737 .legalFor({S32, S64})
738 .scalarize(0)
739 .clampScalar(0, S32, S64);
740 }
741 }
742
743 getActionDefinitionsBuilder(G_FPTRUNC)
744 .legalFor({{S32, S64}, {S16, S32}})
745 .scalarize(0)
746 .lower();
747
748 getActionDefinitionsBuilder(G_FPEXT)
749 .legalFor({{S64, S32}, {S32, S16}})
750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751 .scalarize(0);
752
753 getActionDefinitionsBuilder(G_FSUB)
754 // Use actual fsub instruction
755 .legalFor({S32})
756 // Must use fadd + fneg
757 .lowerFor({S64, S16, V2S16})
758 .scalarize(0)
759 .clampScalar(0, S32, S64);
760
761 // Whether this is legal depends on the floating point mode for the function.
762 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764 FMad.customFor({S32, S16});
765 else if (ST.hasMadMacF32Insts())
766 FMad.customFor({S32});
767 else if (ST.hasMadF16())
768 FMad.customFor({S16});
769 FMad.scalarize(0)
770 .lower();
771
772 auto &FRem = getActionDefinitionsBuilder(G_FREM);
773 if (ST.has16BitInsts()) {
774 FRem.customFor({S16, S32, S64});
775 } else {
776 FRem.minScalar(0, S32)
777 .customFor({S32, S64});
778 }
779 FRem.scalarize(0);
780
781 // TODO: Do we need to clamp maximum bitwidth?
782 getActionDefinitionsBuilder(G_TRUNC)
783 .legalIf(isScalar(0))
784 .legalFor({{V2S16, V2S32}})
785 .clampMaxNumElements(0, S16, 2)
786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787 // situations (like an invalid implicit use), we don't want to infinite loop
788 // in the legalizer.
789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790 .alwaysLegal();
791
792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794 {S32, S1}, {S64, S1}, {S16, S1}})
795 .scalarize(0)
796 .clampScalar(0, S32, S64)
797 .widenScalarToNextPow2(1, 32);
798
799 // TODO: Split s1->s64 during regbankselect for VALU.
800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802 .lowerFor({{S32, S64}})
803 .lowerIf(typeIs(1, S1))
804 .customFor({{S64, S64}});
805 if (ST.has16BitInsts())
806 IToFP.legalFor({{S16, S16}});
807 IToFP.clampScalar(1, S32, S64)
808 .minScalar(0, S32)
809 .scalarize(0)
810 .widenScalarToNextPow2(1);
811
812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814 .customFor({{S64, S64}})
815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816 if (ST.has16BitInsts())
817 FPToI.legalFor({{S16, S16}});
818 else
819 FPToI.minScalar(1, S32);
820
821 FPToI.minScalar(0, S32)
822 .scalarize(0)
823 .lower();
824
825 // Lower roundeven into G_FRINT
826 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
827 .scalarize(0)
828 .lower();
829
830 if (ST.has16BitInsts()) {
831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832 .legalFor({S16, S32, S64})
833 .clampScalar(0, S16, S64)
834 .scalarize(0);
835 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
836 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
837 .legalFor({S32, S64})
838 .clampScalar(0, S32, S64)
839 .scalarize(0);
840 } else {
841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842 .legalFor({S32})
843 .customFor({S64})
844 .clampScalar(0, S32, S64)
845 .scalarize(0);
846 }
847
848 getActionDefinitionsBuilder(G_PTR_ADD)
849 .legalIf(all(isPointer(0), sameSize(0, 1)))
850 .scalarize(0)
851 .scalarSameSizeAs(1, 0);
852
853 getActionDefinitionsBuilder(G_PTRMASK)
854 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
855 .scalarSameSizeAs(1, 0)
856 .scalarize(0);
857
858 auto &CmpBuilder =
859 getActionDefinitionsBuilder(G_ICMP)
860 // The compare output type differs based on the register bank of the output,
861 // so make both s1 and s32 legal.
862 //
863 // Scalar compares producing output in scc will be promoted to s32, as that
864 // is the allocatable register type that will be needed for the copy from
865 // scc. This will be promoted during RegBankSelect, and we assume something
866 // before that won't try to use s32 result types.
867 //
868 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
869 // bank.
870 .legalForCartesianProduct(
871 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
872 .legalForCartesianProduct(
873 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
874 if (ST.has16BitInsts()) {
875 CmpBuilder.legalFor({{S1, S16}});
876 }
877
878 CmpBuilder
879 .widenScalarToNextPow2(1)
880 .clampScalar(1, S32, S64)
881 .scalarize(0)
882 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
883
884 getActionDefinitionsBuilder(G_FCMP)
885 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
886 .widenScalarToNextPow2(1)
887 .clampScalar(1, S32, S64)
888 .scalarize(0);
889
890 // FIXME: fpow has a selection pattern that should move to custom lowering.
891 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
892 if (ST.has16BitInsts())
893 Exp2Ops.legalFor({S32, S16});
894 else
895 Exp2Ops.legalFor({S32});
896 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
897 Exp2Ops.scalarize(0);
898
899 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
900 if (ST.has16BitInsts())
901 ExpOps.customFor({{S32}, {S16}});
902 else
903 ExpOps.customFor({S32});
904 ExpOps.clampScalar(0, MinScalarFPTy, S32)
905 .scalarize(0);
906
907 getActionDefinitionsBuilder(G_FPOWI)
908 .clampScalar(0, MinScalarFPTy, S32)
909 .lower();
910
911 // The 64-bit versions produce 32-bit results, but only on the SALU.
912 getActionDefinitionsBuilder(G_CTPOP)
913 .legalFor({{S32, S32}, {S32, S64}})
914 .clampScalar(0, S32, S32)
915 .clampScalar(1, S32, S64)
916 .scalarize(0)
917 .widenScalarToNextPow2(0, 32)
918 .widenScalarToNextPow2(1, 32);
919
920 // The hardware instructions return a different result on 0 than the generic
921 // instructions expect. The hardware produces -1, but these produce the
922 // bitwidth.
923 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
924 .scalarize(0)
925 .clampScalar(0, S32, S32)
926 .clampScalar(1, S32, S64)
927 .widenScalarToNextPow2(0, 32)
928 .widenScalarToNextPow2(1, 32)
929 .lower();
930
931 // The 64-bit versions produce 32-bit results, but only on the SALU.
932 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
933 .legalFor({{S32, S32}, {S32, S64}})
934 .clampScalar(0, S32, S32)
935 .clampScalar(1, S32, S64)
936 .scalarize(0)
937 .widenScalarToNextPow2(0, 32)
938 .widenScalarToNextPow2(1, 32);
939
940 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
941 // RegBankSelect.
942 getActionDefinitionsBuilder(G_BITREVERSE)
943 .legalFor({S32, S64})
944 .clampScalar(0, S32, S64)
945 .scalarize(0)
946 .widenScalarToNextPow2(0);
947
948 if (ST.has16BitInsts()) {
949 getActionDefinitionsBuilder(G_BSWAP)
950 .legalFor({S16, S32, V2S16})
951 .clampMaxNumElements(0, S16, 2)
952 // FIXME: Fixing non-power-of-2 before clamp is workaround for
953 // narrowScalar limitation.
954 .widenScalarToNextPow2(0)
955 .clampScalar(0, S16, S32)
956 .scalarize(0);
957
958 if (ST.hasVOP3PInsts()) {
959 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
960 .legalFor({S32, S16, V2S16})
961 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
962 .clampMaxNumElements(0, S16, 2)
963 .minScalar(0, S16)
964 .widenScalarToNextPow2(0)
965 .scalarize(0)
966 .lower();
967 } else {
968 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
969 .legalFor({S32, S16})
970 .widenScalarToNextPow2(0)
971 .minScalar(0, S16)
972 .scalarize(0)
973 .lower();
974 }
975 } else {
976 // TODO: Should have same legality without v_perm_b32
977 getActionDefinitionsBuilder(G_BSWAP)
978 .legalFor({S32})
979 .lowerIf(scalarNarrowerThan(0, 32))
980 // FIXME: Fixing non-power-of-2 before clamp is workaround for
981 // narrowScalar limitation.
982 .widenScalarToNextPow2(0)
983 .maxScalar(0, S32)
984 .scalarize(0)
985 .lower();
986
987 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
988 .legalFor({S32})
989 .minScalar(0, S32)
990 .widenScalarToNextPow2(0)
991 .scalarize(0)
992 .lower();
993 }
994
995 getActionDefinitionsBuilder(G_INTTOPTR)
996 // List the common cases
997 .legalForCartesianProduct(AddrSpaces64, {S64})
998 .legalForCartesianProduct(AddrSpaces32, {S32})
999 .scalarize(0)
1000 // Accept any address space as long as the size matches
1001 .legalIf(sameSize(0, 1))
1002 .widenScalarIf(smallerThan(1, 0),
1003 [](const LegalityQuery &Query) {
1004 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1005 })
1006 .narrowScalarIf(largerThan(1, 0),
1007 [](const LegalityQuery &Query) {
1008 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1009 });
1010
1011 getActionDefinitionsBuilder(G_PTRTOINT)
1012 // List the common cases
1013 .legalForCartesianProduct(AddrSpaces64, {S64})
1014 .legalForCartesianProduct(AddrSpaces32, {S32})
1015 .scalarize(0)
1016 // Accept any address space as long as the size matches
1017 .legalIf(sameSize(0, 1))
1018 .widenScalarIf(smallerThan(0, 1),
1019 [](const LegalityQuery &Query) {
1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021 })
1022 .narrowScalarIf(
1023 largerThan(0, 1),
1024 [](const LegalityQuery &Query) {
1025 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1026 });
1027
1028 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1029 .scalarize(0)
1030 .custom();
1031
1032 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1033 bool IsLoad) -> bool {
1034 const LLT DstTy = Query.Types[0];
1035
1036 // Split vector extloads.
1037 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1038 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1039
1040 if (MemSize < DstTy.getSizeInBits())
1041 MemSize = std::max(MemSize, AlignBits);
1042
1043 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1044 return true;
1045
1046 const LLT PtrTy = Query.Types[1];
1047 unsigned AS = PtrTy.getAddressSpace();
1048 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1049 return true;
1050
1051 // Catch weird sized loads that don't evenly divide into the access sizes
1052 // TODO: May be able to widen depending on alignment etc.
1053 unsigned NumRegs = (MemSize + 31) / 32;
1054 if (NumRegs == 3) {
1055 if (!ST.hasDwordx3LoadStores())
1056 return true;
1057 } else {
1058 // If the alignment allows, these should have been widened.
1059 if (!isPowerOf2_32(NumRegs))
1060 return true;
1061 }
1062
1063 if (AlignBits < MemSize) {
1064 const SITargetLowering *TLI = ST.getTargetLowering();
1065 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1066 Align(AlignBits / 8));
1067 }
1068
1069 return false;
1070 };
1071
1072 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1073 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1074 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1075
1076 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1077 // LDS
1078 // TODO: Unsupported flat for SI.
1079
1080 for (unsigned Op : {G_LOAD, G_STORE}) {
1081 const bool IsStore = Op == G_STORE;
1082
1083 auto &Actions = getActionDefinitionsBuilder(Op);
1084 // Explicitly list some common cases.
1085 // TODO: Does this help compile time at all?
1086 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1087 {V2S32, GlobalPtr, 64, GlobalAlign32},
1088 {V4S32, GlobalPtr, 128, GlobalAlign32},
1089 {S64, GlobalPtr, 64, GlobalAlign32},
1090 {V2S64, GlobalPtr, 128, GlobalAlign32},
1091 {V2S16, GlobalPtr, 32, GlobalAlign32},
1092 {S32, GlobalPtr, 8, GlobalAlign8},
1093 {S32, GlobalPtr, 16, GlobalAlign16},
1094
1095 {S32, LocalPtr, 32, 32},
1096 {S64, LocalPtr, 64, 32},
1097 {V2S32, LocalPtr, 64, 32},
1098 {S32, LocalPtr, 8, 8},
1099 {S32, LocalPtr, 16, 16},
1100 {V2S16, LocalPtr, 32, 32},
1101
1102 {S32, PrivatePtr, 32, 32},
1103 {S32, PrivatePtr, 8, 8},
1104 {S32, PrivatePtr, 16, 16},
1105 {V2S16, PrivatePtr, 32, 32},
1106
1107 {S32, ConstantPtr, 32, GlobalAlign32},
1108 {V2S32, ConstantPtr, 64, GlobalAlign32},
1109 {V4S32, ConstantPtr, 128, GlobalAlign32},
1110 {S64, ConstantPtr, 64, GlobalAlign32},
1111 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1112 Actions.legalIf(
1113 [=](const LegalityQuery &Query) -> bool {
1114 return isLoadStoreLegal(ST, Query, Op);
1115 });
1116
1117 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1118 // 64-bits.
1119 //
1120 // TODO: Should generalize bitcast action into coerce, which will also cover
1121 // inserting addrspacecasts.
1122 Actions.customIf(typeIs(1, Constant32Ptr));
1123
1124 // Turn any illegal element vectors into something easier to deal
1125 // with. These will ultimately produce 32-bit scalar shifts to extract the
1126 // parts anyway.
1127 //
1128 // For odd 16-bit element vectors, prefer to split those into pieces with
1129 // 16-bit vector parts.
1130 Actions.bitcastIf(
1131 [=](const LegalityQuery &Query) -> bool {
1132 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1133 Query.MMODescrs[0].SizeInBits);
1134 }, bitcastToRegisterType(0));
1135
1136 if (!IsStore) {
1137 // Widen suitably aligned loads by loading extra bytes. The standard
1138 // legalization actions can't properly express widening memory operands.
1139 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1140 return shouldWidenLoad(ST, Query, G_LOAD);
1141 });
1142 }
1143
1144 // FIXME: load/store narrowing should be moved to lower action
1145 Actions
1146 .narrowScalarIf(
1147 [=](const LegalityQuery &Query) -> bool {
1148 return !Query.Types[0].isVector() &&
1149 needToSplitMemOp(Query, Op == G_LOAD);
1150 },
1151 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1152 const LLT DstTy = Query.Types[0];
1153 const LLT PtrTy = Query.Types[1];
1154
1155 const unsigned DstSize = DstTy.getSizeInBits();
1156 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1157
1158 // Split extloads.
1159 if (DstSize > MemSize)
1160 return std::make_pair(0, LLT::scalar(MemSize));
1161
1162 if (!isPowerOf2_32(DstSize)) {
1163 // We're probably decomposing an odd sized store. Try to split
1164 // to the widest type. TODO: Account for alignment. As-is it
1165 // should be OK, since the new parts will be further legalized.
1166 unsigned FloorSize = PowerOf2Floor(DstSize);
1167 return std::make_pair(0, LLT::scalar(FloorSize));
1168 }
1169
1170 if (DstSize > 32 && (DstSize % 32 != 0)) {
1171 // FIXME: Need a way to specify non-extload of larger size if
1172 // suitably aligned.
1173 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1174 }
1175
1176 unsigned MaxSize = maxSizeForAddrSpace(ST,
1177 PtrTy.getAddressSpace(),
1178 Op == G_LOAD);
1179 if (MemSize > MaxSize)
1180 return std::make_pair(0, LLT::scalar(MaxSize));
1181
1182 unsigned Align = Query.MMODescrs[0].AlignInBits;
1183 return std::make_pair(0, LLT::scalar(Align));
1184 })
1185 .fewerElementsIf(
1186 [=](const LegalityQuery &Query) -> bool {
1187 return Query.Types[0].isVector() &&
1188 needToSplitMemOp(Query, Op == G_LOAD);
1189 },
1190 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1191 const LLT DstTy = Query.Types[0];
1192 const LLT PtrTy = Query.Types[1];
1193
1194 LLT EltTy = DstTy.getElementType();
1195 unsigned MaxSize = maxSizeForAddrSpace(ST,
1196 PtrTy.getAddressSpace(),
1197 Op == G_LOAD);
1198
1199 // FIXME: Handle widened to power of 2 results better. This ends
1200 // up scalarizing.
1201 // FIXME: 3 element stores scalarized on SI
1202
1203 // Split if it's too large for the address space.
1204 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1205 unsigned NumElts = DstTy.getNumElements();
1206 unsigned EltSize = EltTy.getSizeInBits();
1207
1208 if (MaxSize % EltSize == 0) {
1209 return std::make_pair(
1210 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1211 }
1212
1213 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1214
1215 // FIXME: Refine when odd breakdowns handled
1216 // The scalars will need to be re-legalized.
1217 if (NumPieces == 1 || NumPieces >= NumElts ||
1218 NumElts % NumPieces != 0)
1219 return std::make_pair(0, EltTy);
1220
1221 return std::make_pair(0,
1222 LLT::vector(NumElts / NumPieces, EltTy));
1223 }
1224
1225 // FIXME: We could probably handle weird extending loads better.
1226 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1227 if (DstTy.getSizeInBits() > MemSize)
1228 return std::make_pair(0, EltTy);
1229
1230 unsigned EltSize = EltTy.getSizeInBits();
1231 unsigned DstSize = DstTy.getSizeInBits();
1232 if (!isPowerOf2_32(DstSize)) {
1233 // We're probably decomposing an odd sized store. Try to split
1234 // to the widest type. TODO: Account for alignment. As-is it
1235 // should be OK, since the new parts will be further legalized.
1236 unsigned FloorSize = PowerOf2Floor(DstSize);
1237 return std::make_pair(
1238 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1239 }
1240
1241 // Need to split because of alignment.
1242 unsigned Align = Query.MMODescrs[0].AlignInBits;
1243 if (EltSize > Align &&
1244 (EltSize / Align < DstTy.getNumElements())) {
1245 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1246 }
1247
1248 // May need relegalization for the scalars.
1249 return std::make_pair(0, EltTy);
1250 })
1251 .lowerIfMemSizeNotPow2()
1252 .minScalar(0, S32);
1253
1254 if (IsStore)
1255 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1256
1257 Actions
1258 .widenScalarToNextPow2(0)
1259 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1260 .lower();
1261 }
1262
1263 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1264 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1265 {S32, GlobalPtr, 16, 2 * 8},
1266 {S32, LocalPtr, 8, 8},
1267 {S32, LocalPtr, 16, 16},
1268 {S32, PrivatePtr, 8, 8},
1269 {S32, PrivatePtr, 16, 16},
1270 {S32, ConstantPtr, 8, 8},
1271 {S32, ConstantPtr, 16, 2 * 8}});
1272 if (ST.hasFlatAddressSpace()) {
1273 ExtLoads.legalForTypesWithMemDesc(
1274 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1275 }
1276
1277 ExtLoads.clampScalar(0, S32, S32)
1278 .widenScalarToNextPow2(0)
1279 .unsupportedIfMemSizeNotPow2()
1280 .lower();
1281
1282 auto &Atomics = getActionDefinitionsBuilder(
1283 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1284 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1285 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1286 G_ATOMICRMW_UMIN})
1287 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1288 {S64, GlobalPtr}, {S64, LocalPtr},
1289 {S32, RegionPtr}, {S64, RegionPtr}});
1290 if (ST.hasFlatAddressSpace()) {
1291 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1292 }
1293
1294 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1295 if (ST.hasLDSFPAtomics()) {
1296 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1297 if (ST.hasGFX90AInsts())
1298 Atomic.legalFor({{S64, LocalPtr}});
1299 }
1300 if (ST.hasAtomicFaddInsts())
1301 Atomic.legalFor({{S32, GlobalPtr}});
1302
1303 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1304 // demarshalling
1305 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1306 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1307 {S32, FlatPtr}, {S64, FlatPtr}})
1308 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1309 {S32, RegionPtr}, {S64, RegionPtr}});
1310 // TODO: Pointer types, any 32-bit or 64-bit vector
1311
1312 // Condition should be s32 for scalar, s1 for vector.
1313 getActionDefinitionsBuilder(G_SELECT)
1314 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1315 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1316 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1317 .clampScalar(0, S16, S64)
1318 .scalarize(1)
1319 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1320 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1321 .clampMaxNumElements(0, S32, 2)
1322 .clampMaxNumElements(0, LocalPtr, 2)
1323 .clampMaxNumElements(0, PrivatePtr, 2)
1324 .scalarize(0)
1325 .widenScalarToNextPow2(0)
1326 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1327
1328 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1329 // be more flexible with the shift amount type.
1330 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1331 .legalFor({{S32, S32}, {S64, S32}});
1332 if (ST.has16BitInsts()) {
1333 if (ST.hasVOP3PInsts()) {
1334 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1335 .clampMaxNumElements(0, S16, 2);
1336 } else
1337 Shifts.legalFor({{S16, S16}});
1338
1339 // TODO: Support 16-bit shift amounts for all types
1340 Shifts.widenScalarIf(
1341 [=](const LegalityQuery &Query) {
1342 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1343 // 32-bit amount.
1344 const LLT ValTy = Query.Types[0];
1345 const LLT AmountTy = Query.Types[1];
1346 return ValTy.getSizeInBits() <= 16 &&
1347 AmountTy.getSizeInBits() < 16;
1348 }, changeTo(1, S16));
1349 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1350 Shifts.clampScalar(1, S32, S32);
1351 Shifts.clampScalar(0, S16, S64);
1352 Shifts.widenScalarToNextPow2(0, 16);
1353
1354 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1355 .minScalar(0, S16)
1356 .scalarize(0)
1357 .lower();
1358 } else {
1359 // Make sure we legalize the shift amount type first, as the general
1360 // expansion for the shifted type will produce much worse code if it hasn't
1361 // been truncated already.
1362 Shifts.clampScalar(1, S32, S32);
1363 Shifts.clampScalar(0, S32, S64);
1364 Shifts.widenScalarToNextPow2(0, 32);
1365
1366 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1367 .minScalar(0, S32)
1368 .scalarize(0)
1369 .lower();
1370 }
1371 Shifts.scalarize(0);
1372
1373 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1374 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1375 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1376 unsigned IdxTypeIdx = 2;
1377
1378 getActionDefinitionsBuilder(Op)
1379 .customIf([=](const LegalityQuery &Query) {
1380 const LLT EltTy = Query.Types[EltTypeIdx];
1381 const LLT VecTy = Query.Types[VecTypeIdx];
1382 const LLT IdxTy = Query.Types[IdxTypeIdx];
1383 const unsigned EltSize = EltTy.getSizeInBits();
1384 return (EltSize == 32 || EltSize == 64) &&
1385 VecTy.getSizeInBits() % 32 == 0 &&
1386 VecTy.getSizeInBits() <= MaxRegisterSize &&
1387 IdxTy.getSizeInBits() == 32;
1388 })
1389 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1390 bitcastToVectorElement32(VecTypeIdx))
1391 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1392 .bitcastIf(
1393 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1394 [=](const LegalityQuery &Query) {
1395 // For > 64-bit element types, try to turn this into a 64-bit
1396 // element vector since we may be able to do better indexing
1397 // if this is scalar. If not, fall back to 32.
1398 const LLT EltTy = Query.Types[EltTypeIdx];
1399 const LLT VecTy = Query.Types[VecTypeIdx];
1400 const unsigned DstEltSize = EltTy.getSizeInBits();
1401 const unsigned VecSize = VecTy.getSizeInBits();
1402
1403 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1404 return std::make_pair(
1405 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1406 })
1407 .clampScalar(EltTypeIdx, S32, S64)
1408 .clampScalar(VecTypeIdx, S32, S64)
1409 .clampScalar(IdxTypeIdx, S32, S32)
1410 .clampMaxNumElements(VecTypeIdx, S32, 32)
1411 // TODO: Clamp elements for 64-bit vectors?
1412 // It should only be necessary with variable indexes.
1413 // As a last resort, lower to the stack
1414 .lower();
1415 }
1416
1417 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1418 .unsupportedIf([=](const LegalityQuery &Query) {
1419 const LLT &EltTy = Query.Types[1].getElementType();
1420 return Query.Types[0] != EltTy;
1421 });
1422
1423 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1424 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1425 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1426
1427 // FIXME: Doesn't handle extract of illegal sizes.
1428 getActionDefinitionsBuilder(Op)
1429 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1430 // FIXME: Multiples of 16 should not be legal.
1431 .legalIf([=](const LegalityQuery &Query) {
1432 const LLT BigTy = Query.Types[BigTyIdx];
1433 const LLT LitTy = Query.Types[LitTyIdx];
1434 return (BigTy.getSizeInBits() % 32 == 0) &&
1435 (LitTy.getSizeInBits() % 16 == 0);
1436 })
1437 .widenScalarIf(
1438 [=](const LegalityQuery &Query) {
1439 const LLT BigTy = Query.Types[BigTyIdx];
1440 return (BigTy.getScalarSizeInBits() < 16);
1441 },
1442 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1443 .widenScalarIf(
1444 [=](const LegalityQuery &Query) {
1445 const LLT LitTy = Query.Types[LitTyIdx];
1446 return (LitTy.getScalarSizeInBits() < 16);
1447 },
1448 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1449 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1450 .widenScalarToNextPow2(BigTyIdx, 32);
1451
1452 }
1453
1454 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1455 .legalForCartesianProduct(AllS32Vectors, {S32})
1456 .legalForCartesianProduct(AllS64Vectors, {S64})
1457 .clampNumElements(0, V16S32, V32S32)
1458 .clampNumElements(0, V2S64, V16S64)
1459 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1460
1461 if (ST.hasScalarPackInsts()) {
1462 BuildVector
1463 // FIXME: Should probably widen s1 vectors straight to s32
1464 .minScalarOrElt(0, S16)
1465 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1466 .minScalar(1, S32);
1467
1468 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1469 .legalFor({V2S16, S32})
1470 .lower();
1471 BuildVector.minScalarOrElt(0, S32);
1472 } else {
1473 BuildVector.customFor({V2S16, S16});
1474 BuildVector.minScalarOrElt(0, S32);
1475
1476 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1477 .customFor({V2S16, S32})
1478 .lower();
1479 }
1480
1481 BuildVector.legalIf(isRegisterType(0));
1482
1483 // FIXME: Clamp maximum size
1484 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1485 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1486 .clampMaxNumElements(0, S32, 32)
1487 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1488 .clampMaxNumElements(0, S16, 64);
1489
1490 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1491 // pre-legalize.
1492 if (ST.hasVOP3PInsts()) {
1493 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1494 .customFor({V2S16, V2S16})
1495 .lower();
1496 } else
1497 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1498
1499 // Merge/Unmerge
1500 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1501 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1502 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1503
1504 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1505 const LLT Ty = Query.Types[TypeIdx];
1506 if (Ty.isVector()) {
1507 const LLT &EltTy = Ty.getElementType();
1508 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1509 return true;
1510 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1511 return true;
1512 }
1513 return false;
1514 };
1515
1516 auto &Builder = getActionDefinitionsBuilder(Op)
1517 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1518 .lowerFor({{S16, V2S16}})
1519 .lowerIf([=](const LegalityQuery &Query) {
1520 const LLT BigTy = Query.Types[BigTyIdx];
1521 return BigTy.getSizeInBits() == 32;
1522 })
1523 // Try to widen to s16 first for small types.
1524 // TODO: Only do this on targets with legal s16 shifts
1525 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1526 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1527 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1528 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1529 elementTypeIs(1, S16)),
1530 changeTo(1, V2S16))
1531 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1532 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1533 // valid.
1534 .clampScalar(LitTyIdx, S32, S512)
1535 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1536 // Break up vectors with weird elements into scalars
1537 .fewerElementsIf(
1538 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1539 scalarize(0))
1540 .fewerElementsIf(
1541 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1542 scalarize(1))
1543 .clampScalar(BigTyIdx, S32, MaxScalar);
1544
1545 if (Op == G_MERGE_VALUES) {
1546 Builder.widenScalarIf(
1547 // TODO: Use 16-bit shifts if legal for 8-bit values?
1548 [=](const LegalityQuery &Query) {
1549 const LLT Ty = Query.Types[LitTyIdx];
1550 return Ty.getSizeInBits() < 32;
1551 },
1552 changeTo(LitTyIdx, S32));
1553 }
1554
1555 Builder.widenScalarIf(
1556 [=](const LegalityQuery &Query) {
1557 const LLT Ty = Query.Types[BigTyIdx];
1558 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1559 Ty.getSizeInBits() % 16 != 0;
1560 },
1561 [=](const LegalityQuery &Query) {
1562 // Pick the next power of 2, or a multiple of 64 over 128.
1563 // Whichever is smaller.
1564 const LLT &Ty = Query.Types[BigTyIdx];
1565 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1566 if (NewSizeInBits >= 256) {
1567 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1568 if (RoundedTo < NewSizeInBits)
1569 NewSizeInBits = RoundedTo;
1570 }
1571 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1572 })
1573 // Any vectors left are the wrong size. Scalarize them.
1574 .scalarize(0)
1575 .scalarize(1);
1576 }
1577
1578 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1579 // RegBankSelect.
1580 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1581 .legalFor({{S32}, {S64}});
1582
1583 if (ST.hasVOP3PInsts()) {
1584 SextInReg.lowerFor({{V2S16}})
1585 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1586 // get more vector shift opportunities, since we'll get those when
1587 // expanded.
1588 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1589 } else if (ST.has16BitInsts()) {
1590 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1591 } else {
1592 // Prefer to promote to s32 before lowering if we don't have 16-bit
1593 // shifts. This avoid a lot of intermediate truncate and extend operations.
1594 SextInReg.lowerFor({{S32}, {S64}});
1595 }
1596
1597 SextInReg
1598 .scalarize(0)
1599 .clampScalar(0, S32, S64)
1600 .lower();
1601
1602 // TODO: Only Try to form v2s16 with legal packed instructions.
1603 getActionDefinitionsBuilder(G_FSHR)
1604 .legalFor({{S32, S32}})
1605 .lowerFor({{V2S16, V2S16}})
1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1607 .scalarize(0)
1608 .lower();
1609
1610 if (ST.hasVOP3PInsts()) {
1611 getActionDefinitionsBuilder(G_FSHL)
1612 .lowerFor({{V2S16, V2S16}})
1613 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1614 .scalarize(0)
1615 .lower();
1616 } else {
1617 getActionDefinitionsBuilder(G_FSHL)
1618 .scalarize(0)
1619 .lower();
1620 }
1621
1622 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1623 .legalFor({S64});
1624
1625 getActionDefinitionsBuilder(G_FENCE)
1626 .alwaysLegal();
1627
1628 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1629 .scalarize(0)
1630 .minScalar(0, S32)
1631 .lower();
1632
1633 getActionDefinitionsBuilder({
1634 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1635 G_FCOPYSIGN,
1636
1637 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1638 G_ATOMICRMW_NAND,
1639 G_ATOMICRMW_FSUB,
1640 G_READ_REGISTER,
1641 G_WRITE_REGISTER,
1642
1643 G_SADDO, G_SSUBO,
1644
1645 // TODO: Implement
1646 G_FMINIMUM, G_FMAXIMUM}).lower();
1647
1648 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1649 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1650 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1651 .unsupported();
1652
1653 computeTables();
1654 verify(*ST.getInstrInfo());
1655}
1656
1657bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1658 MachineInstr &MI) const {
1659 MachineIRBuilder &B = Helper.MIRBuilder;
1660 MachineRegisterInfo &MRI = *B.getMRI();
1661
1662 switch (MI.getOpcode()) {
1663 case TargetOpcode::G_ADDRSPACE_CAST:
1664 return legalizeAddrSpaceCast(MI, MRI, B);
1665 case TargetOpcode::G_FRINT:
1666 return legalizeFrint(MI, MRI, B);
1667 case TargetOpcode::G_FCEIL:
1668 return legalizeFceil(MI, MRI, B);
1669 case TargetOpcode::G_FREM:
1670 return legalizeFrem(MI, MRI, B);
1671 case TargetOpcode::G_INTRINSIC_TRUNC:
1672 return legalizeIntrinsicTrunc(MI, MRI, B);
1673 case TargetOpcode::G_SITOFP:
1674 return legalizeITOFP(MI, MRI, B, true);
1675 case TargetOpcode::G_UITOFP:
1676 return legalizeITOFP(MI, MRI, B, false);
1677 case TargetOpcode::G_FPTOSI:
1678 return legalizeFPTOI(MI, MRI, B, true);
1679 case TargetOpcode::G_FPTOUI:
1680 return legalizeFPTOI(MI, MRI, B, false);
1681 case TargetOpcode::G_FMINNUM:
1682 case TargetOpcode::G_FMAXNUM:
1683 case TargetOpcode::G_FMINNUM_IEEE:
1684 case TargetOpcode::G_FMAXNUM_IEEE:
1685 return legalizeMinNumMaxNum(Helper, MI);
1686 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1687 return legalizeExtractVectorElt(MI, MRI, B);
1688 case TargetOpcode::G_INSERT_VECTOR_ELT:
1689 return legalizeInsertVectorElt(MI, MRI, B);
1690 case TargetOpcode::G_SHUFFLE_VECTOR:
1691 return legalizeShuffleVector(MI, MRI, B);
1692 case TargetOpcode::G_FSIN:
1693 case TargetOpcode::G_FCOS:
1694 return legalizeSinCos(MI, MRI, B);
1695 case TargetOpcode::G_GLOBAL_VALUE:
1696 return legalizeGlobalValue(MI, MRI, B);
1697 case TargetOpcode::G_LOAD:
1698 return legalizeLoad(Helper, MI);
1699 case TargetOpcode::G_FMAD:
1700 return legalizeFMad(MI, MRI, B);
1701 case TargetOpcode::G_FDIV:
1702 return legalizeFDIV(MI, MRI, B);
1703 case TargetOpcode::G_UDIV:
1704 case TargetOpcode::G_UREM:
1705 return legalizeUDIV_UREM(MI, MRI, B);
1706 case TargetOpcode::G_SDIV:
1707 case TargetOpcode::G_SREM:
1708 return legalizeSDIV_SREM(MI, MRI, B);
1709 case TargetOpcode::G_ATOMIC_CMPXCHG:
1710 return legalizeAtomicCmpXChg(MI, MRI, B);
1711 case TargetOpcode::G_FLOG:
1712 return legalizeFlog(MI, B, numbers::ln2f);
1713 case TargetOpcode::G_FLOG10:
1714 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1715 case TargetOpcode::G_FEXP:
1716 return legalizeFExp(MI, B);
1717 case TargetOpcode::G_FPOW:
1718 return legalizeFPow(MI, B);
1719 case TargetOpcode::G_FFLOOR:
1720 return legalizeFFloor(MI, MRI, B);
1721 case TargetOpcode::G_BUILD_VECTOR:
1722 return legalizeBuildVector(MI, MRI, B);
1723 default:
1724 return false;
1725 }
1726
1727 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1727)
;
1728}
1729
1730Register AMDGPULegalizerInfo::getSegmentAperture(
1731 unsigned AS,
1732 MachineRegisterInfo &MRI,
1733 MachineIRBuilder &B) const {
1734 MachineFunction &MF = B.getMF();
1735 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1736 const LLT S32 = LLT::scalar(32);
1737
1738 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1738, __PRETTY_FUNCTION__))
;
1739
1740 if (ST.hasApertureRegs()) {
1741 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1742 // getreg.
1743 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1744 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1745 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1746 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1747 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1748 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1749 unsigned Encoding =
1750 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1751 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1752 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1753
1754 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1755
1756 B.buildInstr(AMDGPU::S_GETREG_B32)
1757 .addDef(GetReg)
1758 .addImm(Encoding);
1759 MRI.setType(GetReg, S32);
1760
1761 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1762 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1763 }
1764
1765 Register QueuePtr = MRI.createGenericVirtualRegister(
1766 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1767
1768 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1769 return Register();
1770
1771 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1772 // private_segment_aperture_base_hi.
1773 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1774
1775 // TODO: can we be smarter about machine pointer info?
1776 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1777 MachineMemOperand *MMO = MF.getMachineMemOperand(
1778 PtrInfo,
1779 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1780 MachineMemOperand::MOInvariant,
1781 4, commonAlignment(Align(64), StructOffset));
1782
1783 Register LoadAddr;
1784
1785 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1786 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1787}
1788
1789bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1790 MachineInstr &MI, MachineRegisterInfo &MRI,
1791 MachineIRBuilder &B) const {
1792 MachineFunction &MF = B.getMF();
1793
1794 const LLT S32 = LLT::scalar(32);
1795 Register Dst = MI.getOperand(0).getReg();
1796 Register Src = MI.getOperand(1).getReg();
1797
1798 LLT DstTy = MRI.getType(Dst);
1799 LLT SrcTy = MRI.getType(Src);
1800 unsigned DestAS = DstTy.getAddressSpace();
1801 unsigned SrcAS = SrcTy.getAddressSpace();
1802
1803 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1804 // vector element.
1805 assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1805, __PRETTY_FUNCTION__))
;
1806
1807 const AMDGPUTargetMachine &TM
1808 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1809
1810 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1811 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1812 return true;
1813 }
1814
1815 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1816 // Truncate.
1817 B.buildExtract(Dst, Src, 0);
1818 MI.eraseFromParent();
1819 return true;
1820 }
1821
1822 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1823 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1824 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1825
1826 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1827 // another. Merge operands are required to be the same type, but creating an
1828 // extra ptrtoint would be kind of pointless.
1829 auto HighAddr = B.buildConstant(
1830 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1831 B.buildMerge(Dst, {Src, HighAddr});
1832 MI.eraseFromParent();
1833 return true;
1834 }
1835
1836 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1837 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1838, __PRETTY_FUNCTION__))
1838 DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1838, __PRETTY_FUNCTION__))
;
1839 unsigned NullVal = TM.getNullPointerValue(DestAS);
1840
1841 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1842 auto FlatNull = B.buildConstant(SrcTy, 0);
1843
1844 // Extract low 32-bits of the pointer.
1845 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1846
1847 auto CmpRes =
1848 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1849 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1850
1851 MI.eraseFromParent();
1852 return true;
1853 }
1854
1855 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1856 return false;
1857
1858 if (!ST.hasFlatAddressSpace())
1859 return false;
1860
1861 auto SegmentNull =
1862 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1863 auto FlatNull =
1864 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1865
1866 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1867 if (!ApertureReg.isValid())
1868 return false;
1869
1870 auto CmpRes =
1871 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1872
1873 // Coerce the type of the low half of the result so we can use merge_values.
1874 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1875
1876 // TODO: Should we allow mismatched types but matching sizes in merges to
1877 // avoid the ptrtoint?
1878 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1879 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1880
1881 MI.eraseFromParent();
1882 return true;
1883}
1884
1885bool AMDGPULegalizerInfo::legalizeFrint(
1886 MachineInstr &MI, MachineRegisterInfo &MRI,
1887 MachineIRBuilder &B) const {
1888 Register Src = MI.getOperand(1).getReg();
1889 LLT Ty = MRI.getType(Src);
1890 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast
<void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1890, __PRETTY_FUNCTION__))
;
1891
1892 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1893 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1894
1895 auto C1 = B.buildFConstant(Ty, C1Val);
1896 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1897
1898 // TODO: Should this propagate fast-math-flags?
1899 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1900 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1901
1902 auto C2 = B.buildFConstant(Ty, C2Val);
1903 auto Fabs = B.buildFAbs(Ty, Src);
1904
1905 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1906 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1907 MI.eraseFromParent();
1908 return true;
1909}
1910
1911bool AMDGPULegalizerInfo::legalizeFceil(
1912 MachineInstr &MI, MachineRegisterInfo &MRI,
1913 MachineIRBuilder &B) const {
1914
1915 const LLT S1 = LLT::scalar(1);
1916 const LLT S64 = LLT::scalar(64);
1917
1918 Register Src = MI.getOperand(1).getReg();
1919 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1919, __PRETTY_FUNCTION__))
;
1920
1921 // result = trunc(src)
1922 // if (src > 0.0 && src != result)
1923 // result += 1.0
1924
1925 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1926
1927 const auto Zero = B.buildFConstant(S64, 0.0);
1928 const auto One = B.buildFConstant(S64, 1.0);
1929 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1930 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1931 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1932 auto Add = B.buildSelect(S64, And, One, Zero);
1933
1934 // TODO: Should this propagate fast-math-flags?
1935 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1936 return true;
1937}
1938
1939bool AMDGPULegalizerInfo::legalizeFrem(
1940 MachineInstr &MI, MachineRegisterInfo &MRI,
1941 MachineIRBuilder &B) const {
1942 Register DstReg = MI.getOperand(0).getReg();
1943 Register Src0Reg = MI.getOperand(1).getReg();
1944 Register Src1Reg = MI.getOperand(2).getReg();
1945 auto Flags = MI.getFlags();
1946 LLT Ty = MRI.getType(DstReg);
1947
1948 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1949 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1950 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1951 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1952 MI.eraseFromParent();
1953 return true;
1954}
1955
1956static MachineInstrBuilder extractF64Exponent(Register Hi,
1957 MachineIRBuilder &B) {
1958 const unsigned FractBits = 52;
1959 const unsigned ExpBits = 11;
1960 LLT S32 = LLT::scalar(32);
1961
1962 auto Const0 = B.buildConstant(S32, FractBits - 32);
1963 auto Const1 = B.buildConstant(S32, ExpBits);
1964
1965 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1966 .addUse(Hi)
1967 .addUse(Const0.getReg(0))
1968 .addUse(Const1.getReg(0));
1969
1970 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1971}
1972
1973bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1974 MachineInstr &MI, MachineRegisterInfo &MRI,
1975 MachineIRBuilder &B) const {
1976 const LLT S1 = LLT::scalar(1);
1977 const LLT S32 = LLT::scalar(32);
1978 const LLT S64 = LLT::scalar(64);
1979
1980 Register Src = MI.getOperand(1).getReg();
1981 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1981, __PRETTY_FUNCTION__))
;
1982
1983 // TODO: Should this use extract since the low half is unused?
1984 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1985 Register Hi = Unmerge.getReg(1);
1986
1987 // Extract the upper half, since this is where we will find the sign and
1988 // exponent.
1989 auto Exp = extractF64Exponent(Hi, B);
1990
1991 const unsigned FractBits = 52;
1992
1993 // Extract the sign bit.
1994 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
1995 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1996
1997 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
1998
1999 const auto Zero32 = B.buildConstant(S32, 0);
2000
2001 // Extend back to 64-bits.
2002 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2003
2004 auto Shr = B.buildAShr(S64, FractMask, Exp);
2005 auto Not = B.buildNot(S64, Shr);
2006 auto Tmp0 = B.buildAnd(S64, Src, Not);
2007 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2008
2009 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2010 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2011
2012 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2013 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2014 MI.eraseFromParent();
2015 return true;
2016}
2017
2018bool AMDGPULegalizerInfo::legalizeITOFP(
2019 MachineInstr &MI, MachineRegisterInfo &MRI,
2020 MachineIRBuilder &B, bool Signed) const {
2021
2022 Register Dst = MI.getOperand(0).getReg();
2023 Register Src = MI.getOperand(1).getReg();
2024
2025 const LLT S64 = LLT::scalar(64);
2026 const LLT S32 = LLT::scalar(32);
2027
2028 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2028, __PRETTY_FUNCTION__))
;
2029
2030 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2031
2032 auto CvtHi = Signed ?
2033 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2034 B.buildUITOFP(S64, Unmerge.getReg(1));
2035
2036 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2037
2038 auto ThirtyTwo = B.buildConstant(S32, 32);
2039 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2040 .addUse(CvtHi.getReg(0))
2041 .addUse(ThirtyTwo.getReg(0));
2042
2043 // TODO: Should this propagate fast-math-flags?
2044 B.buildFAdd(Dst, LdExp, CvtLo);
2045 MI.eraseFromParent();
2046 return true;
2047}
2048
2049// TODO: Copied from DAG implementation. Verify logic and document how this
2050// actually works.
2051bool AMDGPULegalizerInfo::legalizeFPTOI(
2052 MachineInstr &MI, MachineRegisterInfo &MRI,
2053 MachineIRBuilder &B, bool Signed) const {
2054
2055 Register Dst = MI.getOperand(0).getReg();
2056 Register Src = MI.getOperand(1).getReg();
2057
2058 const LLT S64 = LLT::scalar(64);
2059 const LLT S32 = LLT::scalar(32);
2060
2061 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2061, __PRETTY_FUNCTION__))
;
2062
2063 unsigned Flags = MI.getFlags();
2064
2065 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2066 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL));
2067 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL));
2068
2069 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2070 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2071 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2072
2073 auto Hi = Signed ?
2074 B.buildFPTOSI(S32, FloorMul) :
2075 B.buildFPTOUI(S32, FloorMul);
2076 auto Lo = B.buildFPTOUI(S32, Fma);
2077
2078 B.buildMerge(Dst, { Lo, Hi });
2079 MI.eraseFromParent();
2080
2081 return true;
2082}
2083
2084bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2085 MachineInstr &MI) const {
2086 MachineFunction &MF = Helper.MIRBuilder.getMF();
2087 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2088
2089 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2090 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2091
2092 // With ieee_mode disabled, the instructions have the correct behavior
2093 // already for G_FMINNUM/G_FMAXNUM
2094 if (!MFI->getMode().IEEE)
2095 return !IsIEEEOp;
2096
2097 if (IsIEEEOp)
2098 return true;
2099
2100 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2101}
2102
2103bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2104 MachineInstr &MI, MachineRegisterInfo &MRI,
2105 MachineIRBuilder &B) const {
2106 // TODO: Should move some of this into LegalizerHelper.
2107
2108 // TODO: Promote dynamic indexing of s16 to s32
2109
2110 // FIXME: Artifact combiner probably should have replaced the truncated
2111 // constant before this, so we shouldn't need
2112 // getConstantVRegValWithLookThrough.
2113 Optional<ValueAndVReg> MaybeIdxVal =
2114 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2115 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2116 return true;
2117 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2118
2119 Register Dst = MI.getOperand(0).getReg();
2120 Register Vec = MI.getOperand(1).getReg();
2121
2122 LLT VecTy = MRI.getType(Vec);
2123 LLT EltTy = VecTy.getElementType();
2124 assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2124, __PRETTY_FUNCTION__))
;
2125
2126 if (IdxVal < VecTy.getNumElements())
2127 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2128 else
2129 B.buildUndef(Dst);
2130
2131 MI.eraseFromParent();
2132 return true;
2133}
2134
2135bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2136 MachineInstr &MI, MachineRegisterInfo &MRI,
2137 MachineIRBuilder &B) const {
2138 // TODO: Should move some of this into LegalizerHelper.
2139
2140 // TODO: Promote dynamic indexing of s16 to s32
2141
2142 // FIXME: Artifact combiner probably should have replaced the truncated
2143 // constant before this, so we shouldn't need
2144 // getConstantVRegValWithLookThrough.
2145 Optional<ValueAndVReg> MaybeIdxVal =
2146 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2147 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2148 return true;
2149
2150 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2151 Register Dst = MI.getOperand(0).getReg();
2152 Register Vec = MI.getOperand(1).getReg();
2153 Register Ins = MI.getOperand(2).getReg();
2154
2155 LLT VecTy = MRI.getType(Vec);
2156 LLT EltTy = VecTy.getElementType();
2157 assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2157, __PRETTY_FUNCTION__))
;
2158
2159 if (IdxVal < VecTy.getNumElements())
2160 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2161 else
2162 B.buildUndef(Dst);
2163
2164 MI.eraseFromParent();
2165 return true;
2166}
2167
2168bool AMDGPULegalizerInfo::legalizeShuffleVector(
2169 MachineInstr &MI, MachineRegisterInfo &MRI,
2170 MachineIRBuilder &B) const {
2171 const LLT V2S16 = LLT::vector(2, 16);
2172
2173 Register Dst = MI.getOperand(0).getReg();
2174 Register Src0 = MI.getOperand(1).getReg();
2175 LLT DstTy = MRI.getType(Dst);
2176 LLT SrcTy = MRI.getType(Src0);
2177
2178 if (SrcTy == V2S16 && DstTy == V2S16 &&
2179 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2180 return true;
2181
2182 MachineIRBuilder HelperBuilder(MI);
2183 GISelObserverWrapper DummyObserver;
2184 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2185 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2186}
2187
2188bool AMDGPULegalizerInfo::legalizeSinCos(
2189 MachineInstr &MI, MachineRegisterInfo &MRI,
2190 MachineIRBuilder &B) const {
2191
2192 Register DstReg = MI.getOperand(0).getReg();
2193 Register SrcReg = MI.getOperand(1).getReg();
2194 LLT Ty = MRI.getType(DstReg);
2195 unsigned Flags = MI.getFlags();
2196
2197 Register TrigVal;
2198 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2199 if (ST.hasTrigReducedRange()) {
2200 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2201 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2202 .addUse(MulVal.getReg(0))
2203 .setMIFlags(Flags).getReg(0);
2204 } else
2205 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2206
2207 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2208 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2209 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2210 .addUse(TrigVal)
2211 .setMIFlags(Flags);
2212 MI.eraseFromParent();
2213 return true;
2214}
2215
2216bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2217 MachineIRBuilder &B,
2218 const GlobalValue *GV,
2219 int64_t Offset,
2220 unsigned GAFlags) const {
2221 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")((isInt<32>(Offset + 4) && "32-bit offset is expected!"
) ? static_cast<void> (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2221, __PRETTY_FUNCTION__))
;
2222 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2223 // to the following code sequence:
2224 //
2225 // For constant address space:
2226 // s_getpc_b64 s[0:1]
2227 // s_add_u32 s0, s0, $symbol
2228 // s_addc_u32 s1, s1, 0
2229 //
2230 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2231 // a fixup or relocation is emitted to replace $symbol with a literal
2232 // constant, which is a pc-relative offset from the encoding of the $symbol
2233 // operand to the global variable.
2234 //
2235 // For global address space:
2236 // s_getpc_b64 s[0:1]
2237 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2238 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2239 //
2240 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2241 // fixups or relocations are emitted to replace $symbol@*@lo and
2242 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2243 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2244 // operand to the global variable.
2245 //
2246 // What we want here is an offset from the value returned by s_getpc
2247 // (which is the address of the s_add_u32 instruction) to the global
2248 // variable, but since the encoding of $symbol starts 4 bytes after the start
2249 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2250 // small. This requires us to add 4 to the global variable offset in order to
2251 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2252 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2253 // instruction.
2254
2255 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2256
2257 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2258 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2259
2260 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2261 .addDef(PCReg);
2262
2263 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2264 if (GAFlags == SIInstrInfo::MO_NONE)
2265 MIB.addImm(0);
2266 else
2267 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2268
2269 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2270
2271 if (PtrTy.getSizeInBits() == 32)
2272 B.buildExtract(DstReg, PCReg, 0);
2273 return true;
2274 }
2275
2276bool AMDGPULegalizerInfo::legalizeGlobalValue(
2277 MachineInstr &MI, MachineRegisterInfo &MRI,
2278 MachineIRBuilder &B) const {
2279 Register DstReg = MI.getOperand(0).getReg();
2280 LLT Ty = MRI.getType(DstReg);
2281 unsigned AS = Ty.getAddressSpace();
2282
2283 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2284 MachineFunction &MF = B.getMF();
2285 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2286
2287 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2288 if (!MFI->isModuleEntryFunction()) {
2289 const Function &Fn = MF.getFunction();
2290 DiagnosticInfoUnsupported BadLDSDecl(
2291 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2292 DS_Warning);
2293 Fn.getContext().diagnose(BadLDSDecl);
2294
2295 // We currently don't have a way to correctly allocate LDS objects that
2296 // aren't directly associated with a kernel. We do force inlining of
2297 // functions that use local objects. However, if these dead functions are
2298 // not eliminated, we don't want a compile time error. Just emit a warning
2299 // and a trap, since there should be no callable path here.
2300 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2301 B.buildUndef(DstReg);
2302 MI.eraseFromParent();
2303 return true;
2304 }
2305
2306 // TODO: We could emit code to handle the initialization somewhere.
2307 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2308 const SITargetLowering *TLI = ST.getTargetLowering();
2309 if (!TLI->shouldUseLDSConstAddress(GV)) {
2310 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2311 return true; // Leave in place;
2312 }
2313
2314 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2315 Type *Ty = GV->getValueType();
2316 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2317 // zero-sized type in other languages to declare the dynamic shared
2318 // memory which size is not known at the compile time. They will be
2319 // allocated by the runtime and placed directly after the static
2320 // allocated ones. They all share the same offset.
2321 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2322 // Adjust alignment for that dynamic shared memory array.
2323 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2324 LLT S32 = LLT::scalar(32);
2325 auto Sz =
2326 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2327 B.buildIntToPtr(DstReg, Sz);
2328 MI.eraseFromParent();
2329 return true;
2330 }
2331 }
2332
2333 B.buildConstant(
2334 DstReg,
2335 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2336 MI.eraseFromParent();
2337 return true;
2338 }
2339
2340 const Function &Fn = MF.getFunction();
2341 DiagnosticInfoUnsupported BadInit(
2342 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2343 Fn.getContext().diagnose(BadInit);
2344 return true;
2345 }
2346
2347 const SITargetLowering *TLI = ST.getTargetLowering();
2348
2349 if (TLI->shouldEmitFixup(GV)) {
2350 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2351 MI.eraseFromParent();
2352 return true;
2353 }
2354
2355 if (TLI->shouldEmitPCReloc(GV)) {
2356 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2357 MI.eraseFromParent();
2358 return true;
2359 }
2360
2361 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2362 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2363
2364 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2365 MachinePointerInfo::getGOT(MF),
2366 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2367 MachineMemOperand::MOInvariant,
2368 8 /*Size*/, Align(8));
2369
2370 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2371
2372 if (Ty.getSizeInBits() == 32) {
2373 // Truncate if this is a 32-bit constant adrdess.
2374 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2375 B.buildExtract(DstReg, Load, 0);
2376 } else
2377 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2378
2379 MI.eraseFromParent();
2380 return true;
2381}
2382
2383static LLT widenToNextPowerOf2(LLT Ty) {
2384 if (Ty.isVector())
2385 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2386 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2387}
2388
2389bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2390 MachineInstr &MI) const {
2391 MachineIRBuilder &B = Helper.MIRBuilder;
2392 MachineRegisterInfo &MRI = *B.getMRI();
2393 GISelChangeObserver &Observer = Helper.Observer;
2394
2395 Register PtrReg = MI.getOperand(1).getReg();
2396 LLT PtrTy = MRI.getType(PtrReg);
2397 unsigned AddrSpace = PtrTy.getAddressSpace();
2398
2399 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2400 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2401 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2402 Observer.changingInstr(MI);
2403 MI.getOperand(1).setReg(Cast.getReg(0));
2404 Observer.changedInstr(MI);
2405 return true;
2406 }
2407
2408 Register ValReg = MI.getOperand(0).getReg();
2409 LLT ValTy = MRI.getType(ValReg);
2410
2411 MachineMemOperand *MMO = *MI.memoperands_begin();
2412 const unsigned ValSize = ValTy.getSizeInBits();
2413 const unsigned MemSize = 8 * MMO->getSize();
2414 const Align MemAlign = MMO->getAlign();
2415 const unsigned AlignInBits = 8 * MemAlign.value();
2416
2417 // Widen non-power-of-2 loads to the alignment if needed
2418 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2419 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2420
2421 // This was already the correct extending load result type, so just adjust
2422 // the memory type.
2423 if (WideMemSize == ValSize) {
2424 MachineFunction &MF = B.getMF();
2425
2426 MachineMemOperand *WideMMO =
2427 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2428 Observer.changingInstr(MI);
2429 MI.setMemRefs(MF, {WideMMO});
2430 Observer.changedInstr(MI);
2431 return true;
2432 }
2433
2434 // Don't bother handling edge case that should probably never be produced.
2435 if (ValSize > WideMemSize)
2436 return false;
2437
2438 LLT WideTy = widenToNextPowerOf2(ValTy);
2439
2440 Register WideLoad;
2441 if (!WideTy.isVector()) {
2442 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2443 B.buildTrunc(ValReg, WideLoad).getReg(0);
2444 } else {
2445 // Extract the subvector.
2446
2447 if (isRegisterType(ValTy)) {
2448 // If this a case where G_EXTRACT is legal, use it.
2449 // (e.g. <3 x s32> -> <4 x s32>)
2450 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2451 B.buildExtract(ValReg, WideLoad, 0);
2452 } else {
2453 // For cases where the widened type isn't a nice register value, unmerge
2454 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2455 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2456 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2457 B.setInsertPt(B.getMBB(), MI.getIterator());
2458 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2459 }
2460 }
2461
2462 MI.eraseFromParent();
2463 return true;
2464 }
2465
2466 return false;
2467}
2468
2469bool AMDGPULegalizerInfo::legalizeFMad(
2470 MachineInstr &MI, MachineRegisterInfo &MRI,
2471 MachineIRBuilder &B) const {
2472 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2473 assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2473, __PRETTY_FUNCTION__))
;
2474
2475 MachineFunction &MF = B.getMF();
2476 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2477
2478 // TODO: Always legal with future ftz flag.
2479 // FIXME: Do we need just output?
2480 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2481 return true;
2482 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2483 return true;
2484
2485 MachineIRBuilder HelperBuilder(MI);
2486 GISelObserverWrapper DummyObserver;
2487 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2488 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2489}
2490
2491bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2492 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2493 Register DstReg = MI.getOperand(0).getReg();
2494 Register PtrReg = MI.getOperand(1).getReg();
2495 Register CmpVal = MI.getOperand(2).getReg();
2496 Register NewVal = MI.getOperand(3).getReg();
2497
2498 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&((AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace
()) && "this should not have been custom lowered") ? static_cast
<void> (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2499, __PRETTY_FUNCTION__))
2499 "this should not have been custom lowered")((AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace
()) && "this should not have been custom lowered") ? static_cast
<void> (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2499, __PRETTY_FUNCTION__))
;
2500
2501 LLT ValTy = MRI.getType(CmpVal);
2502 LLT VecTy = LLT::vector(2, ValTy);
2503
2504 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2505
2506 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2507 .addDef(DstReg)
2508 .addUse(PtrReg)
2509 .addUse(PackedVal)
2510 .setMemRefs(MI.memoperands());
2511
2512 MI.eraseFromParent();
2513 return true;
2514}
2515
2516bool AMDGPULegalizerInfo::legalizeFlog(
2517 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2518 Register Dst = MI.getOperand(0).getReg();
2519 Register Src = MI.getOperand(1).getReg();
2520 LLT Ty = B.getMRI()->getType(Dst);
2521 unsigned Flags = MI.getFlags();
2522
2523 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2524 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2525
2526 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2527 MI.eraseFromParent();
2528 return true;
2529}
2530
2531bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2532 MachineIRBuilder &B) const {
2533 Register Dst = MI.getOperand(0).getReg();
2534 Register Src = MI.getOperand(1).getReg();
2535 unsigned Flags = MI.getFlags();
2536 LLT Ty = B.getMRI()->getType(Dst);
2537
2538 auto K = B.buildFConstant(Ty, numbers::log2e);
2539 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2540 B.buildFExp2(Dst, Mul, Flags);
2541 MI.eraseFromParent();
2542 return true;
2543}
2544
2545bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2546 MachineIRBuilder &B) const {
2547 Register Dst = MI.getOperand(0).getReg();
2548 Register Src0 = MI.getOperand(1).getReg();
2549 Register Src1 = MI.getOperand(2).getReg();
2550 unsigned Flags = MI.getFlags();
2551 LLT Ty = B.getMRI()->getType(Dst);
2552 const LLT S16 = LLT::scalar(16);
2553 const LLT S32 = LLT::scalar(32);
2554
2555 if (Ty == S32) {
2556 auto Log = B.buildFLog2(S32, Src0, Flags);
2557 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2558 .addUse(Log.getReg(0))
2559 .addUse(Src1)
2560 .setMIFlags(Flags);
2561 B.buildFExp2(Dst, Mul, Flags);
2562 } else if (Ty == S16) {
2563 // There's no f16 fmul_legacy, so we need to convert for it.
2564 auto Log = B.buildFLog2(S16, Src0, Flags);
2565 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2566 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2567 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2568 .addUse(Ext0.getReg(0))
2569 .addUse(Ext1.getReg(0))
2570 .setMIFlags(Flags);
2571
2572 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2573 } else
2574 return false;
2575
2576 MI.eraseFromParent();
2577 return true;
2578}
2579
2580// Find a source register, ignoring any possible source modifiers.
2581static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2582 Register ModSrc = OrigSrc;
2583 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2584 ModSrc = SrcFNeg->getOperand(1).getReg();
2585 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2586 ModSrc = SrcFAbs->getOperand(1).getReg();
2587 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2588 ModSrc = SrcFAbs->getOperand(1).getReg();
2589 return ModSrc;
2590}
2591
2592bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2593 MachineRegisterInfo &MRI,
2594 MachineIRBuilder &B) const {
2595
2596 const LLT S1 = LLT::scalar(1);
2597 const LLT S64 = LLT::scalar(64);
2598 Register Dst = MI.getOperand(0).getReg();
2599 Register OrigSrc = MI.getOperand(1).getReg();
2600 unsigned Flags = MI.getFlags();
2601 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&((ST.hasFractBug() && MRI.getType(Dst) == S64 &&
"this should not have been custom lowered") ? static_cast<
void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2602, __PRETTY_FUNCTION__))
2602 "this should not have been custom lowered")((ST.hasFractBug() && MRI.getType(Dst) == S64 &&
"this should not have been custom lowered") ? static_cast<
void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2602, __PRETTY_FUNCTION__))
;
2603
2604 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2605 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2606 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2607 // V_FRACT bug is:
2608 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2609 //
2610 // Convert floor(x) to (x - fract(x))
2611
2612 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2613 .addUse(OrigSrc)
2614 .setMIFlags(Flags);
2615
2616 // Give source modifier matching some assistance before obscuring a foldable
2617 // pattern.
2618
2619 // TODO: We can avoid the neg on the fract? The input sign to fract
2620 // shouldn't matter?
2621 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2622
2623 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2624
2625 Register Min = MRI.createGenericVirtualRegister(S64);
2626
2627 // We don't need to concern ourselves with the snan handling difference, so
2628 // use the one which will directly select.
2629 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2630 if (MFI->getMode().IEEE)
2631 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2632 else
2633 B.buildFMinNum(Min, Fract, Const, Flags);
2634
2635 Register CorrectedFract = Min;
2636 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2637 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2638 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2639 }
2640
2641 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2642 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2643
2644 MI.eraseFromParent();
2645 return true;
2646}
2647
2648// Turn an illegal packed v2s16 build vector into bit operations.
2649// TODO: This should probably be a bitcast action in LegalizerHelper.
2650bool AMDGPULegalizerInfo::legalizeBuildVector(
2651 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2652 Register Dst = MI.getOperand(0).getReg();
2653 const LLT S32 = LLT::scalar(32);
2654 assert(MRI.getType(Dst) == LLT::vector(2, 16))((MRI.getType(Dst) == LLT::vector(2, 16)) ? static_cast<void
> (0) : __assert_fail ("MRI.getType(Dst) == LLT::vector(2, 16)"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2654, __PRETTY_FUNCTION__))
;
2655
2656 Register Src0 = MI.getOperand(1).getReg();
2657 Register Src1 = MI.getOperand(2).getReg();
2658 assert(MRI.getType(Src0) == LLT::scalar(16))((MRI.getType(Src0) == LLT::scalar(16)) ? static_cast<void
> (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2658, __PRETTY_FUNCTION__))
;
2659
2660 auto Merge = B.buildMerge(S32, {Src0, Src1});
2661 B.buildBitcast(Dst, Merge);
2662
2663 MI.eraseFromParent();
2664 return true;
2665}
2666
2667// Check that this is a G_XOR x, -1
2668static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2669 if (MI.getOpcode() != TargetOpcode::G_XOR)
2670 return false;
2671 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2672 return ConstVal && *ConstVal == -1;
2673}
2674
2675// Return the use branch instruction, otherwise null if the usage is invalid.
2676static MachineInstr *
2677verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2678 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2679 Register CondDef = MI.getOperand(0).getReg();
2680 if (!MRI.hasOneNonDBGUse(CondDef))
2681 return nullptr;
2682
2683 MachineBasicBlock *Parent = MI.getParent();
2684 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2685
2686 if (isNot(MRI, *UseMI)) {
2687 Register NegatedCond = UseMI->getOperand(0).getReg();
2688 if (!MRI.hasOneNonDBGUse(NegatedCond))
2689 return nullptr;
2690
2691 // We're deleting the def of this value, so we need to remove it.
2692 UseMI->eraseFromParent();
2693
2694 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2695 Negated = true;
2696 }
2697
2698 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2699 return nullptr;
2700
2701 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2702 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2703 if (Next == Parent->end()) {
2704 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2705 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2706 return nullptr;
2707 UncondBrTarget = &*NextMBB;
2708 } else {
2709 if (Next->getOpcode() != AMDGPU::G_BR)
2710 return nullptr;
2711 Br = &*Next;
2712 UncondBrTarget = Br->getOperand(0).getMBB();
2713 }
2714
2715 return UseMI;
2716}
2717
2718bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2719 const ArgDescriptor *Arg,
2720 const TargetRegisterClass *ArgRC,
2721 LLT ArgTy) const {
2722 MCRegister SrcReg = Arg->getRegister();
2723 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")((Register::isPhysicalRegister(SrcReg) && "Physical register expected"
) ? static_cast<void> (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2723, __PRETTY_FUNCTION__))
;
6
'?' condition is true
2724 assert(DstReg.isVirtual() && "Virtual register expected")((DstReg.isVirtual() && "Virtual register expected") ?
static_cast<void> (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2724, __PRETTY_FUNCTION__))
;
7
'?' condition is true
2725
2726 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2727 ArgTy);
2728 if (Arg->isMasked()) {
8
Calling 'ArgDescriptor::isMasked'
11
Returning from 'ArgDescriptor::isMasked'
12
Taking true branch
2729 // TODO: Should we try to emit this once in the entry block?
2730 const LLT S32 = LLT::scalar(32);
2731 const unsigned Mask = Arg->getMask();
2732 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
13
Calling 'countTrailingZeros<unsigned int>'
20
Returning from 'countTrailingZeros<unsigned int>'
21
'Shift' initialized to 32
2733
2734 Register AndMaskSrc = LiveIn;
2735
2736 if (Shift
21.1
'Shift' is not equal to 0
21.1
'Shift' is not equal to 0
21.1
'Shift' is not equal to 0
!= 0) {
22
Taking true branch
2737 auto ShiftAmt = B.buildConstant(S32, Shift);
2738 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2739 }
2740
2741 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
23
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2742 } else {
2743 B.buildCopy(DstReg, LiveIn);
2744 }
2745
2746 return true;
2747}
2748
2749bool AMDGPULegalizerInfo::loadInputValue(
2750 Register DstReg, MachineIRBuilder &B,
2751 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2752 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2753 const ArgDescriptor *Arg;
2754 const TargetRegisterClass *ArgRC;
2755 LLT ArgTy;
2756 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2757
2758 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4
Taking false branch
2759 return false; // TODO: Handle these
2760 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5
Calling 'AMDGPULegalizerInfo::loadInputValue'
2761}
2762
2763bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2764 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2765 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2766 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3
Calling 'AMDGPULegalizerInfo::loadInputValue'
2767 return false;
2768
2769 MI.eraseFromParent();
2770 return true;
2771}
2772
2773bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2774 MachineRegisterInfo &MRI,
2775 MachineIRBuilder &B) const {
2776 Register Dst = MI.getOperand(0).getReg();
2777 LLT DstTy = MRI.getType(Dst);
2778 LLT S16 = LLT::scalar(16);
2779 LLT S32 = LLT::scalar(32);
2780 LLT S64 = LLT::scalar(64);
2781
2782 if (DstTy == S16)
2783 return legalizeFDIV16(MI, MRI, B);
2784 if (DstTy == S32)
2785 return legalizeFDIV32(MI, MRI, B);
2786 if (DstTy == S64)
2787 return legalizeFDIV64(MI, MRI, B);
2788
2789 return false;
2790}
2791
2792void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2793 Register DstReg,
2794 Register X,
2795 Register Y,
2796 bool IsDiv) const {
2797 const LLT S1 = LLT::scalar(1);
2798 const LLT S32 = LLT::scalar(32);
2799
2800 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2801 // algorithm used here.
2802
2803 // Initial estimate of inv(y).
2804 auto FloatY = B.buildUITOFP(S32, Y);
2805 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2806 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2807 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2808 auto Z = B.buildFPTOUI(S32, ScaledY);
2809
2810 // One round of UNR.
2811 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2812 auto NegYZ = B.buildMul(S32, NegY, Z);
2813 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2814
2815 // Quotient/remainder estimate.
2816 auto Q = B.buildUMulH(S32, X, Z);
2817 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2818
2819 // First quotient/remainder refinement.
2820 auto One = B.buildConstant(S32, 1);
2821 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2822 if (IsDiv)
2823 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2824 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2825
2826 // Second quotient/remainder refinement.
2827 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2828 if (IsDiv)
2829 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2830 else
2831 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2832}
2833
2834bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2835 MachineRegisterInfo &MRI,
2836 MachineIRBuilder &B) const {
2837 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2838 Register DstReg = MI.getOperand(0).getReg();
2839 Register Num = MI.getOperand(1).getReg();
2840 Register Den = MI.getOperand(2).getReg();
2841 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2842 MI.eraseFromParent();
2843 return true;
2844}
2845
2846// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2847//
2848// Return lo, hi of result
2849//
2850// %cvt.lo = G_UITOFP Val.lo
2851// %cvt.hi = G_UITOFP Val.hi
2852// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2853// %rcp = G_AMDGPU_RCP_IFLAG %mad
2854// %mul1 = G_FMUL %rcp, 0x5f7ffffc
2855// %mul2 = G_FMUL %mul1, 2**(-32)
2856// %trunc = G_INTRINSIC_TRUNC %mul2
2857// %mad2 = G_FMAD %trunc, -(2**32), %mul1
2858// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2859static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2860 Register Val) {
2861 const LLT S32 = LLT::scalar(32);
2862 auto Unmerge = B.buildUnmerge(S32, Val);
2863
2864 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2865 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2866
2867 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2868 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2869
2870 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2871 auto Mul1 =
2872 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2873
2874 // 2**(-32)
2875 auto Mul2 =
2876 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2877 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2878
2879 // -(2**32)
2880 auto Mad2 = B.buildFMAD(S32, Trunc,
2881 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2882
2883 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2884 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2885
2886 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2887}
2888
2889void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2890 Register DstReg,
2891 Register Numer,
2892 Register Denom,
2893 bool IsDiv) const {
2894 const LLT S32 = LLT::scalar(32);
2895 const LLT S64 = LLT::scalar(64);
2896 const LLT S1 = LLT::scalar(1);
2897 Register RcpLo, RcpHi;
2898
2899 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2900
2901 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2902
2903 auto Zero64 = B.buildConstant(S64, 0);
2904 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2905
2906 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2907 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2908
2909 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2910 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2911 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2912
2913 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2914 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2915 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2916 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2917
2918 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2919 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2920 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2921 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2922 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2923
2924 auto Zero32 = B.buildConstant(S32, 0);
2925 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2926 auto Add2_HiC =
2927 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2928 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2929 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2930
2931 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2932 Register NumerLo = UnmergeNumer.getReg(0);
2933 Register NumerHi = UnmergeNumer.getReg(1);
2934
2935 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2936 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2937 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2938 Register Mul3_Lo = UnmergeMul3.getReg(0);
2939 Register Mul3_Hi = UnmergeMul3.getReg(1);
2940 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2941 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2942 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2943 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2944
2945 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2946 Register DenomLo = UnmergeDenom.getReg(0);
2947 Register DenomHi = UnmergeDenom.getReg(1);
2948
2949 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2950 auto C1 = B.buildSExt(S32, CmpHi);
2951
2952 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2953 auto C2 = B.buildSExt(S32, CmpLo);
2954
2955 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2956 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2957
2958 // TODO: Here and below portions of the code can be enclosed into if/endif.
2959 // Currently control flow is unconditional and we have 4 selects after
2960 // potential endif to substitute PHIs.
2961
2962 // if C3 != 0 ...
2963 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2964 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2965 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2966 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2967
2968 auto One64 = B.buildConstant(S64, 1);
2969 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2970
2971 auto C4 =
2972 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2973 auto C5 =
2974 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2975 auto C6 = B.buildSelect(
2976 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2977
2978 // if (C6 != 0)
2979 auto Add4 = B.buildAdd(S64, Add3, One64);
2980 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2981
2982 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2983 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2984 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2985
2986 // endif C6
2987 // endif C3
2988
2989 if (IsDiv) {
2990 auto Sel1 = B.buildSelect(
2991 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2992 B.buildSelect(DstReg,
2993 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2994 } else {
2995 auto Sel2 = B.buildSelect(
2996 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2997 B.buildSelect(DstReg,
2998 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2999 }
3000}
3001
3002bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
3003 MachineRegisterInfo &MRI,
3004 MachineIRBuilder &B) const {
3005 const LLT S64 = LLT::scalar(64);
3006 const LLT S32 = LLT::scalar(32);
3007 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
3008 Register DstReg = MI.getOperand(0).getReg();
3009 Register Num = MI.getOperand(1).getReg();
3010 Register Den = MI.getOperand(2).getReg();
3011 LLT Ty = MRI.getType(DstReg);
3012
3013 if (Ty == S32)
3014 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3015 else if (Ty == S64)
3016 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3017 else
3018 return false;
3019
3020 MI.eraseFromParent();
3021 return true;
3022
3023}
3024
3025bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3026 MachineRegisterInfo &MRI,
3027 MachineIRBuilder &B) const {
3028 const LLT S64 = LLT::scalar(64);
3029 const LLT S32 = LLT::scalar(32);
3030
3031 Register DstReg = MI.getOperand(0).getReg();
3032 const LLT Ty = MRI.getType(DstReg);
3033 if (Ty != S32 && Ty != S64)
3034 return false;
3035
3036 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3037
3038 Register LHS = MI.getOperand(1).getReg();
3039 Register RHS = MI.getOperand(2).getReg();
3040
3041 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3042 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3043 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3044
3045 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3046 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3047
3048 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3049 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3050
3051 Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3052 if (Ty == S32)
3053 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3054 else
3055 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3056
3057 Register Sign;
3058 if (IsDiv)
3059 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3060 else
3061 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3062
3063 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3064 B.buildSub(DstReg, UDivRem, Sign);
3065
3066 MI.eraseFromParent();
3067 return true;
3068}
3069
3070bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3071 MachineRegisterInfo &MRI,
3072 MachineIRBuilder &B) const {
3073 Register Res = MI.getOperand(0).getReg();
3074 Register LHS = MI.getOperand(1).getReg();
3075 Register RHS = MI.getOperand(2).getReg();
3076 uint16_t Flags = MI.getFlags();
3077 LLT ResTy = MRI.getType(Res);
3078
3079 const MachineFunction &MF = B.getMF();
3080 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3081 MI.getFlag(MachineInstr::FmAfn);
3082
3083 if (!AllowInaccurateRcp)
3084 return false;
3085
3086 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3087 // 1 / x -> RCP(x)
3088 if (CLHS->isExactlyValue(1.0)) {
3089 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3090 .addUse(RHS)
3091 .setMIFlags(Flags);
3092
3093 MI.eraseFromParent();
3094 return true;
3095 }
3096
3097 // -1 / x -> RCP( FNEG(x) )
3098 if (CLHS->isExactlyValue(-1.0)) {
3099 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3100 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3101 .addUse(FNeg.getReg(0))
3102 .setMIFlags(Flags);
3103
3104 MI.eraseFromParent();
3105 return true;
3106 }
3107 }
3108
3109 // x / y -> x * (1.0 / y)
3110 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3111 .addUse(RHS)
3112 .setMIFlags(Flags);
3113 B.buildFMul(Res, LHS, RCP, Flags);
3114
3115 MI.eraseFromParent();
3116 return true;
3117}
3118
3119bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3120 MachineRegisterInfo &MRI,
3121 MachineIRBuilder &B) const {
3122 Register Res = MI.getOperand(0).getReg();
3123 Register X = MI.getOperand(1).getReg();
3124 Register Y = MI.getOperand(2).getReg();
3125 uint16_t Flags = MI.getFlags();
3126 LLT ResTy = MRI.getType(Res);
3127
3128 const MachineFunction &MF = B.getMF();
3129 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3130 MI.getFlag(MachineInstr::FmAfn);
3131
3132 if (!AllowInaccurateRcp)
3133 return false;
3134
3135 auto NegY = B.buildFNeg(ResTy, Y);
3136 auto One = B.buildFConstant(ResTy, 1.0);
3137
3138 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3139 .addUse(Y)
3140 .setMIFlags(Flags);
3141
3142 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3143 R = B.buildFMA(ResTy, Tmp0, R, R);
3144
3145 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3146 R = B.buildFMA(ResTy, Tmp1, R, R);
3147
3148 auto Ret = B.buildFMul(ResTy, X, R);
3149 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3150
3151 B.buildFMA(Res, Tmp2, R, Ret);
3152 MI.eraseFromParent();
3153 return true;
3154}
3155
3156bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3157 MachineRegisterInfo &MRI,
3158 MachineIRBuilder &B) const {
3159 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3160 return true;
3161
3162 Register Res = MI.getOperand(0).getReg();
3163 Register LHS = MI.getOperand(1).getReg();
3164 Register RHS = MI.getOperand(2).getReg();
3165
3166 uint16_t Flags = MI.getFlags();
3167
3168 LLT S16 = LLT::scalar(16);
3169 LLT S32 = LLT::scalar(32);
3170
3171 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3172 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3173
3174 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3175 .addUse(RHSExt.getReg(0))
3176 .setMIFlags(Flags);
3177
3178 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3179 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3180
3181 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3182 .addUse(RDst.getReg(0))
3183 .addUse(RHS)
3184 .addUse(LHS)
3185 .setMIFlags(Flags);
3186
3187 MI.eraseFromParent();
3188 return true;
3189}
3190
3191// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3192// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3193static void toggleSPDenormMode(bool Enable,
3194 MachineIRBuilder &B,
3195 const GCNSubtarget &ST,
3196 AMDGPU::SIModeRegisterDefaults Mode) {
3197 // Set SP denorm mode to this value.
3198 unsigned SPDenormMode =
3199 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3200
3201 if (ST.hasDenormModeInst()) {
3202 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3203 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3204
3205 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3206 B.buildInstr(AMDGPU::S_DENORM_MODE)
3207 .addImm(NewDenormModeValue);
3208
3209 } else {
3210 // Select FP32 bit field in mode register.
3211 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3212 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3213 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3214
3215 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3216 .addImm(SPDenormMode)
3217 .addImm(SPDenormModeBitField);
3218 }
3219}
3220
3221bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3222 MachineRegisterInfo &MRI,
3223 MachineIRBuilder &B) const {
3224 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3225 return true;
3226
3227 Register Res = MI.getOperand(0).getReg();
3228 Register LHS = MI.getOperand(1).getReg();
3229 Register RHS = MI.getOperand(2).getReg();
3230 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3231 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3232
3233 uint16_t Flags = MI.getFlags();
3234
3235 LLT S32 = LLT::scalar(32);
3236 LLT S1 = LLT::scalar(1);
3237
3238 auto One = B.buildFConstant(S32, 1.0f);
3239
3240 auto DenominatorScaled =
3241 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3242 .addUse(LHS)
3243 .addUse(RHS)
3244 .addImm(0)
3245 .setMIFlags(Flags);
3246 auto NumeratorScaled =
3247 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3248 .addUse(LHS)
3249 .addUse(RHS)
3250 .addImm(1)
3251 .setMIFlags(Flags);
3252
3253 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3254 .addUse(DenominatorScaled.getReg(0))
3255 .setMIFlags(Flags);
3256 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3257
3258 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3259 // aren't modeled as reading it.
3260 if (!Mode.allFP32Denormals())
3261 toggleSPDenormMode(true, B, ST, Mode);
3262
3263 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3264 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3265 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3266 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3267 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3268 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3269
3270 if (!Mode.allFP32Denormals())
3271 toggleSPDenormMode(false, B, ST, Mode);
3272
3273 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3274 .addUse(Fma4.getReg(0))
3275 .addUse(Fma1.getReg(0))
3276 .addUse(Fma3.getReg(0))
3277 .addUse(NumeratorScaled.getReg(1))
3278 .setMIFlags(Flags);
3279
3280 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3281 .addUse(Fmas.getReg(0))
3282 .addUse(RHS)
3283 .addUse(LHS)
3284 .setMIFlags(Flags);
3285
3286 MI.eraseFromParent();
3287 return true;
3288}
3289
3290bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3291 MachineRegisterInfo &MRI,
3292 MachineIRBuilder &B) const {
3293 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3294 return true;
3295
3296 Register Res = MI.getOperand(0).getReg();
3297 Register LHS = MI.getOperand(1).getReg();
3298 Register RHS = MI.getOperand(2).getReg();
3299
3300 uint16_t Flags = MI.getFlags();
3301
3302 LLT S64 = LLT::scalar(64);
3303 LLT S1 = LLT::scalar(1);
3304
3305 auto One = B.buildFConstant(S64, 1.0);
3306
3307 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3308 .addUse(LHS)
3309 .addUse(RHS)
3310 .addImm(0)
3311 .setMIFlags(Flags);
3312
3313 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3314
3315 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3316 .addUse(DivScale0.getReg(0))
3317 .setMIFlags(Flags);
3318
3319 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3320 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3321 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3322
3323 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3324 .addUse(LHS)
3325 .addUse(RHS)
3326 .addImm(1)
3327 .setMIFlags(Flags);
3328
3329 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3330 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3331 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3332
3333 Register Scale;
3334 if (!ST.hasUsableDivScaleConditionOutput()) {
3335 // Workaround a hardware bug on SI where the condition output from div_scale
3336 // is not usable.
3337
3338 LLT S32 = LLT::scalar(32);
3339
3340 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3341 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3342 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3343 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3344
3345 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3346 Scale1Unmerge.getReg(1));
3347 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3348 Scale0Unmerge.getReg(1));
3349 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3350 } else {
3351 Scale = DivScale1.getReg(1);
3352 }
3353
3354 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3355 .addUse(Fma4.getReg(0))
3356 .addUse(Fma3.getReg(0))
3357 .addUse(Mul.getReg(0))
3358 .addUse(Scale)
3359 .setMIFlags(Flags);
3360
3361 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3362 .addUse(Fmas.getReg(0))
3363 .addUse(RHS)
3364 .addUse(LHS)
3365 .setMIFlags(Flags);
3366
3367 MI.eraseFromParent();
3368 return true;
3369}
3370
3371bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3372 MachineRegisterInfo &MRI,
3373 MachineIRBuilder &B) const {
3374 Register Res = MI.getOperand(0).getReg();
3375 Register LHS = MI.getOperand(2).getReg();
3376 Register RHS = MI.getOperand(3).getReg();
3377 uint16_t Flags = MI.getFlags();
3378
3379 LLT S32 = LLT::scalar(32);
3380 LLT S1 = LLT::scalar(1);
3381
3382 auto Abs = B.buildFAbs(S32, RHS, Flags);
3383 const APFloat C0Val(1.0f);
3384
3385 auto C0 = B.buildConstant(S32, 0x6f800000);
3386 auto C1 = B.buildConstant(S32, 0x2f800000);
3387 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3388
3389 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3390 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3391
3392 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3393
3394 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3395 .addUse(Mul0.getReg(0))
3396 .setMIFlags(Flags);
3397
3398 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3399
3400 B.buildFMul(Res, Sel, Mul1, Flags);
3401
3402 MI.eraseFromParent();
3403 return true;
3404}
3405
3406// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3407// FIXME: Why do we handle this one but not other removed instructions?
3408//
3409// Reciprocal square root. The clamp prevents infinite results, clamping
3410// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3411// +-max_float.
3412bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3413 MachineRegisterInfo &MRI,
3414 MachineIRBuilder &B) const {
3415 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3416 return true;
3417
3418 Register Dst = MI.getOperand(0).getReg();
3419 Register Src = MI.getOperand(2).getReg();
3420 auto Flags = MI.getFlags();
3421
3422 LLT Ty = MRI.getType(Dst);
3423
3424 const fltSemantics *FltSemantics;
3425 if (Ty == LLT::scalar(32))
3426 FltSemantics = &APFloat::IEEEsingle();
3427 else if (Ty == LLT::scalar(64))
3428 FltSemantics = &APFloat::IEEEdouble();
3429 else
3430 return false;
3431
3432 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3433 .addUse(Src)
3434 .setMIFlags(Flags);
3435
3436 // We don't need to concern ourselves with the snan handling difference, since
3437 // the rsq quieted (or not) so use the one which will directly select.
3438 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3439 const bool UseIEEE = MFI->getMode().IEEE;
3440
3441 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3442 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3443 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3444
3445 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3446
3447 if (UseIEEE)
3448 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3449 else
3450 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3451 MI.eraseFromParent();
3452 return true;
3453}
3454
3455static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3456 switch (IID) {
3457 case Intrinsic::amdgcn_ds_fadd:
3458 return AMDGPU::G_ATOMICRMW_FADD;
3459 case Intrinsic::amdgcn_ds_fmin:
3460 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3461 case Intrinsic::amdgcn_ds_fmax:
3462 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3463 default:
3464 llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3464)
;
3465 }
3466}
3467
3468bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3469 MachineInstr &MI,
3470 Intrinsic::ID IID) const {
3471 GISelChangeObserver &Observer = Helper.Observer;
3472 Observer.changingInstr(MI);
3473
3474 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3475
3476 // The remaining operands were used to set fields in the MemOperand on
3477 // construction.
3478 for (int I = 6; I > 3; --I)
3479 MI.RemoveOperand(I);
3480
3481 MI.RemoveOperand(1); // Remove the intrinsic ID.
3482 Observer.changedInstr(MI);
3483 return true;
3484}
3485
3486bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3487 MachineRegisterInfo &MRI,
3488 MachineIRBuilder &B) const {
3489 uint64_t Offset =
3490 ST.getTargetLowering()->getImplicitParameterOffset(
3491 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3492 LLT DstTy = MRI.getType(DstReg);
3493 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3494
3495 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3496 if (!loadInputValue(KernargPtrReg, B,
3497 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3498 return false;
3499
3500 // FIXME: This should be nuw
3501 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3502 return true;
3503}
3504
3505bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3506 MachineRegisterInfo &MRI,
3507 MachineIRBuilder &B) const {
3508 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3509 if (!MFI->isEntryFunction()) {
3510 return legalizePreloadedArgIntrin(MI, MRI, B,
3511 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3512 }
3513
3514 Register DstReg = MI.getOperand(0).getReg();
3515 if (!getImplicitArgPtr(DstReg, MRI, B))
3516 return false;
3517
3518 MI.eraseFromParent();
3519 return true;
3520}
3521
3522bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3523 MachineRegisterInfo &MRI,
3524 MachineIRBuilder &B,
3525 unsigned AddrSpace) const {
3526 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3527 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3528 Register Hi32 = Unmerge.getReg(1);
3529
3530 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3531 MI.eraseFromParent();
3532 return true;
3533}
3534
3535// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3536// offset (the offset that is included in bounds checking and swizzling, to be
3537// split between the instruction's voffset and immoffset fields) and soffset
3538// (the offset that is excluded from bounds checking and swizzling, to go in
3539// the instruction's soffset field). This function takes the first kind of
3540// offset and figures out how to split it between voffset and immoffset.
3541std::tuple<Register, unsigned, unsigned>
3542AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3543 Register OrigOffset) const {
3544 const unsigned MaxImm = 4095;
3545 Register BaseReg;
3546 unsigned TotalConstOffset;
3547 const LLT S32 = LLT::scalar(32);
3548 MachineRegisterInfo &MRI = *B.getMRI();
3549
3550 std::tie(BaseReg, TotalConstOffset) =
3551 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3552
3553 unsigned ImmOffset = TotalConstOffset;
3554
3555 // If BaseReg is a pointer, convert it to int.
3556 if (MRI.getType(BaseReg).isPointer())
3557 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3558
3559 // If the immediate value is too big for the immoffset field, put the value
3560 // and -4096 into the immoffset field so that the value that is copied/added
3561 // for the voffset field is a multiple of 4096, and it stands more chance
3562 // of being CSEd with the copy/add for another similar load/store.
3563 // However, do not do that rounding down to a multiple of 4096 if that is a
3564 // negative number, as it appears to be illegal to have a negative offset
3565 // in the vgpr, even if adding the immediate offset makes it positive.
3566 unsigned Overflow = ImmOffset & ~MaxImm;
3567 ImmOffset -= Overflow;
3568 if ((int32_t)Overflow < 0) {
3569 Overflow += ImmOffset;
3570 ImmOffset = 0;
3571 }
3572
3573 if (Overflow != 0) {
3574 if (!BaseReg) {
3575 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3576 } else {
3577 auto OverflowVal = B.buildConstant(S32, Overflow);
3578 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3579 }
3580 }
3581
3582 if (!BaseReg)
3583 BaseReg = B.buildConstant(S32, 0).getReg(0);
3584
3585 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3586}
3587
3588/// Handle register layout difference for f16 images for some subtargets.
3589Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3590 MachineRegisterInfo &MRI,
3591 Register Reg,
3592 bool ImageStore) const {
3593 const LLT S16 = LLT::scalar(16);
3594 const LLT S32 = LLT::scalar(32);
3595 LLT StoreVT = MRI.getType(Reg);
3596 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16
) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3596, __PRETTY_FUNCTION__))
;
3597
3598 if (ST.hasUnpackedD16VMem()) {
3599 auto Unmerge = B.buildUnmerge(S16, Reg);
3600
3601 SmallVector<Register, 4> WideRegs;
3602 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3603 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3604
3605 int NumElts = StoreVT.getNumElements();
3606
3607 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3608 }
3609
3610 if (ImageStore && ST.hasImageStoreD16Bug()) {
3611 if (StoreVT.getNumElements() == 2) {
3612 SmallVector<Register, 4> PackedRegs;
3613 Reg = B.buildBitcast(S32, Reg).getReg(0);
3614 PackedRegs.push_back(Reg);
3615 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3616 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3617 }
3618
3619 if (StoreVT.getNumElements() == 3) {
3620 SmallVector<Register, 4> PackedRegs;
3621 auto Unmerge = B.buildUnmerge(S16, Reg);
3622 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3623 PackedRegs.push_back(Unmerge.getReg(I));
3624 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3625 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3626 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3627 }
3628
3629 if (StoreVT.getNumElements() == 4) {
3630 SmallVector<Register, 4> PackedRegs;
3631 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3632 auto Unmerge = B.buildUnmerge(S32, Reg);
3633 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3634 PackedRegs.push_back(Unmerge.getReg(I));
3635 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3636 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3637 }
3638
3639 llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3639)
;
3640 }
3641
3642 return Reg;
3643}
3644
3645Register AMDGPULegalizerInfo::fixStoreSourceType(
3646 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3647 MachineRegisterInfo *MRI = B.getMRI();
3648 LLT Ty = MRI->getType(VData);
3649
3650 const LLT S16 = LLT::scalar(16);
3651
3652 // Fixup illegal register types for i8 stores.
3653 if (Ty == LLT::scalar(8) || Ty == S16) {
3654 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3655 return AnyExt;
3656 }
3657
3658 if (Ty.isVector()) {
3659 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3660 if (IsFormat)
3661 return handleD16VData(B, *MRI, VData);
3662 }
3663 }
3664
3665 return VData;
3666}
3667
3668bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3669 MachineRegisterInfo &MRI,
3670 MachineIRBuilder &B,
3671 bool IsTyped,
3672 bool IsFormat) const {
3673 Register VData = MI.getOperand(1).getReg();
3674 LLT Ty = MRI.getType(VData);
3675 LLT EltTy = Ty.getScalarType();
3676 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3677 const LLT S32 = LLT::scalar(32);
3678
3679 VData = fixStoreSourceType(B, VData, IsFormat);
3680 Register RSrc = MI.getOperand(2).getReg();
3681
3682 MachineMemOperand *MMO = *MI.memoperands_begin();
3683 const int MemSize = MMO->getSize();
3684
3685 unsigned ImmOffset;
3686 unsigned TotalOffset;
3687
3688 // The typed intrinsics add an immediate after the registers.
3689 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3690
3691 // The struct intrinsic variants add one additional operand over raw.
3692 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3693 Register VIndex;
3694 int OpOffset = 0;
3695 if (HasVIndex) {
3696 VIndex = MI.getOperand(3).getReg();
3697 OpOffset = 1;
3698 }
3699
3700 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3701 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3702
3703 unsigned Format = 0;
3704 if (IsTyped) {
3705 Format = MI.getOperand(5 + OpOffset).getImm();
3706 ++OpOffset;
3707 }
3708
3709 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3710
3711 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3712 if (TotalOffset != 0)
3713 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3714
3715 unsigned Opc;
3716 if (IsTyped) {
3717 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3718 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3719 } else if (IsFormat) {
3720 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3721 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3722 } else {
3723 switch (MemSize) {
3724 case 1:
3725 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3726 break;
3727 case 2:
3728 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3729 break;
3730 default:
3731 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3732 break;
3733 }
3734 }
3735
3736 if (!VIndex)
3737 VIndex = B.buildConstant(S32, 0).getReg(0);
3738
3739 auto MIB = B.buildInstr(Opc)
3740 .addUse(VData) // vdata
3741 .addUse(RSrc) // rsrc
3742 .addUse(VIndex) // vindex
3743 .addUse(VOffset) // voffset
3744 .addUse(SOffset) // soffset
3745 .addImm(ImmOffset); // offset(imm)
3746
3747 if (IsTyped)
3748 MIB.addImm(Format);
3749
3750 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3751 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3752 .addMemOperand(MMO);
3753
3754 MI.eraseFromParent();
3755 return true;
3756}
3757
3758bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3759 MachineRegisterInfo &MRI,
3760 MachineIRBuilder &B,
3761 bool IsFormat,
3762 bool IsTyped) const {
3763 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3764 MachineMemOperand *MMO = *MI.memoperands_begin();
3765 const int MemSize = MMO->getSize();
3766 const LLT S32 = LLT::scalar(32);
3767
3768 Register Dst = MI.getOperand(0).getReg();
3769 Register RSrc = MI.getOperand(2).getReg();
3770
3771 // The typed intrinsics add an immediate after the registers.
3772 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3773
3774 // The struct intrinsic variants add one additional operand over raw.
3775 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3776 Register VIndex;
3777 int OpOffset = 0;
3778 if (HasVIndex) {
3779 VIndex = MI.getOperand(3).getReg();
3780 OpOffset = 1;
3781 }
3782
3783 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3784 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3785
3786 unsigned Format = 0;
3787 if (IsTyped) {
3788 Format = MI.getOperand(5 + OpOffset).getImm();
3789 ++OpOffset;
3790 }
3791
3792 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3793 unsigned ImmOffset;
3794 unsigned TotalOffset;
3795
3796 LLT Ty = MRI.getType(Dst);
3797 LLT EltTy = Ty.getScalarType();
3798 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3799 const bool Unpacked = ST.hasUnpackedD16VMem();
3800
3801 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3802 if (TotalOffset != 0)
3803 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3804
3805 unsigned Opc;
3806
3807 if (IsTyped) {
3808 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3809 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3810 } else if (IsFormat) {
3811 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3812 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3813 } else {
3814 switch (MemSize) {
3815 case 1:
3816 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3817 break;
3818 case 2:
3819 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3820 break;
3821 default:
3822 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3823 break;
3824 }
3825 }
3826
3827 Register LoadDstReg;
3828
3829 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3830 LLT UnpackedTy = Ty.changeElementSize(32);
3831
3832 if (IsExtLoad)
3833 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3834 else if (Unpacked && IsD16 && Ty.isVector())
3835 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3836 else
3837 LoadDstReg = Dst;
3838
3839 if (!VIndex)
3840 VIndex = B.buildConstant(S32, 0).getReg(0);
3841
3842 auto MIB = B.buildInstr(Opc)
3843 .addDef(LoadDstReg) // vdata
3844 .addUse(RSrc) // rsrc
3845 .addUse(VIndex) // vindex
3846 .addUse(VOffset) // voffset
3847 .addUse(SOffset) // soffset
3848 .addImm(ImmOffset); // offset(imm)
3849
3850 if (IsTyped)
3851 MIB.addImm(Format);
3852
3853 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3854 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3855 .addMemOperand(MMO);
3856
3857 if (LoadDstReg != Dst) {
3858 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3859
3860 // Widen result for extending loads was widened.
3861 if (IsExtLoad)
3862 B.buildTrunc(Dst, LoadDstReg);
3863 else {
3864 // Repack to original 16-bit vector result
3865 // FIXME: G_TRUNC should work, but legalization currently fails
3866 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3867 SmallVector<Register, 4> Repack;
3868 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3869 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3870 B.buildMerge(Dst, Repack);
3871 }
3872 }
3873
3874 MI.eraseFromParent();
3875 return true;
3876}
3877
3878bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3879 MachineIRBuilder &B,
3880 bool IsInc) const {
3881 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3882 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3883 B.buildInstr(Opc)
3884 .addDef(MI.getOperand(0).getReg())
3885 .addUse(MI.getOperand(2).getReg())
3886 .addUse(MI.getOperand(3).getReg())
3887 .cloneMemRefs(MI);
3888 MI.eraseFromParent();
3889 return true;
3890}
3891
3892static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3893 switch (IntrID) {
3894 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3895 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3896 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3897 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3898 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3899 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3900 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3901 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3903 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3904 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3906 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3907 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3908 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3909 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3910 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3911 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3912 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3913 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3914 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3915 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3916 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3918 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3919 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3921 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3922 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3923 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3924 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3925 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3926 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3927 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3928 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3930 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3931 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3933 case Intrinsic::amdgcn_buffer_atomic_fadd:
3934 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3935 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3936 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3937 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3938 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3939 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3940 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3941 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3942 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3943 default:
3944 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3944)
;
3945 }
3946}
3947
3948bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3949 MachineIRBuilder &B,
3950 Intrinsic::ID IID) const {
3951 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3952 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3953 const bool HasReturn = MI.getNumExplicitDefs() != 0;
3954
3955 Register Dst;
3956
3957 int OpOffset = 0;
3958 if (HasReturn) {
3959 // A few FP atomics do not support return values.
3960 Dst = MI.getOperand(0).getReg();
3961 } else {
3962 OpOffset = -1;
3963 }
3964
3965 Register VData = MI.getOperand(2 + OpOffset).getReg();
3966 Register CmpVal;
3967
3968 if (IsCmpSwap) {
3969 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3970 ++OpOffset;
3971 }
3972
3973 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3974 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3975
3976 // The struct intrinsic variants add one additional operand over raw.
3977 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3978 Register VIndex;
3979 if (HasVIndex) {
3980 VIndex = MI.getOperand(4 + OpOffset).getReg();
3981 ++OpOffset;
3982 }
3983
3984 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3985 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3986 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3987
3988 MachineMemOperand *MMO = *MI.memoperands_begin();
3989
3990 unsigned ImmOffset;
3991 unsigned TotalOffset;
3992 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3993 if (TotalOffset != 0)
3994 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3995
3996 if (!VIndex)
3997 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3998
3999 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4000
4001 if (HasReturn)
4002 MIB.addDef(Dst);
4003
4004 MIB.addUse(VData); // vdata
4005
4006 if (IsCmpSwap)
4007 MIB.addReg(CmpVal);
4008
4009 MIB.addUse(RSrc) // rsrc
4010 .addUse(VIndex) // vindex
4011 .addUse(VOffset) // voffset
4012 .addUse(SOffset) // soffset
4013 .addImm(ImmOffset) // offset(imm)
4014 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4015 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4016 .addMemOperand(MMO);
4017
4018 MI.eraseFromParent();
4019 return true;
4020}
4021
4022/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4023/// vector with s16 typed elements.
4024static void packImageA16AddressToDwords(
4025 MachineIRBuilder &B, MachineInstr &MI,
4026 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4027 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4028 const LLT S16 = LLT::scalar(16);
4029 const LLT V2S16 = LLT::vector(2, 16);
4030
4031 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4032 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4033 if (!SrcOp.isReg())
4034 continue; // _L to _LZ may have eliminated this.
4035
4036 Register AddrReg = SrcOp.getReg();
4037
4038 if (I < Intr->GradientStart) {
4039 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4040 PackedAddrs.push_back(AddrReg);
4041 } else {
4042 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4043 // derivatives dx/dh and dx/dv are packed with undef.
4044 if (((I + 1) >= EndIdx) ||
4045 ((Intr->NumGradients / 2) % 2 == 1 &&
4046 (I == static_cast<unsigned>(Intr->GradientStart +
4047 (Intr->NumGradients / 2) - 1) ||
4048 I == static_cast<unsigned>(Intr->GradientStart +
4049 Intr->NumGradients - 1))) ||
4050 // Check for _L to _LZ optimization
4051 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4052 PackedAddrs.push_back(
4053 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4054 .getReg(0));
4055 } else {
4056 PackedAddrs.push_back(
4057 B.buildBuildVector(
4058 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4059 .getReg(0));
4060 ++I;
4061 }
4062 }
4063 }
4064}
4065
4066/// Convert from separate vaddr components to a single vector address register,
4067/// and replace the remaining operands with $noreg.
4068static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4069 int DimIdx, int NumVAddrs) {
4070 const LLT S32 = LLT::scalar(32);
4071
4072 SmallVector<Register, 8> AddrRegs;
4073 for (int I = 0; I != NumVAddrs; ++I) {
4074 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4075 if (SrcOp.isReg()) {
4076 AddrRegs.push_back(SrcOp.getReg());
4077 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)((B.getMRI()->getType(SrcOp.getReg()) == S32) ? static_cast
<void> (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4077, __PRETTY_FUNCTION__))
;
4078 }
4079 }
4080
4081 int NumAddrRegs = AddrRegs.size();
4082 if (NumAddrRegs != 1) {
4083 // Round up to 8 elements for v5-v7
4084 // FIXME: Missing intermediate sized register classes and instructions.
4085 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4086 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4087 auto Undef = B.buildUndef(S32);
4088 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4089 NumAddrRegs = RoundedNumRegs;
4090 }
4091
4092 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4093 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4094 }
4095
4096 for (int I = 1; I != NumVAddrs; ++I) {
4097 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4098 if (SrcOp.isReg())
4099 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4100 }
4101}
4102
4103/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4104///
4105/// Depending on the subtarget, load/store with 16-bit element data need to be
4106/// rewritten to use the low half of 32-bit registers, or directly use a packed
4107/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4108/// registers.
4109///
4110/// We don't want to directly select image instructions just yet, but also want
4111/// to exposes all register repacking to the legalizer/combiners. We also don't
4112/// want a selected instrution entering RegBankSelect. In order to avoid
4113/// defining a multitude of intermediate image instructions, directly hack on
4114/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4115/// now unnecessary arguments with $noreg.
4116bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4117 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4118 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4119
4120 const unsigned NumDefs = MI.getNumExplicitDefs();
4121 const unsigned ArgOffset = NumDefs + 1;
4122 bool IsTFE = NumDefs == 2;
4123 // We are only processing the operands of d16 image operations on subtargets
4124 // that use the unpacked register layout, or need to repack the TFE result.
4125
4126 // TODO: Do we need to guard against already legalized intrinsics?
4127 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4128 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4129
4130 MachineRegisterInfo *MRI = B.getMRI();
4131 const LLT S32 = LLT::scalar(32);
4132 const LLT S16 = LLT::scalar(16);
4133 const LLT V2S16 = LLT::vector(2, 16);
4134
4135 unsigned DMask = 0;
4136
4137 // Check for 16 bit addresses and pack if true.
4138 LLT GradTy =
4139 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4140 LLT AddrTy =
4141 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4142 const bool IsG16 = GradTy == S16;
4143 const bool IsA16 = AddrTy == S16;
4144
4145 int DMaskLanes = 0;
4146 if (!BaseOpcode->Atomic) {
4147 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4148 if (BaseOpcode->Gather4) {
4149 DMaskLanes = 4;
4150 } else if (DMask != 0) {
4151 DMaskLanes = countPopulation(DMask);
4152 } else if (!IsTFE && !BaseOpcode->Store) {
4153 // If dmask is 0, this is a no-op load. This can be eliminated.
4154 B.buildUndef(MI.getOperand(0));
4155 MI.eraseFromParent();
4156 return true;
4157 }
4158 }
4159
4160 Observer.changingInstr(MI);
4161 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4162
4163 unsigned NewOpcode = NumDefs == 0 ?
4164 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4165
4166 // Track that we legalized this
4167 MI.setDesc(B.getTII().get(NewOpcode));
4168
4169 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4170 // dmask to be at least 1 otherwise the instruction will fail
4171 if (IsTFE && DMask == 0) {
4172 DMask = 0x1;
4173 DMaskLanes = 1;
4174 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4175 }
4176
4177 if (BaseOpcode->Atomic) {
4178 Register VData0 = MI.getOperand(2).getReg();
4179 LLT Ty = MRI->getType(VData0);
4180
4181 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4182 if (Ty.isVector())
4183 return false;
4184
4185 if (BaseOpcode->AtomicX2) {
4186 Register VData1 = MI.getOperand(3).getReg();
4187 // The two values are packed in one register.
4188 LLT PackedTy = LLT::vector(2, Ty);
4189 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4190 MI.getOperand(2).setReg(Concat.getReg(0));
4191 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4192 }
4193 }
4194
4195 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4196
4197 // Optimize _L to _LZ when _L is zero
4198 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4199 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4200 const ConstantFP *ConstantLod;
4201
4202 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4203 m_GFCst(ConstantLod))) {
4204 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4205 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4206 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4207 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4208 Intr->Dim);
4209
4210 // The starting indexes should remain in the same place.
4211 --CorrectedNumVAddrs;
4212
4213 MI.getOperand(MI.getNumExplicitDefs())
4214 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4215 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4216 Intr = NewImageDimIntr;
4217 }
4218 }
4219 }
4220
4221 // Optimize _mip away, when 'lod' is zero
4222 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4223 int64_t ConstantLod;
4224 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4225 m_ICst(ConstantLod))) {
4226 if (ConstantLod == 0) {
4227 // TODO: Change intrinsic opcode and remove operand instead or replacing
4228 // it with 0, as the _L to _LZ handling is done above.
4229 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4230 --CorrectedNumVAddrs;
4231 }
4232 }
4233 }
4234
4235 // Rewrite the addressing register layout before doing anything else.
4236 if (IsA16 || IsG16) {
4237 if (IsA16) {
4238 // Target must support the feature and gradients need to be 16 bit too
4239 if (!ST.hasA16() || !IsG16)
4240 return false;
4241 } else if (!ST.hasG16())
4242 return false;
4243
4244 if (Intr->NumVAddrs > 1) {
4245 SmallVector<Register, 4> PackedRegs;
4246 // Don't compress addresses for G16
4247 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4248 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4249 PackEndIdx);
4250
4251 if (!IsA16) {
4252 // Add uncompressed address
4253 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4254 int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4255 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32))((B.getMRI()->getType(AddrReg) == LLT::scalar(32)) ? static_cast
<void> (0) : __assert_fail ("B.getMRI()->getType(AddrReg) == LLT::scalar(32)"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4255, __PRETTY_FUNCTION__))
;
4256 PackedRegs.push_back(AddrReg);
4257 }
4258 }
4259
4260 // See also below in the non-a16 branch
4261 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4262
4263 if (!UseNSA && PackedRegs.size() > 1) {
4264 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4265 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4266 PackedRegs[0] = Concat.getReg(0);
4267 PackedRegs.resize(1);
4268 }
4269
4270 const unsigned NumPacked = PackedRegs.size();
4271 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4272 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4273 if (!SrcOp.isReg()) {
4274 assert(SrcOp.isImm() && SrcOp.getImm() == 0)((SrcOp.isImm() && SrcOp.getImm() == 0) ? static_cast
<void> (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4274, __PRETTY_FUNCTION__))
;
4275 continue;
4276 }
4277
4278 assert(SrcOp.getReg() != AMDGPU::NoRegister)((SrcOp.getReg() != AMDGPU::NoRegister) ? static_cast<void
> (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4278, __PRETTY_FUNCTION__))
;
4279
4280 if (I - Intr->VAddrStart < NumPacked)
4281 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4282 else
4283 SrcOp.setReg(AMDGPU::NoRegister);
4284 }
4285 }
4286 } else {
4287 // If the register allocator cannot place the address registers contiguously
4288 // without introducing moves, then using the non-sequential address encoding
4289 // is always preferable, since it saves VALU instructions and is usually a
4290 // wash in terms of code size or even better.
4291 //
4292 // However, we currently have no way of hinting to the register allocator
4293 // that MIMG addresses should be placed contiguously when it is possible to
4294 // do so, so force non-NSA for the common 2-address case as a heuristic.
4295 //
4296 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4297 // allocation when possible.
4298 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4299
4300 if (!UseNSA && Intr->NumVAddrs > 1)
4301 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4302 Intr->NumVAddrs);
4303 }
4304
4305 int Flags = 0;
4306 if (IsA16)
4307 Flags |= 1;
4308 if (IsG16)
4309 Flags |= 2;
4310 MI.addOperand(MachineOperand::CreateImm(Flags));
4311
4312 if (BaseOpcode->Store) { // No TFE for stores?
4313 // TODO: Handle dmask trim
4314 Register VData = MI.getOperand(1).getReg();
4315 LLT Ty = MRI->getType(VData);
4316 if (!Ty.isVector() || Ty.getElementType() != S16)
4317 return true;
4318
4319 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4320 if (RepackedReg != VData) {
4321 MI.getOperand(1).setReg(RepackedReg);
4322 }
4323
4324 return true;
4325 }
4326
4327 Register DstReg = MI.getOperand(0).getReg();
4328 LLT Ty = MRI->getType(DstReg);
4329 const LLT EltTy = Ty.getScalarType();
4330 const bool IsD16 = Ty.getScalarType() == S16;
4331 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4332
4333 // Confirm that the return type is large enough for the dmask specified
4334 if (NumElts < DMaskLanes)
4335 return false;
4336
4337 if (NumElts > 4 || DMaskLanes > 4)
4338 return false;
4339
4340 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4341 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4342
4343 // The raw dword aligned data component of the load. The only legal cases
4344 // where this matters should be when using the packed D16 format, for
4345 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4346 LLT RoundedTy;
4347
4348 // S32 vector to to cover all data, plus TFE result element.
4349 LLT TFETy;
4350
4351 // Register type to use for each loaded component. Will be S32 or V2S16.
4352 LLT RegTy;
4353
4354 if (IsD16 && ST.hasUnpackedD16VMem()) {
4355 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4356 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4357 RegTy = S32;
4358 } else {
4359 unsigned EltSize = EltTy.getSizeInBits();
4360 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4361 unsigned RoundedSize = 32 * RoundedElts;
4362 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4363 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4364 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4365 }
4366
4367 // The return type does not need adjustment.
4368 // TODO: Should we change s16 case to s32 or <2 x s16>?
4369 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4370 return true;
4371
4372 Register Dst1Reg;
4373
4374 // Insert after the instruction.
4375 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4376
4377 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4378 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4379 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4380 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4381
4382 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4383
4384 MI.getOperand(0).setReg(NewResultReg);
4385
4386 // In the IR, TFE is supposed to be used with a 2 element struct return
4387 // type. The intruction really returns these two values in one contiguous
4388 // register, with one additional dword beyond the loaded data. Rewrite the
4389 // return type to use a single register result.
4390
4391 if (IsTFE) {
4392 Dst1Reg = MI.getOperand(1).getReg();
4393 if (MRI->getType(Dst1Reg) != S32)
4394 return false;
4395
4396 // TODO: Make sure the TFE operand bit is set.
4397 MI.RemoveOperand(1);
4398
4399 // Handle the easy case that requires no repack instructions.
4400 if (Ty == S32) {
4401 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4402 return true;
4403 }
4404 }
4405
4406 // Now figure out how to copy the new result register back into the old
4407 // result.
4408 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4409
4410 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4411
4412 if (ResultNumRegs == 1) {
4413 assert(!IsTFE)((!IsTFE) ? static_cast<void> (0) : __assert_fail ("!IsTFE"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4413, __PRETTY_FUNCTION__))
;
4414 ResultRegs[0] = NewResultReg;
4415 } else {
4416 // We have to repack into a new vector of some kind.
4417 for (int I = 0; I != NumDataRegs; ++I)
4418 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4419 B.buildUnmerge(ResultRegs, NewResultReg);
4420
4421 // Drop the final TFE element to get the data part. The TFE result is
4422 // directly written to the right place already.
4423 if (IsTFE)
4424 ResultRegs.resize(NumDataRegs);
4425 }
4426
4427 // For an s16 scalar result, we form an s32 result with a truncate regardless
4428 // of packed vs. unpacked.
4429 if (IsD16 && !Ty.isVector()) {
4430 B.buildTrunc(DstReg, ResultRegs[0]);
4431 return true;
4432 }
4433
4434 // Avoid a build/concat_vector of 1 entry.
4435 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4436 B.buildBitcast(DstReg, ResultRegs[0]);
4437 return true;
4438 }
4439
4440 assert(Ty.isVector())((Ty.isVector()) ? static_cast<void> (0) : __assert_fail
("Ty.isVector()", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4440, __PRETTY_FUNCTION__))
;
4441
4442 if (IsD16) {
4443 // For packed D16 results with TFE enabled, all the data components are
4444 // S32. Cast back to the expected type.
4445 //
4446 // TODO: We don't really need to use load s32 elements. We would only need one
4447 // cast for the TFE result if a multiple of v2s16 was used.
4448 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4449 for (Register &Reg : ResultRegs)
4450 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4451 } else if (ST.hasUnpackedD16VMem()) {
4452 for (Register &Reg : ResultRegs)
4453 Reg = B.buildTrunc(S16, Reg).getReg(0);
4454 }
4455 }
4456
4457 auto padWithUndef = [&](LLT Ty, int NumElts) {
4458 if (NumElts == 0)
4459 return;
4460 Register Undef = B.buildUndef(Ty).getReg(0);
4461 for (int I = 0; I != NumElts; ++I)
4462 ResultRegs.push_back(Undef);
4463 };
4464
4465 // Pad out any elements eliminated due to the dmask.
4466 LLT ResTy = MRI->getType(ResultRegs[0]);
4467 if (!ResTy.isVector()) {
4468 padWithUndef(ResTy, NumElts - ResultRegs.size());
4469 B.buildBuildVector(DstReg, ResultRegs);
4470 return true;
4471 }
4472
4473 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)((!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? static_cast
<void> (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4473, __PRETTY_FUNCTION__))
;
4474 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4475
4476 // Deal with the one annoying legal case.
4477 const LLT V3S16 = LLT::vector(3, 16);
4478 if (Ty == V3S16) {
4479 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4480 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4481 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4482 return true;
4483 }
4484
4485 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4486 B.buildConcatVectors(DstReg, ResultRegs);
4487 return true;
4488}
4489
4490bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4491 LegalizerHelper &Helper, MachineInstr &MI) const {
4492 MachineIRBuilder &B = Helper.MIRBuilder;
4493 GISelChangeObserver &Observer = Helper.Observer;
4494
4495 Register Dst = MI.getOperand(0).getReg();
4496 LLT Ty = B.getMRI()->getType(Dst);
4497 unsigned Size = Ty.getSizeInBits();
4498 MachineFunction &MF = B.getMF();
4499
4500 Observer.changingInstr(MI);
4501
4502 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4503 Ty = getBitcastRegisterType(Ty);
4504 Helper.bitcastDst(MI, Ty, 0);
4505 Dst = MI.getOperand(0).getReg();
4506 B.setInsertPt(B.getMBB(), MI);
4507 }
4508
4509 // FIXME: We don't really need this intermediate instruction. The intrinsic
4510 // should be fixed to have a memory operand. Since it's readnone, we're not
4511 // allowed to add one.
4512 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4513 MI.RemoveOperand(1); // Remove intrinsic ID
4514
4515 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4516 // TODO: Should this use datalayout alignment?
4517 const unsigned MemSize = (Size + 7) / 8;
4518 const Align MemAlign(4);
4519 MachineMemOperand *MMO = MF.getMachineMemOperand(
4520 MachinePointerInfo(),
4521 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4522 MachineMemOperand::MOInvariant,
4523 MemSize, MemAlign);
4524 MI.addMemOperand(MF, MMO);
4525
4526 // There are no 96-bit result scalar loads, but widening to 128-bit should
4527 // always be legal. We may need to restore this to a 96-bit result if it turns
4528 // out this needs to be converted to a vector load during RegBankSelect.
4529 if (!isPowerOf2_32(Size)) {
4530 if (Ty.isVector())
4531 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4532 else
4533 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4534 }
4535
4536 Observer.changedInstr(MI);
4537 return true;
4538}
4539
4540// TODO: Move to selection
4541bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4542 MachineRegisterInfo &MRI,
4543 MachineIRBuilder &B) const {
4544 if (!ST.isTrapHandlerEnabled() ||
4545 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4546 return legalizeTrapEndpgm(MI, MRI, B);
4547
4548 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4549 switch (*HsaAbiVer) {
4550 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4551 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4552 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4553 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4554 return ST.supportsGetDoorbellID() ?
4555 legalizeTrapHsa(MI, MRI, B) :
4556 legalizeTrapHsaQueuePtr(MI, MRI, B);
4557 }
4558 }
4559
4560 llvm_unreachable("Unknown trap handler")::llvm::llvm_unreachable_internal("Unknown trap handler", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 4560)
;
4561}
4562
4563bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4564 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4565 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4566 MI.eraseFromParent();
4567 return true;
4568}
4569
4570bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4571 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4572 // Pass queue pointer to trap handler as input, and insert trap instruction
4573 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4574 Register LiveIn =
4575 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4576 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4577 return false;
4578
4579 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4580 B.buildCopy(SGPR01, LiveIn);
4581 B.buildInstr(AMDGPU::S_TRAP)
4582 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4583 .addReg(SGPR01, RegState::Implicit);
4584
4585 MI.eraseFromParent();
4586 return true;
4587}
4588
4589bool AMDGPULegalizerInfo::legalizeTrapHsa(
4590 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4591 B.buildInstr(AMDGPU::S_TRAP)
4592 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4593 MI.eraseFromParent();
4594 return true;
4595}
4596
4597bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4598 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4599 // Is non-HSA path or trap-handler disabled? then, report a warning
4600 // accordingly
4601 if (!ST.isTrapHandlerEnabled() ||
4602 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4603 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4604 "debugtrap handler not supported",
4605 MI.getDebugLoc(), DS_Warning);
4606 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4607 Ctx.diagnose(NoTrap);
4608 } else {
4609 // Insert debug-trap instruction
4610 B.buildInstr(AMDGPU::S_TRAP)
4611 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4612 }
4613
4614 MI.eraseFromParent();
4615 return true;
4616}
4617
4618bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4619 MachineIRBuilder &B) const {
4620 MachineRegisterInfo &MRI = *B.getMRI();
4621 const LLT S16 = LLT::scalar(16);
4622 const LLT S32 = LLT::scalar(32);
4623
4624 Register DstReg = MI.getOperand(0).getReg();
4625 Register NodePtr = MI.getOperand(2).getReg();
4626 Register RayExtent = MI.getOperand(3).getReg();
4627 Register RayOrigin = MI.getOperand(4).getReg();
4628 Register RayDir = MI.getOperand(5).getReg();
4629 Register RayInvDir = MI.getOperand(6).getReg();
4630 Register TDescr = MI.getOperand(7).getReg();
4631
4632 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4633 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4634 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4635 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4636 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4637 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4638
4639 SmallVector<Register, 12> Ops;
4640 if (Is64) {
4641 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4642 Ops.push_back(Unmerge.getReg(0));
4643 Ops.push_back(Unmerge.getReg(1));
4644 } else {
4645 Ops.push_back(NodePtr);
4646 }
4647 Ops.push_back(RayExtent);
4648
4649 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4650 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4651 Ops.push_back(Unmerge.getReg(0));
4652 Ops.push_back(Unmerge.getReg(1));
4653 Ops.push_back(Unmerge.getReg(2));
4654 };
4655
4656 packLanes(RayOrigin);
4657 if (IsA16) {
4658 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4659 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4660 Register R1 = MRI.createGenericVirtualRegister(S32);
4661 Register R2 = MRI.createGenericVirtualRegister(S32);
4662 Register R3 = MRI.createGenericVirtualRegister(S32);
4663 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4664 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4665 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4666 Ops.push_back(R1);
4667 Ops.push_back(R2);
4668 Ops.push_back(R3);
4669 } else {
4670 packLanes(RayDir);
4671 packLanes(RayInvDir);
4672 }
4673
4674 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4675 .addDef(DstReg)
4676 .addImm(Opcode);
4677
4678 for (Register R : Ops) {
4679 MIB.addUse(R);
4680 }
4681
4682 MIB.addUse(TDescr)
4683 .addImm(IsA16 ? 1 : 0)
4684 .cloneMemRefs(MI);
4685
4686 MI.eraseFromParent();
4687 return true;
4688}
4689
4690bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4691 MachineInstr &MI) const {
4692 MachineIRBuilder &B = Helper.MIRBuilder;
4693 MachineRegisterInfo &MRI = *B.getMRI();
4694
4695 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4696 auto IntrID = MI.getIntrinsicID();
4697 switch (IntrID) {
1
Control jumps to 'case amdgcn_dispatch_id:' at line 4819
4698 case Intrinsic::amdgcn_if:
4699 case Intrinsic::amdgcn_else: {
4700 MachineInstr *Br = nullptr;
4701 MachineBasicBlock *UncondBrTarget = nullptr;
4702 bool Negated = false;
4703 if (MachineInstr *BrCond =
4704 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4705 const SIRegisterInfo *TRI
4706 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4707
4708 Register Def = MI.getOperand(1).getReg();
4709 Register Use = MI.getOperand(3).getReg();
4710
4711 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4712
4713 if (Negated)
4714 std::swap(CondBrTarget, UncondBrTarget);
4715
4716 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4717 if (IntrID == Intrinsic::amdgcn_if) {
4718 B.buildInstr(AMDGPU::SI_IF)
4719 .addDef(Def)
4720 .addUse(Use)
4721 .addMBB(UncondBrTarget);
4722 } else {
4723 B.buildInstr(AMDGPU::SI_ELSE)
4724 .addDef(Def)
4725 .addUse(Use)
4726 .addMBB(UncondBrTarget);
4727 }
4728
4729 if (Br) {
4730 Br->getOperand(0).setMBB(CondBrTarget);
4731 } else {
4732 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4733 // since we're swapping branch targets it needs to be reinserted.
4734 // FIXME: IRTranslator should probably not do this
4735 B.buildBr(*CondBrTarget);
4736 }
4737
4738 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4739 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4740 MI.eraseFromParent();
4741 BrCond->eraseFromParent();
4742 return true;
4743 }
4744
4745 return false;
4746 }
4747 case Intrinsic::amdgcn_loop: {
4748 MachineInstr *Br = nullptr;
4749 MachineBasicBlock *UncondBrTarget = nullptr;
4750 bool Negated = false;
4751 if (MachineInstr *BrCond =
4752 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4753 const SIRegisterInfo *TRI
4754 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4755
4756 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4757 Register Reg = MI.getOperand(2).getReg();
4758
4759 if (Negated)
4760 std::swap(CondBrTarget, UncondBrTarget);
4761
4762 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4763 B.buildInstr(AMDGPU::SI_LOOP)
4764 .addUse(Reg)
4765 .addMBB(UncondBrTarget);
4766
4767 if (Br)
4768 Br->getOperand(0).setMBB(CondBrTarget);
4769 else
4770 B.buildBr(*CondBrTarget);
4771
4772 MI.eraseFromParent();
4773 BrCond->eraseFromParent();
4774 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4775 return true;
4776 }
4777
4778 return false;
4779 }
4780 case Intrinsic::amdgcn_kernarg_segment_ptr:
4781 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4782 // This only makes sense to call in a kernel, so just lower to null.
4783 B.buildConstant(MI.getOperand(0).getReg(), 0);
4784 MI.eraseFromParent();
4785 return true;
4786 }
4787
4788 return legalizePreloadedArgIntrin(
4789 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4790 case Intrinsic::amdgcn_implicitarg_ptr:
4791 return legalizeImplicitArgPtr(MI, MRI, B);
4792 case Intrinsic::amdgcn_workitem_id_x:
4793 return legalizePreloadedArgIntrin(MI, MRI, B,
4794 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4795 case Intrinsic::amdgcn_workitem_id_y:
4796 return legalizePreloadedArgIntrin(MI, MRI, B,
4797 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4798 case Intrinsic::amdgcn_workitem_id_z:
4799 return legalizePreloadedArgIntrin(MI, MRI, B,
4800 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4801 case Intrinsic::amdgcn_workgroup_id_x:
4802 return legalizePreloadedArgIntrin(MI, MRI, B,
4803 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4804 case Intrinsic::amdgcn_workgroup_id_y:
4805 return legalizePreloadedArgIntrin(MI, MRI, B,
4806 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4807 case Intrinsic::amdgcn_workgroup_id_z:
4808 return legalizePreloadedArgIntrin(MI, MRI, B,
4809 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4810 case Intrinsic::amdgcn_dispatch_ptr:
4811 return legalizePreloadedArgIntrin(MI, MRI, B,
4812 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4813 case Intrinsic::amdgcn_queue_ptr:
4814 return legalizePreloadedArgIntrin(MI, MRI, B,
4815 AMDGPUFunctionArgInfo::QUEUE_PTR);
4816 case Intrinsic::amdgcn_implicit_buffer_ptr:
4817 return legalizePreloadedArgIntrin(
4818 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4819 case Intrinsic::amdgcn_dispatch_id:
4820 return legalizePreloadedArgIntrin(MI, MRI, B,
2
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
4821 AMDGPUFunctionArgInfo::DISPATCH_ID);
4822 case Intrinsic::amdgcn_fdiv_fast:
4823 return legalizeFDIVFastIntrin(MI, MRI, B);
4824 case Intrinsic::amdgcn_is_shared:
4825 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4826 case Intrinsic::amdgcn_is_private:
4827 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4828 case Intrinsic::amdgcn_wavefrontsize: {
4829 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4830 MI.eraseFromParent();
4831 return true;
4832 }
4833 case Intrinsic::amdgcn_s_buffer_load:
4834 return legalizeSBufferLoad(Helper, MI);
4835 case Intrinsic::amdgcn_raw_buffer_store:
4836 case Intrinsic::amdgcn_struct_buffer_store:
4837 return legalizeBufferStore(MI, MRI, B, false, false);
4838 case Intrinsic::amdgcn_raw_buffer_store_format:
4839 case Intrinsic::amdgcn_struct_buffer_store_format:
4840 return legalizeBufferStore(MI, MRI, B, false, true);
4841 case Intrinsic::amdgcn_raw_tbuffer_store:
4842 case Intrinsic::amdgcn_struct_tbuffer_store:
4843 return legalizeBufferStore(MI, MRI, B, true, true);
4844 case Intrinsic::amdgcn_raw_buffer_load:
4845 case Intrinsic::amdgcn_struct_buffer_load:
4846 return legalizeBufferLoad(MI, MRI, B, false, false);
4847 case Intrinsic::amdgcn_raw_buffer_load_format:
4848 case Intrinsic::amdgcn_struct_buffer_load_format:
4849 return legalizeBufferLoad(MI, MRI, B, true, false);
4850 case Intrinsic::amdgcn_raw_tbuffer_load:
4851 case Intrinsic::amdgcn_struct_tbuffer_load:
4852 return legalizeBufferLoad(MI, MRI, B, true, true);
4853 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4854 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4855 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4856 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4857 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4858 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4859 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4860 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4861 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4862 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4863 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4864 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4865 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4866 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4867 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4868 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4869 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4870 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4871 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4872 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4873 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4874 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4875 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4876 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4877 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4878 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4879 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4880 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4881 case Intrinsic::amdgcn_buffer_atomic_fadd:
4882 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4883 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4884 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4885 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4886 return legalizeBufferAtomic(MI, B, IntrID);
4887 case Intrinsic::amdgcn_atomic_inc:
4888 return legalizeAtomicIncDec(MI, B, true);
4889 case Intrinsic::amdgcn_atomic_dec:
4890 return legalizeAtomicIncDec(MI, B, false);
4891 case Intrinsic::trap:
4892 return legalizeTrapIntrinsic(MI, MRI, B);
4893 case Intrinsic::debugtrap:
4894 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4895 case Intrinsic::amdgcn_rsq_clamp:
4896 return legalizeRsqClampIntrinsic(MI, MRI, B);
4897 case Intrinsic::amdgcn_ds_fadd:
4898 case Intrinsic::amdgcn_ds_fmin:
4899 case Intrinsic::amdgcn_ds_fmax:
4900 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4901 case Intrinsic::amdgcn_image_bvh_intersect_ray:
4902 return legalizeBVHIntrinsic(MI, B);
4903 default: {
4904 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4905 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4906 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4907 return true;
4908 }
4909 }
4910
4911 return true;
4912}

/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
9
Assuming the condition is true
10
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <algorithm>
18#include <cassert>
19#include <climits>
20#include <cmath>
21#include <cstdint>
22#include <cstring>
23#include <limits>
24#include <type_traits>
25
26#ifdef __ANDROID_NDK__
27#include <android/api-level.h>
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and MathExtras.h is popular.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44/// The behavior an operation has on an input of 0.
45enum ZeroBehavior {
46 /// The returned value is undefined.
47 ZB_Undefined,
48 /// The returned value is numeric_limits<T>::max()
49 ZB_Max,
50 /// The returned value is numeric_limits<T>::digits
51 ZB_Width
52};
53
54/// Mathematical constants.
55namespace numbers {
56// TODO: Track C++20 std::numbers.
57// TODO: Favor using the hexadecimal FP constants (requires C++17).
58constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
59 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
60 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
61 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
62 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
63 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
64 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
65 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
66 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
67 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
68 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
69 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
70 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
71 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
72 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
73constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
74 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
75 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
76 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
77 log2ef = 1.44269504F, // (0x1.715476P+0)
78 log10ef = .434294482F, // (0x1.bcb7b2P-2)
79 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
80 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
81 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
82 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
83 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
84 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
85 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
86 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
87 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
88} // namespace numbers
89
90namespace detail {
91template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
92 static unsigned count(T Val, ZeroBehavior) {
93 if (!Val)
94 return std::numeric_limits<T>::digits;
95 if (Val & 0x1)
96 return 0;
97
98 // Bisection method.
99 unsigned ZeroBits = 0;
100 T Shift = std::numeric_limits<T>::digits >> 1;
101 T Mask = std::numeric_limits<T>::max() >> Shift;
102 while (Shift) {
103 if ((Val & Mask) == 0) {
104 Val >>= Shift;
105 ZeroBits |= Shift;
106 }
107 Shift >>= 1;
108 Mask >>= Shift;
109 }
110 return ZeroBits;
111 }
112};
113
114#if defined(__GNUC__4) || defined(_MSC_VER)
115template <typename T> struct TrailingZerosCounter<T, 4> {
116 static unsigned count(T Val, ZeroBehavior ZB) {
117 if (ZB
14.1
'ZB' is not equal to ZB_Undefined
14.1
'ZB' is not equal to ZB_Undefined
14.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
15
Assuming 'Val' is equal to 0
16
Taking true branch
118 return 32;
17
Returning the value 32
119
120#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
121 return __builtin_ctz(Val);
122#elif defined(_MSC_VER)
123 unsigned long Index;
124 _BitScanForward(&Index, Val);
125 return Index;
126#endif
127 }
128};
129
130#if !defined(_MSC_VER) || defined(_M_X64)
131template <typename T> struct TrailingZerosCounter<T, 8> {
132 static unsigned count(T Val, ZeroBehavior ZB) {
133 if (ZB != ZB_Undefined && Val == 0)
134 return 64;
135
136#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
137 return __builtin_ctzll(Val);
138#elif defined(_MSC_VER)
139 unsigned long Index;
140 _BitScanForward64(&Index, Val);
141 return Index;
142#endif
143 }
144};
145#endif
146#endif
147} // namespace detail
148
149/// Count number of 0's from the least significant bit to the most
150/// stopping at the first 1.
151///
152/// Only unsigned integral types are allowed.
153///
154/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
155/// valid arguments.
156template <typename T>
157unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
158 static_assert(std::numeric_limits<T>::is_integer &&
159 !std::numeric_limits<T>::is_signed,
160 "Only unsigned integral types are allowed.");
161 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
14
Calling 'TrailingZerosCounter::count'
18
Returning from 'TrailingZerosCounter::count'
19
Returning the value 32
162}
163
164namespace detail {
165template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
166 static unsigned count(T Val, ZeroBehavior) {
167 if (!Val)
168 return std::numeric_limits<T>::digits;
169
170 // Bisection method.
171 unsigned ZeroBits = 0;
172 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
173 T Tmp = Val >> Shift;
174 if (Tmp)
175 Val = Tmp;
176 else
177 ZeroBits |= Shift;
178 }
179 return ZeroBits;
180 }
181};
182
183#if defined(__GNUC__4) || defined(_MSC_VER)
184template <typename T> struct LeadingZerosCounter<T, 4> {
185 static unsigned count(T Val, ZeroBehavior ZB) {
186 if (ZB != ZB_Undefined && Val == 0)
187 return 32;
188
189#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
190 return __builtin_clz(Val);
191#elif defined(_MSC_VER)
192 unsigned long Index;
193 _BitScanReverse(&Index, Val);
194 return Index ^ 31;
195#endif
196 }
197};
198
199#if !defined(_MSC_VER) || defined(_M_X64)
200template <typename T> struct LeadingZerosCounter<T, 8> {
201 static unsigned count(T Val, ZeroBehavior ZB) {
202 if (ZB != ZB_Undefined && Val == 0)
203 return 64;
204
205#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
206 return __builtin_clzll(Val);
207#elif defined(_MSC_VER)
208 unsigned long Index;
209 _BitScanReverse64(&Index, Val);
210 return Index ^ 63;
211#endif
212 }
213};
214#endif
215#endif
216} // namespace detail
217
218/// Count number of 0's from the most significant bit to the least
219/// stopping at the first 1.
220///
221/// Only unsigned integral types are allowed.
222///
223/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
224/// valid arguments.
225template <typename T>
226unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
227 static_assert(std::numeric_limits<T>::is_integer &&
228 !std::numeric_limits<T>::is_signed,
229 "Only unsigned integral types are allowed.");
230 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
231}
232
233/// Get the index of the first set bit starting from the least
234/// significant bit.
235///
236/// Only unsigned integral types are allowed.
237///
238/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
239/// valid arguments.
240template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
241 if (ZB == ZB_Max && Val == 0)
242 return std::numeric_limits<T>::max();
243
244 return countTrailingZeros(Val, ZB_Undefined);
245}
246
247/// Create a bitmask with the N right-most bits set to 1, and all other
248/// bits set to 0. Only unsigned types are allowed.
249template <typename T> T maskTrailingOnes(unsigned N) {
250 static_assert(std::is_unsigned<T>::value, "Invalid type!");
251 const unsigned Bits = CHAR_BIT8 * sizeof(T);
252 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 252, __PRETTY_FUNCTION__))
;
253 return N == 0 ? 0 : (T(-1) >> (Bits - N));
254}
255
256/// Create a bitmask with the N left-most bits set to 1, and all other
257/// bits set to 0. Only unsigned types are allowed.
258template <typename T> T maskLeadingOnes(unsigned N) {
259 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
260}
261
262/// Create a bitmask with the N right-most bits set to 0, and all other
263/// bits set to 1. Only unsigned types are allowed.
264template <typename T> T maskTrailingZeros(unsigned N) {
265 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
266}
267
268/// Create a bitmask with the N left-most bits set to 0, and all other
269/// bits set to 1. Only unsigned types are allowed.
270template <typename T> T maskLeadingZeros(unsigned N) {
271 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
272}
273
274/// Get the index of the last set bit starting from the least
275/// significant bit.
276///
277/// Only unsigned integral types are allowed.
278///
279/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
280/// valid arguments.
281template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
282 if (ZB == ZB_Max && Val == 0)
283 return std::numeric_limits<T>::max();
284
285 // Use ^ instead of - because both gcc and llvm can remove the associated ^
286 // in the __builtin_clz intrinsic on x86.
287 return countLeadingZeros(Val, ZB_Undefined) ^
288 (std::numeric_limits<T>::digits - 1);
289}
290
291/// Macro compressed bit reversal table for 256 bits.
292///
293/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
294static const unsigned char BitReverseTable256[256] = {
295#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
296#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
297#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
298 R6(0), R6(2), R6(1), R6(3)
299#undef R2
300#undef R4
301#undef R6
302};
303
304/// Reverse the bits in \p Val.
305template <typename T>
306T reverseBits(T Val) {
307 unsigned char in[sizeof(Val)];
308 unsigned char out[sizeof(Val)];
309 std::memcpy(in, &Val, sizeof(Val));
310 for (unsigned i = 0; i < sizeof(Val); ++i)
311 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
312 std::memcpy(&Val, out, sizeof(Val));
313 return Val;
314}
315
316#if __has_builtin(__builtin_bitreverse8)1
317template<>
318inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
319 return __builtin_bitreverse8(Val);
320}
321#endif
322
323#if __has_builtin(__builtin_bitreverse16)1
324template<>
325inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
326 return __builtin_bitreverse16(Val);
327}
328#endif
329
330#if __has_builtin(__builtin_bitreverse32)1
331template<>
332inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
333 return __builtin_bitreverse32(Val);
334}
335#endif
336
337#if __has_builtin(__builtin_bitreverse64)1
338template<>
339inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
340 return __builtin_bitreverse64(Val);
341}
342#endif
343
344// NOTE: The following support functions use the _32/_64 extensions instead of
345// type overloading so that signed and unsigned integers can be used without
346// ambiguity.
347
348/// Return the high 32 bits of a 64 bit value.
349constexpr inline uint32_t Hi_32(uint64_t Value) {
350 return static_cast<uint32_t>(Value >> 32);
351}
352
353/// Return the low 32 bits of a 64 bit value.
354constexpr inline uint32_t Lo_32(uint64_t Value) {
355 return static_cast<uint32_t>(Value);
356}
357
358/// Make a 64-bit integer from a high / low pair of 32-bit integers.
359constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
360 return ((uint64_t)High << 32) | (uint64_t)Low;
361}
362
363/// Checks if an integer fits into the given bit width.
364template <unsigned N> constexpr inline bool isInt(int64_t x) {
365 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
366}
367// Template specializations to get better code for common cases.
368template <> constexpr inline bool isInt<8>(int64_t x) {
369 return static_cast<int8_t>(x) == x;
370}
371template <> constexpr inline bool isInt<16>(int64_t x) {
372 return static_cast<int16_t>(x) == x;
373}
374template <> constexpr inline bool isInt<32>(int64_t x) {
375 return static_cast<int32_t>(x) == x;
376}
377
378/// Checks if a signed integer is an N bit number shifted left by S.
379template <unsigned N, unsigned S>
380constexpr inline bool isShiftedInt(int64_t x) {
381 static_assert(
382 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
383 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
384 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
385}
386
387/// Checks if an unsigned integer fits into the given bit width.
388///
389/// This is written as two functions rather than as simply
390///
391/// return N >= 64 || X < (UINT64_C(1) << N);
392///
393/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
394/// left too many places.
395template <unsigned N>
396constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
397 static_assert(N > 0, "isUInt<0> doesn't make sense");
398 return X < (UINT64_C(1)1UL << (N));
399}
400template <unsigned N>
401constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
402 return true;
403}
404
405// Template specializations to get better code for common cases.
406template <> constexpr inline bool isUInt<8>(uint64_t x) {
407 return static_cast<uint8_t>(x) == x;
408}
409template <> constexpr inline bool isUInt<16>(uint64_t x) {
410 return static_cast<uint16_t>(x) == x;
411}
412template <> constexpr inline bool isUInt<32>(uint64_t x) {
413 return static_cast<uint32_t>(x) == x;
414}
415
416/// Checks if a unsigned integer is an N bit number shifted left by S.
417template <unsigned N, unsigned S>
418constexpr inline bool isShiftedUInt(uint64_t x) {
419 static_assert(
420 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
421 static_assert(N + S <= 64,
422 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
423 // Per the two static_asserts above, S must be strictly less than 64. So
424 // 1 << S is not undefined behavior.
425 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
426}
427
428/// Gets the maximum value for a N-bit unsigned integer.
429inline uint64_t maxUIntN(uint64_t N) {
430 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 430, __PRETTY_FUNCTION__))
;
431
432 // uint64_t(1) << 64 is undefined behavior, so we can't do
433 // (uint64_t(1) << N) - 1
434 // without checking first that N != 64. But this works and doesn't have a
435 // branch.
436 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
437}
438
439/// Gets the minimum value for a N-bit signed integer.
440inline int64_t minIntN(int64_t N) {
441 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 441, __PRETTY_FUNCTION__))
;
442
443 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
444}
445
446/// Gets the maximum value for a N-bit signed integer.
447inline int64_t maxIntN(int64_t N) {
448 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 448, __PRETTY_FUNCTION__))
;
449
450 // This relies on two's complement wraparound when N == 64, so we convert to
451 // int64_t only at the very end to avoid UB.
452 return (UINT64_C(1)1UL << (N - 1)) - 1;
453}
454
455/// Checks if an unsigned integer fits into the given (dynamic) bit width.
456inline bool isUIntN(unsigned N, uint64_t x) {
457 return N >= 64 || x <= maxUIntN(N);
458}
459
460/// Checks if an signed integer fits into the given (dynamic) bit width.
461inline bool isIntN(unsigned N, int64_t x) {
462 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
463}
464
465/// Return true if the argument is a non-empty sequence of ones starting at the
466/// least significant bit with the remainder zero (32 bit version).
467/// Ex. isMask_32(0x0000FFFFU) == true.
468constexpr inline bool isMask_32(uint32_t Value) {
469 return Value && ((Value + 1) & Value) == 0;
470}
471
472/// Return true if the argument is a non-empty sequence of ones starting at the
473/// least significant bit with the remainder zero (64 bit version).
474constexpr inline bool isMask_64(uint64_t Value) {
475 return Value && ((Value + 1) & Value) == 0;
476}
477
478/// Return true if the argument contains a non-empty sequence of ones with the
479/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
480constexpr inline bool isShiftedMask_32(uint32_t Value) {
481 return Value && isMask_32((Value - 1) | Value);
482}
483
484/// Return true if the argument contains a non-empty sequence of ones with the
485/// remainder zero (64 bit version.)
486constexpr inline bool isShiftedMask_64(uint64_t Value) {
487 return Value && isMask_64((Value - 1) | Value);
488}
489
490/// Return true if the argument is a power of two > 0.
491/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
492constexpr inline bool isPowerOf2_32(uint32_t Value) {
493 return Value && !(Value & (Value - 1));
494}
495
496/// Return true if the argument is a power of two > 0 (64 bit edition.)
497constexpr inline bool isPowerOf2_64(uint64_t Value) {
498 return Value && !(Value & (Value - 1));
499}
500
501/// Count the number of ones from the most significant bit to the first
502/// zero bit.
503///
504/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
505/// Only unsigned integral types are allowed.
506///
507/// \param ZB the behavior on an input of all ones. Only ZB_Width and
508/// ZB_Undefined are valid arguments.
509template <typename T>
510unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
511 static_assert(std::numeric_limits<T>::is_integer &&
512 !std::numeric_limits<T>::is_signed,
513 "Only unsigned integral types are allowed.");
514 return countLeadingZeros<T>(~Value, ZB);
515}
516
517/// Count the number of ones from the least significant bit to the first
518/// zero bit.
519///
520/// Ex. countTrailingOnes(0x00FF00FF) == 8.
521/// Only unsigned integral types are allowed.
522///
523/// \param ZB the behavior on an input of all ones. Only ZB_Width and
524/// ZB_Undefined are valid arguments.
525template <typename T>
526unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
527 static_assert(std::numeric_limits<T>::is_integer &&
528 !std::numeric_limits<T>::is_signed,
529 "Only unsigned integral types are allowed.");
530 return countTrailingZeros<T>(~Value, ZB);
531}
532
533namespace detail {
534template <typename T, std::size_t SizeOfT> struct PopulationCounter {
535 static unsigned count(T Value) {
536 // Generic version, forward to 32 bits.
537 static_assert(SizeOfT <= 4, "Not implemented!");
538#if defined(__GNUC__4)
539 return __builtin_popcount(Value);
540#else
541 uint32_t v = Value;
542 v = v - ((v >> 1) & 0x55555555);
543 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
544 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
545#endif
546 }
547};
548
549template <typename T> struct PopulationCounter<T, 8> {
550 static unsigned count(T Value) {
551#if defined(__GNUC__4)
552 return __builtin_popcountll(Value);
553#else
554 uint64_t v = Value;
555 v = v - ((v >> 1) & 0x5555555555555555ULL);
556 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
557 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
558 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
559#endif
560 }
561};
562} // namespace detail
563
564/// Count the number of set bits in a value.
565/// Ex. countPopulation(0xF000F000) = 8
566/// Returns 0 if the word is zero.
567template <typename T>
568inline unsigned countPopulation(T Value) {
569 static_assert(std::numeric_limits<T>::is_integer &&
570 !std::numeric_limits<T>::is_signed,
571 "Only unsigned integral types are allowed.");
572 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
573}
574
575/// Compile time Log2.
576/// Valid only for positive powers of two.
577template <size_t kValue> constexpr inline size_t CTLog2() {
578 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
579 "Value is not a valid power of 2");
580 return 1 + CTLog2<kValue / 2>();
581}
582
583template <> constexpr inline size_t CTLog2<1>() { return 0; }
584
585/// Return the log base 2 of the specified value.
586inline double Log2(double Value) {
587#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
588 return __builtin_log(Value) / __builtin_log(2.0);
589#else
590 return log2(Value);
591#endif
592}
593
594/// Return the floor log base 2 of the specified value, -1 if the value is zero.
595/// (32 bit edition.)
596/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
597inline unsigned Log2_32(uint32_t Value) {
598 return 31 - countLeadingZeros(Value);
599}
600
601/// Return the floor log base 2 of the specified value, -1 if the value is zero.
602/// (64 bit edition.)
603inline unsigned Log2_64(uint64_t Value) {
604 return 63 - countLeadingZeros(Value);
605}
606
607/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
608/// (32 bit edition).
609/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
610inline unsigned Log2_32_Ceil(uint32_t Value) {
611 return 32 - countLeadingZeros(Value - 1);
612}
613
614/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
615/// (64 bit edition.)
616inline unsigned Log2_64_Ceil(uint64_t Value) {
617 return 64 - countLeadingZeros(Value - 1);
618}
619
620/// Return the greatest common divisor of the values using Euclid's algorithm.
621template <typename T>
622inline T greatestCommonDivisor(T A, T B) {
623 while (B) {
624 T Tmp = B;
625 B = A % B;
626 A = Tmp;
627 }
628 return A;
629}
630
631inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
632 return greatestCommonDivisor<uint64_t>(A, B);
633}
634
635/// This function takes a 64-bit integer and returns the bit equivalent double.
636inline double BitsToDouble(uint64_t Bits) {
637 double D;
638 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
639 memcpy(&D, &Bits, sizeof(Bits));
640 return D;
641}
642
643/// This function takes a 32-bit integer and returns the bit equivalent float.
644inline float BitsToFloat(uint32_t Bits) {
645 float F;
646 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
647 memcpy(&F, &Bits, sizeof(Bits));
648 return F;
649}
650
651/// This function takes a double and returns the bit equivalent 64-bit integer.
652/// Note that copying doubles around changes the bits of NaNs on some hosts,
653/// notably x86, so this routine cannot be used if these bits are needed.
654inline uint64_t DoubleToBits(double Double) {
655 uint64_t Bits;
656 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
657 memcpy(&Bits, &Double, sizeof(Double));
658 return Bits;
659}
660
661/// This function takes a float and returns the bit equivalent 32-bit integer.
662/// Note that copying floats around changes the bits of NaNs on some hosts,
663/// notably x86, so this routine cannot be used if these bits are needed.
664inline uint32_t FloatToBits(float Float) {
665 uint32_t Bits;
666 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
667 memcpy(&Bits, &Float, sizeof(Float));
668 return Bits;
669}
670
671/// A and B are either alignments or offsets. Return the minimum alignment that
672/// may be assumed after adding the two together.
673constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
674 // The largest power of 2 that divides both A and B.
675 //
676 // Replace "-Value" by "1+~Value" in the following commented code to avoid
677 // MSVC warning C4146
678 // return (A | B) & -(A | B);
679 return (A | B) & (1 + ~(A | B));
680}
681
682/// Returns the next power of two (in 64-bits) that is strictly greater than A.
683/// Returns zero on overflow.
684inline uint64_t NextPowerOf2(uint64_t A) {
685 A |= (A >> 1);
686 A |= (A >> 2);
687 A |= (A >> 4);
688 A |= (A >> 8);
689 A |= (A >> 16);
690 A |= (A >> 32);
691 return A + 1;
692}
693
694/// Returns the power of two which is less than or equal to the given value.
695/// Essentially, it is a floor operation across the domain of powers of two.
696inline uint64_t PowerOf2Floor(uint64_t A) {
697 if (!A) return 0;
698 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
699}
700
701/// Returns the power of two which is greater than or equal to the given value.
702/// Essentially, it is a ceil operation across the domain of powers of two.
703inline uint64_t PowerOf2Ceil(uint64_t A) {
704 if (!A)
705 return 0;
706 return NextPowerOf2(A - 1);
707}
708
709/// Returns the next integer (mod 2**64) that is greater than or equal to
710/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
711///
712/// If non-zero \p Skew is specified, the return value will be a minimal
713/// integer that is greater than or equal to \p Value and equal to
714/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
715/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
716///
717/// Examples:
718/// \code
719/// alignTo(5, 8) = 8
720/// alignTo(17, 8) = 24
721/// alignTo(~0LL, 8) = 0
722/// alignTo(321, 255) = 510
723///
724/// alignTo(5, 8, 7) = 7
725/// alignTo(17, 8, 1) = 17
726/// alignTo(~0LL, 8, 3) = 3
727/// alignTo(321, 255, 42) = 552
728/// \endcode
729inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
730 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 730, __PRETTY_FUNCTION__))
;
731 Skew %= Align;
732 return (Value + Align - 1 - Skew) / Align * Align + Skew;
733}
734
735/// Returns the next integer (mod 2**64) that is greater than or equal to
736/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
737template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
738 static_assert(Align != 0u, "Align must be non-zero");
739 return (Value + Align - 1) / Align * Align;
740}
741
742/// Returns the integer ceil(Numerator / Denominator).
743inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
744 return alignTo(Numerator, Denominator) / Denominator;
745}
746
747/// Returns the integer nearest(Numerator / Denominator).
748inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
749 return (Numerator + (Denominator / 2)) / Denominator;
750}
751
752/// Returns the largest uint64_t less than or equal to \p Value and is
753/// \p Skew mod \p Align. \p Align must be non-zero
754inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
755 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 755, __PRETTY_FUNCTION__))
;
756 Skew %= Align;
757 return (Value - Skew) / Align * Align + Skew;
758}
759
760/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
761/// Requires 0 < B <= 32.
762template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
763 static_assert(B > 0, "Bit width can't be 0.");
764 static_assert(B <= 32, "Bit width out of range.");
765 return int32_t(X << (32 - B)) >> (32 - B);
766}
767
768/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
769/// Requires 0 < B <= 32.
770inline int32_t SignExtend32(uint32_t X, unsigned B) {
771 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 771, __PRETTY_FUNCTION__))
;
772 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 772, __PRETTY_FUNCTION__))
;
773 return int32_t(X << (32 - B)) >> (32 - B);
774}
775
776/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
777/// Requires 0 < B <= 64.
778template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
779 static_assert(B > 0, "Bit width can't be 0.");
780 static_assert(B <= 64, "Bit width out of range.");
781 return int64_t(x << (64 - B)) >> (64 - B);
782}
783
784/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
785/// Requires 0 < B <= 64.
786inline int64_t SignExtend64(uint64_t X, unsigned B) {
787 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 787, __PRETTY_FUNCTION__))
;
788 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include/llvm/Support/MathExtras.h"
, 788, __PRETTY_FUNCTION__))
;
789 return int64_t(X << (64 - B)) >> (64 - B);
790}
791
792/// Subtract two unsigned integers, X and Y, of type T and return the absolute
793/// value of the result.
794template <typename T>
795std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
796 return std::max(X, Y) - std::min(X, Y);
797}
798
799/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
800/// maximum representable value of T on overflow. ResultOverflowed indicates if
801/// the result is larger than the maximum representable value of type T.
802template <typename T>
803std::enable_if_t<std::is_unsigned<T>::value, T>
804SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
805 bool Dummy;
806 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
807 // Hacker's Delight, p. 29
808 T Z = X + Y;
809 Overflowed = (Z < X || Z < Y);
810 if (Overflowed)
811 return std::numeric_limits<T>::max();
812 else
813 return Z;
814}
815
816/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
817/// maximum representable value of T on overflow. ResultOverflowed indicates if
818/// the result is larger than the maximum representable value of type T.
819template <typename T>
820std::enable_if_t<std::is_unsigned<T>::value, T>
821SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
822 bool Dummy;
823 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
824
825 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
826 // because it fails for uint16_t (where multiplication can have undefined
827 // behavior due to promotion to int), and requires a division in addition
828 // to the multiplication.
829
830 Overflowed = false;
831
832 // Log2(Z) would be either Log2Z or Log2Z + 1.
833 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
834 // will necessarily be less than Log2Max as desired.
835 int Log2Z = Log2_64(X) + Log2_64(Y);
836 const T Max = std::numeric_limits<T>::max();
837 int Log2Max = Log2_64(Max);
838 if (Log2Z < Log2Max) {
839 return X * Y;
840 }
841 if (Log2Z > Log2Max) {
842 Overflowed = true;
843 return Max;
844 }
845
846 // We're going to use the top bit, and maybe overflow one
847 // bit past it. Multiply all but the bottom bit then add
848 // that on at the end.
849 T Z = (X >> 1) * Y;
850 if (Z & ~(Max >> 1)) {
851 Overflowed = true;
852 return Max;
853 }
854 Z <<= 1;
855 if (X & 1)
856 return SaturatingAdd(Z, Y, ResultOverflowed);
857
858 return Z;
859}
860
861/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
862/// the product. Clamp the result to the maximum representable value of T on
863/// overflow. ResultOverflowed indicates if the result is larger than the
864/// maximum representable value of type T.
865template <typename T>
866std::enable_if_t<std::is_unsigned<T>::value, T>
867SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
868 bool Dummy;
869 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
870
871 T Product = SaturatingMultiply(X, Y, &Overflowed);
872 if (Overflowed)
873 return Product;
874
875 return SaturatingAdd(A, Product, &Overflowed);
876}
877
878/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
879extern const float huge_valf;
880
881
882/// Add two signed integers, computing the two's complement truncated result,
883/// returning true if overflow occured.
884template <typename T>
885std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
886#if __has_builtin(__builtin_add_overflow)1
887 return __builtin_add_overflow(X, Y, &Result);
888#else
889 // Perform the unsigned addition.
890 using U = std::make_unsigned_t<T>;
891 const U UX = static_cast<U>(X);
892 const U UY = static_cast<U>(Y);
893 const U UResult = UX + UY;
894
895 // Convert to signed.
896 Result = static_cast<T>(UResult);
897
898 // Adding two positive numbers should result in a positive number.
899 if (X > 0 && Y > 0)
900 return Result <= 0;
901 // Adding two negatives should result in a negative number.
902 if (X < 0 && Y < 0)
903 return Result >= 0;
904 return false;
905#endif
906}
907
908/// Subtract two signed integers, computing the two's complement truncated
909/// result, returning true if an overflow ocurred.
910template <typename T>
911std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
912#if __has_builtin(__builtin_sub_overflow)1
913 return __builtin_sub_overflow(X, Y, &Result);
914#else
915 // Perform the unsigned addition.
916 using U = std::make_unsigned_t<T>;
917 const U UX = static_cast<U>(X);
918 const U UY = static_cast<U>(Y);
919 const U UResult = UX - UY;
920
921 // Convert to signed.
922 Result = static_cast<T>(UResult);
923
924 // Subtracting a positive number from a negative results in a negative number.
925 if (X <= 0 && Y > 0)
926 return Result >= 0;
927 // Subtracting a negative number from a positive results in a positive number.
928 if (X >= 0 && Y < 0)
929 return Result <= 0;
930 return false;
931#endif
932}
933
934/// Multiply two signed integers, computing the two's complement truncated
935/// result, returning true if an overflow ocurred.
936template <typename T>
937std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
938 // Perform the unsigned multiplication on absolute values.
939 using U = std::make_unsigned_t<T>;
940 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
941 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
942 const U UResult = UX * UY;
943
944 // Convert to signed.
945 const bool IsNegative = (X < 0) ^ (Y < 0);
946 Result = IsNegative ? (0 - UResult) : UResult;
947
948 // If any of the args was 0, result is 0 and no overflow occurs.
949 if (UX == 0 || UY == 0)
950 return false;
951
952 // UX and UY are in [1, 2^n], where n is the number of digits.
953 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
954 // positive) divided by an argument compares to the other.
955 if (IsNegative)
956 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
957 else
958 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
959}
960
961} // End llvm namespace
962
963#endif