Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2874, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returs true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx,
99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100 };
101}
102
103static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 unsigned Size = Ty.getSizeInBits();
108 unsigned Pieces = (Size + 63) / 64;
109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110 return std::make_pair(
111 TypeIdx,
112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113 };
114}
115
116// Increase the number of vector elements to reach the next multiple of 32-bit
117// type.
118static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32)(static_cast<void> (0));
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132}
133
134static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
136
137 LLT CoercedTy;
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145}
146
147static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152}
153
154static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0)(static_cast<void> (0));
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162}
163
164static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169}
170
171static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176}
177
178static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183}
184
185static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187}
188
189static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192}
193
194static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199}
200
201static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209}
210
211// Any combination of 32 or 64-bit elements up the maximum register size, and
212// multiples of v2s16.
213static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217}
218
219static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227}
228
229// If we have a truncating store or an extending load with a data size larger
230// than 32-bits, we need to reduce to a 32-bit type.
231static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237}
238
239// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240// handle some operations by just promoting the register during
241// selection. There are also d16 loads on GFX9+ which preserve the high bits.
242static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize)(static_cast<void> (0));
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331}
332
333// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334// workaround this. Eventually it should ignore the type for loads and only care
335// about the size. Return true in cases where we will workaround this for now by
336// bitcasting.
337static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353}
354
355static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359}
360
361/// Return true if a load or store of the type should be lowered with a bitcast
362/// to a different type.
363static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377}
378
379/// Return true if we should legalize a load by widening an odd sized memory
380/// access up to the alignment. Note this case when the memory access itself
381/// changes, not the size of the result register.
382static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 unsigned AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414}
415
416static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424}
425
426AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16, V2S16})
535 .clampScalar(0, S16, S32)
536 .clampMaxNumElements(0, S16, 2)
537 .scalarize(0)
538 .widenScalarToNextPow2(0, 32);
539
540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541 .legalFor({S32, S16, V2S16}) // Clamp modifier
542 .minScalarOrElt(0, S16)
543 .clampMaxNumElements(0, S16, 2)
544 .scalarize(0)
545 .widenScalarToNextPow2(0, 32)
546 .lower();
547 } else if (ST.has16BitInsts()) {
548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549 .legalFor({S32, S16})
550 .clampScalar(0, S16, S32)
551 .scalarize(0)
552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553
554 // Technically the saturating operations require clamp bit support, but this
555 // was introduced at the same time as 16-bit operations.
556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557 .legalFor({S32, S16}) // Clamp modifier
558 .minScalar(0, S16)
559 .scalarize(0)
560 .widenScalarToNextPow2(0, 16)
561 .lower();
562
563 // We're just lowering this, but it helps get a better result to try to
564 // coerce to the desired type first.
565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566 .minScalar(0, S16)
567 .scalarize(0)
568 .lower();
569 } else {
570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571 .legalFor({S32})
572 .clampScalar(0, S32, S32)
573 .scalarize(0);
574
575 if (ST.hasIntClamp()) {
576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577 .legalFor({S32}) // Clamp modifier.
578 .scalarize(0)
579 .minScalarOrElt(0, S32)
580 .lower();
581 } else {
582 // Clamp bit support was added in VI, along with 16-bit operations.
583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584 .minScalar(0, S32)
585 .scalarize(0)
586 .lower();
587 }
588
589 // FIXME: DAG expansion gets better results. The widening uses the smaller
590 // range values and goes for the min/max lowering directly.
591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592 .minScalar(0, S32)
593 .scalarize(0)
594 .lower();
595 }
596
597 getActionDefinitionsBuilder(
598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599 .customFor({S32, S64})
600 .clampScalar(0, S32, S64)
601 .widenScalarToNextPow2(0, 32)
602 .scalarize(0);
603
604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605 .legalFor({S32})
606 .maxScalarOrElt(0, S32);
607
608 if (ST.hasVOP3PInsts()) {
609 Mulh
610 .clampMaxNumElements(0, S8, 2)
611 .lowerFor({V2S8});
612 }
613
614 Mulh
615 .scalarize(0)
616 .lower();
617
618 // Report legal for any types we can handle anywhere. For the cases only legal
619 // on the SALU, RegBankSelect will be able to re-legalize.
620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622 .clampScalar(0, S32, S64)
623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625 .widenScalarToNextPow2(0)
626 .scalarize(0);
627
628 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630 .legalFor({{S32, S1}, {S32, S32}})
631 .minScalar(0, S32)
632 // TODO: .scalarize(0)
633 .lower();
634
635 getActionDefinitionsBuilder(G_BITCAST)
636 // Don't worry about the size constraint.
637 .legalIf(all(isRegisterType(0), isRegisterType(1)))
638 .lower();
639
640
641 getActionDefinitionsBuilder(G_CONSTANT)
642 .legalFor({S1, S32, S64, S16, GlobalPtr,
643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644 .legalIf(isPointer(0))
645 .clampScalar(0, S32, S64)
646 .widenScalarToNextPow2(0);
647
648 getActionDefinitionsBuilder(G_FCONSTANT)
649 .legalFor({S32, S64, S16})
650 .clampScalar(0, S16, S64);
651
652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653 .legalIf(isRegisterType(0))
654 // s1 and s16 are special cases because they have legal operations on
655 // them, but don't really occupy registers in the normal way.
656 .legalFor({S1, S16})
657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658 .clampScalarOrElt(0, S32, MaxScalar)
659 .widenScalarToNextPow2(0, 32)
660 .clampMaxNumElements(0, S32, 16);
661
662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663
664 // If the amount is divergent, we have to do a wave reduction to get the
665 // maximum value, so this is expanded during RegBankSelect.
666 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667 .legalFor({{PrivatePtr, S32}});
668
669 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670 .customIf(typeIsNot(0, PrivatePtr));
671
672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673
674 auto &FPOpActions = getActionDefinitionsBuilder(
675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676 .legalFor({S32, S64});
677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678 .customFor({S32, S64});
679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680 .customFor({S32, S64});
681
682 if (ST.has16BitInsts()) {
683 if (ST.hasVOP3PInsts())
684 FPOpActions.legalFor({S16, V2S16});
685 else
686 FPOpActions.legalFor({S16});
687
688 TrigActions.customFor({S16});
689 FDIVActions.customFor({S16});
690 }
691
692 auto &MinNumMaxNum = getActionDefinitionsBuilder({
693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694
695 if (ST.hasVOP3PInsts()) {
696 MinNumMaxNum.customFor(FPTypesPK16)
697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698 .clampMaxNumElements(0, S16, 2)
699 .clampScalar(0, S16, S64)
700 .scalarize(0);
701 } else if (ST.has16BitInsts()) {
702 MinNumMaxNum.customFor(FPTypes16)
703 .clampScalar(0, S16, S64)
704 .scalarize(0);
705 } else {
706 MinNumMaxNum.customFor(FPTypesBase)
707 .clampScalar(0, S32, S64)
708 .scalarize(0);
709 }
710
711 if (ST.hasVOP3PInsts())
712 FPOpActions.clampMaxNumElements(0, S16, 2);
713
714 FPOpActions
715 .scalarize(0)
716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717
718 TrigActions
719 .scalarize(0)
720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721
722 FDIVActions
723 .scalarize(0)
724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725
726 getActionDefinitionsBuilder({G_FNEG, G_FABS})
727 .legalFor(FPTypesPK16)
728 .clampMaxNumElements(0, S16, 2)
729 .scalarize(0)
730 .clampScalar(0, S16, S64);
731
732 if (ST.has16BitInsts()) {
733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734 .legalFor({S32, S64, S16})
735 .scalarize(0)
736 .clampScalar(0, S16, S64);
737 } else {
738 getActionDefinitionsBuilder(G_FSQRT)
739 .legalFor({S32, S64})
740 .scalarize(0)
741 .clampScalar(0, S32, S64);
742
743 if (ST.hasFractBug()) {
744 getActionDefinitionsBuilder(G_FFLOOR)
745 .customFor({S64})
746 .legalFor({S32, S64})
747 .scalarize(0)
748 .clampScalar(0, S32, S64);
749 } else {
750 getActionDefinitionsBuilder(G_FFLOOR)
751 .legalFor({S32, S64})
752 .scalarize(0)
753 .clampScalar(0, S32, S64);
754 }
755 }
756
757 getActionDefinitionsBuilder(G_FPTRUNC)
758 .legalFor({{S32, S64}, {S16, S32}})
759 .scalarize(0)
760 .lower();
761
762 getActionDefinitionsBuilder(G_FPEXT)
763 .legalFor({{S64, S32}, {S32, S16}})
764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765 .scalarize(0);
766
767 getActionDefinitionsBuilder(G_FSUB)
768 // Use actual fsub instruction
769 .legalFor({S32})
770 // Must use fadd + fneg
771 .lowerFor({S64, S16, V2S16})
772 .scalarize(0)
773 .clampScalar(0, S32, S64);
774
775 // Whether this is legal depends on the floating point mode for the function.
776 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778 FMad.customFor({S32, S16});
779 else if (ST.hasMadMacF32Insts())
780 FMad.customFor({S32});
781 else if (ST.hasMadF16())
782 FMad.customFor({S16});
783 FMad.scalarize(0)
784 .lower();
785
786 auto &FRem = getActionDefinitionsBuilder(G_FREM);
787 if (ST.has16BitInsts()) {
788 FRem.customFor({S16, S32, S64});
789 } else {
790 FRem.minScalar(0, S32)
791 .customFor({S32, S64});
792 }
793 FRem.scalarize(0);
794
795 // TODO: Do we need to clamp maximum bitwidth?
796 getActionDefinitionsBuilder(G_TRUNC)
797 .legalIf(isScalar(0))
798 .legalFor({{V2S16, V2S32}})
799 .clampMaxNumElements(0, S16, 2)
800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801 // situations (like an invalid implicit use), we don't want to infinite loop
802 // in the legalizer.
803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804 .alwaysLegal();
805
806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808 {S32, S1}, {S64, S1}, {S16, S1}})
809 .scalarize(0)
810 .clampScalar(0, S32, S64)
811 .widenScalarToNextPow2(1, 32);
812
813 // TODO: Split s1->s64 during regbankselect for VALU.
814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816 .lowerIf(typeIs(1, S1))
817 .customFor({{S32, S64}, {S64, S64}});
818 if (ST.has16BitInsts())
819 IToFP.legalFor({{S16, S16}});
820 IToFP.clampScalar(1, S32, S64)
821 .minScalar(0, S32)
822 .scalarize(0)
823 .widenScalarToNextPow2(1);
824
825 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
826 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
827 .customFor({{S64, S32}, {S64, S64}})
828 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
829 if (ST.has16BitInsts())
830 FPToI.legalFor({{S16, S16}});
831 else
832 FPToI.minScalar(1, S32);
833
834 FPToI.minScalar(0, S32)
835 .widenScalarToNextPow2(0, 32)
836 .scalarize(0)
837 .lower();
838
839 // Lower roundeven into G_FRINT
840 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
841 .scalarize(0)
842 .lower();
843
844 if (ST.has16BitInsts()) {
845 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
846 .legalFor({S16, S32, S64})
847 .clampScalar(0, S16, S64)
848 .scalarize(0);
849 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
850 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
851 .legalFor({S32, S64})
852 .clampScalar(0, S32, S64)
853 .scalarize(0);
854 } else {
855 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
856 .legalFor({S32})
857 .customFor({S64})
858 .clampScalar(0, S32, S64)
859 .scalarize(0);
860 }
861
862 getActionDefinitionsBuilder(G_PTR_ADD)
863 .legalIf(all(isPointer(0), sameSize(0, 1)))
864 .scalarize(0)
865 .scalarSameSizeAs(1, 0);
866
867 getActionDefinitionsBuilder(G_PTRMASK)
868 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
869 .scalarSameSizeAs(1, 0)
870 .scalarize(0);
871
872 auto &CmpBuilder =
873 getActionDefinitionsBuilder(G_ICMP)
874 // The compare output type differs based on the register bank of the output,
875 // so make both s1 and s32 legal.
876 //
877 // Scalar compares producing output in scc will be promoted to s32, as that
878 // is the allocatable register type that will be needed for the copy from
879 // scc. This will be promoted during RegBankSelect, and we assume something
880 // before that won't try to use s32 result types.
881 //
882 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
883 // bank.
884 .legalForCartesianProduct(
885 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
886 .legalForCartesianProduct(
887 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
888 if (ST.has16BitInsts()) {
889 CmpBuilder.legalFor({{S1, S16}});
890 }
891
892 CmpBuilder
893 .widenScalarToNextPow2(1)
894 .clampScalar(1, S32, S64)
895 .scalarize(0)
896 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
897
898 getActionDefinitionsBuilder(G_FCMP)
899 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
900 .widenScalarToNextPow2(1)
901 .clampScalar(1, S32, S64)
902 .scalarize(0);
903
904 // FIXME: fpow has a selection pattern that should move to custom lowering.
905 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
906 if (ST.has16BitInsts())
907 Exp2Ops.legalFor({S32, S16});
908 else
909 Exp2Ops.legalFor({S32});
910 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
911 Exp2Ops.scalarize(0);
912
913 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
914 if (ST.has16BitInsts())
915 ExpOps.customFor({{S32}, {S16}});
916 else
917 ExpOps.customFor({S32});
918 ExpOps.clampScalar(0, MinScalarFPTy, S32)
919 .scalarize(0);
920
921 getActionDefinitionsBuilder(G_FPOWI)
922 .clampScalar(0, MinScalarFPTy, S32)
923 .lower();
924
925 // The 64-bit versions produce 32-bit results, but only on the SALU.
926 getActionDefinitionsBuilder(G_CTPOP)
927 .legalFor({{S32, S32}, {S32, S64}})
928 .clampScalar(0, S32, S32)
929 .clampScalar(1, S32, S64)
930 .scalarize(0)
931 .widenScalarToNextPow2(0, 32)
932 .widenScalarToNextPow2(1, 32);
933
934 // The hardware instructions return a different result on 0 than the generic
935 // instructions expect. The hardware produces -1, but these produce the
936 // bitwidth.
937 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
938 .scalarize(0)
939 .clampScalar(0, S32, S32)
940 .clampScalar(1, S32, S64)
941 .widenScalarToNextPow2(0, 32)
942 .widenScalarToNextPow2(1, 32)
943 .custom();
944
945 // The 64-bit versions produce 32-bit results, but only on the SALU.
946 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
947 .legalFor({{S32, S32}, {S32, S64}})
948 .clampScalar(0, S32, S32)
949 .clampScalar(1, S32, S64)
950 .scalarize(0)
951 .widenScalarToNextPow2(0, 32)
952 .widenScalarToNextPow2(1, 32);
953
954 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
955 // RegBankSelect.
956 getActionDefinitionsBuilder(G_BITREVERSE)
957 .legalFor({S32, S64})
958 .clampScalar(0, S32, S64)
959 .scalarize(0)
960 .widenScalarToNextPow2(0);
961
962 if (ST.has16BitInsts()) {
963 getActionDefinitionsBuilder(G_BSWAP)
964 .legalFor({S16, S32, V2S16})
965 .clampMaxNumElements(0, S16, 2)
966 // FIXME: Fixing non-power-of-2 before clamp is workaround for
967 // narrowScalar limitation.
968 .widenScalarToNextPow2(0)
969 .clampScalar(0, S16, S32)
970 .scalarize(0);
971
972 if (ST.hasVOP3PInsts()) {
973 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
974 .legalFor({S32, S16, V2S16})
975 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
976 .clampMaxNumElements(0, S16, 2)
977 .minScalar(0, S16)
978 .widenScalarToNextPow2(0)
979 .scalarize(0)
980 .lower();
981 } else {
982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983 .legalFor({S32, S16})
984 .widenScalarToNextPow2(0)
985 .minScalar(0, S16)
986 .scalarize(0)
987 .lower();
988 }
989 } else {
990 // TODO: Should have same legality without v_perm_b32
991 getActionDefinitionsBuilder(G_BSWAP)
992 .legalFor({S32})
993 .lowerIf(scalarNarrowerThan(0, 32))
994 // FIXME: Fixing non-power-of-2 before clamp is workaround for
995 // narrowScalar limitation.
996 .widenScalarToNextPow2(0)
997 .maxScalar(0, S32)
998 .scalarize(0)
999 .lower();
1000
1001 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1002 .legalFor({S32})
1003 .minScalar(0, S32)
1004 .widenScalarToNextPow2(0)
1005 .scalarize(0)
1006 .lower();
1007 }
1008
1009 getActionDefinitionsBuilder(G_INTTOPTR)
1010 // List the common cases
1011 .legalForCartesianProduct(AddrSpaces64, {S64})
1012 .legalForCartesianProduct(AddrSpaces32, {S32})
1013 .scalarize(0)
1014 // Accept any address space as long as the size matches
1015 .legalIf(sameSize(0, 1))
1016 .widenScalarIf(smallerThan(1, 0),
1017 [](const LegalityQuery &Query) {
1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019 })
1020 .narrowScalarIf(largerThan(1, 0),
1021 [](const LegalityQuery &Query) {
1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023 });
1024
1025 getActionDefinitionsBuilder(G_PTRTOINT)
1026 // List the common cases
1027 .legalForCartesianProduct(AddrSpaces64, {S64})
1028 .legalForCartesianProduct(AddrSpaces32, {S32})
1029 .scalarize(0)
1030 // Accept any address space as long as the size matches
1031 .legalIf(sameSize(0, 1))
1032 .widenScalarIf(smallerThan(0, 1),
1033 [](const LegalityQuery &Query) {
1034 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1035 })
1036 .narrowScalarIf(
1037 largerThan(0, 1),
1038 [](const LegalityQuery &Query) {
1039 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1040 });
1041
1042 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1043 .scalarize(0)
1044 .custom();
1045
1046 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1047 bool IsLoad) -> bool {
1048 const LLT DstTy = Query.Types[0];
1049
1050 // Split vector extloads.
1051 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1052 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1053
1054 if (MemSize < DstTy.getSizeInBits())
1055 MemSize = std::max(MemSize, AlignBits);
1056
1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058 return true;
1059
1060 const LLT PtrTy = Query.Types[1];
1061 unsigned AS = PtrTy.getAddressSpace();
1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063 return true;
1064
1065 // Catch weird sized loads that don't evenly divide into the access sizes
1066 // TODO: May be able to widen depending on alignment etc.
1067 unsigned NumRegs = (MemSize + 31) / 32;
1068 if (NumRegs == 3) {
1069 if (!ST.hasDwordx3LoadStores())
1070 return true;
1071 } else {
1072 // If the alignment allows, these should have been widened.
1073 if (!isPowerOf2_32(NumRegs))
1074 return true;
1075 }
1076
1077 if (AlignBits < MemSize) {
1078 const SITargetLowering *TLI = ST.getTargetLowering();
1079 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1080 Align(AlignBits / 8));
1081 }
1082
1083 return false;
1084 };
1085
1086 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1089
1090 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1091 // LDS
1092 // TODO: Unsupported flat for SI.
1093
1094 for (unsigned Op : {G_LOAD, G_STORE}) {
1095 const bool IsStore = Op == G_STORE;
1096
1097 auto &Actions = getActionDefinitionsBuilder(Op);
1098 // Explicitly list some common cases.
1099 // TODO: Does this help compile time at all?
1100 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1101 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1102 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1103 {S64, GlobalPtr, S64, GlobalAlign32},
1104 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1105 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1106 {S32, GlobalPtr, S8, GlobalAlign8},
1107 {S32, GlobalPtr, S16, GlobalAlign16},
1108
1109 {S32, LocalPtr, S32, 32},
1110 {S64, LocalPtr, S64, 32},
1111 {V2S32, LocalPtr, V2S32, 32},
1112 {S32, LocalPtr, S8, 8},
1113 {S32, LocalPtr, S16, 16},
1114 {V2S16, LocalPtr, S32, 32},
1115
1116 {S32, PrivatePtr, S32, 32},
1117 {S32, PrivatePtr, S8, 8},
1118 {S32, PrivatePtr, S16, 16},
1119 {V2S16, PrivatePtr, S32, 32},
1120
1121 {S32, ConstantPtr, S32, GlobalAlign32},
1122 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1123 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1124 {S64, ConstantPtr, S64, GlobalAlign32},
1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1126 Actions.legalIf(
1127 [=](const LegalityQuery &Query) -> bool {
1128 return isLoadStoreLegal(ST, Query);
1129 });
1130
1131 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1132 // 64-bits.
1133 //
1134 // TODO: Should generalize bitcast action into coerce, which will also cover
1135 // inserting addrspacecasts.
1136 Actions.customIf(typeIs(1, Constant32Ptr));
1137
1138 // Turn any illegal element vectors into something easier to deal
1139 // with. These will ultimately produce 32-bit scalar shifts to extract the
1140 // parts anyway.
1141 //
1142 // For odd 16-bit element vectors, prefer to split those into pieces with
1143 // 16-bit vector parts.
1144 Actions.bitcastIf(
1145 [=](const LegalityQuery &Query) -> bool {
1146 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1147 Query.MMODescrs[0].MemoryTy);
1148 }, bitcastToRegisterType(0));
1149
1150 if (!IsStore) {
1151 // Widen suitably aligned loads by loading extra bytes. The standard
1152 // legalization actions can't properly express widening memory operands.
1153 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1154 return shouldWidenLoad(ST, Query, G_LOAD);
1155 });
1156 }
1157
1158 // FIXME: load/store narrowing should be moved to lower action
1159 Actions
1160 .narrowScalarIf(
1161 [=](const LegalityQuery &Query) -> bool {
1162 return !Query.Types[0].isVector() &&
1163 needToSplitMemOp(Query, Op == G_LOAD);
1164 },
1165 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1166 const LLT DstTy = Query.Types[0];
1167 const LLT PtrTy = Query.Types[1];
1168
1169 const unsigned DstSize = DstTy.getSizeInBits();
1170 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1171
1172 // Split extloads.
1173 if (DstSize > MemSize)
1174 return std::make_pair(0, LLT::scalar(MemSize));
1175
1176 if (!isPowerOf2_32(DstSize)) {
1177 // We're probably decomposing an odd sized store. Try to split
1178 // to the widest type. TODO: Account for alignment. As-is it
1179 // should be OK, since the new parts will be further legalized.
1180 unsigned FloorSize = PowerOf2Floor(DstSize);
1181 return std::make_pair(0, LLT::scalar(FloorSize));
1182 }
1183
1184 if (DstSize > 32 && (DstSize % 32 != 0)) {
1185 // FIXME: Need a way to specify non-extload of larger size if
1186 // suitably aligned.
1187 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1188 }
1189
1190 unsigned MaxSize = maxSizeForAddrSpace(ST,
1191 PtrTy.getAddressSpace(),
1192 Op == G_LOAD);
1193 if (MemSize > MaxSize)
1194 return std::make_pair(0, LLT::scalar(MaxSize));
1195
1196 unsigned Align = Query.MMODescrs[0].AlignInBits;
1197 return std::make_pair(0, LLT::scalar(Align));
1198 })
1199 .fewerElementsIf(
1200 [=](const LegalityQuery &Query) -> bool {
1201 return Query.Types[0].isVector() &&
1202 needToSplitMemOp(Query, Op == G_LOAD);
1203 },
1204 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1205 const LLT DstTy = Query.Types[0];
1206 const LLT PtrTy = Query.Types[1];
1207
1208 LLT EltTy = DstTy.getElementType();
1209 unsigned MaxSize = maxSizeForAddrSpace(ST,
1210 PtrTy.getAddressSpace(),
1211 Op == G_LOAD);
1212
1213 // FIXME: Handle widened to power of 2 results better. This ends
1214 // up scalarizing.
1215 // FIXME: 3 element stores scalarized on SI
1216
1217 // Split if it's too large for the address space.
1218 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1219 if (MemSize > MaxSize) {
1220 unsigned NumElts = DstTy.getNumElements();
1221 unsigned EltSize = EltTy.getSizeInBits();
1222
1223 if (MaxSize % EltSize == 0) {
1224 return std::make_pair(
1225 0, LLT::scalarOrVector(
1226 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1227 }
1228
1229 unsigned NumPieces = MemSize / MaxSize;
1230
1231 // FIXME: Refine when odd breakdowns handled
1232 // The scalars will need to be re-legalized.
1233 if (NumPieces == 1 || NumPieces >= NumElts ||
1234 NumElts % NumPieces != 0)
1235 return std::make_pair(0, EltTy);
1236
1237 return std::make_pair(
1238 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1239 }
1240
1241 // FIXME: We could probably handle weird extending loads better.
1242 if (DstTy.getSizeInBits() > MemSize)
1243 return std::make_pair(0, EltTy);
1244
1245 unsigned EltSize = EltTy.getSizeInBits();
1246 unsigned DstSize = DstTy.getSizeInBits();
1247 if (!isPowerOf2_32(DstSize)) {
1248 // We're probably decomposing an odd sized store. Try to split
1249 // to the widest type. TODO: Account for alignment. As-is it
1250 // should be OK, since the new parts will be further legalized.
1251 unsigned FloorSize = PowerOf2Floor(DstSize);
1252 return std::make_pair(
1253 0, LLT::scalarOrVector(
1254 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1255 }
1256
1257 // Need to split because of alignment.
1258 unsigned Align = Query.MMODescrs[0].AlignInBits;
1259 if (EltSize > Align &&
1260 (EltSize / Align < DstTy.getNumElements())) {
1261 return std::make_pair(
1262 0, LLT::fixed_vector(EltSize / Align, EltTy));
1263 }
1264
1265 // May need relegalization for the scalars.
1266 return std::make_pair(0, EltTy);
1267 })
1268 .minScalar(0, S32)
1269 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1270 .widenScalarToNextPow2(0)
1271 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1272 .lower();
1273 }
1274
1275 // FIXME: Unaligned accesses not lowered.
1276 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1277 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1278 {S32, GlobalPtr, S16, 2 * 8},
1279 {S32, LocalPtr, S8, 8},
1280 {S32, LocalPtr, S16, 16},
1281 {S32, PrivatePtr, S8, 8},
1282 {S32, PrivatePtr, S16, 16},
1283 {S32, ConstantPtr, S8, 8},
1284 {S32, ConstantPtr, S16, 2 * 8}})
1285 .legalIf(
1286 [=](const LegalityQuery &Query) -> bool {
1287 return isLoadStoreLegal(ST, Query);
1288 });
1289
1290 if (ST.hasFlatAddressSpace()) {
1291 ExtLoads.legalForTypesWithMemDesc(
1292 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1293 }
1294
1295 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1296 // 64-bits.
1297 //
1298 // TODO: Should generalize bitcast action into coerce, which will also cover
1299 // inserting addrspacecasts.
1300 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1301
1302 ExtLoads.clampScalar(0, S32, S32)
1303 .widenScalarToNextPow2(0)
1304 .lower();
1305
1306 auto &Atomics = getActionDefinitionsBuilder(
1307 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1308 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1309 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1310 G_ATOMICRMW_UMIN})
1311 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1312 {S64, GlobalPtr}, {S64, LocalPtr},
1313 {S32, RegionPtr}, {S64, RegionPtr}});
1314 if (ST.hasFlatAddressSpace()) {
1315 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1316 }
1317
1318 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1319 if (ST.hasLDSFPAtomicAdd()) {
1320 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1321 if (ST.hasGFX90AInsts())
1322 Atomic.legalFor({{S64, LocalPtr}});
1323 }
1324 if (ST.hasAtomicFaddInsts())
1325 Atomic.legalFor({{S32, GlobalPtr}});
1326
1327 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1328 // demarshalling
1329 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1330 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1331 {S32, FlatPtr}, {S64, FlatPtr}})
1332 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1333 {S32, RegionPtr}, {S64, RegionPtr}});
1334 // TODO: Pointer types, any 32-bit or 64-bit vector
1335
1336 // Condition should be s32 for scalar, s1 for vector.
1337 getActionDefinitionsBuilder(G_SELECT)
1338 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1339 LocalPtr, FlatPtr, PrivatePtr,
1340 LLT::fixed_vector(2, LocalPtr),
1341 LLT::fixed_vector(2, PrivatePtr)},
1342 {S1, S32})
1343 .clampScalar(0, S16, S64)
1344 .scalarize(1)
1345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347 .clampMaxNumElements(0, S32, 2)
1348 .clampMaxNumElements(0, LocalPtr, 2)
1349 .clampMaxNumElements(0, PrivatePtr, 2)
1350 .scalarize(0)
1351 .widenScalarToNextPow2(0)
1352 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1353
1354 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355 // be more flexible with the shift amount type.
1356 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1357 .legalFor({{S32, S32}, {S64, S32}});
1358 if (ST.has16BitInsts()) {
1359 if (ST.hasVOP3PInsts()) {
1360 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1361 .clampMaxNumElements(0, S16, 2);
1362 } else
1363 Shifts.legalFor({{S16, S16}});
1364
1365 // TODO: Support 16-bit shift amounts for all types
1366 Shifts.widenScalarIf(
1367 [=](const LegalityQuery &Query) {
1368 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1369 // 32-bit amount.
1370 const LLT ValTy = Query.Types[0];
1371 const LLT AmountTy = Query.Types[1];
1372 return ValTy.getSizeInBits() <= 16 &&
1373 AmountTy.getSizeInBits() < 16;
1374 }, changeTo(1, S16));
1375 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1376 Shifts.clampScalar(1, S32, S32);
1377 Shifts.clampScalar(0, S16, S64);
1378 Shifts.widenScalarToNextPow2(0, 16);
1379
1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381 .minScalar(0, S16)
1382 .scalarize(0)
1383 .lower();
1384 } else {
1385 // Make sure we legalize the shift amount type first, as the general
1386 // expansion for the shifted type will produce much worse code if it hasn't
1387 // been truncated already.
1388 Shifts.clampScalar(1, S32, S32);
1389 Shifts.clampScalar(0, S32, S64);
1390 Shifts.widenScalarToNextPow2(0, 32);
1391
1392 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1393 .minScalar(0, S32)
1394 .scalarize(0)
1395 .lower();
1396 }
1397 Shifts.scalarize(0);
1398
1399 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1400 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1401 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1402 unsigned IdxTypeIdx = 2;
1403
1404 getActionDefinitionsBuilder(Op)
1405 .customIf([=](const LegalityQuery &Query) {
1406 const LLT EltTy = Query.Types[EltTypeIdx];
1407 const LLT VecTy = Query.Types[VecTypeIdx];
1408 const LLT IdxTy = Query.Types[IdxTypeIdx];
1409 const unsigned EltSize = EltTy.getSizeInBits();
1410 return (EltSize == 32 || EltSize == 64) &&
1411 VecTy.getSizeInBits() % 32 == 0 &&
1412 VecTy.getSizeInBits() <= MaxRegisterSize &&
1413 IdxTy.getSizeInBits() == 32;
1414 })
1415 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1416 bitcastToVectorElement32(VecTypeIdx))
1417 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1418 .bitcastIf(
1419 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1420 [=](const LegalityQuery &Query) {
1421 // For > 64-bit element types, try to turn this into a 64-bit
1422 // element vector since we may be able to do better indexing
1423 // if this is scalar. If not, fall back to 32.
1424 const LLT EltTy = Query.Types[EltTypeIdx];
1425 const LLT VecTy = Query.Types[VecTypeIdx];
1426 const unsigned DstEltSize = EltTy.getSizeInBits();
1427 const unsigned VecSize = VecTy.getSizeInBits();
1428
1429 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1430 return std::make_pair(
1431 VecTypeIdx,
1432 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1433 })
1434 .clampScalar(EltTypeIdx, S32, S64)
1435 .clampScalar(VecTypeIdx, S32, S64)
1436 .clampScalar(IdxTypeIdx, S32, S32)
1437 .clampMaxNumElements(VecTypeIdx, S32, 32)
1438 // TODO: Clamp elements for 64-bit vectors?
1439 // It should only be necessary with variable indexes.
1440 // As a last resort, lower to the stack
1441 .lower();
1442 }
1443
1444 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1445 .unsupportedIf([=](const LegalityQuery &Query) {
1446 const LLT &EltTy = Query.Types[1].getElementType();
1447 return Query.Types[0] != EltTy;
1448 });
1449
1450 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1451 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1452 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1453
1454 // FIXME: Doesn't handle extract of illegal sizes.
1455 getActionDefinitionsBuilder(Op)
1456 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1457 // FIXME: Multiples of 16 should not be legal.
1458 .legalIf([=](const LegalityQuery &Query) {
1459 const LLT BigTy = Query.Types[BigTyIdx];
1460 const LLT LitTy = Query.Types[LitTyIdx];
1461 return (BigTy.getSizeInBits() % 32 == 0) &&
1462 (LitTy.getSizeInBits() % 16 == 0);
1463 })
1464 .widenScalarIf(
1465 [=](const LegalityQuery &Query) {
1466 const LLT BigTy = Query.Types[BigTyIdx];
1467 return (BigTy.getScalarSizeInBits() < 16);
1468 },
1469 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1470 .widenScalarIf(
1471 [=](const LegalityQuery &Query) {
1472 const LLT LitTy = Query.Types[LitTyIdx];
1473 return (LitTy.getScalarSizeInBits() < 16);
1474 },
1475 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1476 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1477 .widenScalarToNextPow2(BigTyIdx, 32);
1478
1479 }
1480
1481 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1482 .legalForCartesianProduct(AllS32Vectors, {S32})
1483 .legalForCartesianProduct(AllS64Vectors, {S64})
1484 .clampNumElements(0, V16S32, V32S32)
1485 .clampNumElements(0, V2S64, V16S64)
1486 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1487
1488 if (ST.hasScalarPackInsts()) {
1489 BuildVector
1490 // FIXME: Should probably widen s1 vectors straight to s32
1491 .minScalarOrElt(0, S16)
1492 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1493 .minScalar(1, S32);
1494
1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496 .legalFor({V2S16, S32})
1497 .lower();
1498 BuildVector.minScalarOrElt(0, S32);
1499 } else {
1500 BuildVector.customFor({V2S16, S16});
1501 BuildVector.minScalarOrElt(0, S32);
1502
1503 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1504 .customFor({V2S16, S32})
1505 .lower();
1506 }
1507
1508 BuildVector.legalIf(isRegisterType(0));
1509
1510 // FIXME: Clamp maximum size
1511 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1512 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513 .clampMaxNumElements(0, S32, 32)
1514 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1515 .clampMaxNumElements(0, S16, 64);
1516
1517 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1518 // pre-legalize.
1519 if (ST.hasVOP3PInsts()) {
1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1521 .customFor({V2S16, V2S16})
1522 .lower();
1523 } else
1524 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1525
1526 // Merge/Unmerge
1527 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1528 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1529 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1530
1531 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1532 const LLT Ty = Query.Types[TypeIdx];
1533 if (Ty.isVector()) {
1534 const LLT &EltTy = Ty.getElementType();
1535 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1536 return true;
1537 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1538 return true;
1539 }
1540 return false;
1541 };
1542
1543 auto &Builder = getActionDefinitionsBuilder(Op)
1544 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545 .lowerFor({{S16, V2S16}})
1546 .lowerIf([=](const LegalityQuery &Query) {
1547 const LLT BigTy = Query.Types[BigTyIdx];
1548 return BigTy.getSizeInBits() == 32;
1549 })
1550 // Try to widen to s16 first for small types.
1551 // TODO: Only do this on targets with legal s16 shifts
1552 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1554 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1555 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1556 elementTypeIs(1, S16)),
1557 changeTo(1, V2S16))
1558 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1560 // valid.
1561 .clampScalar(LitTyIdx, S32, S512)
1562 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1563 // Break up vectors with weird elements into scalars
1564 .fewerElementsIf(
1565 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1566 scalarize(0))
1567 .fewerElementsIf(
1568 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1569 scalarize(1))
1570 .clampScalar(BigTyIdx, S32, MaxScalar);
1571
1572 if (Op == G_MERGE_VALUES) {
1573 Builder.widenScalarIf(
1574 // TODO: Use 16-bit shifts if legal for 8-bit values?
1575 [=](const LegalityQuery &Query) {
1576 const LLT Ty = Query.Types[LitTyIdx];
1577 return Ty.getSizeInBits() < 32;
1578 },
1579 changeTo(LitTyIdx, S32));
1580 }
1581
1582 Builder.widenScalarIf(
1583 [=](const LegalityQuery &Query) {
1584 const LLT Ty = Query.Types[BigTyIdx];
1585 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1586 Ty.getSizeInBits() % 16 != 0;
1587 },
1588 [=](const LegalityQuery &Query) {
1589 // Pick the next power of 2, or a multiple of 64 over 128.
1590 // Whichever is smaller.
1591 const LLT &Ty = Query.Types[BigTyIdx];
1592 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1593 if (NewSizeInBits >= 256) {
1594 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1595 if (RoundedTo < NewSizeInBits)
1596 NewSizeInBits = RoundedTo;
1597 }
1598 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1599 })
1600 // Any vectors left are the wrong size. Scalarize them.
1601 .scalarize(0)
1602 .scalarize(1);
1603 }
1604
1605 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1606 // RegBankSelect.
1607 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1608 .legalFor({{S32}, {S64}});
1609
1610 if (ST.hasVOP3PInsts()) {
1611 SextInReg.lowerFor({{V2S16}})
1612 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613 // get more vector shift opportunities, since we'll get those when
1614 // expanded.
1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1616 } else if (ST.has16BitInsts()) {
1617 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1618 } else {
1619 // Prefer to promote to s32 before lowering if we don't have 16-bit
1620 // shifts. This avoid a lot of intermediate truncate and extend operations.
1621 SextInReg.lowerFor({{S32}, {S64}});
1622 }
1623
1624 SextInReg
1625 .scalarize(0)
1626 .clampScalar(0, S32, S64)
1627 .lower();
1628
1629 // TODO: Only Try to form v2s16 with legal packed instructions.
1630 getActionDefinitionsBuilder(G_FSHR)
1631 .legalFor({{S32, S32}})
1632 .lowerFor({{V2S16, V2S16}})
1633 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1634 .scalarize(0)
1635 .lower();
1636
1637 if (ST.hasVOP3PInsts()) {
1638 getActionDefinitionsBuilder(G_FSHL)
1639 .lowerFor({{V2S16, V2S16}})
1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641 .scalarize(0)
1642 .lower();
1643 } else {
1644 getActionDefinitionsBuilder(G_FSHL)
1645 .scalarize(0)
1646 .lower();
1647 }
1648
1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650 .legalFor({S64});
1651
1652 getActionDefinitionsBuilder(G_FENCE)
1653 .alwaysLegal();
1654
1655 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656 .scalarize(0)
1657 .minScalar(0, S32)
1658 .lower();
1659
1660 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661 .legalFor({{S32, S32}, {S64, S32}})
1662 .clampScalar(1, S32, S32)
1663 .clampScalar(0, S32, S64)
1664 .widenScalarToNextPow2(0)
1665 .scalarize(0);
1666
1667 getActionDefinitionsBuilder({
1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669 G_FCOPYSIGN,
1670
1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672 G_ATOMICRMW_NAND,
1673 G_ATOMICRMW_FSUB,
1674 G_READ_REGISTER,
1675 G_WRITE_REGISTER,
1676
1677 G_SADDO, G_SSUBO,
1678
1679 // TODO: Implement
1680 G_FMINIMUM, G_FMAXIMUM}).lower();
1681
1682 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1683 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1684 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1685 .unsupported();
1686
1687 getLegacyLegalizerInfo().computeTables();
1688 verify(*ST.getInstrInfo());
1689}
1690
1691bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1692 MachineInstr &MI) const {
1693 MachineIRBuilder &B = Helper.MIRBuilder;
1694 MachineRegisterInfo &MRI = *B.getMRI();
1695
1696 switch (MI.getOpcode()) {
1697 case TargetOpcode::G_ADDRSPACE_CAST:
1698 return legalizeAddrSpaceCast(MI, MRI, B);
1699 case TargetOpcode::G_FRINT:
1700 return legalizeFrint(MI, MRI, B);
1701 case TargetOpcode::G_FCEIL:
1702 return legalizeFceil(MI, MRI, B);
1703 case TargetOpcode::G_FREM:
1704 return legalizeFrem(MI, MRI, B);
1705 case TargetOpcode::G_INTRINSIC_TRUNC:
1706 return legalizeIntrinsicTrunc(MI, MRI, B);
1707 case TargetOpcode::G_SITOFP:
1708 return legalizeITOFP(MI, MRI, B, true);
1709 case TargetOpcode::G_UITOFP:
1710 return legalizeITOFP(MI, MRI, B, false);
1711 case TargetOpcode::G_FPTOSI:
1712 return legalizeFPTOI(MI, MRI, B, true);
1713 case TargetOpcode::G_FPTOUI:
1714 return legalizeFPTOI(MI, MRI, B, false);
1715 case TargetOpcode::G_FMINNUM:
1716 case TargetOpcode::G_FMAXNUM:
1717 case TargetOpcode::G_FMINNUM_IEEE:
1718 case TargetOpcode::G_FMAXNUM_IEEE:
1719 return legalizeMinNumMaxNum(Helper, MI);
1720 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1721 return legalizeExtractVectorElt(MI, MRI, B);
1722 case TargetOpcode::G_INSERT_VECTOR_ELT:
1723 return legalizeInsertVectorElt(MI, MRI, B);
1724 case TargetOpcode::G_SHUFFLE_VECTOR:
1725 return legalizeShuffleVector(MI, MRI, B);
1726 case TargetOpcode::G_FSIN:
1727 case TargetOpcode::G_FCOS:
1728 return legalizeSinCos(MI, MRI, B);
1729 case TargetOpcode::G_GLOBAL_VALUE:
1730 return legalizeGlobalValue(MI, MRI, B);
1731 case TargetOpcode::G_LOAD:
1732 case TargetOpcode::G_SEXTLOAD:
1733 case TargetOpcode::G_ZEXTLOAD:
1734 return legalizeLoad(Helper, MI);
1735 case TargetOpcode::G_FMAD:
1736 return legalizeFMad(MI, MRI, B);
1737 case TargetOpcode::G_FDIV:
1738 return legalizeFDIV(MI, MRI, B);
1739 case TargetOpcode::G_UDIV:
1740 case TargetOpcode::G_UREM:
1741 case TargetOpcode::G_UDIVREM:
1742 return legalizeUnsignedDIV_REM(MI, MRI, B);
1743 case TargetOpcode::G_SDIV:
1744 case TargetOpcode::G_SREM:
1745 case TargetOpcode::G_SDIVREM:
1746 return legalizeSignedDIV_REM(MI, MRI, B);
1747 case TargetOpcode::G_ATOMIC_CMPXCHG:
1748 return legalizeAtomicCmpXChg(MI, MRI, B);
1749 case TargetOpcode::G_FLOG:
1750 return legalizeFlog(MI, B, numbers::ln2f);
1751 case TargetOpcode::G_FLOG10:
1752 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1753 case TargetOpcode::G_FEXP:
1754 return legalizeFExp(MI, B);
1755 case TargetOpcode::G_FPOW:
1756 return legalizeFPow(MI, B);
1757 case TargetOpcode::G_FFLOOR:
1758 return legalizeFFloor(MI, MRI, B);
1759 case TargetOpcode::G_BUILD_VECTOR:
1760 return legalizeBuildVector(MI, MRI, B);
1761 case TargetOpcode::G_CTLZ:
1762 case TargetOpcode::G_CTTZ:
1763 return legalizeCTLZ_CTTZ(MI, MRI, B);
1764 default:
1765 return false;
1766 }
1767
1768 llvm_unreachable("expected switch to return")__builtin_unreachable();
1769}
1770
1771Register AMDGPULegalizerInfo::getSegmentAperture(
1772 unsigned AS,
1773 MachineRegisterInfo &MRI,
1774 MachineIRBuilder &B) const {
1775 MachineFunction &MF = B.getMF();
1776 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1777 const LLT S32 = LLT::scalar(32);
1778
1779 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast<void> (0));
1780
1781 if (ST.hasApertureRegs()) {
1782 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1783 // getreg.
1784 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1786 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1787 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1788 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1789 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1790 unsigned Encoding =
1791 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1792 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1793 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1794
1795 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1796
1797 B.buildInstr(AMDGPU::S_GETREG_B32)
1798 .addDef(GetReg)
1799 .addImm(Encoding);
1800 MRI.setType(GetReg, S32);
1801
1802 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1803 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1804 }
1805
1806 Register QueuePtr = MRI.createGenericVirtualRegister(
1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1808
1809 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1810 return Register();
1811
1812 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1813 // private_segment_aperture_base_hi.
1814 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1815
1816 // TODO: can we be smarter about machine pointer info?
1817 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1818 MachineMemOperand *MMO = MF.getMachineMemOperand(
1819 PtrInfo,
1820 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1821 MachineMemOperand::MOInvariant,
1822 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1823
1824 Register LoadAddr;
1825
1826 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1827 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1828}
1829
1830bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1831 MachineInstr &MI, MachineRegisterInfo &MRI,
1832 MachineIRBuilder &B) const {
1833 MachineFunction &MF = B.getMF();
1834
1835 const LLT S32 = LLT::scalar(32);
1836 Register Dst = MI.getOperand(0).getReg();
1837 Register Src = MI.getOperand(1).getReg();
1838
1839 LLT DstTy = MRI.getType(Dst);
1840 LLT SrcTy = MRI.getType(Src);
1841 unsigned DestAS = DstTy.getAddressSpace();
1842 unsigned SrcAS = SrcTy.getAddressSpace();
1843
1844 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1845 // vector element.
1846 assert(!DstTy.isVector())(static_cast<void> (0));
1847
1848 const AMDGPUTargetMachine &TM
1849 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1850
1851 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1852 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1853 return true;
1854 }
1855
1856 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1857 // Truncate.
1858 B.buildExtract(Dst, Src, 0);
1859 MI.eraseFromParent();
1860 return true;
1861 }
1862
1863 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1864 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1865 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1866
1867 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1868 // another. Merge operands are required to be the same type, but creating an
1869 // extra ptrtoint would be kind of pointless.
1870 auto HighAddr = B.buildConstant(
1871 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1872 B.buildMerge(Dst, {Src, HighAddr});
1873 MI.eraseFromParent();
1874 return true;
1875 }
1876
1877 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1878 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||(static_cast<void> (0))
1879 DestAS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast<void> (0));
1880 unsigned NullVal = TM.getNullPointerValue(DestAS);
1881
1882 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1883 auto FlatNull = B.buildConstant(SrcTy, 0);
1884
1885 // Extract low 32-bits of the pointer.
1886 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1887
1888 auto CmpRes =
1889 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1890 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1891
1892 MI.eraseFromParent();
1893 return true;
1894 }
1895
1896 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1897 return false;
1898
1899 if (!ST.hasFlatAddressSpace())
1900 return false;
1901
1902 auto SegmentNull =
1903 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1904 auto FlatNull =
1905 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1906
1907 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1908 if (!ApertureReg.isValid())
1909 return false;
1910
1911 auto CmpRes =
1912 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1913
1914 // Coerce the type of the low half of the result so we can use merge_values.
1915 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1916
1917 // TODO: Should we allow mismatched types but matching sizes in merges to
1918 // avoid the ptrtoint?
1919 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1920 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1921
1922 MI.eraseFromParent();
1923 return true;
1924}
1925
1926bool AMDGPULegalizerInfo::legalizeFrint(
1927 MachineInstr &MI, MachineRegisterInfo &MRI,
1928 MachineIRBuilder &B) const {
1929 Register Src = MI.getOperand(1).getReg();
1930 LLT Ty = MRI.getType(Src);
1931 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast<void> (0));
1932
1933 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1934 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1935
1936 auto C1 = B.buildFConstant(Ty, C1Val);
1937 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1938
1939 // TODO: Should this propagate fast-math-flags?
1940 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1941 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1942
1943 auto C2 = B.buildFConstant(Ty, C2Val);
1944 auto Fabs = B.buildFAbs(Ty, Src);
1945
1946 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1947 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1948 MI.eraseFromParent();
1949 return true;
1950}
1951
1952bool AMDGPULegalizerInfo::legalizeFceil(
1953 MachineInstr &MI, MachineRegisterInfo &MRI,
1954 MachineIRBuilder &B) const {
1955
1956 const LLT S1 = LLT::scalar(1);
1957 const LLT S64 = LLT::scalar(64);
1958
1959 Register Src = MI.getOperand(1).getReg();
1960 assert(MRI.getType(Src) == S64)(static_cast<void> (0));
1961
1962 // result = trunc(src)
1963 // if (src > 0.0 && src != result)
1964 // result += 1.0
1965
1966 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1967
1968 const auto Zero = B.buildFConstant(S64, 0.0);
1969 const auto One = B.buildFConstant(S64, 1.0);
1970 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1971 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1972 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1973 auto Add = B.buildSelect(S64, And, One, Zero);
1974
1975 // TODO: Should this propagate fast-math-flags?
1976 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1977 return true;
1978}
1979
1980bool AMDGPULegalizerInfo::legalizeFrem(
1981 MachineInstr &MI, MachineRegisterInfo &MRI,
1982 MachineIRBuilder &B) const {
1983 Register DstReg = MI.getOperand(0).getReg();
1984 Register Src0Reg = MI.getOperand(1).getReg();
1985 Register Src1Reg = MI.getOperand(2).getReg();
1986 auto Flags = MI.getFlags();
1987 LLT Ty = MRI.getType(DstReg);
1988
1989 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1990 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1991 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1992 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1993 MI.eraseFromParent();
1994 return true;
1995}
1996
1997static MachineInstrBuilder extractF64Exponent(Register Hi,
1998 MachineIRBuilder &B) {
1999 const unsigned FractBits = 52;
2000 const unsigned ExpBits = 11;
2001 LLT S32 = LLT::scalar(32);
2002
2003 auto Const0 = B.buildConstant(S32, FractBits - 32);
2004 auto Const1 = B.buildConstant(S32, ExpBits);
2005
2006 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2007 .addUse(Hi)
2008 .addUse(Const0.getReg(0))
2009 .addUse(Const1.getReg(0));
2010
2011 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2012}
2013
2014bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2015 MachineInstr &MI, MachineRegisterInfo &MRI,
2016 MachineIRBuilder &B) const {
2017 const LLT S1 = LLT::scalar(1);
2018 const LLT S32 = LLT::scalar(32);
2019 const LLT S64 = LLT::scalar(64);
2020
2021 Register Src = MI.getOperand(1).getReg();
2022 assert(MRI.getType(Src) == S64)(static_cast<void> (0));
2023
2024 // TODO: Should this use extract since the low half is unused?
2025 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2026 Register Hi = Unmerge.getReg(1);
2027
2028 // Extract the upper half, since this is where we will find the sign and
2029 // exponent.
2030 auto Exp = extractF64Exponent(Hi, B);
2031
2032 const unsigned FractBits = 52;
2033
2034 // Extract the sign bit.
2035 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2036 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2037
2038 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
2039
2040 const auto Zero32 = B.buildConstant(S32, 0);
2041
2042 // Extend back to 64-bits.
2043 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2044
2045 auto Shr = B.buildAShr(S64, FractMask, Exp);
2046 auto Not = B.buildNot(S64, Shr);
2047 auto Tmp0 = B.buildAnd(S64, Src, Not);
2048 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2049
2050 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2051 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2052
2053 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2054 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2055 MI.eraseFromParent();
2056 return true;
2057}
2058
2059bool AMDGPULegalizerInfo::legalizeITOFP(
2060 MachineInstr &MI, MachineRegisterInfo &MRI,
2061 MachineIRBuilder &B, bool Signed) const {
2062
2063 Register Dst = MI.getOperand(0).getReg();
2064 Register Src = MI.getOperand(1).getReg();
2065
2066 const LLT S64 = LLT::scalar(64);
2067 const LLT S32 = LLT::scalar(32);
2068
2069 assert(MRI.getType(Src) == S64)(static_cast<void> (0));
2070
2071 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2072 auto ThirtyTwo = B.buildConstant(S32, 32);
2073
2074 if (MRI.getType(Dst) == S64) {
2075 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2076 : B.buildUITOFP(S64, Unmerge.getReg(1));
2077
2078 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080 .addUse(CvtHi.getReg(0))
2081 .addUse(ThirtyTwo.getReg(0));
2082
2083 // TODO: Should this propagate fast-math-flags?
2084 B.buildFAdd(Dst, LdExp, CvtLo);
2085 MI.eraseFromParent();
2086 return true;
2087 }
2088
2089 assert(MRI.getType(Dst) == S32)(static_cast<void> (0));
2090
2091 auto One = B.buildConstant(S32, 1);
2092
2093 MachineInstrBuilder ShAmt;
2094 if (Signed) {
2095 auto ThirtyOne = B.buildConstant(S32, 31);
2096 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2097 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2098 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2099 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2100 /*HasSideEffects=*/false)
2101 .addUse(Unmerge.getReg(1));
2102 auto LS2 = B.buildSub(S32, LS, One);
2103 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2104 } else
2105 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2106 auto Norm = B.buildShl(S64, Src, ShAmt);
2107 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2108 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2109 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2110 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2111 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2112 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2113 /*HasSideEffects=*/false)
2114 .addUse(FVal.getReg(0))
2115 .addUse(Scale.getReg(0));
2116 MI.eraseFromParent();
2117 return true;
2118}
2119
2120// TODO: Copied from DAG implementation. Verify logic and document how this
2121// actually works.
2122bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2123 MachineRegisterInfo &MRI,
2124 MachineIRBuilder &B,
2125 bool Signed) const {
2126
2127 Register Dst = MI.getOperand(0).getReg();
2128 Register Src = MI.getOperand(1).getReg();
2129
2130 const LLT S64 = LLT::scalar(64);
2131 const LLT S32 = LLT::scalar(32);
2132
2133 const LLT SrcLT = MRI.getType(Src);
2134 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast<void> (0));
2135
2136 unsigned Flags = MI.getFlags();
2137
2138 // The basic idea of converting a floating point number into a pair of 32-bit
2139 // integers is illustrated as follows:
2140 //
2141 // tf := trunc(val);
2142 // hif := floor(tf * 2^-32);
2143 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2144 // hi := fptoi(hif);
2145 // lo := fptoi(lof);
2146 //
2147 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2148 MachineInstrBuilder Sign;
2149 if (Signed && SrcLT == S32) {
2150 // However, a 32-bit floating point number has only 23 bits mantissa and
2151 // it's not enough to hold all the significant bits of `lof` if val is
2152 // negative. To avoid the loss of precision, We need to take the absolute
2153 // value after truncating and flip the result back based on the original
2154 // signedness.
2155 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2156 Trunc = B.buildFAbs(S32, Trunc, Flags);
2157 }
2158 MachineInstrBuilder K0, K1;
2159 if (SrcLT == S64) {
2160 K0 = B.buildFConstant(S64,
2161 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL));
2162 K1 = B.buildFConstant(S64,
2163 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL));
2164 } else {
2165 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2166 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2167 }
2168
2169 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2170 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2171 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2172
2173 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2174 : B.buildFPTOUI(S32, FloorMul);
2175 auto Lo = B.buildFPTOUI(S32, Fma);
2176
2177 if (Signed && SrcLT == S32) {
2178 // Flip the result based on the signedness, which is either all 0s or 1s.
2179 Sign = B.buildMerge(S64, {Sign, Sign});
2180 // r := xor({lo, hi}, sign) - sign;
2181 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2182 } else
2183 B.buildMerge(Dst, {Lo, Hi});
2184 MI.eraseFromParent();
2185
2186 return true;
2187}
2188
2189bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2190 MachineInstr &MI) const {
2191 MachineFunction &MF = Helper.MIRBuilder.getMF();
2192 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2193
2194 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2195 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2196
2197 // With ieee_mode disabled, the instructions have the correct behavior
2198 // already for G_FMINNUM/G_FMAXNUM
2199 if (!MFI->getMode().IEEE)
2200 return !IsIEEEOp;
2201
2202 if (IsIEEEOp)
2203 return true;
2204
2205 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2206}
2207
2208bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2209 MachineInstr &MI, MachineRegisterInfo &MRI,
2210 MachineIRBuilder &B) const {
2211 // TODO: Should move some of this into LegalizerHelper.
2212
2213 // TODO: Promote dynamic indexing of s16 to s32
2214
2215 // FIXME: Artifact combiner probably should have replaced the truncated
2216 // constant before this, so we shouldn't need
2217 // getConstantVRegValWithLookThrough.
2218 Optional<ValueAndVReg> MaybeIdxVal =
2219 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2220 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2221 return true;
2222 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2223
2224 Register Dst = MI.getOperand(0).getReg();
2225 Register Vec = MI.getOperand(1).getReg();
2226
2227 LLT VecTy = MRI.getType(Vec);
2228 LLT EltTy = VecTy.getElementType();
2229 assert(EltTy == MRI.getType(Dst))(static_cast<void> (0));
2230
2231 if (IdxVal < VecTy.getNumElements())
2232 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2233 else
2234 B.buildUndef(Dst);
2235
2236 MI.eraseFromParent();
2237 return true;
2238}
2239
2240bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2241 MachineInstr &MI, MachineRegisterInfo &MRI,
2242 MachineIRBuilder &B) const {
2243 // TODO: Should move some of this into LegalizerHelper.
2244
2245 // TODO: Promote dynamic indexing of s16 to s32
2246
2247 // FIXME: Artifact combiner probably should have replaced the truncated
2248 // constant before this, so we shouldn't need
2249 // getConstantVRegValWithLookThrough.
2250 Optional<ValueAndVReg> MaybeIdxVal =
2251 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2252 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2253 return true;
2254
2255 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2256 Register Dst = MI.getOperand(0).getReg();
2257 Register Vec = MI.getOperand(1).getReg();
2258 Register Ins = MI.getOperand(2).getReg();
2259
2260 LLT VecTy = MRI.getType(Vec);
2261 LLT EltTy = VecTy.getElementType();
2262 assert(EltTy == MRI.getType(Ins))(static_cast<void> (0));
2263
2264 if (IdxVal < VecTy.getNumElements())
2265 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2266 else
2267 B.buildUndef(Dst);
2268
2269 MI.eraseFromParent();
2270 return true;
2271}
2272
2273bool AMDGPULegalizerInfo::legalizeShuffleVector(
2274 MachineInstr &MI, MachineRegisterInfo &MRI,
2275 MachineIRBuilder &B) const {
2276 const LLT V2S16 = LLT::fixed_vector(2, 16);
2277
2278 Register Dst = MI.getOperand(0).getReg();
2279 Register Src0 = MI.getOperand(1).getReg();
2280 LLT DstTy = MRI.getType(Dst);
2281 LLT SrcTy = MRI.getType(Src0);
2282
2283 if (SrcTy == V2S16 && DstTy == V2S16 &&
2284 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2285 return true;
2286
2287 MachineIRBuilder HelperBuilder(MI);
2288 GISelObserverWrapper DummyObserver;
2289 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2290 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2291}
2292
2293bool AMDGPULegalizerInfo::legalizeSinCos(
2294 MachineInstr &MI, MachineRegisterInfo &MRI,
2295 MachineIRBuilder &B) const {
2296
2297 Register DstReg = MI.getOperand(0).getReg();
2298 Register SrcReg = MI.getOperand(1).getReg();
2299 LLT Ty = MRI.getType(DstReg);
2300 unsigned Flags = MI.getFlags();
2301
2302 Register TrigVal;
2303 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2304 if (ST.hasTrigReducedRange()) {
2305 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2306 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2307 .addUse(MulVal.getReg(0))
2308 .setMIFlags(Flags).getReg(0);
2309 } else
2310 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2311
2312 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2313 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2314 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2315 .addUse(TrigVal)
2316 .setMIFlags(Flags);
2317 MI.eraseFromParent();
2318 return true;
2319}
2320
2321bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2322 MachineIRBuilder &B,
2323 const GlobalValue *GV,
2324 int64_t Offset,
2325 unsigned GAFlags) const {
2326 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast<void> (0));
2327 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2328 // to the following code sequence:
2329 //
2330 // For constant address space:
2331 // s_getpc_b64 s[0:1]
2332 // s_add_u32 s0, s0, $symbol
2333 // s_addc_u32 s1, s1, 0
2334 //
2335 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2336 // a fixup or relocation is emitted to replace $symbol with a literal
2337 // constant, which is a pc-relative offset from the encoding of the $symbol
2338 // operand to the global variable.
2339 //
2340 // For global address space:
2341 // s_getpc_b64 s[0:1]
2342 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2343 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2344 //
2345 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346 // fixups or relocations are emitted to replace $symbol@*@lo and
2347 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2348 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2349 // operand to the global variable.
2350 //
2351 // What we want here is an offset from the value returned by s_getpc
2352 // (which is the address of the s_add_u32 instruction) to the global
2353 // variable, but since the encoding of $symbol starts 4 bytes after the start
2354 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2355 // small. This requires us to add 4 to the global variable offset in order to
2356 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2357 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2358 // instruction.
2359
2360 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2361
2362 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2363 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2364
2365 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2366 .addDef(PCReg);
2367
2368 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2369 if (GAFlags == SIInstrInfo::MO_NONE)
2370 MIB.addImm(0);
2371 else
2372 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2373
2374 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2375
2376 if (PtrTy.getSizeInBits() == 32)
2377 B.buildExtract(DstReg, PCReg, 0);
2378 return true;
2379 }
2380
2381bool AMDGPULegalizerInfo::legalizeGlobalValue(
2382 MachineInstr &MI, MachineRegisterInfo &MRI,
2383 MachineIRBuilder &B) const {
2384 Register DstReg = MI.getOperand(0).getReg();
2385 LLT Ty = MRI.getType(DstReg);
2386 unsigned AS = Ty.getAddressSpace();
2387
2388 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2389 MachineFunction &MF = B.getMF();
2390 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2391
2392 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2393 if (!MFI->isModuleEntryFunction() &&
2394 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2395 const Function &Fn = MF.getFunction();
2396 DiagnosticInfoUnsupported BadLDSDecl(
2397 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2398 DS_Warning);
2399 Fn.getContext().diagnose(BadLDSDecl);
2400
2401 // We currently don't have a way to correctly allocate LDS objects that
2402 // aren't directly associated with a kernel. We do force inlining of
2403 // functions that use local objects. However, if these dead functions are
2404 // not eliminated, we don't want a compile time error. Just emit a warning
2405 // and a trap, since there should be no callable path here.
2406 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2407 B.buildUndef(DstReg);
2408 MI.eraseFromParent();
2409 return true;
2410 }
2411
2412 // TODO: We could emit code to handle the initialization somewhere.
2413 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2414 const SITargetLowering *TLI = ST.getTargetLowering();
2415 if (!TLI->shouldUseLDSConstAddress(GV)) {
2416 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2417 return true; // Leave in place;
2418 }
2419
2420 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2421 Type *Ty = GV->getValueType();
2422 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2423 // zero-sized type in other languages to declare the dynamic shared
2424 // memory which size is not known at the compile time. They will be
2425 // allocated by the runtime and placed directly after the static
2426 // allocated ones. They all share the same offset.
2427 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2428 // Adjust alignment for that dynamic shared memory array.
2429 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2430 LLT S32 = LLT::scalar(32);
2431 auto Sz =
2432 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2433 B.buildIntToPtr(DstReg, Sz);
2434 MI.eraseFromParent();
2435 return true;
2436 }
2437 }
2438
2439 B.buildConstant(
2440 DstReg,
2441 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2442 MI.eraseFromParent();
2443 return true;
2444 }
2445
2446 const Function &Fn = MF.getFunction();
2447 DiagnosticInfoUnsupported BadInit(
2448 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2449 Fn.getContext().diagnose(BadInit);
2450 return true;
2451 }
2452
2453 const SITargetLowering *TLI = ST.getTargetLowering();
2454
2455 if (TLI->shouldEmitFixup(GV)) {
2456 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2457 MI.eraseFromParent();
2458 return true;
2459 }
2460
2461 if (TLI->shouldEmitPCReloc(GV)) {
2462 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2463 MI.eraseFromParent();
2464 return true;
2465 }
2466
2467 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2468 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2469
2470 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2471 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2472 MachinePointerInfo::getGOT(MF),
2473 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2474 MachineMemOperand::MOInvariant,
2475 LoadTy, Align(8));
2476
2477 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2478
2479 if (Ty.getSizeInBits() == 32) {
2480 // Truncate if this is a 32-bit constant adrdess.
2481 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2482 B.buildExtract(DstReg, Load, 0);
2483 } else
2484 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2485
2486 MI.eraseFromParent();
2487 return true;
2488}
2489
2490static LLT widenToNextPowerOf2(LLT Ty) {
2491 if (Ty.isVector())
2492 return Ty.changeElementCount(
2493 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2494 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2495}
2496
2497bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2498 MachineInstr &MI) const {
2499 MachineIRBuilder &B = Helper.MIRBuilder;
2500 MachineRegisterInfo &MRI = *B.getMRI();
2501 GISelChangeObserver &Observer = Helper.Observer;
2502
2503 Register PtrReg = MI.getOperand(1).getReg();
2504 LLT PtrTy = MRI.getType(PtrReg);
2505 unsigned AddrSpace = PtrTy.getAddressSpace();
2506
2507 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2508 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2509 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2510 Observer.changingInstr(MI);
2511 MI.getOperand(1).setReg(Cast.getReg(0));
2512 Observer.changedInstr(MI);
2513 return true;
2514 }
2515
2516 if (MI.getOpcode() != AMDGPU::G_LOAD)
2517 return false;
2518
2519 Register ValReg = MI.getOperand(0).getReg();
2520 LLT ValTy = MRI.getType(ValReg);
2521
2522 MachineMemOperand *MMO = *MI.memoperands_begin();
2523 const unsigned ValSize = ValTy.getSizeInBits();
2524 const LLT MemTy = MMO->getMemoryType();
2525 const Align MemAlign = MMO->getAlign();
2526 const unsigned MemSize = MemTy.getSizeInBits();
2527 const unsigned AlignInBits = 8 * MemAlign.value();
2528
2529 // Widen non-power-of-2 loads to the alignment if needed
2530 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2531 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2532
2533 // This was already the correct extending load result type, so just adjust
2534 // the memory type.
2535 if (WideMemSize == ValSize) {
2536 MachineFunction &MF = B.getMF();
2537
2538 MachineMemOperand *WideMMO =
2539 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2540 Observer.changingInstr(MI);
2541 MI.setMemRefs(MF, {WideMMO});
2542 Observer.changedInstr(MI);
2543 return true;
2544 }
2545
2546 // Don't bother handling edge case that should probably never be produced.
2547 if (ValSize > WideMemSize)
2548 return false;
2549
2550 LLT WideTy = widenToNextPowerOf2(ValTy);
2551
2552 Register WideLoad;
2553 if (!WideTy.isVector()) {
2554 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2555 B.buildTrunc(ValReg, WideLoad).getReg(0);
2556 } else {
2557 // Extract the subvector.
2558
2559 if (isRegisterType(ValTy)) {
2560 // If this a case where G_EXTRACT is legal, use it.
2561 // (e.g. <3 x s32> -> <4 x s32>)
2562 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2563 B.buildExtract(ValReg, WideLoad, 0);
2564 } else {
2565 // For cases where the widened type isn't a nice register value, unmerge
2566 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2567 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2568 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2569 B.setInsertPt(B.getMBB(), MI.getIterator());
2570 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2571 }
2572 }
2573
2574 MI.eraseFromParent();
2575 return true;
2576 }
2577
2578 return false;
2579}
2580
2581bool AMDGPULegalizerInfo::legalizeFMad(
2582 MachineInstr &MI, MachineRegisterInfo &MRI,
2583 MachineIRBuilder &B) const {
2584 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2585 assert(Ty.isScalar())(static_cast<void> (0));
2586
2587 MachineFunction &MF = B.getMF();
2588 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2589
2590 // TODO: Always legal with future ftz flag.
2591 // FIXME: Do we need just output?
2592 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2593 return true;
2594 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2595 return true;
2596
2597 MachineIRBuilder HelperBuilder(MI);
2598 GISelObserverWrapper DummyObserver;
2599 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2600 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2601}
2602
2603bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2604 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2605 Register DstReg = MI.getOperand(0).getReg();
2606 Register PtrReg = MI.getOperand(1).getReg();
2607 Register CmpVal = MI.getOperand(2).getReg();
2608 Register NewVal = MI.getOperand(3).getReg();
2609
2610 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast<void> (0))
2611 "this should not have been custom lowered")(static_cast<void> (0));
2612
2613 LLT ValTy = MRI.getType(CmpVal);
2614 LLT VecTy = LLT::fixed_vector(2, ValTy);
2615
2616 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2617
2618 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2619 .addDef(DstReg)
2620 .addUse(PtrReg)
2621 .addUse(PackedVal)
2622 .setMemRefs(MI.memoperands());
2623
2624 MI.eraseFromParent();
2625 return true;
2626}
2627
2628bool AMDGPULegalizerInfo::legalizeFlog(
2629 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2630 Register Dst = MI.getOperand(0).getReg();
2631 Register Src = MI.getOperand(1).getReg();
2632 LLT Ty = B.getMRI()->getType(Dst);
2633 unsigned Flags = MI.getFlags();
2634
2635 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2636 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2637
2638 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2639 MI.eraseFromParent();
2640 return true;
2641}
2642
2643bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2644 MachineIRBuilder &B) const {
2645 Register Dst = MI.getOperand(0).getReg();
2646 Register Src = MI.getOperand(1).getReg();
2647 unsigned Flags = MI.getFlags();
2648 LLT Ty = B.getMRI()->getType(Dst);
2649
2650 auto K = B.buildFConstant(Ty, numbers::log2e);
2651 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2652 B.buildFExp2(Dst, Mul, Flags);
2653 MI.eraseFromParent();
2654 return true;
2655}
2656
2657bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2658 MachineIRBuilder &B) const {
2659 Register Dst = MI.getOperand(0).getReg();
2660 Register Src0 = MI.getOperand(1).getReg();
2661 Register Src1 = MI.getOperand(2).getReg();
2662 unsigned Flags = MI.getFlags();
2663 LLT Ty = B.getMRI()->getType(Dst);
2664 const LLT S16 = LLT::scalar(16);
2665 const LLT S32 = LLT::scalar(32);
2666
2667 if (Ty == S32) {
2668 auto Log = B.buildFLog2(S32, Src0, Flags);
2669 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2670 .addUse(Log.getReg(0))
2671 .addUse(Src1)
2672 .setMIFlags(Flags);
2673 B.buildFExp2(Dst, Mul, Flags);
2674 } else if (Ty == S16) {
2675 // There's no f16 fmul_legacy, so we need to convert for it.
2676 auto Log = B.buildFLog2(S16, Src0, Flags);
2677 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2678 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2679 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2680 .addUse(Ext0.getReg(0))
2681 .addUse(Ext1.getReg(0))
2682 .setMIFlags(Flags);
2683
2684 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2685 } else
2686 return false;
2687
2688 MI.eraseFromParent();
2689 return true;
2690}
2691
2692// Find a source register, ignoring any possible source modifiers.
2693static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2694 Register ModSrc = OrigSrc;
2695 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2696 ModSrc = SrcFNeg->getOperand(1).getReg();
2697 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2698 ModSrc = SrcFAbs->getOperand(1).getReg();
2699 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2700 ModSrc = SrcFAbs->getOperand(1).getReg();
2701 return ModSrc;
2702}
2703
2704bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2705 MachineRegisterInfo &MRI,
2706 MachineIRBuilder &B) const {
2707
2708 const LLT S1 = LLT::scalar(1);
2709 const LLT S64 = LLT::scalar(64);
2710 Register Dst = MI.getOperand(0).getReg();
2711 Register OrigSrc = MI.getOperand(1).getReg();
2712 unsigned Flags = MI.getFlags();
2713 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast<void> (0))
2714 "this should not have been custom lowered")(static_cast<void> (0));
2715
2716 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2717 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2718 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2719 // V_FRACT bug is:
2720 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2721 //
2722 // Convert floor(x) to (x - fract(x))
2723
2724 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2725 .addUse(OrigSrc)
2726 .setMIFlags(Flags);
2727
2728 // Give source modifier matching some assistance before obscuring a foldable
2729 // pattern.
2730
2731 // TODO: We can avoid the neg on the fract? The input sign to fract
2732 // shouldn't matter?
2733 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2734
2735 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2736
2737 Register Min = MRI.createGenericVirtualRegister(S64);
2738
2739 // We don't need to concern ourselves with the snan handling difference, so
2740 // use the one which will directly select.
2741 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2742 if (MFI->getMode().IEEE)
2743 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2744 else
2745 B.buildFMinNum(Min, Fract, Const, Flags);
2746
2747 Register CorrectedFract = Min;
2748 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2749 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2750 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2751 }
2752
2753 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2754 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2755
2756 MI.eraseFromParent();
2757 return true;
2758}
2759
2760// Turn an illegal packed v2s16 build vector into bit operations.
2761// TODO: This should probably be a bitcast action in LegalizerHelper.
2762bool AMDGPULegalizerInfo::legalizeBuildVector(
2763 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2764 Register Dst = MI.getOperand(0).getReg();
2765 const LLT S32 = LLT::scalar(32);
2766 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast<void> (0));
2767
2768 Register Src0 = MI.getOperand(1).getReg();
2769 Register Src1 = MI.getOperand(2).getReg();
2770 assert(MRI.getType(Src0) == LLT::scalar(16))(static_cast<void> (0));
2771
2772 auto Merge = B.buildMerge(S32, {Src0, Src1});
2773 B.buildBitcast(Dst, Merge);
2774
2775 MI.eraseFromParent();
2776 return true;
2777}
2778
2779// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2780// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2781// case with a single min instruction instead of a compare+select.
2782bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2783 MachineRegisterInfo &MRI,
2784 MachineIRBuilder &B) const {
2785 Register Dst = MI.getOperand(0).getReg();
2786 Register Src = MI.getOperand(1).getReg();
2787 LLT DstTy = MRI.getType(Dst);
2788 LLT SrcTy = MRI.getType(Src);
2789
2790 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2791 ? AMDGPU::G_AMDGPU_FFBH_U32
2792 : AMDGPU::G_AMDGPU_FFBL_B32;
2793 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2794 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2795
2796 MI.eraseFromParent();
2797 return true;
2798}
2799
2800// Check that this is a G_XOR x, -1
2801static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2802 if (MI.getOpcode() != TargetOpcode::G_XOR)
2803 return false;
2804 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2805 return ConstVal && *ConstVal == -1;
2806}
2807
2808// Return the use branch instruction, otherwise null if the usage is invalid.
2809static MachineInstr *
2810verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2811 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2812 Register CondDef = MI.getOperand(0).getReg();
2813 if (!MRI.hasOneNonDBGUse(CondDef))
2814 return nullptr;
2815
2816 MachineBasicBlock *Parent = MI.getParent();
2817 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2818
2819 if (isNot(MRI, *UseMI)) {
2820 Register NegatedCond = UseMI->getOperand(0).getReg();
2821 if (!MRI.hasOneNonDBGUse(NegatedCond))
2822 return nullptr;
2823
2824 // We're deleting the def of this value, so we need to remove it.
2825 UseMI->eraseFromParent();
2826
2827 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2828 Negated = true;
2829 }
2830
2831 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2832 return nullptr;
2833
2834 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2835 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2836 if (Next == Parent->end()) {
2837 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2838 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2839 return nullptr;
2840 UncondBrTarget = &*NextMBB;
2841 } else {
2842 if (Next->getOpcode() != AMDGPU::G_BR)
2843 return nullptr;
2844 Br = &*Next;
2845 UncondBrTarget = Br->getOperand(0).getMBB();
2846 }
2847
2848 return UseMI;
2849}
2850
2851bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2852 const ArgDescriptor *Arg,
2853 const TargetRegisterClass *ArgRC,
2854 LLT ArgTy) const {
2855 MCRegister SrcReg = Arg->getRegister();
2856 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast<void> (0));
2857 assert(DstReg.isVirtual() && "Virtual register expected")(static_cast<void> (0));
2858
2859 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2860 ArgTy);
2861 if (Arg->isMasked()) {
7
Calling 'ArgDescriptor::isMasked'
10
Returning from 'ArgDescriptor::isMasked'
11
Taking true branch
2862 // TODO: Should we try to emit this once in the entry block?
2863 const LLT S32 = LLT::scalar(32);
2864 const unsigned Mask = Arg->getMask();
2865 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
12
Calling 'countTrailingZeros<unsigned int>'
19
Returning from 'countTrailingZeros<unsigned int>'
20
'Shift' initialized to 32
2866
2867 Register AndMaskSrc = LiveIn;
2868
2869 if (Shift
20.1
'Shift' is not equal to 0
20.1
'Shift' is not equal to 0
20.1
'Shift' is not equal to 0
!= 0) {
21
Taking true branch
2870 auto ShiftAmt = B.buildConstant(S32, Shift);
2871 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2872 }
2873
2874 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
22
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2875 } else {
2876 B.buildCopy(DstReg, LiveIn);
2877 }
2878
2879 return true;
2880}
2881
2882bool AMDGPULegalizerInfo::loadInputValue(
2883 Register DstReg, MachineIRBuilder &B,
2884 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2885 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2886 const ArgDescriptor *Arg;
2887 const TargetRegisterClass *ArgRC;
2888 LLT ArgTy;
2889 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2890
2891 if (!Arg) {
4
Assuming 'Arg' is non-null
2892 assert(ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)(static_cast<void> (0));
2893 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2894 // case the pointer argument may be missing and we use null.
2895 B.buildConstant(DstReg, 0);
2896 return true;
2897 }
2898
2899 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5
Taking false branch
2900 return false; // TODO: Handle these
2901 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
6
Calling 'AMDGPULegalizerInfo::loadInputValue'
2902}
2903
2904bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2905 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2906 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2907 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3
Calling 'AMDGPULegalizerInfo::loadInputValue'
2908 return false;
2909
2910 MI.eraseFromParent();
2911 return true;
2912}
2913
2914bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2915 MachineRegisterInfo &MRI,
2916 MachineIRBuilder &B) const {
2917 Register Dst = MI.getOperand(0).getReg();
2918 LLT DstTy = MRI.getType(Dst);
2919 LLT S16 = LLT::scalar(16);
2920 LLT S32 = LLT::scalar(32);
2921 LLT S64 = LLT::scalar(64);
2922
2923 if (DstTy == S16)
2924 return legalizeFDIV16(MI, MRI, B);
2925 if (DstTy == S32)
2926 return legalizeFDIV32(MI, MRI, B);
2927 if (DstTy == S64)
2928 return legalizeFDIV64(MI, MRI, B);
2929
2930 return false;
2931}
2932
2933void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2934 Register DstDivReg,
2935 Register DstRemReg,
2936 Register X,
2937 Register Y) const {
2938 const LLT S1 = LLT::scalar(1);
2939 const LLT S32 = LLT::scalar(32);
2940
2941 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2942 // algorithm used here.
2943
2944 // Initial estimate of inv(y).
2945 auto FloatY = B.buildUITOFP(S32, Y);
2946 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2947 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2948 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2949 auto Z = B.buildFPTOUI(S32, ScaledY);
2950
2951 // One round of UNR.
2952 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2953 auto NegYZ = B.buildMul(S32, NegY, Z);
2954 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2955
2956 // Quotient/remainder estimate.
2957 auto Q = B.buildUMulH(S32, X, Z);
2958 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2959
2960 // First quotient/remainder refinement.
2961 auto One = B.buildConstant(S32, 1);
2962 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2963 if (DstDivReg)
2964 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2965 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2966
2967 // Second quotient/remainder refinement.
2968 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2969 if (DstDivReg)
2970 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2971
2972 if (DstRemReg)
2973 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2974}
2975
2976// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2977//
2978// Return lo, hi of result
2979//
2980// %cvt.lo = G_UITOFP Val.lo
2981// %cvt.hi = G_UITOFP Val.hi
2982// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2983// %rcp = G_AMDGPU_RCP_IFLAG %mad
2984// %mul1 = G_FMUL %rcp, 0x5f7ffffc
2985// %mul2 = G_FMUL %mul1, 2**(-32)
2986// %trunc = G_INTRINSIC_TRUNC %mul2
2987// %mad2 = G_FMAD %trunc, -(2**32), %mul1
2988// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2989static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2990 Register Val) {
2991 const LLT S32 = LLT::scalar(32);
2992 auto Unmerge = B.buildUnmerge(S32, Val);
2993
2994 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2995 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2996
2997 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2998 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2999
3000 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3001 auto Mul1 =
3002 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3003
3004 // 2**(-32)
3005 auto Mul2 =
3006 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3007 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3008
3009 // -(2**32)
3010 auto Mad2 = B.buildFMAD(S32, Trunc,
3011 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3012
3013 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3014 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3015
3016 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3017}
3018
3019void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3020 Register DstDivReg,
3021 Register DstRemReg,
3022 Register Numer,
3023 Register Denom) const {
3024 const LLT S32 = LLT::scalar(32);
3025 const LLT S64 = LLT::scalar(64);
3026 const LLT S1 = LLT::scalar(1);
3027 Register RcpLo, RcpHi;
3028
3029 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3030
3031 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3032
3033 auto Zero64 = B.buildConstant(S64, 0);
3034 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3035
3036 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3037 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3038
3039 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3040 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3041 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3042
3043 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3044 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3045 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3046 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3047
3048 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3049 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3050 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3051 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3052 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3053
3054 auto Zero32 = B.buildConstant(S32, 0);
3055 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3056 auto Add2_HiC =
3057 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3058 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3059 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3060
3061 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3062 Register NumerLo = UnmergeNumer.getReg(0);
3063 Register NumerHi = UnmergeNumer.getReg(1);
3064
3065 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3066 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3067 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3068 Register Mul3_Lo = UnmergeMul3.getReg(0);
3069 Register Mul3_Hi = UnmergeMul3.getReg(1);
3070 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3071 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3072 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3073 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3074
3075 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3076 Register DenomLo = UnmergeDenom.getReg(0);
3077 Register DenomHi = UnmergeDenom.getReg(1);
3078
3079 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3080 auto C1 = B.buildSExt(S32, CmpHi);
3081
3082 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3083 auto C2 = B.buildSExt(S32, CmpLo);
3084
3085 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3086 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3087
3088 // TODO: Here and below portions of the code can be enclosed into if/endif.
3089 // Currently control flow is unconditional and we have 4 selects after
3090 // potential endif to substitute PHIs.
3091
3092 // if C3 != 0 ...
3093 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3094 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3095 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3096 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3097
3098 auto One64 = B.buildConstant(S64, 1);
3099 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3100
3101 auto C4 =
3102 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3103 auto C5 =
3104 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3105 auto C6 = B.buildSelect(
3106 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3107
3108 // if (C6 != 0)
3109 auto Add4 = B.buildAdd(S64, Add3, One64);
3110 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3111
3112 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3113 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3114 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3115
3116 // endif C6
3117 // endif C3
3118
3119 if (DstDivReg) {
3120 auto Sel1 = B.buildSelect(
3121 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3122 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3123 Sel1, MulHi3);
3124 }
3125
3126 if (DstRemReg) {
3127 auto Sel2 = B.buildSelect(
3128 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3129 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3130 Sel2, Sub1);
3131 }
3132}
3133
3134bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3135 MachineRegisterInfo &MRI,
3136 MachineIRBuilder &B) const {
3137 Register DstDivReg, DstRemReg;
3138 switch (MI.getOpcode()) {
3139 default:
3140 llvm_unreachable("Unexpected opcode!")__builtin_unreachable();
3141 case AMDGPU::G_UDIV: {
3142 DstDivReg = MI.getOperand(0).getReg();
3143 break;
3144 }
3145 case AMDGPU::G_UREM: {
3146 DstRemReg = MI.getOperand(0).getReg();
3147 break;
3148 }
3149 case AMDGPU::G_UDIVREM: {
3150 DstDivReg = MI.getOperand(0).getReg();
3151 DstRemReg = MI.getOperand(1).getReg();
3152 break;
3153 }
3154 }
3155
3156 const LLT S64 = LLT::scalar(64);
3157 const LLT S32 = LLT::scalar(32);
3158 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3159 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3160 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3161 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3162
3163 if (Ty == S32)
3164 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3165 else if (Ty == S64)
3166 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3167 else
3168 return false;
3169
3170 MI.eraseFromParent();
3171 return true;
3172}
3173
3174bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3175 MachineRegisterInfo &MRI,
3176 MachineIRBuilder &B) const {
3177 const LLT S64 = LLT::scalar(64);
3178 const LLT S32 = LLT::scalar(32);
3179
3180 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3181 if (Ty != S32 && Ty != S64)
3182 return false;
3183
3184 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3185 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3186 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3187
3188 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3189 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3190 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3191
3192 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3193 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3194
3195 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3196 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3197
3198 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3199 switch (MI.getOpcode()) {
3200 default:
3201 llvm_unreachable("Unexpected opcode!")__builtin_unreachable();
3202 case AMDGPU::G_SDIV: {
3203 DstDivReg = MI.getOperand(0).getReg();
3204 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3205 break;
3206 }
3207 case AMDGPU::G_SREM: {
3208 DstRemReg = MI.getOperand(0).getReg();
3209 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3210 break;
3211 }
3212 case AMDGPU::G_SDIVREM: {
3213 DstDivReg = MI.getOperand(0).getReg();
3214 DstRemReg = MI.getOperand(1).getReg();
3215 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3216 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3217 break;
3218 }
3219 }
3220
3221 if (Ty == S32)
3222 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3223 else
3224 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3225
3226 if (DstDivReg) {
3227 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3228 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3229 B.buildSub(DstDivReg, SignXor, Sign);
3230 }
3231
3232 if (DstRemReg) {
3233 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3234 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3235 B.buildSub(DstRemReg, SignXor, Sign);
3236 }
3237
3238 MI.eraseFromParent();
3239 return true;
3240}
3241
3242bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3243 MachineRegisterInfo &MRI,
3244 MachineIRBuilder &B) const {
3245 Register Res = MI.getOperand(0).getReg();
3246 Register LHS = MI.getOperand(1).getReg();
3247 Register RHS = MI.getOperand(2).getReg();
3248 uint16_t Flags = MI.getFlags();
3249 LLT ResTy = MRI.getType(Res);
3250
3251 const MachineFunction &MF = B.getMF();
3252 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3253 MI.getFlag(MachineInstr::FmAfn);
3254
3255 if (!AllowInaccurateRcp)
3256 return false;
3257
3258 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3259 // 1 / x -> RCP(x)
3260 if (CLHS->isExactlyValue(1.0)) {
3261 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3262 .addUse(RHS)
3263 .setMIFlags(Flags);
3264
3265 MI.eraseFromParent();
3266 return true;
3267 }
3268
3269 // -1 / x -> RCP( FNEG(x) )
3270 if (CLHS->isExactlyValue(-1.0)) {
3271 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3272 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3273 .addUse(FNeg.getReg(0))
3274 .setMIFlags(Flags);
3275
3276 MI.eraseFromParent();
3277 return true;
3278 }
3279 }
3280
3281 // x / y -> x * (1.0 / y)
3282 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3283 .addUse(RHS)
3284 .setMIFlags(Flags);
3285 B.buildFMul(Res, LHS, RCP, Flags);
3286
3287 MI.eraseFromParent();
3288 return true;
3289}
3290
3291bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3292 MachineRegisterInfo &MRI,
3293 MachineIRBuilder &B) const {
3294 Register Res = MI.getOperand(0).getReg();
3295 Register X = MI.getOperand(1).getReg();
3296 Register Y = MI.getOperand(2).getReg();
3297 uint16_t Flags = MI.getFlags();
3298 LLT ResTy = MRI.getType(Res);
3299
3300 const MachineFunction &MF = B.getMF();
3301 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3302 MI.getFlag(MachineInstr::FmAfn);
3303
3304 if (!AllowInaccurateRcp)
3305 return false;
3306
3307 auto NegY = B.buildFNeg(ResTy, Y);
3308 auto One = B.buildFConstant(ResTy, 1.0);
3309
3310 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3311 .addUse(Y)
3312 .setMIFlags(Flags);
3313
3314 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3315 R = B.buildFMA(ResTy, Tmp0, R, R);
3316
3317 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3318 R = B.buildFMA(ResTy, Tmp1, R, R);
3319
3320 auto Ret = B.buildFMul(ResTy, X, R);
3321 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3322
3323 B.buildFMA(Res, Tmp2, R, Ret);
3324 MI.eraseFromParent();
3325 return true;
3326}
3327
3328bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3329 MachineRegisterInfo &MRI,
3330 MachineIRBuilder &B) const {
3331 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3332 return true;
3333
3334 Register Res = MI.getOperand(0).getReg();
3335 Register LHS = MI.getOperand(1).getReg();
3336 Register RHS = MI.getOperand(2).getReg();
3337
3338 uint16_t Flags = MI.getFlags();
3339
3340 LLT S16 = LLT::scalar(16);
3341 LLT S32 = LLT::scalar(32);
3342
3343 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3344 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3345
3346 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3347 .addUse(RHSExt.getReg(0))
3348 .setMIFlags(Flags);
3349
3350 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3351 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3352
3353 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3354 .addUse(RDst.getReg(0))
3355 .addUse(RHS)
3356 .addUse(LHS)
3357 .setMIFlags(Flags);
3358
3359 MI.eraseFromParent();
3360 return true;
3361}
3362
3363// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3364// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3365static void toggleSPDenormMode(bool Enable,
3366 MachineIRBuilder &B,
3367 const GCNSubtarget &ST,
3368 AMDGPU::SIModeRegisterDefaults Mode) {
3369 // Set SP denorm mode to this value.
3370 unsigned SPDenormMode =
3371 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3372
3373 if (ST.hasDenormModeInst()) {
3374 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3375 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3376
3377 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3378 B.buildInstr(AMDGPU::S_DENORM_MODE)
3379 .addImm(NewDenormModeValue);
3380
3381 } else {
3382 // Select FP32 bit field in mode register.
3383 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3384 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3385 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3386
3387 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3388 .addImm(SPDenormMode)
3389 .addImm(SPDenormModeBitField);
3390 }
3391}
3392
3393bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3394 MachineRegisterInfo &MRI,
3395 MachineIRBuilder &B) const {
3396 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3397 return true;
3398
3399 Register Res = MI.getOperand(0).getReg();
3400 Register LHS = MI.getOperand(1).getReg();
3401 Register RHS = MI.getOperand(2).getReg();
3402 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3403 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3404
3405 uint16_t Flags = MI.getFlags();
3406
3407 LLT S32 = LLT::scalar(32);
3408 LLT S1 = LLT::scalar(1);
3409
3410 auto One = B.buildFConstant(S32, 1.0f);
3411
3412 auto DenominatorScaled =
3413 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3414 .addUse(LHS)
3415 .addUse(RHS)
3416 .addImm(0)
3417 .setMIFlags(Flags);
3418 auto NumeratorScaled =
3419 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3420 .addUse(LHS)
3421 .addUse(RHS)
3422 .addImm(1)
3423 .setMIFlags(Flags);
3424
3425 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3426 .addUse(DenominatorScaled.getReg(0))
3427 .setMIFlags(Flags);
3428 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3429
3430 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3431 // aren't modeled as reading it.
3432 if (!Mode.allFP32Denormals())
3433 toggleSPDenormMode(true, B, ST, Mode);
3434
3435 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3436 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3437 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3438 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3439 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3440 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3441
3442 if (!Mode.allFP32Denormals())
3443 toggleSPDenormMode(false, B, ST, Mode);
3444
3445 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3446 .addUse(Fma4.getReg(0))
3447 .addUse(Fma1.getReg(0))
3448 .addUse(Fma3.getReg(0))
3449 .addUse(NumeratorScaled.getReg(1))
3450 .setMIFlags(Flags);
3451
3452 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3453 .addUse(Fmas.getReg(0))
3454 .addUse(RHS)
3455 .addUse(LHS)
3456 .setMIFlags(Flags);
3457
3458 MI.eraseFromParent();
3459 return true;
3460}
3461
3462bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3463 MachineRegisterInfo &MRI,
3464 MachineIRBuilder &B) const {
3465 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3466 return true;
3467
3468 Register Res = MI.getOperand(0).getReg();
3469 Register LHS = MI.getOperand(1).getReg();
3470 Register RHS = MI.getOperand(2).getReg();
3471
3472 uint16_t Flags = MI.getFlags();
3473
3474 LLT S64 = LLT::scalar(64);
3475 LLT S1 = LLT::scalar(1);
3476
3477 auto One = B.buildFConstant(S64, 1.0);
3478
3479 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3480 .addUse(LHS)
3481 .addUse(RHS)
3482 .addImm(0)
3483 .setMIFlags(Flags);
3484
3485 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3486
3487 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3488 .addUse(DivScale0.getReg(0))
3489 .setMIFlags(Flags);
3490
3491 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3492 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3493 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3494
3495 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3496 .addUse(LHS)
3497 .addUse(RHS)
3498 .addImm(1)
3499 .setMIFlags(Flags);
3500
3501 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3502 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3503 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3504
3505 Register Scale;
3506 if (!ST.hasUsableDivScaleConditionOutput()) {
3507 // Workaround a hardware bug on SI where the condition output from div_scale
3508 // is not usable.
3509
3510 LLT S32 = LLT::scalar(32);
3511
3512 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3513 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3514 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3515 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3516
3517 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3518 Scale1Unmerge.getReg(1));
3519 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3520 Scale0Unmerge.getReg(1));
3521 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3522 } else {
3523 Scale = DivScale1.getReg(1);
3524 }
3525
3526 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3527 .addUse(Fma4.getReg(0))
3528 .addUse(Fma3.getReg(0))
3529 .addUse(Mul.getReg(0))
3530 .addUse(Scale)
3531 .setMIFlags(Flags);
3532
3533 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3534 .addUse(Fmas.getReg(0))
3535 .addUse(RHS)
3536 .addUse(LHS)
3537 .setMIFlags(Flags);
3538
3539 MI.eraseFromParent();
3540 return true;
3541}
3542
3543bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3544 MachineRegisterInfo &MRI,
3545 MachineIRBuilder &B) const {
3546 Register Res = MI.getOperand(0).getReg();
3547 Register LHS = MI.getOperand(2).getReg();
3548 Register RHS = MI.getOperand(3).getReg();
3549 uint16_t Flags = MI.getFlags();
3550
3551 LLT S32 = LLT::scalar(32);
3552 LLT S1 = LLT::scalar(1);
3553
3554 auto Abs = B.buildFAbs(S32, RHS, Flags);
3555 const APFloat C0Val(1.0f);
3556
3557 auto C0 = B.buildConstant(S32, 0x6f800000);
3558 auto C1 = B.buildConstant(S32, 0x2f800000);
3559 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3560
3561 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3562 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3563
3564 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3565
3566 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3567 .addUse(Mul0.getReg(0))
3568 .setMIFlags(Flags);
3569
3570 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3571
3572 B.buildFMul(Res, Sel, Mul1, Flags);
3573
3574 MI.eraseFromParent();
3575 return true;
3576}
3577
3578// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3579// FIXME: Why do we handle this one but not other removed instructions?
3580//
3581// Reciprocal square root. The clamp prevents infinite results, clamping
3582// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3583// +-max_float.
3584bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3585 MachineRegisterInfo &MRI,
3586 MachineIRBuilder &B) const {
3587 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3588 return true;
3589
3590 Register Dst = MI.getOperand(0).getReg();
3591 Register Src = MI.getOperand(2).getReg();
3592 auto Flags = MI.getFlags();
3593
3594 LLT Ty = MRI.getType(Dst);
3595
3596 const fltSemantics *FltSemantics;
3597 if (Ty == LLT::scalar(32))
3598 FltSemantics = &APFloat::IEEEsingle();
3599 else if (Ty == LLT::scalar(64))
3600 FltSemantics = &APFloat::IEEEdouble();
3601 else
3602 return false;
3603
3604 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3605 .addUse(Src)
3606 .setMIFlags(Flags);
3607
3608 // We don't need to concern ourselves with the snan handling difference, since
3609 // the rsq quieted (or not) so use the one which will directly select.
3610 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3611 const bool UseIEEE = MFI->getMode().IEEE;
3612
3613 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3614 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3615 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3616
3617 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3618
3619 if (UseIEEE)
3620 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3621 else
3622 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3623 MI.eraseFromParent();
3624 return true;
3625}
3626
3627static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3628 switch (IID) {
3629 case Intrinsic::amdgcn_ds_fadd:
3630 return AMDGPU::G_ATOMICRMW_FADD;
3631 case Intrinsic::amdgcn_ds_fmin:
3632 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3633 case Intrinsic::amdgcn_ds_fmax:
3634 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3635 default:
3636 llvm_unreachable("not a DS FP intrinsic")__builtin_unreachable();
3637 }
3638}
3639
3640bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3641 MachineInstr &MI,
3642 Intrinsic::ID IID) const {
3643 GISelChangeObserver &Observer = Helper.Observer;
3644 Observer.changingInstr(MI);
3645
3646 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3647
3648 // The remaining operands were used to set fields in the MemOperand on
3649 // construction.
3650 for (int I = 6; I > 3; --I)
3651 MI.RemoveOperand(I);
3652
3653 MI.RemoveOperand(1); // Remove the intrinsic ID.
3654 Observer.changedInstr(MI);
3655 return true;
3656}
3657
3658bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3659 MachineRegisterInfo &MRI,
3660 MachineIRBuilder &B) const {
3661 uint64_t Offset =
3662 ST.getTargetLowering()->getImplicitParameterOffset(
3663 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3664 LLT DstTy = MRI.getType(DstReg);
3665 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3666
3667 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3668 if (!loadInputValue(KernargPtrReg, B,
3669 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3670 return false;
3671
3672 // FIXME: This should be nuw
3673 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3674 return true;
3675}
3676
3677bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3678 MachineRegisterInfo &MRI,
3679 MachineIRBuilder &B) const {
3680 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3681 if (!MFI->isEntryFunction()) {
3682 return legalizePreloadedArgIntrin(MI, MRI, B,
3683 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3684 }
3685
3686 Register DstReg = MI.getOperand(0).getReg();
3687 if (!getImplicitArgPtr(DstReg, MRI, B))
3688 return false;
3689
3690 MI.eraseFromParent();
3691 return true;
3692}
3693
3694bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3695 MachineRegisterInfo &MRI,
3696 MachineIRBuilder &B,
3697 unsigned AddrSpace) const {
3698 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3699 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3700 Register Hi32 = Unmerge.getReg(1);
3701
3702 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3703 MI.eraseFromParent();
3704 return true;
3705}
3706
3707// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3708// offset (the offset that is included in bounds checking and swizzling, to be
3709// split between the instruction's voffset and immoffset fields) and soffset
3710// (the offset that is excluded from bounds checking and swizzling, to go in
3711// the instruction's soffset field). This function takes the first kind of
3712// offset and figures out how to split it between voffset and immoffset.
3713std::pair<Register, unsigned>
3714AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3715 Register OrigOffset) const {
3716 const unsigned MaxImm = 4095;
3717 Register BaseReg;
3718 unsigned ImmOffset;
3719 const LLT S32 = LLT::scalar(32);
3720 MachineRegisterInfo &MRI = *B.getMRI();
3721
3722 std::tie(BaseReg, ImmOffset) =
3723 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3724
3725 // If BaseReg is a pointer, convert it to int.
3726 if (MRI.getType(BaseReg).isPointer())
3727 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3728
3729 // If the immediate value is too big for the immoffset field, put the value
3730 // and -4096 into the immoffset field so that the value that is copied/added
3731 // for the voffset field is a multiple of 4096, and it stands more chance
3732 // of being CSEd with the copy/add for another similar load/store.
3733 // However, do not do that rounding down to a multiple of 4096 if that is a
3734 // negative number, as it appears to be illegal to have a negative offset
3735 // in the vgpr, even if adding the immediate offset makes it positive.
3736 unsigned Overflow = ImmOffset & ~MaxImm;
3737 ImmOffset -= Overflow;
3738 if ((int32_t)Overflow < 0) {
3739 Overflow += ImmOffset;
3740 ImmOffset = 0;
3741 }
3742
3743 if (Overflow != 0) {
3744 if (!BaseReg) {
3745 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3746 } else {
3747 auto OverflowVal = B.buildConstant(S32, Overflow);
3748 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3749 }
3750 }
3751
3752 if (!BaseReg)
3753 BaseReg = B.buildConstant(S32, 0).getReg(0);
3754
3755 return std::make_pair(BaseReg, ImmOffset);
3756}
3757
3758/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3759void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3760 Register VOffset, Register SOffset,
3761 unsigned ImmOffset, Register VIndex,
3762 MachineRegisterInfo &MRI) const {
3763 Optional<ValueAndVReg> MaybeVOffsetVal =
3764 getConstantVRegValWithLookThrough(VOffset, MRI);
3765 Optional<ValueAndVReg> MaybeSOffsetVal =
3766 getConstantVRegValWithLookThrough(SOffset, MRI);
3767 Optional<ValueAndVReg> MaybeVIndexVal =
3768 getConstantVRegValWithLookThrough(VIndex, MRI);
3769 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3770 // update the MMO with that offset. The stride is unknown so we can only do
3771 // this if VIndex is constant 0.
3772 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3773 MaybeVIndexVal->Value == 0) {
3774 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3775 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3776 MMO->setOffset(TotalOffset);
3777 } else {
3778 // We don't have a constant combined offset to use in the MMO. Give up.
3779 MMO->setValue((Value *)nullptr);
3780 }
3781}
3782
3783/// Handle register layout difference for f16 images for some subtargets.
3784Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3785 MachineRegisterInfo &MRI,
3786 Register Reg,
3787 bool ImageStore) const {
3788 const LLT S16 = LLT::scalar(16);
3789 const LLT S32 = LLT::scalar(32);
3790 LLT StoreVT = MRI.getType(Reg);
3791 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast<void> (0));
3792
3793 if (ST.hasUnpackedD16VMem()) {
3794 auto Unmerge = B.buildUnmerge(S16, Reg);
3795
3796 SmallVector<Register, 4> WideRegs;
3797 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3798 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3799
3800 int NumElts = StoreVT.getNumElements();
3801
3802 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3803 .getReg(0);
3804 }
3805
3806 if (ImageStore && ST.hasImageStoreD16Bug()) {
3807 if (StoreVT.getNumElements() == 2) {
3808 SmallVector<Register, 4> PackedRegs;
3809 Reg = B.buildBitcast(S32, Reg).getReg(0);
3810 PackedRegs.push_back(Reg);
3811 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3812 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3813 .getReg(0);
3814 }
3815
3816 if (StoreVT.getNumElements() == 3) {
3817 SmallVector<Register, 4> PackedRegs;
3818 auto Unmerge = B.buildUnmerge(S16, Reg);
3819 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3820 PackedRegs.push_back(Unmerge.getReg(I));
3821 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3822 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3823 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3824 }
3825
3826 if (StoreVT.getNumElements() == 4) {
3827 SmallVector<Register, 4> PackedRegs;
3828 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3829 auto Unmerge = B.buildUnmerge(S32, Reg);
3830 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3831 PackedRegs.push_back(Unmerge.getReg(I));
3832 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3833 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3834 .getReg(0);
3835 }
3836
3837 llvm_unreachable("invalid data type")__builtin_unreachable();
3838 }
3839
3840 return Reg;
3841}
3842
3843Register AMDGPULegalizerInfo::fixStoreSourceType(
3844 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3845 MachineRegisterInfo *MRI = B.getMRI();
3846 LLT Ty = MRI->getType(VData);
3847
3848 const LLT S16 = LLT::scalar(16);
3849
3850 // Fixup illegal register types for i8 stores.
3851 if (Ty == LLT::scalar(8) || Ty == S16) {
3852 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3853 return AnyExt;
3854 }
3855
3856 if (Ty.isVector()) {
3857 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3858 if (IsFormat)
3859 return handleD16VData(B, *MRI, VData);
3860 }
3861 }
3862
3863 return VData;
3864}
3865
3866bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3867 MachineRegisterInfo &MRI,
3868 MachineIRBuilder &B,
3869 bool IsTyped,
3870 bool IsFormat) const {
3871 Register VData = MI.getOperand(1).getReg();
3872 LLT Ty = MRI.getType(VData);
3873 LLT EltTy = Ty.getScalarType();
3874 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3875 const LLT S32 = LLT::scalar(32);
3876
3877 VData = fixStoreSourceType(B, VData, IsFormat);
3878 Register RSrc = MI.getOperand(2).getReg();
3879
3880 MachineMemOperand *MMO = *MI.memoperands_begin();
3881 const int MemSize = MMO->getSize();
3882
3883 unsigned ImmOffset;
3884
3885 // The typed intrinsics add an immediate after the registers.
3886 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3887
3888 // The struct intrinsic variants add one additional operand over raw.
3889 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3890 Register VIndex;
3891 int OpOffset = 0;
3892 if (HasVIndex) {
3893 VIndex = MI.getOperand(3).getReg();
3894 OpOffset = 1;
3895 } else {
3896 VIndex = B.buildConstant(S32, 0).getReg(0);
3897 }
3898
3899 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3900 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3901
3902 unsigned Format = 0;
3903 if (IsTyped) {
3904 Format = MI.getOperand(5 + OpOffset).getImm();
3905 ++OpOffset;
3906 }
3907
3908 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3909
3910 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3911 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3912
3913 unsigned Opc;
3914 if (IsTyped) {
3915 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3916 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3917 } else if (IsFormat) {
3918 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3919 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3920 } else {
3921 switch (MemSize) {
3922 case 1:
3923 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3924 break;
3925 case 2:
3926 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3927 break;
3928 default:
3929 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3930 break;
3931 }
3932 }
3933
3934 auto MIB = B.buildInstr(Opc)
3935 .addUse(VData) // vdata
3936 .addUse(RSrc) // rsrc
3937 .addUse(VIndex) // vindex
3938 .addUse(VOffset) // voffset
3939 .addUse(SOffset) // soffset
3940 .addImm(ImmOffset); // offset(imm)
3941
3942 if (IsTyped)
3943 MIB.addImm(Format);
3944
3945 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3946 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3947 .addMemOperand(MMO);
3948
3949 MI.eraseFromParent();
3950 return true;
3951}
3952
3953bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3954 MachineRegisterInfo &MRI,
3955 MachineIRBuilder &B,
3956 bool IsFormat,
3957 bool IsTyped) const {
3958 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3959 MachineMemOperand *MMO = *MI.memoperands_begin();
3960 const LLT MemTy = MMO->getMemoryType();
3961 const LLT S32 = LLT::scalar(32);
3962
3963 Register Dst = MI.getOperand(0).getReg();
3964 Register RSrc = MI.getOperand(2).getReg();
3965
3966 // The typed intrinsics add an immediate after the registers.
3967 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3968
3969 // The struct intrinsic variants add one additional operand over raw.
3970 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3971 Register VIndex;
3972 int OpOffset = 0;
3973 if (HasVIndex) {
3974 VIndex = MI.getOperand(3).getReg();
3975 OpOffset = 1;
3976 } else {
3977 VIndex = B.buildConstant(S32, 0).getReg(0);
3978 }
3979
3980 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3981 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3982
3983 unsigned Format = 0;
3984 if (IsTyped) {
3985 Format = MI.getOperand(5 + OpOffset).getImm();
3986 ++OpOffset;
3987 }
3988
3989 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3990 unsigned ImmOffset;
3991
3992 LLT Ty = MRI.getType(Dst);
3993 LLT EltTy = Ty.getScalarType();
3994 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3995 const bool Unpacked = ST.hasUnpackedD16VMem();
3996
3997 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3998 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3999
4000 unsigned Opc;
4001
4002 if (IsTyped) {
4003 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4004 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4005 } else if (IsFormat) {
4006 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4007 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4008 } else {
4009 switch (MemTy.getSizeInBits()) {
4010 case 8:
4011 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4012 break;
4013 case 16:
4014 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4015 break;
4016 default:
4017 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4018 break;
4019 }
4020 }
4021
4022 Register LoadDstReg;
4023
4024 bool IsExtLoad =
4025 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4026 LLT UnpackedTy = Ty.changeElementSize(32);
4027
4028 if (IsExtLoad)
4029 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4030 else if (Unpacked && IsD16 && Ty.isVector())
4031 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4032 else
4033 LoadDstReg = Dst;
4034
4035 auto MIB = B.buildInstr(Opc)
4036 .addDef(LoadDstReg) // vdata
4037 .addUse(RSrc) // rsrc
4038 .addUse(VIndex) // vindex
4039 .addUse(VOffset) // voffset
4040 .addUse(SOffset) // soffset
4041 .addImm(ImmOffset); // offset(imm)
4042
4043 if (IsTyped)
4044 MIB.addImm(Format);
4045
4046 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4047 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4048 .addMemOperand(MMO);
4049
4050 if (LoadDstReg != Dst) {
4051 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4052
4053 // Widen result for extending loads was widened.
4054 if (IsExtLoad)
4055 B.buildTrunc(Dst, LoadDstReg);
4056 else {
4057 // Repack to original 16-bit vector result
4058 // FIXME: G_TRUNC should work, but legalization currently fails
4059 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4060 SmallVector<Register, 4> Repack;
4061 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4062 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4063 B.buildMerge(Dst, Repack);
4064 }
4065 }
4066
4067 MI.eraseFromParent();
4068 return true;
4069}
4070
4071bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4072 MachineIRBuilder &B,
4073 bool IsInc) const {
4074 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4075 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4076 B.buildInstr(Opc)
4077 .addDef(MI.getOperand(0).getReg())
4078 .addUse(MI.getOperand(2).getReg())
4079 .addUse(MI.getOperand(3).getReg())
4080 .cloneMemRefs(MI);
4081 MI.eraseFromParent();
4082 return true;
4083}
4084
4085static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4086 switch (IntrID) {
4087 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4088 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4090 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4091 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4092 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4093 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4094 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4095 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4096 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4097 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4098 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4099 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4100 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4101 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4102 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4103 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4105 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4106 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4107 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4108 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4109 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4110 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4111 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4112 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4113 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4114 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4115 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4116 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4117 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4118 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4120 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4121 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4122 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4123 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4124 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4125 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4126 case Intrinsic::amdgcn_buffer_atomic_fadd:
4127 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4128 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4130 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4131 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4132 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4133 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4134 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4135 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4136 default:
4137 llvm_unreachable("unhandled atomic opcode")__builtin_unreachable();
4138 }
4139}
4140
4141bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4142 MachineIRBuilder &B,
4143 Intrinsic::ID IID) const {
4144 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4145 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4146 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4147
4148 Register Dst;
4149
4150 int OpOffset = 0;
4151 if (HasReturn) {
4152 // A few FP atomics do not support return values.
4153 Dst = MI.getOperand(0).getReg();
4154 } else {
4155 OpOffset = -1;
4156 }
4157
4158 Register VData = MI.getOperand(2 + OpOffset).getReg();
4159 Register CmpVal;
4160
4161 if (IsCmpSwap) {
4162 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4163 ++OpOffset;
4164 }
4165
4166 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4167 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4168
4169 // The struct intrinsic variants add one additional operand over raw.
4170 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4171 Register VIndex;
4172 if (HasVIndex) {
4173 VIndex = MI.getOperand(4 + OpOffset).getReg();
4174 ++OpOffset;
4175 } else {
4176 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4177 }
4178
4179 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4180 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4181 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4182
4183 MachineMemOperand *MMO = *MI.memoperands_begin();
4184
4185 unsigned ImmOffset;
4186 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4187 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4188
4189 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4190
4191 if (HasReturn)
4192 MIB.addDef(Dst);
4193
4194 MIB.addUse(VData); // vdata
4195
4196 if (IsCmpSwap)
4197 MIB.addReg(CmpVal);
4198
4199 MIB.addUse(RSrc) // rsrc
4200 .addUse(VIndex) // vindex
4201 .addUse(VOffset) // voffset
4202 .addUse(SOffset) // soffset
4203 .addImm(ImmOffset) // offset(imm)
4204 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4205 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4206 .addMemOperand(MMO);
4207
4208 MI.eraseFromParent();
4209 return true;
4210}
4211
4212/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4213/// vector with s16 typed elements.
4214static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4215 SmallVectorImpl<Register> &PackedAddrs,
4216 unsigned ArgOffset,
4217 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4218 bool IsA16, bool IsG16) {
4219 const LLT S16 = LLT::scalar(16);
4220 const LLT V2S16 = LLT::fixed_vector(2, 16);
4221 auto EndIdx = Intr->VAddrEnd;
4222
4223 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4224 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4225 if (!SrcOp.isReg())
4226 continue; // _L to _LZ may have eliminated this.
4227
4228 Register AddrReg = SrcOp.getReg();
4229
4230 if ((I < Intr->GradientStart) ||
4231 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4232 (I >= Intr->CoordStart && !IsA16)) {
4233 // Handle any gradient or coordinate operands that should not be packed
4234 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4235 PackedAddrs.push_back(AddrReg);
4236 } else {
4237 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4238 // derivatives dx/dh and dx/dv are packed with undef.
4239 if (((I + 1) >= EndIdx) ||
4240 ((Intr->NumGradients / 2) % 2 == 1 &&
4241 (I == static_cast<unsigned>(Intr->GradientStart +
4242 (Intr->NumGradients / 2) - 1) ||
4243 I == static_cast<unsigned>(Intr->GradientStart +
4244 Intr->NumGradients - 1))) ||
4245 // Check for _L to _LZ optimization
4246 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4247 PackedAddrs.push_back(
4248 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4249 .getReg(0));
4250 } else {
4251 PackedAddrs.push_back(
4252 B.buildBuildVector(
4253 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4254 .getReg(0));
4255 ++I;
4256 }
4257 }
4258 }
4259}
4260
4261/// Convert from separate vaddr components to a single vector address register,
4262/// and replace the remaining operands with $noreg.
4263static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4264 int DimIdx, int NumVAddrs) {
4265 const LLT S32 = LLT::scalar(32);
4266
4267 SmallVector<Register, 8> AddrRegs;
4268 for (int I = 0; I != NumVAddrs; ++I) {
4269 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4270 if (SrcOp.isReg()) {
4271 AddrRegs.push_back(SrcOp.getReg());
4272 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast<void> (0));
4273 }
4274 }
4275
4276 int NumAddrRegs = AddrRegs.size();
4277 if (NumAddrRegs != 1) {
4278 // Above 8 elements round up to next power of 2 (i.e. 16).
4279 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4280 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4281 auto Undef = B.buildUndef(S32);
4282 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4283 NumAddrRegs = RoundedNumRegs;
4284 }
4285
4286 auto VAddr =
4287 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4288 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4289 }
4290
4291 for (int I = 1; I != NumVAddrs; ++I) {
4292 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4293 if (SrcOp.isReg())
4294 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4295 }
4296}
4297
4298/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4299///
4300/// Depending on the subtarget, load/store with 16-bit element data need to be
4301/// rewritten to use the low half of 32-bit registers, or directly use a packed
4302/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4303/// registers.
4304///
4305/// We don't want to directly select image instructions just yet, but also want
4306/// to exposes all register repacking to the legalizer/combiners. We also don't
4307/// want a selected instrution entering RegBankSelect. In order to avoid
4308/// defining a multitude of intermediate image instructions, directly hack on
4309/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4310/// now unnecessary arguments with $noreg.
4311bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4312 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4313 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4314
4315 const unsigned NumDefs = MI.getNumExplicitDefs();
4316 const unsigned ArgOffset = NumDefs + 1;
4317 bool IsTFE = NumDefs == 2;
4318 // We are only processing the operands of d16 image operations on subtargets
4319 // that use the unpacked register layout, or need to repack the TFE result.
4320
4321 // TODO: Do we need to guard against already legalized intrinsics?
4322 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4323 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4324
4325 MachineRegisterInfo *MRI = B.getMRI();
4326 const LLT S32 = LLT::scalar(32);
4327 const LLT S16 = LLT::scalar(16);
4328 const LLT V2S16 = LLT::fixed_vector(2, 16);
4329
4330 unsigned DMask = 0;
4331
4332 // Check for 16 bit addresses and pack if true.
4333 LLT GradTy =
4334 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4335 LLT AddrTy =
4336 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4337 const bool IsG16 = GradTy == S16;
4338 const bool IsA16 = AddrTy == S16;
4339
4340 int DMaskLanes = 0;
4341 if (!BaseOpcode->Atomic) {
4342 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4343 if (BaseOpcode->Gather4) {
4344 DMaskLanes = 4;
4345 } else if (DMask != 0) {
4346 DMaskLanes = countPopulation(DMask);
4347 } else if (!IsTFE && !BaseOpcode->Store) {
4348 // If dmask is 0, this is a no-op load. This can be eliminated.
4349 B.buildUndef(MI.getOperand(0));
4350 MI.eraseFromParent();
4351 return true;
4352 }
4353 }
4354
4355 Observer.changingInstr(MI);
4356 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4357
4358 unsigned NewOpcode = NumDefs == 0 ?
4359 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4360
4361 // Track that we legalized this
4362 MI.setDesc(B.getTII().get(NewOpcode));
4363
4364 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4365 // dmask to be at least 1 otherwise the instruction will fail
4366 if (IsTFE && DMask == 0) {
4367 DMask = 0x1;
4368 DMaskLanes = 1;
4369 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4370 }
4371
4372 if (BaseOpcode->Atomic) {
4373 Register VData0 = MI.getOperand(2).getReg();
4374 LLT Ty = MRI->getType(VData0);
4375
4376 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4377 if (Ty.isVector())
4378 return false;
4379
4380 if (BaseOpcode->AtomicX2) {
4381 Register VData1 = MI.getOperand(3).getReg();
4382 // The two values are packed in one register.
4383 LLT PackedTy = LLT::fixed_vector(2, Ty);
4384 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4385 MI.getOperand(2).setReg(Concat.getReg(0));
4386 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4387 }
4388 }
4389
4390 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4391
4392 // Optimize _L to _LZ when _L is zero
4393 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4394 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4395 const ConstantFP *ConstantLod;
4396
4397 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4398 m_GFCst(ConstantLod))) {
4399 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4400 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4401 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4402 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4403 Intr->Dim);
4404
4405 // The starting indexes should remain in the same place.
4406 --CorrectedNumVAddrs;
4407
4408 MI.getOperand(MI.getNumExplicitDefs())
4409 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4410 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4411 Intr = NewImageDimIntr;
4412 }
4413 }
4414 }
4415
4416 // Optimize _mip away, when 'lod' is zero
4417 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4418 int64_t ConstantLod;
4419 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4420 m_ICst(ConstantLod))) {
4421 if (ConstantLod == 0) {
4422 // TODO: Change intrinsic opcode and remove operand instead or replacing
4423 // it with 0, as the _L to _LZ handling is done above.
4424 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4425 --CorrectedNumVAddrs;
4426 }
4427 }
4428 }
4429
4430 // Rewrite the addressing register layout before doing anything else.
4431 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4432 // 16 bit gradients are supported, but are tied to the A16 control
4433 // so both gradients and addresses must be 16 bit
4434 return false;
4435 }
4436
4437 if (IsA16 && !ST.hasA16()) {
4438 // A16 not supported
4439 return false;
4440 }
4441
4442 if (IsA16 || IsG16) {
4443 if (Intr->NumVAddrs > 1) {
4444 SmallVector<Register, 4> PackedRegs;
4445
4446 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4447 IsG16);
4448
4449 // See also below in the non-a16 branch
4450 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4451 PackedRegs.size() <= ST.getNSAMaxSize();
4452
4453 if (!UseNSA && PackedRegs.size() > 1) {
4454 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4455 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4456 PackedRegs[0] = Concat.getReg(0);
4457 PackedRegs.resize(1);
4458 }
4459
4460 const unsigned NumPacked = PackedRegs.size();
4461 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4462 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4463 if (!SrcOp.isReg()) {
4464 assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast<void> (0));
4465 continue;
4466 }
4467
4468 assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast<void> (0));
4469
4470 if (I - Intr->VAddrStart < NumPacked)
4471 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4472 else
4473 SrcOp.setReg(AMDGPU::NoRegister);
4474 }
4475 }
4476 } else {
4477 // If the register allocator cannot place the address registers contiguously
4478 // without introducing moves, then using the non-sequential address encoding
4479 // is always preferable, since it saves VALU instructions and is usually a
4480 // wash in terms of code size or even better.
4481 //
4482 // However, we currently have no way of hinting to the register allocator
4483 // that MIMG addresses should be placed contiguously when it is possible to
4484 // do so, so force non-NSA for the common 2-address case as a heuristic.
4485 //
4486 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4487 // allocation when possible.
4488 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4489 CorrectedNumVAddrs <= ST.getNSAMaxSize();
4490
4491 if (!UseNSA && Intr->NumVAddrs > 1)
4492 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4493 Intr->NumVAddrs);
4494 }
4495
4496 int Flags = 0;
4497 if (IsA16)
4498 Flags |= 1;
4499 if (IsG16)
4500 Flags |= 2;
4501 MI.addOperand(MachineOperand::CreateImm(Flags));
4502
4503 if (BaseOpcode->Store) { // No TFE for stores?
4504 // TODO: Handle dmask trim
4505 Register VData = MI.getOperand(1).getReg();
4506 LLT Ty = MRI->getType(VData);
4507 if (!Ty.isVector() || Ty.getElementType() != S16)
4508 return true;
4509
4510 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4511 if (RepackedReg != VData) {
4512 MI.getOperand(1).setReg(RepackedReg);
4513 }
4514
4515 return true;
4516 }
4517
4518 Register DstReg = MI.getOperand(0).getReg();
4519 LLT Ty = MRI->getType(DstReg);
4520 const LLT EltTy = Ty.getScalarType();
4521 const bool IsD16 = Ty.getScalarType() == S16;
4522 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4523
4524 // Confirm that the return type is large enough for the dmask specified
4525 if (NumElts < DMaskLanes)
4526 return false;
4527
4528 if (NumElts > 4 || DMaskLanes > 4)
4529 return false;
4530
4531 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4532 const LLT AdjustedTy =
4533 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4534
4535 // The raw dword aligned data component of the load. The only legal cases
4536 // where this matters should be when using the packed D16 format, for
4537 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4538 LLT RoundedTy;
4539
4540 // S32 vector to to cover all data, plus TFE result element.
4541 LLT TFETy;
4542
4543 // Register type to use for each loaded component. Will be S32 or V2S16.
4544 LLT RegTy;
4545
4546 if (IsD16 && ST.hasUnpackedD16VMem()) {
4547 RoundedTy =
4548 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4549 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4550 RegTy = S32;
4551 } else {
4552 unsigned EltSize = EltTy.getSizeInBits();
4553 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4554 unsigned RoundedSize = 32 * RoundedElts;
4555 RoundedTy = LLT::scalarOrVector(
4556 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4557 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4558 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4559 }
4560
4561 // The return type does not need adjustment.
4562 // TODO: Should we change s16 case to s32 or <2 x s16>?
4563 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4564 return true;
4565
4566 Register Dst1Reg;
4567
4568 // Insert after the instruction.
4569 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4570
4571 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4572 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4573 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4574 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4575
4576 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4577
4578 MI.getOperand(0).setReg(NewResultReg);
4579
4580 // In the IR, TFE is supposed to be used with a 2 element struct return
4581 // type. The intruction really returns these two values in one contiguous
4582 // register, with one additional dword beyond the loaded data. Rewrite the
4583 // return type to use a single register result.
4584
4585 if (IsTFE) {
4586 Dst1Reg = MI.getOperand(1).getReg();
4587 if (MRI->getType(Dst1Reg) != S32)
4588 return false;
4589
4590 // TODO: Make sure the TFE operand bit is set.
4591 MI.RemoveOperand(1);
4592
4593 // Handle the easy case that requires no repack instructions.
4594 if (Ty == S32) {
4595 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4596 return true;
4597 }
4598 }
4599
4600 // Now figure out how to copy the new result register back into the old
4601 // result.
4602 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4603
4604 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4605
4606 if (ResultNumRegs == 1) {
4607 assert(!IsTFE)(static_cast<void> (0));
4608 ResultRegs[0] = NewResultReg;
4609 } else {
4610 // We have to repack into a new vector of some kind.
4611 for (int I = 0; I != NumDataRegs; ++I)
4612 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4613 B.buildUnmerge(ResultRegs, NewResultReg);
4614
4615 // Drop the final TFE element to get the data part. The TFE result is
4616 // directly written to the right place already.
4617 if (IsTFE)
4618 ResultRegs.resize(NumDataRegs);
4619 }
4620
4621 // For an s16 scalar result, we form an s32 result with a truncate regardless
4622 // of packed vs. unpacked.
4623 if (IsD16 && !Ty.isVector()) {
4624 B.buildTrunc(DstReg, ResultRegs[0]);
4625 return true;
4626 }
4627
4628 // Avoid a build/concat_vector of 1 entry.
4629 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4630 B.buildBitcast(DstReg, ResultRegs[0]);
4631 return true;
4632 }
4633
4634 assert(Ty.isVector())(static_cast<void> (0));
4635
4636 if (IsD16) {
4637 // For packed D16 results with TFE enabled, all the data components are
4638 // S32. Cast back to the expected type.
4639 //
4640 // TODO: We don't really need to use load s32 elements. We would only need one
4641 // cast for the TFE result if a multiple of v2s16 was used.
4642 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4643 for (Register &Reg : ResultRegs)
4644 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4645 } else if (ST.hasUnpackedD16VMem()) {
4646 for (Register &Reg : ResultRegs)
4647 Reg = B.buildTrunc(S16, Reg).getReg(0);
4648 }
4649 }
4650
4651 auto padWithUndef = [&](LLT Ty, int NumElts) {
4652 if (NumElts == 0)
4653 return;
4654 Register Undef = B.buildUndef(Ty).getReg(0);
4655 for (int I = 0; I != NumElts; ++I)
4656 ResultRegs.push_back(Undef);
4657 };
4658
4659 // Pad out any elements eliminated due to the dmask.
4660 LLT ResTy = MRI->getType(ResultRegs[0]);
4661 if (!ResTy.isVector()) {
4662 padWithUndef(ResTy, NumElts - ResultRegs.size());
4663 B.buildBuildVector(DstReg, ResultRegs);
4664 return true;
4665 }
4666
4667 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast<void> (0));
4668 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4669
4670 // Deal with the one annoying legal case.
4671 const LLT V3S16 = LLT::fixed_vector(3, 16);
4672 if (Ty == V3S16) {
4673 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4674 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4675 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4676 return true;
4677 }
4678
4679 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4680 B.buildConcatVectors(DstReg, ResultRegs);
4681 return true;
4682}
4683
4684bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4685 LegalizerHelper &Helper, MachineInstr &MI) const {
4686 MachineIRBuilder &B = Helper.MIRBuilder;
4687 GISelChangeObserver &Observer = Helper.Observer;
4688
4689 Register Dst = MI.getOperand(0).getReg();
4690 LLT Ty = B.getMRI()->getType(Dst);
4691 unsigned Size = Ty.getSizeInBits();
4692 MachineFunction &MF = B.getMF();
4693
4694 Observer.changingInstr(MI);
4695
4696 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4697 Ty = getBitcastRegisterType(Ty);
4698 Helper.bitcastDst(MI, Ty, 0);
4699 Dst = MI.getOperand(0).getReg();
4700 B.setInsertPt(B.getMBB(), MI);
4701 }
4702
4703 // FIXME: We don't really need this intermediate instruction. The intrinsic
4704 // should be fixed to have a memory operand. Since it's readnone, we're not
4705 // allowed to add one.
4706 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4707 MI.RemoveOperand(1); // Remove intrinsic ID
4708
4709 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4710 // TODO: Should this use datalayout alignment?
4711 const unsigned MemSize = (Size + 7) / 8;
4712 const Align MemAlign(4);
4713 MachineMemOperand *MMO = MF.getMachineMemOperand(
4714 MachinePointerInfo(),
4715 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4716 MachineMemOperand::MOInvariant,
4717 MemSize, MemAlign);
4718 MI.addMemOperand(MF, MMO);
4719
4720 // There are no 96-bit result scalar loads, but widening to 128-bit should
4721 // always be legal. We may need to restore this to a 96-bit result if it turns
4722 // out this needs to be converted to a vector load during RegBankSelect.
4723 if (!isPowerOf2_32(Size)) {
4724 if (Ty.isVector())
4725 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4726 else
4727 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4728 }
4729
4730 Observer.changedInstr(MI);
4731 return true;
4732}
4733
4734// TODO: Move to selection
4735bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4736 MachineRegisterInfo &MRI,
4737 MachineIRBuilder &B) const {
4738 if (!ST.isTrapHandlerEnabled() ||
4739 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4740 return legalizeTrapEndpgm(MI, MRI, B);
4741
4742 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4743 switch (*HsaAbiVer) {
4744 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4745 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4746 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4747 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4748 return ST.supportsGetDoorbellID() ?
4749 legalizeTrapHsa(MI, MRI, B) :
4750 legalizeTrapHsaQueuePtr(MI, MRI, B);
4751 }
4752 }
4753
4754 llvm_unreachable("Unknown trap handler")__builtin_unreachable();
4755}
4756
4757bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4758 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4759 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4760 MI.eraseFromParent();
4761 return true;
4762}
4763
4764bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4765 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4766 // Pass queue pointer to trap handler as input, and insert trap instruction
4767 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4768 Register LiveIn =
4769 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4770 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4771 return false;
4772
4773 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4774 B.buildCopy(SGPR01, LiveIn);
4775 B.buildInstr(AMDGPU::S_TRAP)
4776 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4777 .addReg(SGPR01, RegState::Implicit);
4778
4779 MI.eraseFromParent();
4780 return true;
4781}
4782
4783bool AMDGPULegalizerInfo::legalizeTrapHsa(
4784 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4785 B.buildInstr(AMDGPU::S_TRAP)
4786 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4787 MI.eraseFromParent();
4788 return true;
4789}
4790
4791bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4792 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4793 // Is non-HSA path or trap-handler disabled? then, report a warning
4794 // accordingly
4795 if (!ST.isTrapHandlerEnabled() ||
4796 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4797 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4798 "debugtrap handler not supported",
4799 MI.getDebugLoc(), DS_Warning);
4800 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4801 Ctx.diagnose(NoTrap);
4802 } else {
4803 // Insert debug-trap instruction
4804 B.buildInstr(AMDGPU::S_TRAP)
4805 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4806 }
4807
4808 MI.eraseFromParent();
4809 return true;
4810}
4811
4812bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4813 MachineIRBuilder &B) const {
4814 MachineRegisterInfo &MRI = *B.getMRI();
4815 const LLT S16 = LLT::scalar(16);
4816 const LLT S32 = LLT::scalar(32);
4817
4818 Register DstReg = MI.getOperand(0).getReg();
4819 Register NodePtr = MI.getOperand(2).getReg();
4820 Register RayExtent = MI.getOperand(3).getReg();
4821 Register RayOrigin = MI.getOperand(4).getReg();
4822 Register RayDir = MI.getOperand(5).getReg();
4823 Register RayInvDir = MI.getOperand(6).getReg();
4824 Register TDescr = MI.getOperand(7).getReg();
4825
4826 if (!ST.hasGFX10_AEncoding()) {
4827 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4828 "intrinsic not supported on subtarget",
4829 MI.getDebugLoc());
4830 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4831 return false;
4832 }
4833
4834 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4835 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4836 const unsigned NumVDataDwords = 4;
4837 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4838 const bool UseNSA =
4839 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4840 const unsigned BaseOpcodes[2][2] = {
4841 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4842 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4843 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4844 int Opcode;
4845 if (UseNSA) {
4846 Opcode =
4847 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4848 NumVDataDwords, NumVAddrDwords);
4849 } else {
4850 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4851 AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4852 PowerOf2Ceil(NumVAddrDwords));
4853 }
4854 assert(Opcode != -1)(static_cast<void> (0));
4855
4856 SmallVector<Register, 12> Ops;
4857 if (Is64) {
4858 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4859 Ops.push_back(Unmerge.getReg(0));
4860 Ops.push_back(Unmerge.getReg(1));
4861 } else {
4862 Ops.push_back(NodePtr);
4863 }
4864 Ops.push_back(RayExtent);
4865
4866 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4867 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4868 Ops.push_back(Unmerge.getReg(0));
4869 Ops.push_back(Unmerge.getReg(1));
4870 Ops.push_back(Unmerge.getReg(2));
4871 };
4872
4873 packLanes(RayOrigin);
4874 if (IsA16) {
4875 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4876 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4877 Register R1 = MRI.createGenericVirtualRegister(S32);
4878 Register R2 = MRI.createGenericVirtualRegister(S32);
4879 Register R3 = MRI.createGenericVirtualRegister(S32);
4880 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4881 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4882 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4883 Ops.push_back(R1);
4884 Ops.push_back(R2);
4885 Ops.push_back(R3);
4886 } else {
4887 packLanes(RayDir);
4888 packLanes(RayInvDir);
4889 }
4890
4891 if (!UseNSA) {
4892 // Build a single vector containing all the operands so far prepared.
4893 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4894 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4895 Ops.clear();
4896 Ops.push_back(MergedOps);
4897 }
4898
4899 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4900 .addDef(DstReg)
4901 .addImm(Opcode);
4902
4903 for (Register R : Ops) {
4904 MIB.addUse(R);
4905 }
4906
4907 MIB.addUse(TDescr)
4908 .addImm(IsA16 ? 1 : 0)
4909 .cloneMemRefs(MI);
4910
4911 MI.eraseFromParent();
4912 return true;
4913}
4914
4915bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4916 MachineInstr &MI) const {
4917 MachineIRBuilder &B = Helper.MIRBuilder;
4918 MachineRegisterInfo &MRI = *B.getMRI();
4919
4920 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4921 auto IntrID = MI.getIntrinsicID();
4922 switch (IntrID) {
1
Control jumps to 'case amdgcn_dispatch_id:' at line 5044
4923 case Intrinsic::amdgcn_if:
4924 case Intrinsic::amdgcn_else: {
4925 MachineInstr *Br = nullptr;
4926 MachineBasicBlock *UncondBrTarget = nullptr;
4927 bool Negated = false;
4928 if (MachineInstr *BrCond =
4929 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4930 const SIRegisterInfo *TRI
4931 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4932
4933 Register Def = MI.getOperand(1).getReg();
4934 Register Use = MI.getOperand(3).getReg();
4935
4936 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4937
4938 if (Negated)
4939 std::swap(CondBrTarget, UncondBrTarget);
4940
4941 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4942 if (IntrID == Intrinsic::amdgcn_if) {
4943 B.buildInstr(AMDGPU::SI_IF)
4944 .addDef(Def)
4945 .addUse(Use)
4946 .addMBB(UncondBrTarget);
4947 } else {
4948 B.buildInstr(AMDGPU::SI_ELSE)
4949 .addDef(Def)
4950 .addUse(Use)
4951 .addMBB(UncondBrTarget);
4952 }
4953
4954 if (Br) {
4955 Br->getOperand(0).setMBB(CondBrTarget);
4956 } else {
4957 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4958 // since we're swapping branch targets it needs to be reinserted.
4959 // FIXME: IRTranslator should probably not do this
4960 B.buildBr(*CondBrTarget);
4961 }
4962
4963 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4964 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4965 MI.eraseFromParent();
4966 BrCond->eraseFromParent();
4967 return true;
4968 }
4969
4970 return false;
4971 }
4972 case Intrinsic::amdgcn_loop: {
4973 MachineInstr *Br = nullptr;
4974 MachineBasicBlock *UncondBrTarget = nullptr;
4975 bool Negated = false;
4976 if (MachineInstr *BrCond =
4977 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4978 const SIRegisterInfo *TRI
4979 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4980
4981 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4982 Register Reg = MI.getOperand(2).getReg();
4983
4984 if (Negated)
4985 std::swap(CondBrTarget, UncondBrTarget);
4986
4987 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4988 B.buildInstr(AMDGPU::SI_LOOP)
4989 .addUse(Reg)
4990 .addMBB(UncondBrTarget);
4991
4992 if (Br)
4993 Br->getOperand(0).setMBB(CondBrTarget);
4994 else
4995 B.buildBr(*CondBrTarget);
4996
4997 MI.eraseFromParent();
4998 BrCond->eraseFromParent();
4999 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5000 return true;
5001 }
5002
5003 return false;
5004 }
5005 case Intrinsic::amdgcn_kernarg_segment_ptr:
5006 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5007 // This only makes sense to call in a kernel, so just lower to null.
5008 B.buildConstant(MI.getOperand(0).getReg(), 0);
5009 MI.eraseFromParent();
5010 return true;
5011 }
5012
5013 return legalizePreloadedArgIntrin(
5014 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5015 case Intrinsic::amdgcn_implicitarg_ptr:
5016 return legalizeImplicitArgPtr(MI, MRI, B);
5017 case Intrinsic::amdgcn_workitem_id_x:
5018 return legalizePreloadedArgIntrin(MI, MRI, B,
5019 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5020 case Intrinsic::amdgcn_workitem_id_y:
5021 return legalizePreloadedArgIntrin(MI, MRI, B,
5022 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5023 case Intrinsic::amdgcn_workitem_id_z:
5024 return legalizePreloadedArgIntrin(MI, MRI, B,
5025 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5026 case Intrinsic::amdgcn_workgroup_id_x:
5027 return legalizePreloadedArgIntrin(MI, MRI, B,
5028 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5029 case Intrinsic::amdgcn_workgroup_id_y:
5030 return legalizePreloadedArgIntrin(MI, MRI, B,
5031 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5032 case Intrinsic::amdgcn_workgroup_id_z:
5033 return legalizePreloadedArgIntrin(MI, MRI, B,
5034 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5035 case Intrinsic::amdgcn_dispatch_ptr:
5036 return legalizePreloadedArgIntrin(MI, MRI, B,
5037 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5038 case Intrinsic::amdgcn_queue_ptr:
5039 return legalizePreloadedArgIntrin(MI, MRI, B,
5040 AMDGPUFunctionArgInfo::QUEUE_PTR);
5041 case Intrinsic::amdgcn_implicit_buffer_ptr:
5042 return legalizePreloadedArgIntrin(
5043 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5044 case Intrinsic::amdgcn_dispatch_id:
5045 return legalizePreloadedArgIntrin(MI, MRI, B,
2
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
5046 AMDGPUFunctionArgInfo::DISPATCH_ID);
5047 case Intrinsic::amdgcn_fdiv_fast:
5048 return legalizeFDIVFastIntrin(MI, MRI, B);
5049 case Intrinsic::amdgcn_is_shared:
5050 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5051 case Intrinsic::amdgcn_is_private:
5052 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5053 case Intrinsic::amdgcn_wavefrontsize: {
5054 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5055 MI.eraseFromParent();
5056 return true;
5057 }
5058 case Intrinsic::amdgcn_s_buffer_load:
5059 return legalizeSBufferLoad(Helper, MI);
5060 case Intrinsic::amdgcn_raw_buffer_store:
5061 case Intrinsic::amdgcn_struct_buffer_store:
5062 return legalizeBufferStore(MI, MRI, B, false, false);
5063 case Intrinsic::amdgcn_raw_buffer_store_format:
5064 case Intrinsic::amdgcn_struct_buffer_store_format:
5065 return legalizeBufferStore(MI, MRI, B, false, true);
5066 case Intrinsic::amdgcn_raw_tbuffer_store:
5067 case Intrinsic::amdgcn_struct_tbuffer_store:
5068 return legalizeBufferStore(MI, MRI, B, true, true);
5069 case Intrinsic::amdgcn_raw_buffer_load:
5070 case Intrinsic::amdgcn_struct_buffer_load:
5071 return legalizeBufferLoad(MI, MRI, B, false, false);
5072 case Intrinsic::amdgcn_raw_buffer_load_format:
5073 case Intrinsic::amdgcn_struct_buffer_load_format:
5074 return legalizeBufferLoad(MI, MRI, B, true, false);
5075 case Intrinsic::amdgcn_raw_tbuffer_load:
5076 case Intrinsic::amdgcn_struct_tbuffer_load:
5077 return legalizeBufferLoad(MI, MRI, B, true, true);
5078 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5079 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5080 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5081 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5082 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5083 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5084 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5085 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5086 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5087 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5088 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5089 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5090 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5091 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5092 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5093 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5094 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5095 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5096 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5097 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5098 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5099 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5100 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5101 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5102 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5103 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5104 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5105 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5106 case Intrinsic::amdgcn_buffer_atomic_fadd:
5107 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5108 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5109 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5110 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5111 return legalizeBufferAtomic(MI, B, IntrID);
5112 case Intrinsic::amdgcn_atomic_inc:
5113 return legalizeAtomicIncDec(MI, B, true);
5114 case Intrinsic::amdgcn_atomic_dec:
5115 return legalizeAtomicIncDec(MI, B, false);
5116 case Intrinsic::trap:
5117 return legalizeTrapIntrinsic(MI, MRI, B);
5118 case Intrinsic::debugtrap:
5119 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5120 case Intrinsic::amdgcn_rsq_clamp:
5121 return legalizeRsqClampIntrinsic(MI, MRI, B);
5122 case Intrinsic::amdgcn_ds_fadd:
5123 case Intrinsic::amdgcn_ds_fmin:
5124 case Intrinsic::amdgcn_ds_fmax:
5125 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5126 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5127 return legalizeBVHIntrinsic(MI, B);
5128 default: {
5129 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5130 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5131 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5132 return true;
5133 }
5134 }
5135
5136 return true;
5137}

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)(static_cast<void> (0));
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)(static_cast<void> (0));
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
8
Assuming the condition is true
9
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
13.1
'ZB' is not equal to ZB_Undefined
13.1
'ZB' is not equal to ZB_Undefined
13.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
14
Assuming 'Val' is equal to 0
15
Taking true branch
117 return 32;
16
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
13
Calling 'TrailingZerosCounter::count'
17
Returning from 'TrailingZerosCounter::count'
18
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast<void> (0));
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
771 assert(B <= 32 && "Bit width out of range.")(static_cast<void> (0));
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
787 assert(B <= 64 && "Bit width out of range.")(static_cast<void> (0));
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.