Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 1812, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/include -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2019-12-07-102640-14763-1 -x c++ /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#if defined(_MSC_VER) || defined(__MINGW32__)
15// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16// from the Visual C++ cmath / math.h headers:
17// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18#define _USE_MATH_DEFINES
19#endif
20
21#include "AMDGPU.h"
22#include "AMDGPULegalizerInfo.h"
23#include "AMDGPUTargetMachine.h"
24#include "SIMachineFunctionInfo.h"
25#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/TargetOpcodes.h"
28#include "llvm/CodeGen/ValueTypes.h"
29#include "llvm/IR/DerivedTypes.h"
30#include "llvm/IR/DiagnosticInfo.h"
31#include "llvm/IR/Type.h"
32#include "llvm/Support/Debug.h"
33
34#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
35
36using namespace llvm;
37using namespace LegalizeActions;
38using namespace LegalizeMutations;
39using namespace LegalityPredicates;
40
41
42static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 1024) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48 };
49}
50
51static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
54 };
55}
56
57static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
62 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
64 };
65}
66
67static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72 };
73}
74
75static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80 };
81}
82
83static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91 };
92}
93
94// Increase the number of vector elements to reach the next multiple of 32-bit
95// type.
96static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
104
105 assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 105, __PRETTY_FUNCTION__))
;
106
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109 };
110}
111
112static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116 };
117}
118
119static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123 };
124}
125
126static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130 };
131}
132
133// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134// v2s16.
135static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
143 }
144
145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146 };
147}
148
149static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
152 };
153}
154
155static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160 };
161}
162
163AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
166 using namespace TargetOpcode;
167
168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170 };
171
172 const LLT S1 = LLT::scalar(1);
173 const LLT S8 = LLT::scalar(8);
174 const LLT S16 = LLT::scalar(16);
175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
177 const LLT S96 = LLT::scalar(96);
178 const LLT S128 = LLT::scalar(128);
179 const LLT S256 = LLT::scalar(256);
180 const LLT S1024 = LLT::scalar(1024);
181
182 const LLT V2S16 = LLT::vector(2, 16);
183 const LLT V4S16 = LLT::vector(4, 16);
184
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
200 const LLT V32S32 = LLT::vector(32, 32);
201
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
209 const LLT V16S64 = LLT::vector(16, 64);
210
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214 std::initializer_list<LLT> AllS64Vectors =
215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216
217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224
225 const LLT CodePtr = FlatPtr;
226
227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
229 };
230
231 const std::initializer_list<LLT> AddrSpaces32 = {
232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233 };
234
235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
237 };
238
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
241 };
242
243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
245 };
246
247 setAction({G_BRCOND, S1}, Legal);
248
249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI)
252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253 .legalFor(AllS32Vectors)
254 .legalFor(AllS64Vectors)
255 .legalFor(AddrSpaces64)
256 .legalFor(AddrSpaces32)
257 .clampScalar(0, S32, S256)
258 .widenScalarToNextPow2(0, 32)
259 .clampMaxNumElements(0, S32, 16)
260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261 .legalIf(isPointer(0));
262
263 if (ST.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265 .legalFor({S32, S16})
266 .clampScalar(0, S16, S32)
267 .scalarize(0);
268 } else {
269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270 .legalFor({S32})
271 .clampScalar(0, S32, S32)
272 .scalarize(0);
273 }
274
275 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276 .legalFor({S32})
277 .clampScalar(0, S32, S32)
278 .scalarize(0);
279
280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284 .clampScalar(0, S32, S64)
285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287 .widenScalarToNextPow2(0)
288 .scalarize(0);
289
290 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292 .legalFor({{S32, S1}})
293 .clampScalar(0, S32, S32)
294 .scalarize(0); // TODO: Implement.
295
296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297 .lower();
298
299 getActionDefinitionsBuilder(G_BITCAST)
300 // Don't worry about the size constraint.
301 .legalIf(all(isRegisterType(0), isRegisterType(1)))
302 // FIXME: Testing hack
303 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304
305 getActionDefinitionsBuilder(G_FCONSTANT)
306 .legalFor({S32, S64, S16})
307 .clampScalar(0, S16, S64);
308
309 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313 .clampScalarOrElt(0, S32, S1024)
314 .legalIf(isMultiple32(0))
315 .widenScalarToNextPow2(0, 32)
316 .clampMaxNumElements(0, S32, 16);
317
318
319 // FIXME: i1 operands to intrinsics should always be legal, but other i1
320 // values may not be legal. We need to figure out how to distinguish
321 // between these two scenarios.
322 getActionDefinitionsBuilder(G_CONSTANT)
323 .legalFor({S1, S32, S64, S16, GlobalPtr,
324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325 .clampScalar(0, S32, S64)
326 .widenScalarToNextPow2(0)
327 .legalIf(isPointer(0));
328
329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332
333
334 auto &FPOpActions = getActionDefinitionsBuilder(
335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336 .legalFor({S32, S64});
337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338 .customFor({S32, S64});
339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340 .customFor({S32, S64});
341
342 if (ST.has16BitInsts()) {
343 if (ST.hasVOP3PInsts())
344 FPOpActions.legalFor({S16, V2S16});
345 else
346 FPOpActions.legalFor({S16});
347
348 TrigActions.customFor({S16});
349 FDIVActions.customFor({S16});
350 }
351
352 auto &MinNumMaxNum = getActionDefinitionsBuilder({
353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354
355 if (ST.hasVOP3PInsts()) {
356 MinNumMaxNum.customFor(FPTypesPK16)
357 .clampMaxNumElements(0, S16, 2)
358 .clampScalar(0, S16, S64)
359 .scalarize(0);
360 } else if (ST.has16BitInsts()) {
361 MinNumMaxNum.customFor(FPTypes16)
362 .clampScalar(0, S16, S64)
363 .scalarize(0);
364 } else {
365 MinNumMaxNum.customFor(FPTypesBase)
366 .clampScalar(0, S32, S64)
367 .scalarize(0);
368 }
369
370 if (ST.hasVOP3PInsts())
371 FPOpActions.clampMaxNumElements(0, S16, 2);
372
373 FPOpActions
374 .scalarize(0)
375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
376
377 TrigActions
378 .scalarize(0)
379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380
381 FDIVActions
382 .scalarize(0)
383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384
385 getActionDefinitionsBuilder({G_FNEG, G_FABS})
386 .legalFor(FPTypesPK16)
387 .clampMaxNumElements(0, S16, 2)
388 .scalarize(0)
389 .clampScalar(0, S16, S64);
390
391 // TODO: Implement
392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393
394 if (ST.has16BitInsts()) {
395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
396 .legalFor({S32, S64, S16})
397 .scalarize(0)
398 .clampScalar(0, S16, S64);
399 } else {
400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
401 .legalFor({S32, S64})
402 .scalarize(0)
403 .clampScalar(0, S32, S64);
404 }
405
406 getActionDefinitionsBuilder(G_FPTRUNC)
407 .legalFor({{S32, S64}, {S16, S32}})
408 .scalarize(0);
409
410 getActionDefinitionsBuilder(G_FPEXT)
411 .legalFor({{S64, S32}, {S32, S16}})
412 .lowerFor({{S64, S16}}) // FIXME: Implement
413 .scalarize(0);
414
415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
417
418 getActionDefinitionsBuilder(G_FSUB)
419 // Use actual fsub instruction
420 .legalFor({S32})
421 // Must use fadd + fneg
422 .lowerFor({S64, S16, V2S16})
423 .scalarize(0)
424 .clampScalar(0, S32, S64);
425
426 // Whether this is legal depends on the floating point mode for the function.
427 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428 if (ST.hasMadF16())
429 FMad.customFor({S32, S16});
430 else
431 FMad.customFor({S32});
432 FMad.scalarize(0)
433 .lower();
434
435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437 {S32, S1}, {S64, S1}, {S16, S1},
438 {S96, S32},
439 // FIXME: Hack
440 {S64, LLT::scalar(33)},
441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
442 .scalarize(0);
443
444 // TODO: Split s1->s64 during regbankselect for VALU.
445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
447 .lowerFor({{S32, S64}})
448 .customFor({{S64, S64}});
449 if (ST.has16BitInsts())
450 IToFP.legalFor({{S16, S16}});
451 IToFP.clampScalar(1, S32, S64)
452 .scalarize(0);
453
454 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
455 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
456 if (ST.has16BitInsts())
457 FPToI.legalFor({{S16, S16}});
458 else
459 FPToI.minScalar(1, S32);
460
461 FPToI.minScalar(0, S32)
462 .scalarize(0);
463
464 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
465 .legalFor({S32, S64})
466 .scalarize(0);
467
468 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
469 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
470 .legalFor({S32, S64})
471 .clampScalar(0, S32, S64)
472 .scalarize(0);
473 } else {
474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
475 .legalFor({S32})
476 .customFor({S64})
477 .clampScalar(0, S32, S64)
478 .scalarize(0);
479 }
480
481 getActionDefinitionsBuilder(G_PTR_ADD)
482 .legalForCartesianProduct(AddrSpaces64, {S64})
483 .legalForCartesianProduct(AddrSpaces32, {S32})
484 .scalarize(0);
485
486 getActionDefinitionsBuilder(G_PTR_MASK)
487 .scalarize(0)
488 .alwaysLegal();
489
490 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491
492 auto &CmpBuilder =
493 getActionDefinitionsBuilder(G_ICMP)
494 .legalForCartesianProduct(
495 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
496 .legalFor({{S1, S32}, {S1, S64}});
497 if (ST.has16BitInsts()) {
498 CmpBuilder.legalFor({{S1, S16}});
499 }
500
501 CmpBuilder
502 .widenScalarToNextPow2(1)
503 .clampScalar(1, S32, S64)
504 .scalarize(0)
505 .legalIf(all(typeIs(0, S1), isPointer(1)));
506
507 getActionDefinitionsBuilder(G_FCMP)
508 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
509 .widenScalarToNextPow2(1)
510 .clampScalar(1, S32, S64)
511 .scalarize(0);
512
513 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
515 G_FLOG, G_FLOG2, G_FLOG10})
516 .legalFor({S32})
517 .scalarize(0);
518
519 // The 64-bit versions produce 32-bit results, but only on the SALU.
520 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
521 G_CTTZ, G_CTTZ_ZERO_UNDEF,
522 G_CTPOP})
523 .legalFor({{S32, S32}, {S32, S64}})
524 .clampScalar(0, S32, S32)
525 .clampScalar(1, S32, S64)
526 .scalarize(0)
527 .widenScalarToNextPow2(0, 32)
528 .widenScalarToNextPow2(1, 32);
529
530 // TODO: Expand for > s32
531 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
532 .legalFor({S32})
533 .clampScalar(0, S32, S32)
534 .scalarize(0);
535
536 if (ST.has16BitInsts()) {
537 if (ST.hasVOP3PInsts()) {
538 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
539 .legalFor({S32, S16, V2S16})
540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541 .clampMaxNumElements(0, S16, 2)
542 .clampScalar(0, S16, S32)
543 .widenScalarToNextPow2(0)
544 .scalarize(0);
545 } else {
546 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547 .legalFor({S32, S16})
548 .widenScalarToNextPow2(0)
549 .clampScalar(0, S16, S32)
550 .scalarize(0);
551 }
552 } else {
553 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554 .legalFor({S32})
555 .clampScalar(0, S32, S32)
556 .widenScalarToNextPow2(0)
557 .scalarize(0);
558 }
559
560 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561 return [=](const LegalityQuery &Query) {
562 return Query.Types[TypeIdx0].getSizeInBits() <
563 Query.Types[TypeIdx1].getSizeInBits();
564 };
565 };
566
567 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568 return [=](const LegalityQuery &Query) {
569 return Query.Types[TypeIdx0].getSizeInBits() >
570 Query.Types[TypeIdx1].getSizeInBits();
571 };
572 };
573
574 getActionDefinitionsBuilder(G_INTTOPTR)
575 // List the common cases
576 .legalForCartesianProduct(AddrSpaces64, {S64})
577 .legalForCartesianProduct(AddrSpaces32, {S32})
578 .scalarize(0)
579 // Accept any address space as long as the size matches
580 .legalIf(sameSize(0, 1))
581 .widenScalarIf(smallerThan(1, 0),
582 [](const LegalityQuery &Query) {
583 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
584 })
585 .narrowScalarIf(greaterThan(1, 0),
586 [](const LegalityQuery &Query) {
587 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
588 });
589
590 getActionDefinitionsBuilder(G_PTRTOINT)
591 // List the common cases
592 .legalForCartesianProduct(AddrSpaces64, {S64})
593 .legalForCartesianProduct(AddrSpaces32, {S32})
594 .scalarize(0)
595 // Accept any address space as long as the size matches
596 .legalIf(sameSize(0, 1))
597 .widenScalarIf(smallerThan(0, 1),
598 [](const LegalityQuery &Query) {
599 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
600 })
601 .narrowScalarIf(
602 greaterThan(0, 1),
603 [](const LegalityQuery &Query) {
604 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
605 });
606
607 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
608 .scalarize(0)
609 .custom();
610
611 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612 // handle some operations by just promoting the register during
613 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
614 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
615 switch (AS) {
616 // FIXME: Private element size.
617 case AMDGPUAS::PRIVATE_ADDRESS:
618 return 32;
619 // FIXME: Check subtarget
620 case AMDGPUAS::LOCAL_ADDRESS:
621 return ST.useDS128() ? 128 : 64;
622
623 // Treat constant and global as identical. SMRD loads are sometimes usable
624 // for global loads (ideally constant address space should be eliminated)
625 // depending on the context. Legality cannot be context dependent, but
626 // RegBankSelect can split the load as necessary depending on the pointer
627 // register bank/uniformity and if the memory is invariant or not written in
628 // a kernel.
629 case AMDGPUAS::CONSTANT_ADDRESS:
630 case AMDGPUAS::GLOBAL_ADDRESS:
631 return 512;
632 default:
633 return 128;
634 }
635 };
636
637 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
638 const LLT DstTy = Query.Types[0];
639
640 // Split vector extloads.
641 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
642 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
643 return true;
644
645 const LLT PtrTy = Query.Types[1];
646 unsigned AS = PtrTy.getAddressSpace();
647 if (MemSize > maxSizeForAddrSpace(AS))
648 return true;
649
650 // Catch weird sized loads that don't evenly divide into the access sizes
651 // TODO: May be able to widen depending on alignment etc.
652 unsigned NumRegs = MemSize / 32;
653 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
654 return true;
655
656 unsigned Align = Query.MMODescrs[0].AlignInBits;
657 if (Align < MemSize) {
658 const SITargetLowering *TLI = ST.getTargetLowering();
659 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
660 }
661
662 return false;
663 };
664
665 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
666 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
667 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
668
669 // TODO: Refine based on subtargets which support unaligned access or 128-bit
670 // LDS
671 // TODO: Unsupported flat for SI.
672
673 for (unsigned Op : {G_LOAD, G_STORE}) {
674 const bool IsStore = Op == G_STORE;
675
676 auto &Actions = getActionDefinitionsBuilder(Op);
677 // Whitelist the common cases.
678 // TODO: Pointer loads
679 // TODO: Wide constant loads
680 // TODO: Only CI+ has 3x loads
681 // TODO: Loads to s16 on gfx9
682 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
683 {V2S32, GlobalPtr, 64, GlobalAlign32},
684 {V3S32, GlobalPtr, 96, GlobalAlign32},
685 {S96, GlobalPtr, 96, GlobalAlign32},
686 {V4S32, GlobalPtr, 128, GlobalAlign32},
687 {S128, GlobalPtr, 128, GlobalAlign32},
688 {S64, GlobalPtr, 64, GlobalAlign32},
689 {V2S64, GlobalPtr, 128, GlobalAlign32},
690 {V2S16, GlobalPtr, 32, GlobalAlign32},
691 {S32, GlobalPtr, 8, GlobalAlign8},
692 {S32, GlobalPtr, 16, GlobalAlign16},
693
694 {S32, LocalPtr, 32, 32},
695 {S64, LocalPtr, 64, 32},
696 {V2S32, LocalPtr, 64, 32},
697 {S32, LocalPtr, 8, 8},
698 {S32, LocalPtr, 16, 16},
699 {V2S16, LocalPtr, 32, 32},
700
701 {S32, PrivatePtr, 32, 32},
702 {S32, PrivatePtr, 8, 8},
703 {S32, PrivatePtr, 16, 16},
704 {V2S16, PrivatePtr, 32, 32},
705
706 {S32, FlatPtr, 32, GlobalAlign32},
707 {S32, FlatPtr, 16, GlobalAlign16},
708 {S32, FlatPtr, 8, GlobalAlign8},
709 {V2S16, FlatPtr, 32, GlobalAlign32},
710
711 {S32, ConstantPtr, 32, GlobalAlign32},
712 {V2S32, ConstantPtr, 64, GlobalAlign32},
713 {V3S32, ConstantPtr, 96, GlobalAlign32},
714 {V4S32, ConstantPtr, 128, GlobalAlign32},
715 {S64, ConstantPtr, 64, GlobalAlign32},
716 {S128, ConstantPtr, 128, GlobalAlign32},
717 {V2S32, ConstantPtr, 32, GlobalAlign32}});
718 Actions
719 .customIf(typeIs(1, Constant32Ptr))
720 .narrowScalarIf(
721 [=](const LegalityQuery &Query) -> bool {
722 return !Query.Types[0].isVector() && needToSplitLoad(Query);
723 },
724 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
725 const LLT DstTy = Query.Types[0];
726 const LLT PtrTy = Query.Types[1];
727
728 const unsigned DstSize = DstTy.getSizeInBits();
729 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
730
731 // Split extloads.
732 if (DstSize > MemSize)
733 return std::make_pair(0, LLT::scalar(MemSize));
734
735 if (DstSize > 32 && (DstSize % 32 != 0)) {
736 // FIXME: Need a way to specify non-extload of larger size if
737 // suitably aligned.
738 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
739 }
740
741 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
742 if (MemSize > MaxSize)
743 return std::make_pair(0, LLT::scalar(MaxSize));
744
745 unsigned Align = Query.MMODescrs[0].AlignInBits;
746 return std::make_pair(0, LLT::scalar(Align));
747 })
748 .fewerElementsIf(
749 [=](const LegalityQuery &Query) -> bool {
750 return Query.Types[0].isVector() && needToSplitLoad(Query);
751 },
752 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753 const LLT DstTy = Query.Types[0];
754 const LLT PtrTy = Query.Types[1];
755
756 LLT EltTy = DstTy.getElementType();
757 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
758
759 // Split if it's too large for the address space.
760 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
761 unsigned NumElts = DstTy.getNumElements();
762 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
763
764 // FIXME: Refine when odd breakdowns handled
765 // The scalars will need to be re-legalized.
766 if (NumPieces == 1 || NumPieces >= NumElts ||
767 NumElts % NumPieces != 0)
768 return std::make_pair(0, EltTy);
769
770 return std::make_pair(0,
771 LLT::vector(NumElts / NumPieces, EltTy));
772 }
773
774 // Need to split because of alignment.
775 unsigned Align = Query.MMODescrs[0].AlignInBits;
776 unsigned EltSize = EltTy.getSizeInBits();
777 if (EltSize > Align &&
778 (EltSize / Align < DstTy.getNumElements())) {
779 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
780 }
781
782 // May need relegalization for the scalars.
783 return std::make_pair(0, EltTy);
784 })
785 .minScalar(0, S32);
786
787 if (IsStore)
788 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
789
790 // TODO: Need a bitcast lower option?
791 Actions
792 .legalIf([=](const LegalityQuery &Query) {
793 const LLT Ty0 = Query.Types[0];
794 unsigned Size = Ty0.getSizeInBits();
795 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
796 unsigned Align = Query.MMODescrs[0].AlignInBits;
797
798 // No extending vector loads.
799 if (Size > MemSize && Ty0.isVector())
800 return false;
801
802 // FIXME: Widening store from alignment not valid.
803 if (MemSize < Size)
804 MemSize = std::max(MemSize, Align);
805
806 switch (MemSize) {
807 case 8:
808 case 16:
809 return Size == 32;
810 case 32:
811 case 64:
812 case 128:
813 return true;
814 case 96:
815 return ST.hasDwordx3LoadStores();
816 case 256:
817 case 512:
818 return true;
819 default:
820 return false;
821 }
822 })
823 .widenScalarToNextPow2(0)
824 // TODO: v3s32->v4s32 with alignment
825 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
826 }
827
828 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
829 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
830 {S32, GlobalPtr, 16, 2 * 8},
831 {S32, LocalPtr, 8, 8},
832 {S32, LocalPtr, 16, 16},
833 {S32, PrivatePtr, 8, 8},
834 {S32, PrivatePtr, 16, 16},
835 {S32, ConstantPtr, 8, 8},
836 {S32, ConstantPtr, 16, 2 * 8}});
837 if (ST.hasFlatAddressSpace()) {
838 ExtLoads.legalForTypesWithMemDesc(
839 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
840 }
841
842 ExtLoads.clampScalar(0, S32, S32)
843 .widenScalarToNextPow2(0)
844 .unsupportedIfMemSizeNotPow2()
845 .lower();
846
847 auto &Atomics = getActionDefinitionsBuilder(
848 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
849 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
850 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
851 G_ATOMICRMW_UMIN})
852 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
853 {S64, GlobalPtr}, {S64, LocalPtr}});
854 if (ST.hasFlatAddressSpace()) {
855 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
856 }
857
858 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
859 .legalFor({{S32, LocalPtr}});
860
861 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
862 // demarshalling
863 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
864 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
865 {S32, FlatPtr}, {S64, FlatPtr}})
866 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
867 {S32, RegionPtr}, {S64, RegionPtr}});
868
869 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
870 .lower();
871
872 // TODO: Pointer types, any 32-bit or 64-bit vector
873 getActionDefinitionsBuilder(G_SELECT)
874 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
875 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
876 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
877 .clampScalar(0, S16, S64)
878 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
879 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
880 .scalarize(1)
881 .clampMaxNumElements(0, S32, 2)
882 .clampMaxNumElements(0, LocalPtr, 2)
883 .clampMaxNumElements(0, PrivatePtr, 2)
884 .scalarize(0)
885 .widenScalarToNextPow2(0)
886 .legalIf(all(isPointer(0), typeIs(1, S1)));
887
888 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
889 // be more flexible with the shift amount type.
890 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
891 .legalFor({{S32, S32}, {S64, S32}});
892 if (ST.has16BitInsts()) {
893 if (ST.hasVOP3PInsts()) {
894 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
895 .clampMaxNumElements(0, S16, 2);
896 } else
897 Shifts.legalFor({{S16, S32}, {S16, S16}});
898
899 Shifts.clampScalar(1, S16, S32);
900 Shifts.clampScalar(0, S16, S64);
901 Shifts.widenScalarToNextPow2(0, 16);
902 } else {
903 // Make sure we legalize the shift amount type first, as the general
904 // expansion for the shifted type will produce much worse code if it hasn't
905 // been truncated already.
906 Shifts.clampScalar(1, S32, S32);
907 Shifts.clampScalar(0, S32, S64);
908 Shifts.widenScalarToNextPow2(0, 32);
909 }
910 Shifts.scalarize(0);
911
912 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
913 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
914 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
915 unsigned IdxTypeIdx = 2;
916
917 getActionDefinitionsBuilder(Op)
918 .customIf([=](const LegalityQuery &Query) {
919 const LLT EltTy = Query.Types[EltTypeIdx];
920 const LLT VecTy = Query.Types[VecTypeIdx];
921 const LLT IdxTy = Query.Types[IdxTypeIdx];
922 return (EltTy.getSizeInBits() == 16 ||
923 EltTy.getSizeInBits() % 32 == 0) &&
924 VecTy.getSizeInBits() % 32 == 0 &&
925 VecTy.getSizeInBits() <= 1024 &&
926 IdxTy.getSizeInBits() == 32;
927 })
928 .clampScalar(EltTypeIdx, S32, S64)
929 .clampScalar(VecTypeIdx, S32, S64)
930 .clampScalar(IdxTypeIdx, S32, S32);
931 }
932
933 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
934 .unsupportedIf([=](const LegalityQuery &Query) {
935 const LLT &EltTy = Query.Types[1].getElementType();
936 return Query.Types[0] != EltTy;
937 });
938
939 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
940 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
941 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
942
943 // FIXME: Doesn't handle extract of illegal sizes.
944 getActionDefinitionsBuilder(Op)
945 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
946 // FIXME: Multiples of 16 should not be legal.
947 .legalIf([=](const LegalityQuery &Query) {
948 const LLT BigTy = Query.Types[BigTyIdx];
949 const LLT LitTy = Query.Types[LitTyIdx];
950 return (BigTy.getSizeInBits() % 32 == 0) &&
951 (LitTy.getSizeInBits() % 16 == 0);
952 })
953 .widenScalarIf(
954 [=](const LegalityQuery &Query) {
955 const LLT BigTy = Query.Types[BigTyIdx];
956 return (BigTy.getScalarSizeInBits() < 16);
957 },
958 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
959 .widenScalarIf(
960 [=](const LegalityQuery &Query) {
961 const LLT LitTy = Query.Types[LitTyIdx];
962 return (LitTy.getScalarSizeInBits() < 16);
963 },
964 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
965 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
966 .widenScalarToNextPow2(BigTyIdx, 32);
967
968 }
969
970 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
971 .legalForCartesianProduct(AllS32Vectors, {S32})
972 .legalForCartesianProduct(AllS64Vectors, {S64})
973 .clampNumElements(0, V16S32, V32S32)
974 .clampNumElements(0, V2S64, V16S64)
975 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
976
977 if (ST.hasScalarPackInsts())
978 BuildVector.legalFor({V2S16, S32});
979
980 BuildVector
981 .minScalarSameAs(1, 0)
982 .legalIf(isRegisterType(0))
983 .minScalarOrElt(0, S32);
984
985 if (ST.hasScalarPackInsts()) {
986 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
987 .legalFor({V2S16, S32})
988 .lower();
989 } else {
990 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
991 .lower();
992 }
993
994 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
995 .legalIf(isRegisterType(0));
996
997 // TODO: Don't fully scalarize v2s16 pieces
998 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
999
1000 // Merge/Unmerge
1001 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1002 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1003 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1004
1005 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1006 const LLT &Ty = Query.Types[TypeIdx];
1007 if (Ty.isVector()) {
1008 const LLT &EltTy = Ty.getElementType();
1009 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1010 return true;
1011 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1012 return true;
1013 }
1014 return false;
1015 };
1016
1017 auto &Builder = getActionDefinitionsBuilder(Op)
1018 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1019 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1020 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1021 // valid.
1022 .clampScalar(LitTyIdx, S16, S256)
1023 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1024 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1025 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1026 elementTypeIs(1, S16)),
1027 changeTo(1, V2S16))
1028 // Break up vectors with weird elements into scalars
1029 .fewerElementsIf(
1030 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1031 scalarize(0))
1032 .fewerElementsIf(
1033 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1034 scalarize(1))
1035 .clampScalar(BigTyIdx, S32, S1024)
1036 .lowerFor({{S16, V2S16}});
1037
1038 if (Op == G_MERGE_VALUES) {
1039 Builder.widenScalarIf(
1040 // TODO: Use 16-bit shifts if legal for 8-bit values?
1041 [=](const LegalityQuery &Query) {
1042 const LLT Ty = Query.Types[LitTyIdx];
1043 return Ty.getSizeInBits() < 32;
1044 },
1045 changeTo(LitTyIdx, S32));
1046 }
1047
1048 Builder.widenScalarIf(
1049 [=](const LegalityQuery &Query) {
1050 const LLT Ty = Query.Types[BigTyIdx];
1051 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1052 Ty.getSizeInBits() % 16 != 0;
1053 },
1054 [=](const LegalityQuery &Query) {
1055 // Pick the next power of 2, or a multiple of 64 over 128.
1056 // Whichever is smaller.
1057 const LLT &Ty = Query.Types[BigTyIdx];
1058 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1059 if (NewSizeInBits >= 256) {
1060 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1061 if (RoundedTo < NewSizeInBits)
1062 NewSizeInBits = RoundedTo;
1063 }
1064 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1065 })
1066 .legalIf([=](const LegalityQuery &Query) {
1067 const LLT &BigTy = Query.Types[BigTyIdx];
1068 const LLT &LitTy = Query.Types[LitTyIdx];
1069
1070 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1071 return false;
1072 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1073 return false;
1074
1075 return BigTy.getSizeInBits() % 16 == 0 &&
1076 LitTy.getSizeInBits() % 16 == 0 &&
1077 BigTy.getSizeInBits() <= 1024;
1078 })
1079 // Any vectors left are the wrong size. Scalarize them.
1080 .scalarize(0)
1081 .scalarize(1);
1082 }
1083
1084 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1085
1086 computeTables();
1087 verify(*ST.getInstrInfo());
1088}
1089
1090bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1091 MachineRegisterInfo &MRI,
1092 MachineIRBuilder &B,
1093 GISelChangeObserver &Observer) const {
1094 switch (MI.getOpcode()) {
1095 case TargetOpcode::G_ADDRSPACE_CAST:
1096 return legalizeAddrSpaceCast(MI, MRI, B);
1097 case TargetOpcode::G_FRINT:
1098 return legalizeFrint(MI, MRI, B);
1099 case TargetOpcode::G_FCEIL:
1100 return legalizeFceil(MI, MRI, B);
1101 case TargetOpcode::G_INTRINSIC_TRUNC:
1102 return legalizeIntrinsicTrunc(MI, MRI, B);
1103 case TargetOpcode::G_SITOFP:
1104 return legalizeITOFP(MI, MRI, B, true);
1105 case TargetOpcode::G_UITOFP:
1106 return legalizeITOFP(MI, MRI, B, false);
1107 case TargetOpcode::G_FMINNUM:
1108 case TargetOpcode::G_FMAXNUM:
1109 case TargetOpcode::G_FMINNUM_IEEE:
1110 case TargetOpcode::G_FMAXNUM_IEEE:
1111 return legalizeMinNumMaxNum(MI, MRI, B);
1112 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1113 return legalizeExtractVectorElt(MI, MRI, B);
1114 case TargetOpcode::G_INSERT_VECTOR_ELT:
1115 return legalizeInsertVectorElt(MI, MRI, B);
1116 case TargetOpcode::G_FSIN:
1117 case TargetOpcode::G_FCOS:
1118 return legalizeSinCos(MI, MRI, B);
1119 case TargetOpcode::G_GLOBAL_VALUE:
1120 return legalizeGlobalValue(MI, MRI, B);
1121 case TargetOpcode::G_LOAD:
1122 return legalizeLoad(MI, MRI, B, Observer);
1123 case TargetOpcode::G_FMAD:
1124 return legalizeFMad(MI, MRI, B);
1125 case TargetOpcode::G_FDIV:
1126 return legalizeFDIV(MI, MRI, B);
1127 case TargetOpcode::G_ATOMIC_CMPXCHG:
1128 return legalizeAtomicCmpXChg(MI, MRI, B);
1129 default:
1130 return false;
1131 }
1132
1133 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1133)
;
1134}
1135
1136Register AMDGPULegalizerInfo::getSegmentAperture(
1137 unsigned AS,
1138 MachineRegisterInfo &MRI,
1139 MachineIRBuilder &B) const {
1140 MachineFunction &MF = B.getMF();
1141 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1142 const LLT S32 = LLT::scalar(32);
1143
1144 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1144, __PRETTY_FUNCTION__))
;
1145
1146 if (ST.hasApertureRegs()) {
1147 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1148 // getreg.
1149 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1150 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1151 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1152 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1153 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1154 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1155 unsigned Encoding =
1156 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1157 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1158 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1159
1160 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1161 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1162
1163 B.buildInstr(AMDGPU::S_GETREG_B32)
1164 .addDef(GetReg)
1165 .addImm(Encoding);
1166 MRI.setType(GetReg, S32);
1167
1168 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1169 B.buildInstr(TargetOpcode::G_SHL)
1170 .addDef(ApertureReg)
1171 .addUse(GetReg)
1172 .addUse(ShiftAmt.getReg(0));
1173
1174 return ApertureReg;
1175 }
1176
1177 Register QueuePtr = MRI.createGenericVirtualRegister(
1178 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1179
1180 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1181 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1182 return Register();
1183
1184 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1185 // private_segment_aperture_base_hi.
1186 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1187
1188 // FIXME: Don't use undef
1189 Value *V = UndefValue::get(PointerType::get(
1190 Type::getInt8Ty(MF.getFunction().getContext()),
1191 AMDGPUAS::CONSTANT_ADDRESS));
1192
1193 MachinePointerInfo PtrInfo(V, StructOffset);
1194 MachineMemOperand *MMO = MF.getMachineMemOperand(
1195 PtrInfo,
1196 MachineMemOperand::MOLoad |
1197 MachineMemOperand::MODereferenceable |
1198 MachineMemOperand::MOInvariant,
1199 4,
1200 MinAlign(64, StructOffset));
1201
1202 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1203 Register LoadAddr;
1204
1205 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1206 B.buildLoad(LoadResult, LoadAddr, *MMO);
1207 return LoadResult;
1208}
1209
1210bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1211 MachineInstr &MI, MachineRegisterInfo &MRI,
1212 MachineIRBuilder &B) const {
1213 MachineFunction &MF = B.getMF();
1214
1215 B.setInstr(MI);
1216
1217 const LLT S32 = LLT::scalar(32);
1218 Register Dst = MI.getOperand(0).getReg();
1219 Register Src = MI.getOperand(1).getReg();
1220
1221 LLT DstTy = MRI.getType(Dst);
1222 LLT SrcTy = MRI.getType(Src);
1223 unsigned DestAS = DstTy.getAddressSpace();
1224 unsigned SrcAS = SrcTy.getAddressSpace();
1225
1226 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1227 // vector element.
1228 assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1228, __PRETTY_FUNCTION__))
;
1229
1230 const AMDGPUTargetMachine &TM
1231 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1232
1233 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1234 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1235 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1236 return true;
1237 }
1238
1239 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1240 // Truncate.
1241 B.buildExtract(Dst, Src, 0);
1242 MI.eraseFromParent();
1243 return true;
1244 }
1245
1246 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1247 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1248 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1249
1250 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1251 // another. Merge operands are required to be the same type, but creating an
1252 // extra ptrtoint would be kind of pointless.
1253 auto HighAddr = B.buildConstant(
1254 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1255 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1256 MI.eraseFromParent();
1257 return true;
1258 }
1259
1260 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1261 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1262, __PRETTY_FUNCTION__))
1262 DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1262, __PRETTY_FUNCTION__))
;
1263 unsigned NullVal = TM.getNullPointerValue(DestAS);
1264
1265 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1266 auto FlatNull = B.buildConstant(SrcTy, 0);
1267
1268 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1269
1270 // Extract low 32-bits of the pointer.
1271 B.buildExtract(PtrLo32, Src, 0);
1272
1273 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1274 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1275 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1276
1277 MI.eraseFromParent();
1278 return true;
1279 }
1280
1281 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1282 return false;
1283
1284 if (!ST.hasFlatAddressSpace())
1285 return false;
1286
1287 auto SegmentNull =
1288 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1289 auto FlatNull =
1290 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1291
1292 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1293 if (!ApertureReg.isValid())
1294 return false;
1295
1296 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1297 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1298
1299 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1300
1301 // Coerce the type of the low half of the result so we can use merge_values.
1302 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1303 B.buildInstr(TargetOpcode::G_PTRTOINT)
1304 .addDef(SrcAsInt)
1305 .addUse(Src);
1306
1307 // TODO: Should we allow mismatched types but matching sizes in merges to
1308 // avoid the ptrtoint?
1309 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1310 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1311
1312 MI.eraseFromParent();
1313 return true;
1314}
1315
1316bool AMDGPULegalizerInfo::legalizeFrint(
1317 MachineInstr &MI, MachineRegisterInfo &MRI,
1318 MachineIRBuilder &B) const {
1319 B.setInstr(MI);
1320
1321 Register Src = MI.getOperand(1).getReg();
1322 LLT Ty = MRI.getType(Src);
1323 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast
<void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1323, __PRETTY_FUNCTION__))
;
1324
1325 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1326 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1327
1328 auto C1 = B.buildFConstant(Ty, C1Val);
1329 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1330
1331 // TODO: Should this propagate fast-math-flags?
1332 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1333 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1334
1335 auto C2 = B.buildFConstant(Ty, C2Val);
1336 auto Fabs = B.buildFAbs(Ty, Src);
1337
1338 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1339 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1340 return true;
1341}
1342
1343bool AMDGPULegalizerInfo::legalizeFceil(
1344 MachineInstr &MI, MachineRegisterInfo &MRI,
1345 MachineIRBuilder &B) const {
1346 B.setInstr(MI);
1347
1348 const LLT S1 = LLT::scalar(1);
1349 const LLT S64 = LLT::scalar(64);
1350
1351 Register Src = MI.getOperand(1).getReg();
1352 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1352, __PRETTY_FUNCTION__))
;
1353
1354 // result = trunc(src)
1355 // if (src > 0.0 && src != result)
1356 // result += 1.0
1357
1358 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1359
1360 const auto Zero = B.buildFConstant(S64, 0.0);
1361 const auto One = B.buildFConstant(S64, 1.0);
1362 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1363 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1364 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1365 auto Add = B.buildSelect(S64, And, One, Zero);
1366
1367 // TODO: Should this propagate fast-math-flags?
1368 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1369 return true;
1370}
1371
1372static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1373 MachineIRBuilder &B) {
1374 const unsigned FractBits = 52;
1375 const unsigned ExpBits = 11;
1376 LLT S32 = LLT::scalar(32);
1377
1378 auto Const0 = B.buildConstant(S32, FractBits - 32);
1379 auto Const1 = B.buildConstant(S32, ExpBits);
1380
1381 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1382 .addUse(Const0.getReg(0))
1383 .addUse(Const1.getReg(0));
1384
1385 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1386}
1387
1388bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1389 MachineInstr &MI, MachineRegisterInfo &MRI,
1390 MachineIRBuilder &B) const {
1391 B.setInstr(MI);
1392
1393 const LLT S1 = LLT::scalar(1);
1394 const LLT S32 = LLT::scalar(32);
1395 const LLT S64 = LLT::scalar(64);
1396
1397 Register Src = MI.getOperand(1).getReg();
1398 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1398, __PRETTY_FUNCTION__))
;
1399
1400 // TODO: Should this use extract since the low half is unused?
1401 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1402 Register Hi = Unmerge.getReg(1);
1403
1404 // Extract the upper half, since this is where we will find the sign and
1405 // exponent.
1406 auto Exp = extractF64Exponent(Hi, B);
1407
1408 const unsigned FractBits = 52;
1409
1410 // Extract the sign bit.
1411 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
1412 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1413
1414 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
1415
1416 const auto Zero32 = B.buildConstant(S32, 0);
1417
1418 // Extend back to 64-bits.
1419 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1420
1421 auto Shr = B.buildAShr(S64, FractMask, Exp);
1422 auto Not = B.buildNot(S64, Shr);
1423 auto Tmp0 = B.buildAnd(S64, Src, Not);
1424 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1425
1426 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1427 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1428
1429 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1430 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1431 return true;
1432}
1433
1434bool AMDGPULegalizerInfo::legalizeITOFP(
1435 MachineInstr &MI, MachineRegisterInfo &MRI,
1436 MachineIRBuilder &B, bool Signed) const {
1437 B.setInstr(MI);
1438
1439 Register Dst = MI.getOperand(0).getReg();
1440 Register Src = MI.getOperand(1).getReg();
1441
1442 const LLT S64 = LLT::scalar(64);
1443 const LLT S32 = LLT::scalar(32);
1444
1445 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1445, __PRETTY_FUNCTION__))
;
1446
1447 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1448
1449 auto CvtHi = Signed ?
1450 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1451 B.buildUITOFP(S64, Unmerge.getReg(1));
1452
1453 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1454
1455 auto ThirtyTwo = B.buildConstant(S32, 32);
1456 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1457 .addUse(CvtHi.getReg(0))
1458 .addUse(ThirtyTwo.getReg(0));
1459
1460 // TODO: Should this propagate fast-math-flags?
1461 B.buildFAdd(Dst, LdExp, CvtLo);
1462 MI.eraseFromParent();
1463 return true;
1464}
1465
1466bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1467 MachineInstr &MI, MachineRegisterInfo &MRI,
1468 MachineIRBuilder &B) const {
1469 MachineFunction &MF = B.getMF();
1470 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1471
1472 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1473 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1474
1475 // With ieee_mode disabled, the instructions have the correct behavior
1476 // already for G_FMINNUM/G_FMAXNUM
1477 if (!MFI->getMode().IEEE)
1478 return !IsIEEEOp;
1479
1480 if (IsIEEEOp)
1481 return true;
1482
1483 MachineIRBuilder HelperBuilder(MI);
1484 GISelObserverWrapper DummyObserver;
1485 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1486 HelperBuilder.setInstr(MI);
1487 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1488}
1489
1490bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1491 MachineInstr &MI, MachineRegisterInfo &MRI,
1492 MachineIRBuilder &B) const {
1493 // TODO: Should move some of this into LegalizerHelper.
1494
1495 // TODO: Promote dynamic indexing of s16 to s32
1496 // TODO: Dynamic s64 indexing is only legal for SGPR.
1497 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1498 if (!IdxVal) // Dynamic case will be selected to register indexing.
1499 return true;
1500
1501 Register Dst = MI.getOperand(0).getReg();
1502 Register Vec = MI.getOperand(1).getReg();
1503
1504 LLT VecTy = MRI.getType(Vec);
1505 LLT EltTy = VecTy.getElementType();
1506 assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1506, __PRETTY_FUNCTION__))
;
1507
1508 B.setInstr(MI);
1509
1510 if (IdxVal.getValue() < VecTy.getNumElements())
1511 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1512 else
1513 B.buildUndef(Dst);
1514
1515 MI.eraseFromParent();
1516 return true;
1517}
1518
1519bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1520 MachineInstr &MI, MachineRegisterInfo &MRI,
1521 MachineIRBuilder &B) const {
1522 // TODO: Should move some of this into LegalizerHelper.
1523
1524 // TODO: Promote dynamic indexing of s16 to s32
1525 // TODO: Dynamic s64 indexing is only legal for SGPR.
1526 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1527 if (!IdxVal) // Dynamic case will be selected to register indexing.
1528 return true;
1529
1530 Register Dst = MI.getOperand(0).getReg();
1531 Register Vec = MI.getOperand(1).getReg();
1532 Register Ins = MI.getOperand(2).getReg();
1533
1534 LLT VecTy = MRI.getType(Vec);
1535 LLT EltTy = VecTy.getElementType();
1536 assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1536, __PRETTY_FUNCTION__))
;
1537
1538 B.setInstr(MI);
1539
1540 if (IdxVal.getValue() < VecTy.getNumElements())
1541 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1542 else
1543 B.buildUndef(Dst);
1544
1545 MI.eraseFromParent();
1546 return true;
1547}
1548
1549bool AMDGPULegalizerInfo::legalizeSinCos(
1550 MachineInstr &MI, MachineRegisterInfo &MRI,
1551 MachineIRBuilder &B) const {
1552 B.setInstr(MI);
1553
1554 Register DstReg = MI.getOperand(0).getReg();
1555 Register SrcReg = MI.getOperand(1).getReg();
1556 LLT Ty = MRI.getType(DstReg);
1557 unsigned Flags = MI.getFlags();
1558
1559 Register TrigVal;
1560 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI3.14159265358979323846);
1561 if (ST.hasTrigReducedRange()) {
1562 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1563 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1564 .addUse(MulVal.getReg(0))
1565 .setMIFlags(Flags).getReg(0);
1566 } else
1567 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1568
1569 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1570 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1571 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1572 .addUse(TrigVal)
1573 .setMIFlags(Flags);
1574 MI.eraseFromParent();
1575 return true;
1576}
1577
1578bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1579 Register DstReg, LLT PtrTy,
1580 MachineIRBuilder &B, const GlobalValue *GV,
1581 unsigned Offset, unsigned GAFlags) const {
1582 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1583 // to the following code sequence:
1584 //
1585 // For constant address space:
1586 // s_getpc_b64 s[0:1]
1587 // s_add_u32 s0, s0, $symbol
1588 // s_addc_u32 s1, s1, 0
1589 //
1590 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591 // a fixup or relocation is emitted to replace $symbol with a literal
1592 // constant, which is a pc-relative offset from the encoding of the $symbol
1593 // operand to the global variable.
1594 //
1595 // For global address space:
1596 // s_getpc_b64 s[0:1]
1597 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1598 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1599 //
1600 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1601 // fixups or relocations are emitted to replace $symbol@*@lo and
1602 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1603 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1604 // operand to the global variable.
1605 //
1606 // What we want here is an offset from the value returned by s_getpc
1607 // (which is the address of the s_add_u32 instruction) to the global
1608 // variable, but since the encoding of $symbol starts 4 bytes after the start
1609 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1610 // small. This requires us to add 4 to the global variable offset in order to
1611 // compute the correct address.
1612
1613 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1614
1615 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1616 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1617
1618 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1619 .addDef(PCReg);
1620
1621 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1622 if (GAFlags == SIInstrInfo::MO_NONE)
1623 MIB.addImm(0);
1624 else
1625 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1626
1627 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1628
1629 if (PtrTy.getSizeInBits() == 32)
1630 B.buildExtract(DstReg, PCReg, 0);
1631 return true;
1632 }
1633
1634bool AMDGPULegalizerInfo::legalizeGlobalValue(
1635 MachineInstr &MI, MachineRegisterInfo &MRI,
1636 MachineIRBuilder &B) const {
1637 Register DstReg = MI.getOperand(0).getReg();
1638 LLT Ty = MRI.getType(DstReg);
1639 unsigned AS = Ty.getAddressSpace();
1640
1641 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1642 MachineFunction &MF = B.getMF();
1643 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1644 B.setInstr(MI);
1645
1646 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1647 if (!MFI->isEntryFunction()) {
1648 const Function &Fn = MF.getFunction();
1649 DiagnosticInfoUnsupported BadLDSDecl(
1650 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1651 Fn.getContext().diagnose(BadLDSDecl);
1652 }
1653
1654 // TODO: We could emit code to handle the initialization somewhere.
1655 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1656 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1657 MI.eraseFromParent();
1658 return true;
1659 }
1660
1661 const Function &Fn = MF.getFunction();
1662 DiagnosticInfoUnsupported BadInit(
1663 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1664 Fn.getContext().diagnose(BadInit);
1665 return true;
1666 }
1667
1668 const SITargetLowering *TLI = ST.getTargetLowering();
1669
1670 if (TLI->shouldEmitFixup(GV)) {
1671 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1672 MI.eraseFromParent();
1673 return true;
1674 }
1675
1676 if (TLI->shouldEmitPCReloc(GV)) {
1677 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1678 MI.eraseFromParent();
1679 return true;
1680 }
1681
1682 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1683 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1684
1685 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1686 MachinePointerInfo::getGOT(MF),
1687 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1688 MachineMemOperand::MOInvariant,
1689 8 /*Size*/, 8 /*Align*/);
1690
1691 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1692
1693 if (Ty.getSizeInBits() == 32) {
1694 // Truncate if this is a 32-bit constant adrdess.
1695 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1696 B.buildExtract(DstReg, Load, 0);
1697 } else
1698 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1699
1700 MI.eraseFromParent();
1701 return true;
1702}
1703
1704bool AMDGPULegalizerInfo::legalizeLoad(
1705 MachineInstr &MI, MachineRegisterInfo &MRI,
1706 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1707 B.setInstr(MI);
1708 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1709 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1710 Observer.changingInstr(MI);
1711 MI.getOperand(1).setReg(Cast.getReg(0));
1712 Observer.changedInstr(MI);
1713 return true;
1714}
1715
1716bool AMDGPULegalizerInfo::legalizeFMad(
1717 MachineInstr &MI, MachineRegisterInfo &MRI,
1718 MachineIRBuilder &B) const {
1719 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1720 assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1720, __PRETTY_FUNCTION__))
;
1721
1722 // TODO: Always legal with future ftz flag.
1723 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1724 return true;
1725 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1726 return true;
1727
1728 MachineFunction &MF = B.getMF();
1729
1730 MachineIRBuilder HelperBuilder(MI);
1731 GISelObserverWrapper DummyObserver;
1732 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1733 HelperBuilder.setMBB(*MI.getParent());
1734 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1735}
1736
1737bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1738 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1739 Register DstReg = MI.getOperand(0).getReg();
1740 Register PtrReg = MI.getOperand(1).getReg();
1741 Register CmpVal = MI.getOperand(2).getReg();
1742 Register NewVal = MI.getOperand(3).getReg();
1743
1744 assert(SITargetLowering::isFlatGlobalAddrSpace(((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1746, __PRETTY_FUNCTION__))
1745 MRI.getType(PtrReg).getAddressSpace()) &&((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1746, __PRETTY_FUNCTION__))
1746 "this should not have been custom lowered")((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1746, __PRETTY_FUNCTION__))
;
1747
1748 LLT ValTy = MRI.getType(CmpVal);
1749 LLT VecTy = LLT::vector(2, ValTy);
1750
1751 B.setInstr(MI);
1752 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1753
1754 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1755 .addDef(DstReg)
1756 .addUse(PtrReg)
1757 .addUse(PackedVal)
1758 .setMemRefs(MI.memoperands());
1759
1760 MI.eraseFromParent();
1761 return true;
1762}
1763
1764// Return the use branch instruction, otherwise null if the usage is invalid.
1765static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1766 MachineRegisterInfo &MRI) {
1767 Register CondDef = MI.getOperand(0).getReg();
1768 if (!MRI.hasOneNonDBGUse(CondDef))
1769 return nullptr;
1770
1771 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1772 return UseMI.getParent() == MI.getParent() &&
1773 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1774}
1775
1776Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1777 Register Reg, LLT Ty) const {
1778 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1779 if (LiveIn)
1780 return LiveIn;
1781
1782 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1783 MRI.addLiveIn(Reg, NewReg);
1784 return NewReg;
1785}
1786
1787bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1788 const ArgDescriptor *Arg) const {
1789 if (!Arg->isRegister() || !Arg->getRegister().isValid())
9
Taking false branch
1790 return false; // TODO: Handle these
1791
1792 assert(Arg->getRegister().isPhysical())((Arg->getRegister().isPhysical()) ? static_cast<void>
(0) : __assert_fail ("Arg->getRegister().isPhysical()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1792, __PRETTY_FUNCTION__))
;
10
'?' condition is true
1793
1794 MachineRegisterInfo &MRI = *B.getMRI();
1795
1796 LLT Ty = MRI.getType(DstReg);
1797 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1798
1799 if (Arg->isMasked()) {
11
Calling 'ArgDescriptor::isMasked'
14
Returning from 'ArgDescriptor::isMasked'
15
Taking true branch
1800 // TODO: Should we try to emit this once in the entry block?
1801 const LLT S32 = LLT::scalar(32);
1802 const unsigned Mask = Arg->getMask();
1803 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
16
Calling 'countTrailingZeros<unsigned int>'
23
Returning from 'countTrailingZeros<unsigned int>'
24
'Shift' initialized to 32
1804
1805 Register AndMaskSrc = LiveIn;
1806
1807 if (Shift
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
!= 0) {
25
Taking true branch
1808 auto ShiftAmt = B.buildConstant(S32, Shift);
1809 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1810 }
1811
1812 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
26
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
1813 } else
1814 B.buildCopy(DstReg, LiveIn);
1815
1816 // Insert the argument copy if it doens't already exist.
1817 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1818 if (!MRI.getVRegDef(LiveIn)) {
1819 // FIXME: Should have scoped insert pt
1820 MachineBasicBlock &OrigInsBB = B.getMBB();
1821 auto OrigInsPt = B.getInsertPt();
1822
1823 MachineBasicBlock &EntryMBB = B.getMF().front();
1824 EntryMBB.addLiveIn(Arg->getRegister());
1825 B.setInsertPt(EntryMBB, EntryMBB.begin());
1826 B.buildCopy(LiveIn, Arg->getRegister());
1827
1828 B.setInsertPt(OrigInsBB, OrigInsPt);
1829 }
1830
1831 return true;
1832}
1833
1834bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1835 MachineInstr &MI,
1836 MachineRegisterInfo &MRI,
1837 MachineIRBuilder &B,
1838 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1839 B.setInstr(MI);
1840
1841 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1842
1843 const ArgDescriptor *Arg;
1844 const TargetRegisterClass *RC;
1845 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1846 if (!Arg) {
6
Assuming 'Arg' is non-null
7
Taking false branch
1847 LLVM_DEBUG(dbgs() << "Required arg register missing\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-legalinfo")) { dbgs() << "Required arg register missing\n"
; } } while (false)
;
1848 return false;
1849 }
1850
1851 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
8
Calling 'AMDGPULegalizerInfo::loadInputValue'
1852 MI.eraseFromParent();
1853 return true;
1854 }
1855
1856 return false;
1857}
1858
1859bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1860 MachineRegisterInfo &MRI,
1861 MachineIRBuilder &B) const {
1862 B.setInstr(MI);
1863 Register Dst = MI.getOperand(0).getReg();
1864 LLT DstTy = MRI.getType(Dst);
1865 LLT S16 = LLT::scalar(16);
1866 LLT S32 = LLT::scalar(32);
1867
1868 if (legalizeFastUnsafeFDIV(MI, MRI, B))
1869 return true;
1870
1871 if (DstTy == S16)
1872 return legalizeFDIV16(MI, MRI, B);
1873 if (DstTy == S32)
1874 return legalizeFDIV32(MI, MRI, B);
1875
1876 return false;
1877}
1878
1879bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1880 MachineRegisterInfo &MRI,
1881 MachineIRBuilder &B) const {
1882 Register Res = MI.getOperand(0).getReg();
1883 Register LHS = MI.getOperand(1).getReg();
1884 Register RHS = MI.getOperand(2).getReg();
1885
1886 uint16_t Flags = MI.getFlags();
1887
1888 LLT ResTy = MRI.getType(Res);
1889 LLT S32 = LLT::scalar(32);
1890 LLT S64 = LLT::scalar(64);
1891
1892 const MachineFunction &MF = B.getMF();
1893 bool Unsafe =
1894 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1895
1896 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1897 return false;
1898
1899 if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1900 return false;
1901
1902 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1903 // 1 / x -> RCP(x)
1904 if (CLHS->isExactlyValue(1.0)) {
1905 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1906 .addUse(RHS)
1907 .setMIFlags(Flags);
1908
1909 MI.eraseFromParent();
1910 return true;
1911 }
1912
1913 // -1 / x -> RCP( FNEG(x) )
1914 if (CLHS->isExactlyValue(-1.0)) {
1915 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1916 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1917 .addUse(FNeg.getReg(0))
1918 .setMIFlags(Flags);
1919
1920 MI.eraseFromParent();
1921 return true;
1922 }
1923 }
1924
1925 // x / y -> x * (1.0 / y)
1926 if (Unsafe) {
1927 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1928 .addUse(RHS)
1929 .setMIFlags(Flags);
1930 B.buildFMul(Res, LHS, RCP, Flags);
1931
1932 MI.eraseFromParent();
1933 return true;
1934 }
1935
1936 return false;
1937}
1938
1939bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1940 MachineRegisterInfo &MRI,
1941 MachineIRBuilder &B) const {
1942 B.setInstr(MI);
1943 Register Res = MI.getOperand(0).getReg();
1944 Register LHS = MI.getOperand(1).getReg();
1945 Register RHS = MI.getOperand(2).getReg();
1946
1947 uint16_t Flags = MI.getFlags();
1948
1949 LLT S16 = LLT::scalar(16);
1950 LLT S32 = LLT::scalar(32);
1951
1952 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1953 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1954
1955 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1956 .addUse(RHSExt.getReg(0))
1957 .setMIFlags(Flags);
1958
1959 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1960 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1961
1962 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1963 .addUse(RDst.getReg(0))
1964 .addUse(RHS)
1965 .addUse(LHS)
1966 .setMIFlags(Flags);
1967
1968 MI.eraseFromParent();
1969 return true;
1970}
1971
1972// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
1973// to enable denorm mode. When 'Enable' is false, disable denorm mode.
1974static void toggleSPDenormMode(bool Enable,
1975 const GCNSubtarget &ST,
1976 MachineIRBuilder &B) {
1977 // Set SP denorm mode to this value.
1978 unsigned SPDenormMode =
1979 Enable ? FP_DENORM_FLUSH_NONE3 : FP_DENORM_FLUSH_IN_FLUSH_OUT0;
1980
1981 if (ST.hasDenormModeInst()) {
1982 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
1983 unsigned DPDenormModeDefault = ST.hasFP64Denormals()
1984 ? FP_DENORM_FLUSH_NONE3
1985 : FP_DENORM_FLUSH_IN_FLUSH_OUT0;
1986
1987 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
1988 B.buildInstr(AMDGPU::S_DENORM_MODE)
1989 .addImm(NewDenormModeValue);
1990
1991 } else {
1992 // Select FP32 bit field in mode register.
1993 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
1994 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
1995 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
1996
1997 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
1998 .addImm(SPDenormMode)
1999 .addImm(SPDenormModeBitField);
2000 }
2001}
2002
2003bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2004 MachineRegisterInfo &MRI,
2005 MachineIRBuilder &B) const {
2006 B.setInstr(MI);
2007 Register Res = MI.getOperand(0).getReg();
2008 Register LHS = MI.getOperand(1).getReg();
2009 Register RHS = MI.getOperand(2).getReg();
2010
2011 uint16_t Flags = MI.getFlags();
2012
2013 LLT S32 = LLT::scalar(32);
2014 LLT S1 = LLT::scalar(1);
2015
2016 auto One = B.buildFConstant(S32, 1.0f);
2017
2018 auto DenominatorScaled =
2019 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2020 .addUse(RHS)
2021 .addUse(RHS)
2022 .addUse(LHS)
2023 .setMIFlags(Flags);
2024 auto NumeratorScaled =
2025 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2026 .addUse(LHS)
2027 .addUse(RHS)
2028 .addUse(LHS)
2029 .setMIFlags(Flags);
2030
2031 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2032 .addUse(DenominatorScaled.getReg(0))
2033 .setMIFlags(Flags);
2034 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2035
2036 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2037 // aren't modeled as reading it.
2038 if (!ST.hasFP32Denormals())
2039 toggleSPDenormMode(true, ST, B);
2040
2041 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2042 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2043 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2044 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2045 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2046 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2047
2048 if (!ST.hasFP32Denormals())
2049 toggleSPDenormMode(false, ST, B);
2050
2051 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2052 .addUse(Fma4.getReg(0))
2053 .addUse(Fma1.getReg(0))
2054 .addUse(Fma3.getReg(0))
2055 .addUse(NumeratorScaled.getReg(1))
2056 .setMIFlags(Flags);
2057
2058 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2059 .addUse(Fmas.getReg(0))
2060 .addUse(RHS)
2061 .addUse(LHS)
2062 .setMIFlags(Flags);
2063
2064 MI.eraseFromParent();
2065 return true;
2066}
2067
2068bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2069 MachineRegisterInfo &MRI,
2070 MachineIRBuilder &B) const {
2071 B.setInstr(MI);
2072 Register Res = MI.getOperand(0).getReg();
2073 Register LHS = MI.getOperand(2).getReg();
2074 Register RHS = MI.getOperand(3).getReg();
2075 uint16_t Flags = MI.getFlags();
2076
2077 LLT S32 = LLT::scalar(32);
2078 LLT S1 = LLT::scalar(1);
2079
2080 auto Abs = B.buildFAbs(S32, RHS, Flags);
2081 const APFloat C0Val(1.0f);
2082
2083 auto C0 = B.buildConstant(S32, 0x6f800000);
2084 auto C1 = B.buildConstant(S32, 0x2f800000);
2085 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2086
2087 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2088 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2089
2090 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2091
2092 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2093 .addUse(Mul0.getReg(0))
2094 .setMIFlags(Flags);
2095
2096 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2097
2098 B.buildFMul(Res, Sel, Mul1, Flags);
2099
2100 MI.eraseFromParent();
2101 return true;
2102}
2103
2104bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2105 MachineRegisterInfo &MRI,
2106 MachineIRBuilder &B) const {
2107 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2108 if (!MFI->isEntryFunction()) {
3
Assuming the condition is true
4
Taking true branch
2109 return legalizePreloadedArgIntrin(MI, MRI, B,
5
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
2110 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2111 }
2112
2113 B.setInstr(MI);
2114
2115 uint64_t Offset =
2116 ST.getTargetLowering()->getImplicitParameterOffset(
2117 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2118 Register DstReg = MI.getOperand(0).getReg();
2119 LLT DstTy = MRI.getType(DstReg);
2120 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2121
2122 const ArgDescriptor *Arg;
2123 const TargetRegisterClass *RC;
2124 std::tie(Arg, RC)
2125 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2126 if (!Arg)
2127 return false;
2128
2129 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2130 if (!loadInputValue(KernargPtrReg, B, Arg))
2131 return false;
2132
2133 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2134 MI.eraseFromParent();
2135 return true;
2136}
2137
2138bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2139 MachineRegisterInfo &MRI,
2140 MachineIRBuilder &B,
2141 unsigned AddrSpace) const {
2142 B.setInstr(MI);
2143 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2144 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2145 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2146 MI.eraseFromParent();
2147 return true;
2148}
2149
2150/// Handle register layout difference for f16 images for some subtargets.
2151Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2152 MachineRegisterInfo &MRI,
2153 Register Reg) const {
2154 if (!ST.hasUnpackedD16VMem())
2155 return Reg;
2156
2157 const LLT S16 = LLT::scalar(16);
2158 const LLT S32 = LLT::scalar(32);
2159 LLT StoreVT = MRI.getType(Reg);
2160 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16
) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2160, __PRETTY_FUNCTION__))
;
2161
2162 auto Unmerge = B.buildUnmerge(S16, Reg);
2163
2164 SmallVector<Register, 4> WideRegs;
2165 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2166 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2167
2168 int NumElts = StoreVT.getNumElements();
2169
2170 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2171}
2172
2173bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2174 MachineRegisterInfo &MRI,
2175 MachineIRBuilder &B,
2176 bool IsFormat) const {
2177 // TODO: Reject f16 format on targets where unsupported.
2178 Register VData = MI.getOperand(1).getReg();
2179 LLT Ty = MRI.getType(VData);
2180
2181 B.setInstr(MI);
2182
2183 const LLT S32 = LLT::scalar(32);
2184 const LLT S16 = LLT::scalar(16);
2185
2186 // Fixup illegal register types for i8 stores.
2187 if (Ty == LLT::scalar(8) || Ty == S16) {
2188 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2189 MI.getOperand(1).setReg(AnyExt);
2190 return true;
2191 }
2192
2193 if (Ty.isVector()) {
2194 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2195 if (IsFormat)
2196 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2197 return true;
2198 }
2199
2200 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2201 }
2202
2203 return Ty == S32;
2204}
2205
2206bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2207 MachineRegisterInfo &MRI,
2208 MachineIRBuilder &B) const {
2209 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2210 switch (MI.getIntrinsicID()) {
1
Control jumps to 'case amdgcn_implicitarg_ptr:' at line 2254
2211 case Intrinsic::amdgcn_if: {
2212 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2213 const SIRegisterInfo *TRI
2214 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2215
2216 B.setInstr(*BrCond);
2217 Register Def = MI.getOperand(1).getReg();
2218 Register Use = MI.getOperand(3).getReg();
2219 B.buildInstr(AMDGPU::SI_IF)
2220 .addDef(Def)
2221 .addUse(Use)
2222 .addMBB(BrCond->getOperand(1).getMBB());
2223
2224 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2225 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2226 MI.eraseFromParent();
2227 BrCond->eraseFromParent();
2228 return true;
2229 }
2230
2231 return false;
2232 }
2233 case Intrinsic::amdgcn_loop: {
2234 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2235 const SIRegisterInfo *TRI
2236 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2237
2238 B.setInstr(*BrCond);
2239 Register Reg = MI.getOperand(2).getReg();
2240 B.buildInstr(AMDGPU::SI_LOOP)
2241 .addUse(Reg)
2242 .addMBB(BrCond->getOperand(1).getMBB());
2243 MI.eraseFromParent();
2244 BrCond->eraseFromParent();
2245 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2246 return true;
2247 }
2248
2249 return false;
2250 }
2251 case Intrinsic::amdgcn_kernarg_segment_ptr:
2252 return legalizePreloadedArgIntrin(
2253 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2254 case Intrinsic::amdgcn_implicitarg_ptr:
2255 return legalizeImplicitArgPtr(MI, MRI, B);
2
Calling 'AMDGPULegalizerInfo::legalizeImplicitArgPtr'
2256 case Intrinsic::amdgcn_workitem_id_x:
2257 return legalizePreloadedArgIntrin(MI, MRI, B,
2258 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2259 case Intrinsic::amdgcn_workitem_id_y:
2260 return legalizePreloadedArgIntrin(MI, MRI, B,
2261 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2262 case Intrinsic::amdgcn_workitem_id_z:
2263 return legalizePreloadedArgIntrin(MI, MRI, B,
2264 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2265 case Intrinsic::amdgcn_workgroup_id_x:
2266 return legalizePreloadedArgIntrin(MI, MRI, B,
2267 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2268 case Intrinsic::amdgcn_workgroup_id_y:
2269 return legalizePreloadedArgIntrin(MI, MRI, B,
2270 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2271 case Intrinsic::amdgcn_workgroup_id_z:
2272 return legalizePreloadedArgIntrin(MI, MRI, B,
2273 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2274 case Intrinsic::amdgcn_dispatch_ptr:
2275 return legalizePreloadedArgIntrin(MI, MRI, B,
2276 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2277 case Intrinsic::amdgcn_queue_ptr:
2278 return legalizePreloadedArgIntrin(MI, MRI, B,
2279 AMDGPUFunctionArgInfo::QUEUE_PTR);
2280 case Intrinsic::amdgcn_implicit_buffer_ptr:
2281 return legalizePreloadedArgIntrin(
2282 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2283 case Intrinsic::amdgcn_dispatch_id:
2284 return legalizePreloadedArgIntrin(MI, MRI, B,
2285 AMDGPUFunctionArgInfo::DISPATCH_ID);
2286 case Intrinsic::amdgcn_fdiv_fast:
2287 return legalizeFDIVFastIntrin(MI, MRI, B);
2288 case Intrinsic::amdgcn_is_shared:
2289 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2290 case Intrinsic::amdgcn_is_private:
2291 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2292 case Intrinsic::amdgcn_wavefrontsize: {
2293 B.setInstr(MI);
2294 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2295 MI.eraseFromParent();
2296 return true;
2297 }
2298 case Intrinsic::amdgcn_raw_buffer_store:
2299 return legalizeRawBufferStore(MI, MRI, B, false);
2300 case Intrinsic::amdgcn_raw_buffer_store_format:
2301 return legalizeRawBufferStore(MI, MRI, B, true);
2302 default:
2303 return true;
2304 }
2305
2306 return true;
2307}

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
12
Assuming the condition is true
13
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include "llvm/Support/SwapByteOrder.h"
18#include <algorithm>
19#include <cassert>
20#include <climits>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
18
Assuming 'Val' is equal to 0
19
Taking true branch
117 return 32;
20
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
17
Calling 'TrailingZerosCounter::count'
21
Returning from 'TrailingZerosCounter::count'
22
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 251, __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315// NOTE: The following support functions use the _32/_64 extensions instead of
316// type overloading so that signed and unsigned integers can be used without
317// ambiguity.
318
319/// Return the high 32 bits of a 64 bit value.
320constexpr inline uint32_t Hi_32(uint64_t Value) {
321 return static_cast<uint32_t>(Value >> 32);
322}
323
324/// Return the low 32 bits of a 64 bit value.
325constexpr inline uint32_t Lo_32(uint64_t Value) {
326 return static_cast<uint32_t>(Value);
327}
328
329/// Make a 64-bit integer from a high / low pair of 32-bit integers.
330constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
331 return ((uint64_t)High << 32) | (uint64_t)Low;
332}
333
334/// Checks if an integer fits into the given bit width.
335template <unsigned N> constexpr inline bool isInt(int64_t x) {
336 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
337}
338// Template specializations to get better code for common cases.
339template <> constexpr inline bool isInt<8>(int64_t x) {
340 return static_cast<int8_t>(x) == x;
341}
342template <> constexpr inline bool isInt<16>(int64_t x) {
343 return static_cast<int16_t>(x) == x;
344}
345template <> constexpr inline bool isInt<32>(int64_t x) {
346 return static_cast<int32_t>(x) == x;
347}
348
349/// Checks if a signed integer is an N bit number shifted left by S.
350template <unsigned N, unsigned S>
351constexpr inline bool isShiftedInt(int64_t x) {
352 static_assert(
353 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
354 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
355 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
356}
357
358/// Checks if an unsigned integer fits into the given bit width.
359///
360/// This is written as two functions rather than as simply
361///
362/// return N >= 64 || X < (UINT64_C(1) << N);
363///
364/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
365/// left too many places.
366template <unsigned N>
367constexpr inline typename std::enable_if<(N < 64), bool>::type
368isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline typename std::enable_if<N >= 64, bool>::type
374isUInt(uint64_t X) {
375 return true;
376}
377
378// Template specializations to get better code for common cases.
379template <> constexpr inline bool isUInt<8>(uint64_t x) {
380 return static_cast<uint8_t>(x) == x;
381}
382template <> constexpr inline bool isUInt<16>(uint64_t x) {
383 return static_cast<uint16_t>(x) == x;
384}
385template <> constexpr inline bool isUInt<32>(uint64_t x) {
386 return static_cast<uint32_t>(x) == x;
387}
388
389/// Checks if a unsigned integer is an N bit number shifted left by S.
390template <unsigned N, unsigned S>
391constexpr inline bool isShiftedUInt(uint64_t x) {
392 static_assert(
393 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
394 static_assert(N + S <= 64,
395 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
396 // Per the two static_asserts above, S must be strictly less than 64. So
397 // 1 << S is not undefined behavior.
398 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
399}
400
401/// Gets the maximum value for a N-bit unsigned integer.
402inline uint64_t maxUIntN(uint64_t N) {
403 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 403, __PRETTY_FUNCTION__))
;
404
405 // uint64_t(1) << 64 is undefined behavior, so we can't do
406 // (uint64_t(1) << N) - 1
407 // without checking first that N != 64. But this works and doesn't have a
408 // branch.
409 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
410}
411
412/// Gets the minimum value for a N-bit signed integer.
413inline int64_t minIntN(int64_t N) {
414 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 414, __PRETTY_FUNCTION__))
;
415
416 return -(UINT64_C(1)1UL<<(N-1));
417}
418
419/// Gets the maximum value for a N-bit signed integer.
420inline int64_t maxIntN(int64_t N) {
421 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 421, __PRETTY_FUNCTION__))
;
422
423 // This relies on two's complement wraparound when N == 64, so we convert to
424 // int64_t only at the very end to avoid UB.
425 return (UINT64_C(1)1UL << (N - 1)) - 1;
426}
427
428/// Checks if an unsigned integer fits into the given (dynamic) bit width.
429inline bool isUIntN(unsigned N, uint64_t x) {
430 return N >= 64 || x <= maxUIntN(N);
431}
432
433/// Checks if an signed integer fits into the given (dynamic) bit width.
434inline bool isIntN(unsigned N, int64_t x) {
435 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
436}
437
438/// Return true if the argument is a non-empty sequence of ones starting at the
439/// least significant bit with the remainder zero (32 bit version).
440/// Ex. isMask_32(0x0000FFFFU) == true.
441constexpr inline bool isMask_32(uint32_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument is a non-empty sequence of ones starting at the
446/// least significant bit with the remainder zero (64 bit version).
447constexpr inline bool isMask_64(uint64_t Value) {
448 return Value && ((Value + 1) & Value) == 0;
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
453constexpr inline bool isShiftedMask_32(uint32_t Value) {
454 return Value && isMask_32((Value - 1) | Value);
455}
456
457/// Return true if the argument contains a non-empty sequence of ones with the
458/// remainder zero (64 bit version.)
459constexpr inline bool isShiftedMask_64(uint64_t Value) {
460 return Value && isMask_64((Value - 1) | Value);
461}
462
463/// Return true if the argument is a power of two > 0.
464/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
465constexpr inline bool isPowerOf2_32(uint32_t Value) {
466 return Value && !(Value & (Value - 1));
467}
468
469/// Return true if the argument is a power of two > 0 (64 bit edition.)
470constexpr inline bool isPowerOf2_64(uint64_t Value) {
471 return Value && !(Value & (Value - 1));
472}
473
474/// Return a byte-swapped representation of the 16-bit argument.
475inline uint16_t ByteSwap_16(uint16_t Value) {
476 return sys::SwapByteOrder_16(Value);
477}
478
479/// Return a byte-swapped representation of the 32-bit argument.
480inline uint32_t ByteSwap_32(uint32_t Value) {
481 return sys::SwapByteOrder_32(Value);
482}
483
484/// Return a byte-swapped representation of the 64-bit argument.
485inline uint64_t ByteSwap_64(uint64_t Value) {
486 return sys::SwapByteOrder_64(Value);
487}
488
489/// Count the number of ones from the most significant bit to the first
490/// zero bit.
491///
492/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countLeadingZeros<T>(~Value, ZB);
503}
504
505/// Count the number of ones from the least significant bit to the first
506/// zero bit.
507///
508/// Ex. countTrailingOnes(0x00FF00FF) == 8.
509/// Only unsigned integral types are allowed.
510///
511/// \param ZB the behavior on an input of all ones. Only ZB_Width and
512/// ZB_Undefined are valid arguments.
513template <typename T>
514unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
515 static_assert(std::numeric_limits<T>::is_integer &&
516 !std::numeric_limits<T>::is_signed,
517 "Only unsigned integral types are allowed.");
518 return countTrailingZeros<T>(~Value, ZB);
519}
520
521namespace detail {
522template <typename T, std::size_t SizeOfT> struct PopulationCounter {
523 static unsigned count(T Value) {
524 // Generic version, forward to 32 bits.
525 static_assert(SizeOfT <= 4, "Not implemented!");
526#if defined(__GNUC__4)
527 return __builtin_popcount(Value);
528#else
529 uint32_t v = Value;
530 v = v - ((v >> 1) & 0x55555555);
531 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
532 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
533#endif
534 }
535};
536
537template <typename T> struct PopulationCounter<T, 8> {
538 static unsigned count(T Value) {
539#if defined(__GNUC__4)
540 return __builtin_popcountll(Value);
541#else
542 uint64_t v = Value;
543 v = v - ((v >> 1) & 0x5555555555555555ULL);
544 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
545 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
546 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
547#endif
548 }
549};
550} // namespace detail
551
552/// Count the number of set bits in a value.
553/// Ex. countPopulation(0xF000F000) = 8
554/// Returns 0 if the word is zero.
555template <typename T>
556inline unsigned countPopulation(T Value) {
557 static_assert(std::numeric_limits<T>::is_integer &&
558 !std::numeric_limits<T>::is_signed,
559 "Only unsigned integral types are allowed.");
560 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
561}
562
563/// Compile time Log2.
564/// Valid only for positive powers of two.
565template <size_t kValue> constexpr inline size_t CTLog2() {
566 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
567 "Value is not a valid power of 2");
568 return 1 + CTLog2<kValue / 2>();
569}
570
571template <> constexpr inline size_t CTLog2<1>() { return 0; }
572
573/// Return the log base 2 of the specified value.
574inline double Log2(double Value) {
575#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
576 return __builtin_log(Value) / __builtin_log(2.0);
577#else
578 return log2(Value);
579#endif
580}
581
582/// Return the floor log base 2 of the specified value, -1 if the value is zero.
583/// (32 bit edition.)
584/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
585inline unsigned Log2_32(uint32_t Value) {
586 return 31 - countLeadingZeros(Value);
587}
588
589/// Return the floor log base 2 of the specified value, -1 if the value is zero.
590/// (64 bit edition.)
591inline unsigned Log2_64(uint64_t Value) {
592 return 63 - countLeadingZeros(Value);
593}
594
595/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
596/// (32 bit edition).
597/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
598inline unsigned Log2_32_Ceil(uint32_t Value) {
599 return 32 - countLeadingZeros(Value - 1);
600}
601
602/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
603/// (64 bit edition.)
604inline unsigned Log2_64_Ceil(uint64_t Value) {
605 return 64 - countLeadingZeros(Value - 1);
606}
607
608/// Return the greatest common divisor of the values using Euclid's algorithm.
609template <typename T>
610inline T greatestCommonDivisor(T A, T B) {
611 while (B) {
612 T Tmp = B;
613 B = A % B;
614 A = Tmp;
615 }
616 return A;
617}
618
619inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
620 return greatestCommonDivisor<uint64_t>(A, B);
621}
622
623/// This function takes a 64-bit integer and returns the bit equivalent double.
624inline double BitsToDouble(uint64_t Bits) {
625 double D;
626 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
627 memcpy(&D, &Bits, sizeof(Bits));
628 return D;
629}
630
631/// This function takes a 32-bit integer and returns the bit equivalent float.
632inline float BitsToFloat(uint32_t Bits) {
633 float F;
634 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
635 memcpy(&F, &Bits, sizeof(Bits));
636 return F;
637}
638
639/// This function takes a double and returns the bit equivalent 64-bit integer.
640/// Note that copying doubles around changes the bits of NaNs on some hosts,
641/// notably x86, so this routine cannot be used if these bits are needed.
642inline uint64_t DoubleToBits(double Double) {
643 uint64_t Bits;
644 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
645 memcpy(&Bits, &Double, sizeof(Double));
646 return Bits;
647}
648
649/// This function takes a float and returns the bit equivalent 32-bit integer.
650/// Note that copying floats around changes the bits of NaNs on some hosts,
651/// notably x86, so this routine cannot be used if these bits are needed.
652inline uint32_t FloatToBits(float Float) {
653 uint32_t Bits;
654 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
655 memcpy(&Bits, &Float, sizeof(Float));
656 return Bits;
657}
658
659/// A and B are either alignments or offsets. Return the minimum alignment that
660/// may be assumed after adding the two together.
661constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
662 // The largest power of 2 that divides both A and B.
663 //
664 // Replace "-Value" by "1+~Value" in the following commented code to avoid
665 // MSVC warning C4146
666 // return (A | B) & -(A | B);
667 return (A | B) & (1 + ~(A | B));
668}
669
670/// Returns the next power of two (in 64-bits) that is strictly greater than A.
671/// Returns zero on overflow.
672inline uint64_t NextPowerOf2(uint64_t A) {
673 A |= (A >> 1);
674 A |= (A >> 2);
675 A |= (A >> 4);
676 A |= (A >> 8);
677 A |= (A >> 16);
678 A |= (A >> 32);
679 return A + 1;
680}
681
682/// Returns the power of two which is less than or equal to the given value.
683/// Essentially, it is a floor operation across the domain of powers of two.
684inline uint64_t PowerOf2Floor(uint64_t A) {
685 if (!A) return 0;
686 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
687}
688
689/// Returns the power of two which is greater than or equal to the given value.
690/// Essentially, it is a ceil operation across the domain of powers of two.
691inline uint64_t PowerOf2Ceil(uint64_t A) {
692 if (!A)
693 return 0;
694 return NextPowerOf2(A - 1);
695}
696
697/// Returns the next integer (mod 2**64) that is greater than or equal to
698/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
699///
700/// If non-zero \p Skew is specified, the return value will be a minimal
701/// integer that is greater than or equal to \p Value and equal to
702/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
703/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
704///
705/// Examples:
706/// \code
707/// alignTo(5, 8) = 8
708/// alignTo(17, 8) = 24
709/// alignTo(~0LL, 8) = 0
710/// alignTo(321, 255) = 510
711///
712/// alignTo(5, 8, 7) = 7
713/// alignTo(17, 8, 1) = 17
714/// alignTo(~0LL, 8, 3) = 3
715/// alignTo(321, 255, 42) = 552
716/// \endcode
717inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
718 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 718, __PRETTY_FUNCTION__))
;
719 Skew %= Align;
720 return (Value + Align - 1 - Skew) / Align * Align + Skew;
721}
722
723/// Returns the next integer (mod 2**64) that is greater than or equal to
724/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
725template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
726 static_assert(Align != 0u, "Align must be non-zero");
727 return (Value + Align - 1) / Align * Align;
728}
729
730/// Returns the integer ceil(Numerator / Denominator).
731inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
732 return alignTo(Numerator, Denominator) / Denominator;
733}
734
735/// Returns the largest uint64_t less than or equal to \p Value and is
736/// \p Skew mod \p Align. \p Align must be non-zero
737inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
738 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 738, __PRETTY_FUNCTION__))
;
739 Skew %= Align;
740 return (Value - Skew) / Align * Align + Skew;
741}
742
743/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
744/// Requires 0 < B <= 32.
745template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
746 static_assert(B > 0, "Bit width can't be 0.");
747 static_assert(B <= 32, "Bit width out of range.");
748 return int32_t(X << (32 - B)) >> (32 - B);
749}
750
751/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
752/// Requires 0 < B < 32.
753inline int32_t SignExtend32(uint32_t X, unsigned B) {
754 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 754, __PRETTY_FUNCTION__))
;
755 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 755, __PRETTY_FUNCTION__))
;
756 return int32_t(X << (32 - B)) >> (32 - B);
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
760/// Requires 0 < B < 64.
761template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 64, "Bit width out of range.");
764 return int64_t(x << (64 - B)) >> (64 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
768/// Requires 0 < B < 64.
769inline int64_t SignExtend64(uint64_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 770, __PRETTY_FUNCTION__))
;
771 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 771, __PRETTY_FUNCTION__))
;
772 return int64_t(X << (64 - B)) >> (64 - B);
773}
774
775/// Subtract two unsigned integers, X and Y, of type T and return the absolute
776/// value of the result.
777template <typename T>
778typename std::enable_if<std::is_unsigned<T>::value, T>::type
779AbsoluteDifference(T X, T Y) {
780 return std::max(X, Y) - std::min(X, Y);
781}
782
783/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
784/// maximum representable value of T on overflow. ResultOverflowed indicates if
785/// the result is larger than the maximum representable value of type T.
786template <typename T>
787typename std::enable_if<std::is_unsigned<T>::value, T>::type
788SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
789 bool Dummy;
790 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
791 // Hacker's Delight, p. 29
792 T Z = X + Y;
793 Overflowed = (Z < X || Z < Y);
794 if (Overflowed)
795 return std::numeric_limits<T>::max();
796 else
797 return Z;
798}
799
800/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
801/// maximum representable value of T on overflow. ResultOverflowed indicates if
802/// the result is larger than the maximum representable value of type T.
803template <typename T>
804typename std::enable_if<std::is_unsigned<T>::value, T>::type
805SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
806 bool Dummy;
807 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
808
809 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
810 // because it fails for uint16_t (where multiplication can have undefined
811 // behavior due to promotion to int), and requires a division in addition
812 // to the multiplication.
813
814 Overflowed = false;
815
816 // Log2(Z) would be either Log2Z or Log2Z + 1.
817 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
818 // will necessarily be less than Log2Max as desired.
819 int Log2Z = Log2_64(X) + Log2_64(Y);
820 const T Max = std::numeric_limits<T>::max();
821 int Log2Max = Log2_64(Max);
822 if (Log2Z < Log2Max) {
823 return X * Y;
824 }
825 if (Log2Z > Log2Max) {
826 Overflowed = true;
827 return Max;
828 }
829
830 // We're going to use the top bit, and maybe overflow one
831 // bit past it. Multiply all but the bottom bit then add
832 // that on at the end.
833 T Z = (X >> 1) * Y;
834 if (Z & ~(Max >> 1)) {
835 Overflowed = true;
836 return Max;
837 }
838 Z <<= 1;
839 if (X & 1)
840 return SaturatingAdd(Z, Y, ResultOverflowed);
841
842 return Z;
843}
844
845/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
846/// the product. Clamp the result to the maximum representable value of T on
847/// overflow. ResultOverflowed indicates if the result is larger than the
848/// maximum representable value of type T.
849template <typename T>
850typename std::enable_if<std::is_unsigned<T>::value, T>::type
851SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
852 bool Dummy;
853 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
854
855 T Product = SaturatingMultiply(X, Y, &Overflowed);
856 if (Overflowed)
857 return Product;
858
859 return SaturatingAdd(A, Product, &Overflowed);
860}
861
862/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
863extern const float huge_valf;
864
865
866/// Add two signed integers, computing the two's complement truncated result,
867/// returning true if overflow occured.
868template <typename T>
869typename std::enable_if<std::is_signed<T>::value, T>::type
870AddOverflow(T X, T Y, T &Result) {
871#if __has_builtin(__builtin_add_overflow)1
872 return __builtin_add_overflow(X, Y, &Result);
873#else
874 // Perform the unsigned addition.
875 using U = typename std::make_unsigned<T>::type;
876 const U UX = static_cast<U>(X);
877 const U UY = static_cast<U>(Y);
878 const U UResult = UX + UY;
879
880 // Convert to signed.
881 Result = static_cast<T>(UResult);
882
883 // Adding two positive numbers should result in a positive number.
884 if (X > 0 && Y > 0)
885 return Result <= 0;
886 // Adding two negatives should result in a negative number.
887 if (X < 0 && Y < 0)
888 return Result >= 0;
889 return false;
890#endif
891}
892
893/// Subtract two signed integers, computing the two's complement truncated
894/// result, returning true if an overflow ocurred.
895template <typename T>
896typename std::enable_if<std::is_signed<T>::value, T>::type
897SubOverflow(T X, T Y, T &Result) {
898#if __has_builtin(__builtin_sub_overflow)1
899 return __builtin_sub_overflow(X, Y, &Result);
900#else
901 // Perform the unsigned addition.
902 using U = typename std::make_unsigned<T>::type;
903 const U UX = static_cast<U>(X);
904 const U UY = static_cast<U>(Y);
905 const U UResult = UX - UY;
906
907 // Convert to signed.
908 Result = static_cast<T>(UResult);
909
910 // Subtracting a positive number from a negative results in a negative number.
911 if (X <= 0 && Y > 0)
912 return Result >= 0;
913 // Subtracting a negative number from a positive results in a positive number.
914 if (X >= 0 && Y < 0)
915 return Result <= 0;
916 return false;
917#endif
918}
919
920
921/// Multiply two signed integers, computing the two's complement truncated
922/// result, returning true if an overflow ocurred.
923template <typename T>
924typename std::enable_if<std::is_signed<T>::value, T>::type
925MulOverflow(T X, T Y, T &Result) {
926 // Perform the unsigned multiplication on absolute values.
927 using U = typename std::make_unsigned<T>::type;
928 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
929 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
930 const U UResult = UX * UY;
931
932 // Convert to signed.
933 const bool IsNegative = (X < 0) ^ (Y < 0);
934 Result = IsNegative ? (0 - UResult) : UResult;
935
936 // If any of the args was 0, result is 0 and no overflow occurs.
937 if (UX == 0 || UY == 0)
938 return false;
939
940 // UX and UY are in [1, 2^n], where n is the number of digits.
941 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
942 // positive) divided by an argument compares to the other.
943 if (IsNegative)
944 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
945 else
946 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
947}
948
949} // End llvm namespace
950
951#endif