Bug Summary

File:lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 1763, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/include -I /build/llvm-toolchain-snapshot-10~svn374877/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~svn374877=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2019-10-15-233810-7101-1 -x c++ /build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#if defined(_MSC_VER) || defined(__MINGW32__)
15// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16// from the Visual C++ cmath / math.h headers:
17// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18#define _USE_MATH_DEFINES
19#endif
20
21#include "AMDGPU.h"
22#include "AMDGPULegalizerInfo.h"
23#include "AMDGPUTargetMachine.h"
24#include "SIMachineFunctionInfo.h"
25#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/TargetOpcodes.h"
28#include "llvm/CodeGen/ValueTypes.h"
29#include "llvm/IR/DerivedTypes.h"
30#include "llvm/IR/DiagnosticInfo.h"
31#include "llvm/IR/Type.h"
32#include "llvm/Support/Debug.h"
33
34#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
35
36using namespace llvm;
37using namespace LegalizeActions;
38using namespace LegalizeMutations;
39using namespace LegalityPredicates;
40
41
42static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 1024) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48 };
49}
50
51static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
54 };
55}
56
57static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
62 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
64 };
65}
66
67static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72 };
73}
74
75static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80 };
81}
82
83static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91 };
92}
93
94// Increase the number of vector elements to reach the next multiple of 32-bit
95// type.
96static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
104
105 assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 105, __PRETTY_FUNCTION__))
;
106
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109 };
110}
111
112static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116 };
117}
118
119static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123 };
124}
125
126static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130 };
131}
132
133// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134// v2s16.
135static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
143 }
144
145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146 };
147}
148
149static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
152 };
153}
154
155static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160 };
161}
162
163AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
166 using namespace TargetOpcode;
167
168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170 };
171
172 const LLT S1 = LLT::scalar(1);
173 const LLT S8 = LLT::scalar(8);
174 const LLT S16 = LLT::scalar(16);
175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
177 const LLT S96 = LLT::scalar(96);
178 const LLT S128 = LLT::scalar(128);
179 const LLT S256 = LLT::scalar(256);
180 const LLT S1024 = LLT::scalar(1024);
181
182 const LLT V2S16 = LLT::vector(2, 16);
183 const LLT V4S16 = LLT::vector(4, 16);
184
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
200 const LLT V32S32 = LLT::vector(32, 32);
201
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
209 const LLT V16S64 = LLT::vector(16, 64);
210
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214 std::initializer_list<LLT> AllS64Vectors =
215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216
217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224
225 const LLT CodePtr = FlatPtr;
226
227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
229 };
230
231 const std::initializer_list<LLT> AddrSpaces32 = {
232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233 };
234
235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
237 };
238
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
241 };
242
243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
245 };
246
247 setAction({G_BRCOND, S1}, Legal);
248
249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI)
252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253 .legalFor(AllS32Vectors)
254 .legalFor(AllS64Vectors)
255 .legalFor(AddrSpaces64)
256 .legalFor(AddrSpaces32)
257 .clampScalar(0, S32, S256)
258 .widenScalarToNextPow2(0, 32)
259 .clampMaxNumElements(0, S32, 16)
260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261 .legalIf(isPointer(0));
262
263 if (ST.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265 .legalFor({S32, S16})
266 .clampScalar(0, S16, S32)
267 .scalarize(0);
268 } else {
269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270 .legalFor({S32})
271 .clampScalar(0, S32, S32)
272 .scalarize(0);
273 }
274
275 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276 .legalFor({S32})
277 .clampScalar(0, S32, S32)
278 .scalarize(0);
279
280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284 .clampScalar(0, S32, S64)
285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287 .widenScalarToNextPow2(0)
288 .scalarize(0);
289
290 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292 .legalFor({{S32, S1}})
293 .clampScalar(0, S32, S32)
294 .scalarize(0); // TODO: Implement.
295
296 getActionDefinitionsBuilder(G_BITCAST)
297 // Don't worry about the size constraint.
298 .legalIf(all(isRegisterType(0), isRegisterType(1)))
299 // FIXME: Testing hack
300 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
301
302 getActionDefinitionsBuilder(G_FCONSTANT)
303 .legalFor({S32, S64, S16})
304 .clampScalar(0, S16, S64);
305
306 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
307 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
308 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
309 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
310 .clampScalarOrElt(0, S32, S1024)
311 .legalIf(isMultiple32(0))
312 .widenScalarToNextPow2(0, 32)
313 .clampMaxNumElements(0, S32, 16);
314
315
316 // FIXME: i1 operands to intrinsics should always be legal, but other i1
317 // values may not be legal. We need to figure out how to distinguish
318 // between these two scenarios.
319 getActionDefinitionsBuilder(G_CONSTANT)
320 .legalFor({S1, S32, S64, S16, GlobalPtr,
321 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
322 .clampScalar(0, S32, S64)
323 .widenScalarToNextPow2(0)
324 .legalIf(isPointer(0));
325
326 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
327 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
328 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
329
330
331 auto &FPOpActions = getActionDefinitionsBuilder(
332 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
333 .legalFor({S32, S64});
334 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
335 .customFor({S32, S64});
336
337 if (ST.has16BitInsts()) {
338 if (ST.hasVOP3PInsts())
339 FPOpActions.legalFor({S16, V2S16});
340 else
341 FPOpActions.legalFor({S16});
342
343 TrigActions.customFor({S16});
344 }
345
346 auto &MinNumMaxNum = getActionDefinitionsBuilder({
347 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
348
349 if (ST.hasVOP3PInsts()) {
350 MinNumMaxNum.customFor(FPTypesPK16)
351 .clampMaxNumElements(0, S16, 2)
352 .clampScalar(0, S16, S64)
353 .scalarize(0);
354 } else if (ST.has16BitInsts()) {
355 MinNumMaxNum.customFor(FPTypes16)
356 .clampScalar(0, S16, S64)
357 .scalarize(0);
358 } else {
359 MinNumMaxNum.customFor(FPTypesBase)
360 .clampScalar(0, S32, S64)
361 .scalarize(0);
362 }
363
364 if (ST.hasVOP3PInsts())
365 FPOpActions.clampMaxNumElements(0, S16, 2);
366
367 FPOpActions
368 .scalarize(0)
369 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
370
371 TrigActions
372 .scalarize(0)
373 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
374
375 getActionDefinitionsBuilder({G_FNEG, G_FABS})
376 .legalFor(FPTypesPK16)
377 .clampMaxNumElements(0, S16, 2)
378 .scalarize(0)
379 .clampScalar(0, S16, S64);
380
381 // TODO: Implement
382 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
383
384 if (ST.has16BitInsts()) {
385 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
386 .legalFor({S32, S64, S16})
387 .scalarize(0)
388 .clampScalar(0, S16, S64);
389 } else {
390 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
391 .legalFor({S32, S64})
392 .scalarize(0)
393 .clampScalar(0, S32, S64);
394 }
395
396 getActionDefinitionsBuilder(G_FPTRUNC)
397 .legalFor({{S32, S64}, {S16, S32}})
398 .scalarize(0);
399
400 getActionDefinitionsBuilder(G_FPEXT)
401 .legalFor({{S64, S32}, {S32, S16}})
402 .lowerFor({{S64, S16}}) // FIXME: Implement
403 .scalarize(0);
404
405 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
406 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
407
408 getActionDefinitionsBuilder(G_FSUB)
409 // Use actual fsub instruction
410 .legalFor({S32})
411 // Must use fadd + fneg
412 .lowerFor({S64, S16, V2S16})
413 .scalarize(0)
414 .clampScalar(0, S32, S64);
415
416 // Whether this is legal depends on the floating point mode for the function.
417 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
418 if (ST.hasMadF16())
419 FMad.customFor({S32, S16});
420 else
421 FMad.customFor({S32});
422 FMad.scalarize(0)
423 .lower();
424
425 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
426 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
427 {S32, S1}, {S64, S1}, {S16, S1},
428 {S96, S32},
429 // FIXME: Hack
430 {S64, LLT::scalar(33)},
431 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
432 .scalarize(0);
433
434 // TODO: Split s1->s64 during regbankselect for VALU.
435 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
436 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
437 .lowerFor({{S32, S64}})
438 .customFor({{S64, S64}});
439 if (ST.has16BitInsts())
440 IToFP.legalFor({{S16, S16}});
441 IToFP.clampScalar(1, S32, S64)
442 .scalarize(0);
443
444 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
445 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
446 if (ST.has16BitInsts())
447 FPToI.legalFor({{S16, S16}});
448 else
449 FPToI.minScalar(1, S32);
450
451 FPToI.minScalar(0, S32)
452 .scalarize(0);
453
454 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
455 .legalFor({S32, S64})
456 .scalarize(0);
457
458 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
459 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
460 .legalFor({S32, S64})
461 .clampScalar(0, S32, S64)
462 .scalarize(0);
463 } else {
464 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
465 .legalFor({S32})
466 .customFor({S64})
467 .clampScalar(0, S32, S64)
468 .scalarize(0);
469 }
470
471 getActionDefinitionsBuilder(G_GEP)
472 .legalForCartesianProduct(AddrSpaces64, {S64})
473 .legalForCartesianProduct(AddrSpaces32, {S32})
474 .scalarize(0);
475
476 getActionDefinitionsBuilder(G_PTR_MASK)
477 .scalarize(0)
478 .alwaysLegal();
479
480 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
481
482 auto &CmpBuilder =
483 getActionDefinitionsBuilder(G_ICMP)
484 .legalForCartesianProduct(
485 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
486 .legalFor({{S1, S32}, {S1, S64}});
487 if (ST.has16BitInsts()) {
488 CmpBuilder.legalFor({{S1, S16}});
489 }
490
491 CmpBuilder
492 .widenScalarToNextPow2(1)
493 .clampScalar(1, S32, S64)
494 .scalarize(0)
495 .legalIf(all(typeIs(0, S1), isPointer(1)));
496
497 getActionDefinitionsBuilder(G_FCMP)
498 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
499 .widenScalarToNextPow2(1)
500 .clampScalar(1, S32, S64)
501 .scalarize(0);
502
503 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
504 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
505 G_FLOG, G_FLOG2, G_FLOG10})
506 .legalFor({S32})
507 .scalarize(0);
508
509 // The 64-bit versions produce 32-bit results, but only on the SALU.
510 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
511 G_CTTZ, G_CTTZ_ZERO_UNDEF,
512 G_CTPOP})
513 .legalFor({{S32, S32}, {S32, S64}})
514 .clampScalar(0, S32, S32)
515 .clampScalar(1, S32, S64)
516 .scalarize(0)
517 .widenScalarToNextPow2(0, 32)
518 .widenScalarToNextPow2(1, 32);
519
520 // TODO: Expand for > s32
521 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
522 .legalFor({S32})
523 .clampScalar(0, S32, S32)
524 .scalarize(0);
525
526 if (ST.has16BitInsts()) {
527 if (ST.hasVOP3PInsts()) {
528 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
529 .legalFor({S32, S16, V2S16})
530 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
531 .clampMaxNumElements(0, S16, 2)
532 .clampScalar(0, S16, S32)
533 .widenScalarToNextPow2(0)
534 .scalarize(0);
535 } else {
536 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
537 .legalFor({S32, S16})
538 .widenScalarToNextPow2(0)
539 .clampScalar(0, S16, S32)
540 .scalarize(0);
541 }
542 } else {
543 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
544 .legalFor({S32})
545 .clampScalar(0, S32, S32)
546 .widenScalarToNextPow2(0)
547 .scalarize(0);
548 }
549
550 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
551 return [=](const LegalityQuery &Query) {
552 return Query.Types[TypeIdx0].getSizeInBits() <
553 Query.Types[TypeIdx1].getSizeInBits();
554 };
555 };
556
557 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
558 return [=](const LegalityQuery &Query) {
559 return Query.Types[TypeIdx0].getSizeInBits() >
560 Query.Types[TypeIdx1].getSizeInBits();
561 };
562 };
563
564 getActionDefinitionsBuilder(G_INTTOPTR)
565 // List the common cases
566 .legalForCartesianProduct(AddrSpaces64, {S64})
567 .legalForCartesianProduct(AddrSpaces32, {S32})
568 .scalarize(0)
569 // Accept any address space as long as the size matches
570 .legalIf(sameSize(0, 1))
571 .widenScalarIf(smallerThan(1, 0),
572 [](const LegalityQuery &Query) {
573 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
574 })
575 .narrowScalarIf(greaterThan(1, 0),
576 [](const LegalityQuery &Query) {
577 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
578 });
579
580 getActionDefinitionsBuilder(G_PTRTOINT)
581 // List the common cases
582 .legalForCartesianProduct(AddrSpaces64, {S64})
583 .legalForCartesianProduct(AddrSpaces32, {S32})
584 .scalarize(0)
585 // Accept any address space as long as the size matches
586 .legalIf(sameSize(0, 1))
587 .widenScalarIf(smallerThan(0, 1),
588 [](const LegalityQuery &Query) {
589 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
590 })
591 .narrowScalarIf(
592 greaterThan(0, 1),
593 [](const LegalityQuery &Query) {
594 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
595 });
596
597 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
598 .scalarize(0)
599 .custom();
600
601 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
602 // handle some operations by just promoting the register during
603 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
604 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
605 switch (AS) {
606 // FIXME: Private element size.
607 case AMDGPUAS::PRIVATE_ADDRESS:
608 return 32;
609 // FIXME: Check subtarget
610 case AMDGPUAS::LOCAL_ADDRESS:
611 return ST.useDS128() ? 128 : 64;
612
613 // Treat constant and global as identical. SMRD loads are sometimes usable
614 // for global loads (ideally constant address space should be eliminated)
615 // depending on the context. Legality cannot be context dependent, but
616 // RegBankSelect can split the load as necessary depending on the pointer
617 // register bank/uniformity and if the memory is invariant or not written in
618 // a kernel.
619 case AMDGPUAS::CONSTANT_ADDRESS:
620 case AMDGPUAS::GLOBAL_ADDRESS:
621 return 512;
622 default:
623 return 128;
624 }
625 };
626
627 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
628 const LLT DstTy = Query.Types[0];
629
630 // Split vector extloads.
631 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
632 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
633 return true;
634
635 const LLT PtrTy = Query.Types[1];
636 unsigned AS = PtrTy.getAddressSpace();
637 if (MemSize > maxSizeForAddrSpace(AS))
638 return true;
639
640 // Catch weird sized loads that don't evenly divide into the access sizes
641 // TODO: May be able to widen depending on alignment etc.
642 unsigned NumRegs = MemSize / 32;
643 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
644 return true;
645
646 unsigned Align = Query.MMODescrs[0].AlignInBits;
647 if (Align < MemSize) {
648 const SITargetLowering *TLI = ST.getTargetLowering();
649 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
650 }
651
652 return false;
653 };
654
655 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
656 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
657 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
658
659 // TODO: Refine based on subtargets which support unaligned access or 128-bit
660 // LDS
661 // TODO: Unsupported flat for SI.
662
663 for (unsigned Op : {G_LOAD, G_STORE}) {
664 const bool IsStore = Op == G_STORE;
665
666 auto &Actions = getActionDefinitionsBuilder(Op);
667 // Whitelist the common cases.
668 // TODO: Pointer loads
669 // TODO: Wide constant loads
670 // TODO: Only CI+ has 3x loads
671 // TODO: Loads to s16 on gfx9
672 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
673 {V2S32, GlobalPtr, 64, GlobalAlign32},
674 {V3S32, GlobalPtr, 96, GlobalAlign32},
675 {S96, GlobalPtr, 96, GlobalAlign32},
676 {V4S32, GlobalPtr, 128, GlobalAlign32},
677 {S128, GlobalPtr, 128, GlobalAlign32},
678 {S64, GlobalPtr, 64, GlobalAlign32},
679 {V2S64, GlobalPtr, 128, GlobalAlign32},
680 {V2S16, GlobalPtr, 32, GlobalAlign32},
681 {S32, GlobalPtr, 8, GlobalAlign8},
682 {S32, GlobalPtr, 16, GlobalAlign16},
683
684 {S32, LocalPtr, 32, 32},
685 {S64, LocalPtr, 64, 32},
686 {V2S32, LocalPtr, 64, 32},
687 {S32, LocalPtr, 8, 8},
688 {S32, LocalPtr, 16, 16},
689 {V2S16, LocalPtr, 32, 32},
690
691 {S32, PrivatePtr, 32, 32},
692 {S32, PrivatePtr, 8, 8},
693 {S32, PrivatePtr, 16, 16},
694 {V2S16, PrivatePtr, 32, 32},
695
696 {S32, FlatPtr, 32, GlobalAlign32},
697 {S32, FlatPtr, 16, GlobalAlign16},
698 {S32, FlatPtr, 8, GlobalAlign8},
699 {V2S16, FlatPtr, 32, GlobalAlign32},
700
701 {S32, ConstantPtr, 32, GlobalAlign32},
702 {V2S32, ConstantPtr, 64, GlobalAlign32},
703 {V3S32, ConstantPtr, 96, GlobalAlign32},
704 {V4S32, ConstantPtr, 128, GlobalAlign32},
705 {S64, ConstantPtr, 64, GlobalAlign32},
706 {S128, ConstantPtr, 128, GlobalAlign32},
707 {V2S32, ConstantPtr, 32, GlobalAlign32}});
708 Actions
709 .customIf(typeIs(1, Constant32Ptr))
710 .narrowScalarIf(
711 [=](const LegalityQuery &Query) -> bool {
712 return !Query.Types[0].isVector() && needToSplitLoad(Query);
713 },
714 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
715 const LLT DstTy = Query.Types[0];
716 const LLT PtrTy = Query.Types[1];
717
718 const unsigned DstSize = DstTy.getSizeInBits();
719 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
720
721 // Split extloads.
722 if (DstSize > MemSize)
723 return std::make_pair(0, LLT::scalar(MemSize));
724
725 if (DstSize > 32 && (DstSize % 32 != 0)) {
726 // FIXME: Need a way to specify non-extload of larger size if
727 // suitably aligned.
728 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
729 }
730
731 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
732 if (MemSize > MaxSize)
733 return std::make_pair(0, LLT::scalar(MaxSize));
734
735 unsigned Align = Query.MMODescrs[0].AlignInBits;
736 return std::make_pair(0, LLT::scalar(Align));
737 })
738 .fewerElementsIf(
739 [=](const LegalityQuery &Query) -> bool {
740 return Query.Types[0].isVector() && needToSplitLoad(Query);
741 },
742 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
743 const LLT DstTy = Query.Types[0];
744 const LLT PtrTy = Query.Types[1];
745
746 LLT EltTy = DstTy.getElementType();
747 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
748
749 // Split if it's too large for the address space.
750 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
751 unsigned NumElts = DstTy.getNumElements();
752 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
753
754 // FIXME: Refine when odd breakdowns handled
755 // The scalars will need to be re-legalized.
756 if (NumPieces == 1 || NumPieces >= NumElts ||
757 NumElts % NumPieces != 0)
758 return std::make_pair(0, EltTy);
759
760 return std::make_pair(0,
761 LLT::vector(NumElts / NumPieces, EltTy));
762 }
763
764 // Need to split because of alignment.
765 unsigned Align = Query.MMODescrs[0].AlignInBits;
766 unsigned EltSize = EltTy.getSizeInBits();
767 if (EltSize > Align &&
768 (EltSize / Align < DstTy.getNumElements())) {
769 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
770 }
771
772 // May need relegalization for the scalars.
773 return std::make_pair(0, EltTy);
774 })
775 .minScalar(0, S32);
776
777 if (IsStore)
778 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
779
780 // TODO: Need a bitcast lower option?
781 Actions
782 .legalIf([=](const LegalityQuery &Query) {
783 const LLT Ty0 = Query.Types[0];
784 unsigned Size = Ty0.getSizeInBits();
785 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
786 unsigned Align = Query.MMODescrs[0].AlignInBits;
787
788 // No extending vector loads.
789 if (Size > MemSize && Ty0.isVector())
790 return false;
791
792 // FIXME: Widening store from alignment not valid.
793 if (MemSize < Size)
794 MemSize = std::max(MemSize, Align);
795
796 switch (MemSize) {
797 case 8:
798 case 16:
799 return Size == 32;
800 case 32:
801 case 64:
802 case 128:
803 return true;
804 case 96:
805 return ST.hasDwordx3LoadStores();
806 case 256:
807 case 512:
808 return true;
809 default:
810 return false;
811 }
812 })
813 .widenScalarToNextPow2(0)
814 // TODO: v3s32->v4s32 with alignment
815 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
816 }
817
818 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
819 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
820 {S32, GlobalPtr, 16, 2 * 8},
821 {S32, LocalPtr, 8, 8},
822 {S32, LocalPtr, 16, 16},
823 {S32, PrivatePtr, 8, 8},
824 {S32, PrivatePtr, 16, 16},
825 {S32, ConstantPtr, 8, 8},
826 {S32, ConstantPtr, 16, 2 * 8}});
827 if (ST.hasFlatAddressSpace()) {
828 ExtLoads.legalForTypesWithMemDesc(
829 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
830 }
831
832 ExtLoads.clampScalar(0, S32, S32)
833 .widenScalarToNextPow2(0)
834 .unsupportedIfMemSizeNotPow2()
835 .lower();
836
837 auto &Atomics = getActionDefinitionsBuilder(
838 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
839 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
840 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
841 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
842 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
843 {S64, GlobalPtr}, {S64, LocalPtr}});
844 if (ST.hasFlatAddressSpace()) {
845 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
846 }
847
848 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
849 .legalFor({{S32, LocalPtr}});
850
851 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
852 .lower();
853
854 // TODO: Pointer types, any 32-bit or 64-bit vector
855 getActionDefinitionsBuilder(G_SELECT)
856 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
857 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
858 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
859 .clampScalar(0, S16, S64)
860 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
861 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
862 .scalarize(1)
863 .clampMaxNumElements(0, S32, 2)
864 .clampMaxNumElements(0, LocalPtr, 2)
865 .clampMaxNumElements(0, PrivatePtr, 2)
866 .scalarize(0)
867 .widenScalarToNextPow2(0)
868 .legalIf(all(isPointer(0), typeIs(1, S1)));
869
870 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
871 // be more flexible with the shift amount type.
872 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
873 .legalFor({{S32, S32}, {S64, S32}});
874 if (ST.has16BitInsts()) {
875 if (ST.hasVOP3PInsts()) {
876 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
877 .clampMaxNumElements(0, S16, 2);
878 } else
879 Shifts.legalFor({{S16, S32}, {S16, S16}});
880
881 Shifts.clampScalar(1, S16, S32);
882 Shifts.clampScalar(0, S16, S64);
883 Shifts.widenScalarToNextPow2(0, 16);
884 } else {
885 // Make sure we legalize the shift amount type first, as the general
886 // expansion for the shifted type will produce much worse code if it hasn't
887 // been truncated already.
888 Shifts.clampScalar(1, S32, S32);
889 Shifts.clampScalar(0, S32, S64);
890 Shifts.widenScalarToNextPow2(0, 32);
891 }
892 Shifts.scalarize(0);
893
894 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
895 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
896 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
897 unsigned IdxTypeIdx = 2;
898
899 getActionDefinitionsBuilder(Op)
900 .customIf([=](const LegalityQuery &Query) {
901 const LLT EltTy = Query.Types[EltTypeIdx];
902 const LLT VecTy = Query.Types[VecTypeIdx];
903 const LLT IdxTy = Query.Types[IdxTypeIdx];
904 return (EltTy.getSizeInBits() == 16 ||
905 EltTy.getSizeInBits() % 32 == 0) &&
906 VecTy.getSizeInBits() % 32 == 0 &&
907 VecTy.getSizeInBits() <= 1024 &&
908 IdxTy.getSizeInBits() == 32;
909 })
910 .clampScalar(EltTypeIdx, S32, S64)
911 .clampScalar(VecTypeIdx, S32, S64)
912 .clampScalar(IdxTypeIdx, S32, S32);
913 }
914
915 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
916 .unsupportedIf([=](const LegalityQuery &Query) {
917 const LLT &EltTy = Query.Types[1].getElementType();
918 return Query.Types[0] != EltTy;
919 });
920
921 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
922 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
923 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
924
925 // FIXME: Doesn't handle extract of illegal sizes.
926 getActionDefinitionsBuilder(Op)
927 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
928 // FIXME: Multiples of 16 should not be legal.
929 .legalIf([=](const LegalityQuery &Query) {
930 const LLT BigTy = Query.Types[BigTyIdx];
931 const LLT LitTy = Query.Types[LitTyIdx];
932 return (BigTy.getSizeInBits() % 32 == 0) &&
933 (LitTy.getSizeInBits() % 16 == 0);
934 })
935 .widenScalarIf(
936 [=](const LegalityQuery &Query) {
937 const LLT BigTy = Query.Types[BigTyIdx];
938 return (BigTy.getScalarSizeInBits() < 16);
939 },
940 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
941 .widenScalarIf(
942 [=](const LegalityQuery &Query) {
943 const LLT LitTy = Query.Types[LitTyIdx];
944 return (LitTy.getScalarSizeInBits() < 16);
945 },
946 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
947 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
948 .widenScalarToNextPow2(BigTyIdx, 32);
949
950 }
951
952 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
953 .legalForCartesianProduct(AllS32Vectors, {S32})
954 .legalForCartesianProduct(AllS64Vectors, {S64})
955 .clampNumElements(0, V16S32, V32S32)
956 .clampNumElements(0, V2S64, V16S64)
957 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
958
959 if (ST.hasScalarPackInsts())
960 BuildVector.legalFor({V2S16, S32});
961
962 BuildVector
963 .minScalarSameAs(1, 0)
964 .legalIf(isRegisterType(0))
965 .minScalarOrElt(0, S32);
966
967 if (ST.hasScalarPackInsts()) {
968 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
969 .legalFor({V2S16, S32})
970 .lower();
971 } else {
972 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
973 .lower();
974 }
975
976 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
977 .legalIf(isRegisterType(0));
978
979 // TODO: Don't fully scalarize v2s16 pieces
980 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
981
982 // Merge/Unmerge
983 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
984 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
985 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
986
987 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
988 const LLT &Ty = Query.Types[TypeIdx];
989 if (Ty.isVector()) {
990 const LLT &EltTy = Ty.getElementType();
991 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
992 return true;
993 if (!isPowerOf2_32(EltTy.getSizeInBits()))
994 return true;
995 }
996 return false;
997 };
998
999 auto &Builder = getActionDefinitionsBuilder(Op)
1000 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1001 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1002 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1003 // valid.
1004 .clampScalar(LitTyIdx, S16, S256)
1005 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1006 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1007 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1008 elementTypeIs(1, S16)),
1009 changeTo(1, V2S16))
1010 // Break up vectors with weird elements into scalars
1011 .fewerElementsIf(
1012 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1013 scalarize(0))
1014 .fewerElementsIf(
1015 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1016 scalarize(1))
1017 .clampScalar(BigTyIdx, S32, S1024)
1018 .lowerFor({{S16, V2S16}});
1019
1020 if (Op == G_MERGE_VALUES) {
1021 Builder.widenScalarIf(
1022 // TODO: Use 16-bit shifts if legal for 8-bit values?
1023 [=](const LegalityQuery &Query) {
1024 const LLT Ty = Query.Types[LitTyIdx];
1025 return Ty.getSizeInBits() < 32;
1026 },
1027 changeTo(LitTyIdx, S32));
1028 }
1029
1030 Builder.widenScalarIf(
1031 [=](const LegalityQuery &Query) {
1032 const LLT Ty = Query.Types[BigTyIdx];
1033 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1034 Ty.getSizeInBits() % 16 != 0;
1035 },
1036 [=](const LegalityQuery &Query) {
1037 // Pick the next power of 2, or a multiple of 64 over 128.
1038 // Whichever is smaller.
1039 const LLT &Ty = Query.Types[BigTyIdx];
1040 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1041 if (NewSizeInBits >= 256) {
1042 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1043 if (RoundedTo < NewSizeInBits)
1044 NewSizeInBits = RoundedTo;
1045 }
1046 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1047 })
1048 .legalIf([=](const LegalityQuery &Query) {
1049 const LLT &BigTy = Query.Types[BigTyIdx];
1050 const LLT &LitTy = Query.Types[LitTyIdx];
1051
1052 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1053 return false;
1054 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1055 return false;
1056
1057 return BigTy.getSizeInBits() % 16 == 0 &&
1058 LitTy.getSizeInBits() % 16 == 0 &&
1059 BigTy.getSizeInBits() <= 1024;
1060 })
1061 // Any vectors left are the wrong size. Scalarize them.
1062 .scalarize(0)
1063 .scalarize(1);
1064 }
1065
1066 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1067
1068 computeTables();
1069 verify(*ST.getInstrInfo());
1070}
1071
1072bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1073 MachineRegisterInfo &MRI,
1074 MachineIRBuilder &B,
1075 GISelChangeObserver &Observer) const {
1076 switch (MI.getOpcode()) {
1077 case TargetOpcode::G_ADDRSPACE_CAST:
1078 return legalizeAddrSpaceCast(MI, MRI, B);
1079 case TargetOpcode::G_FRINT:
1080 return legalizeFrint(MI, MRI, B);
1081 case TargetOpcode::G_FCEIL:
1082 return legalizeFceil(MI, MRI, B);
1083 case TargetOpcode::G_INTRINSIC_TRUNC:
1084 return legalizeIntrinsicTrunc(MI, MRI, B);
1085 case TargetOpcode::G_SITOFP:
1086 return legalizeITOFP(MI, MRI, B, true);
1087 case TargetOpcode::G_UITOFP:
1088 return legalizeITOFP(MI, MRI, B, false);
1089 case TargetOpcode::G_FMINNUM:
1090 case TargetOpcode::G_FMAXNUM:
1091 case TargetOpcode::G_FMINNUM_IEEE:
1092 case TargetOpcode::G_FMAXNUM_IEEE:
1093 return legalizeMinNumMaxNum(MI, MRI, B);
1094 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1095 return legalizeExtractVectorElt(MI, MRI, B);
1096 case TargetOpcode::G_INSERT_VECTOR_ELT:
1097 return legalizeInsertVectorElt(MI, MRI, B);
1098 case TargetOpcode::G_FSIN:
1099 case TargetOpcode::G_FCOS:
1100 return legalizeSinCos(MI, MRI, B);
1101 case TargetOpcode::G_GLOBAL_VALUE:
1102 return legalizeGlobalValue(MI, MRI, B);
1103 case TargetOpcode::G_LOAD:
1104 return legalizeLoad(MI, MRI, B, Observer);
1105 case TargetOpcode::G_FMAD:
1106 return legalizeFMad(MI, MRI, B);
1107 default:
1108 return false;
1109 }
1110
1111 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1111)
;
1112}
1113
1114Register AMDGPULegalizerInfo::getSegmentAperture(
1115 unsigned AS,
1116 MachineRegisterInfo &MRI,
1117 MachineIRBuilder &B) const {
1118 MachineFunction &MF = B.getMF();
1119 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1120 const LLT S32 = LLT::scalar(32);
1121
1122 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1122, __PRETTY_FUNCTION__))
;
1123
1124 if (ST.hasApertureRegs()) {
1125 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1126 // getreg.
1127 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1128 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1129 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1130 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1131 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1132 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1133 unsigned Encoding =
1134 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1135 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1136 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1137
1138 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1139 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1140
1141 B.buildInstr(AMDGPU::S_GETREG_B32)
1142 .addDef(GetReg)
1143 .addImm(Encoding);
1144 MRI.setType(GetReg, S32);
1145
1146 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1147 B.buildInstr(TargetOpcode::G_SHL)
1148 .addDef(ApertureReg)
1149 .addUse(GetReg)
1150 .addUse(ShiftAmt.getReg(0));
1151
1152 return ApertureReg;
1153 }
1154
1155 Register QueuePtr = MRI.createGenericVirtualRegister(
1156 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1157
1158 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1159 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1160 return Register();
1161
1162 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1163 // private_segment_aperture_base_hi.
1164 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1165
1166 // FIXME: Don't use undef
1167 Value *V = UndefValue::get(PointerType::get(
1168 Type::getInt8Ty(MF.getFunction().getContext()),
1169 AMDGPUAS::CONSTANT_ADDRESS));
1170
1171 MachinePointerInfo PtrInfo(V, StructOffset);
1172 MachineMemOperand *MMO = MF.getMachineMemOperand(
1173 PtrInfo,
1174 MachineMemOperand::MOLoad |
1175 MachineMemOperand::MODereferenceable |
1176 MachineMemOperand::MOInvariant,
1177 4,
1178 MinAlign(64, StructOffset));
1179
1180 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1181 Register LoadAddr;
1182
1183 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1184 B.buildLoad(LoadResult, LoadAddr, *MMO);
1185 return LoadResult;
1186}
1187
1188bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1189 MachineInstr &MI, MachineRegisterInfo &MRI,
1190 MachineIRBuilder &B) const {
1191 MachineFunction &MF = B.getMF();
1192
1193 B.setInstr(MI);
1194
1195 const LLT S32 = LLT::scalar(32);
1196 Register Dst = MI.getOperand(0).getReg();
1197 Register Src = MI.getOperand(1).getReg();
1198
1199 LLT DstTy = MRI.getType(Dst);
1200 LLT SrcTy = MRI.getType(Src);
1201 unsigned DestAS = DstTy.getAddressSpace();
1202 unsigned SrcAS = SrcTy.getAddressSpace();
1203
1204 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1205 // vector element.
1206 assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1206, __PRETTY_FUNCTION__))
;
1207
1208 const AMDGPUTargetMachine &TM
1209 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1210
1211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1212 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1213 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1214 return true;
1215 }
1216
1217 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1218 // Truncate.
1219 B.buildExtract(Dst, Src, 0);
1220 MI.eraseFromParent();
1221 return true;
1222 }
1223
1224 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1225 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1226 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1227
1228 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1229 // another. Merge operands are required to be the same type, but creating an
1230 // extra ptrtoint would be kind of pointless.
1231 auto HighAddr = B.buildConstant(
1232 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1233 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1234 MI.eraseFromParent();
1235 return true;
1236 }
1237
1238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1239 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1240, __PRETTY_FUNCTION__))
1240 DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1240, __PRETTY_FUNCTION__))
;
1241 unsigned NullVal = TM.getNullPointerValue(DestAS);
1242
1243 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1244 auto FlatNull = B.buildConstant(SrcTy, 0);
1245
1246 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1247
1248 // Extract low 32-bits of the pointer.
1249 B.buildExtract(PtrLo32, Src, 0);
1250
1251 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1252 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1253 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1254
1255 MI.eraseFromParent();
1256 return true;
1257 }
1258
1259 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1260 return false;
1261
1262 if (!ST.hasFlatAddressSpace())
1263 return false;
1264
1265 auto SegmentNull =
1266 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1267 auto FlatNull =
1268 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1269
1270 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1271 if (!ApertureReg.isValid())
1272 return false;
1273
1274 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1275 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1276
1277 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1278
1279 // Coerce the type of the low half of the result so we can use merge_values.
1280 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1281 B.buildInstr(TargetOpcode::G_PTRTOINT)
1282 .addDef(SrcAsInt)
1283 .addUse(Src);
1284
1285 // TODO: Should we allow mismatched types but matching sizes in merges to
1286 // avoid the ptrtoint?
1287 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1288 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1289
1290 MI.eraseFromParent();
1291 return true;
1292}
1293
1294bool AMDGPULegalizerInfo::legalizeFrint(
1295 MachineInstr &MI, MachineRegisterInfo &MRI,
1296 MachineIRBuilder &B) const {
1297 B.setInstr(MI);
1298
1299 Register Src = MI.getOperand(1).getReg();
1300 LLT Ty = MRI.getType(Src);
1301 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast
<void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1301, __PRETTY_FUNCTION__))
;
1302
1303 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1304 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1305
1306 auto C1 = B.buildFConstant(Ty, C1Val);
1307 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1308
1309 // TODO: Should this propagate fast-math-flags?
1310 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1311 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1312
1313 auto C2 = B.buildFConstant(Ty, C2Val);
1314 auto Fabs = B.buildFAbs(Ty, Src);
1315
1316 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1317 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1318 return true;
1319}
1320
1321bool AMDGPULegalizerInfo::legalizeFceil(
1322 MachineInstr &MI, MachineRegisterInfo &MRI,
1323 MachineIRBuilder &B) const {
1324 B.setInstr(MI);
1325
1326 const LLT S1 = LLT::scalar(1);
1327 const LLT S64 = LLT::scalar(64);
1328
1329 Register Src = MI.getOperand(1).getReg();
1330 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1330, __PRETTY_FUNCTION__))
;
1331
1332 // result = trunc(src)
1333 // if (src > 0.0 && src != result)
1334 // result += 1.0
1335
1336 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1337
1338 const auto Zero = B.buildFConstant(S64, 0.0);
1339 const auto One = B.buildFConstant(S64, 1.0);
1340 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1341 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1342 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1343 auto Add = B.buildSelect(S64, And, One, Zero);
1344
1345 // TODO: Should this propagate fast-math-flags?
1346 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1347 return true;
1348}
1349
1350static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1351 MachineIRBuilder &B) {
1352 const unsigned FractBits = 52;
1353 const unsigned ExpBits = 11;
1354 LLT S32 = LLT::scalar(32);
1355
1356 auto Const0 = B.buildConstant(S32, FractBits - 32);
1357 auto Const1 = B.buildConstant(S32, ExpBits);
1358
1359 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1360 .addUse(Const0.getReg(0))
1361 .addUse(Const1.getReg(0));
1362
1363 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1364}
1365
1366bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1367 MachineInstr &MI, MachineRegisterInfo &MRI,
1368 MachineIRBuilder &B) const {
1369 B.setInstr(MI);
1370
1371 const LLT S1 = LLT::scalar(1);
1372 const LLT S32 = LLT::scalar(32);
1373 const LLT S64 = LLT::scalar(64);
1374
1375 Register Src = MI.getOperand(1).getReg();
1376 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1376, __PRETTY_FUNCTION__))
;
1377
1378 // TODO: Should this use extract since the low half is unused?
1379 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1380 Register Hi = Unmerge.getReg(1);
1381
1382 // Extract the upper half, since this is where we will find the sign and
1383 // exponent.
1384 auto Exp = extractF64Exponent(Hi, B);
1385
1386 const unsigned FractBits = 52;
1387
1388 // Extract the sign bit.
1389 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
1390 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1391
1392 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
1393
1394 const auto Zero32 = B.buildConstant(S32, 0);
1395
1396 // Extend back to 64-bits.
1397 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1398
1399 auto Shr = B.buildAShr(S64, FractMask, Exp);
1400 auto Not = B.buildNot(S64, Shr);
1401 auto Tmp0 = B.buildAnd(S64, Src, Not);
1402 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1403
1404 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1405 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1406
1407 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1408 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1409 return true;
1410}
1411
1412bool AMDGPULegalizerInfo::legalizeITOFP(
1413 MachineInstr &MI, MachineRegisterInfo &MRI,
1414 MachineIRBuilder &B, bool Signed) const {
1415 B.setInstr(MI);
1416
1417 Register Dst = MI.getOperand(0).getReg();
1418 Register Src = MI.getOperand(1).getReg();
1419
1420 const LLT S64 = LLT::scalar(64);
1421 const LLT S32 = LLT::scalar(32);
1422
1423 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1423, __PRETTY_FUNCTION__))
;
1424
1425 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1426
1427 auto CvtHi = Signed ?
1428 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1429 B.buildUITOFP(S64, Unmerge.getReg(1));
1430
1431 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1432
1433 auto ThirtyTwo = B.buildConstant(S32, 32);
1434 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1435 .addUse(CvtHi.getReg(0))
1436 .addUse(ThirtyTwo.getReg(0));
1437
1438 // TODO: Should this propagate fast-math-flags?
1439 B.buildFAdd(Dst, LdExp, CvtLo);
1440 MI.eraseFromParent();
1441 return true;
1442}
1443
1444bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1445 MachineInstr &MI, MachineRegisterInfo &MRI,
1446 MachineIRBuilder &B) const {
1447 MachineFunction &MF = B.getMF();
1448 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1449
1450 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1451 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1452
1453 // With ieee_mode disabled, the instructions have the correct behavior
1454 // already for G_FMINNUM/G_FMAXNUM
1455 if (!MFI->getMode().IEEE)
1456 return !IsIEEEOp;
1457
1458 if (IsIEEEOp)
1459 return true;
1460
1461 MachineIRBuilder HelperBuilder(MI);
1462 GISelObserverWrapper DummyObserver;
1463 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1464 HelperBuilder.setInstr(MI);
1465 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1466}
1467
1468bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1469 MachineInstr &MI, MachineRegisterInfo &MRI,
1470 MachineIRBuilder &B) const {
1471 // TODO: Should move some of this into LegalizerHelper.
1472
1473 // TODO: Promote dynamic indexing of s16 to s32
1474 // TODO: Dynamic s64 indexing is only legal for SGPR.
1475 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1476 if (!IdxVal) // Dynamic case will be selected to register indexing.
1477 return true;
1478
1479 Register Dst = MI.getOperand(0).getReg();
1480 Register Vec = MI.getOperand(1).getReg();
1481
1482 LLT VecTy = MRI.getType(Vec);
1483 LLT EltTy = VecTy.getElementType();
1484 assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1484, __PRETTY_FUNCTION__))
;
1485
1486 B.setInstr(MI);
1487
1488 if (IdxVal.getValue() < VecTy.getNumElements())
1489 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1490 else
1491 B.buildUndef(Dst);
1492
1493 MI.eraseFromParent();
1494 return true;
1495}
1496
1497bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1498 MachineInstr &MI, MachineRegisterInfo &MRI,
1499 MachineIRBuilder &B) const {
1500 // TODO: Should move some of this into LegalizerHelper.
1501
1502 // TODO: Promote dynamic indexing of s16 to s32
1503 // TODO: Dynamic s64 indexing is only legal for SGPR.
1504 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1505 if (!IdxVal) // Dynamic case will be selected to register indexing.
1506 return true;
1507
1508 Register Dst = MI.getOperand(0).getReg();
1509 Register Vec = MI.getOperand(1).getReg();
1510 Register Ins = MI.getOperand(2).getReg();
1511
1512 LLT VecTy = MRI.getType(Vec);
1513 LLT EltTy = VecTy.getElementType();
1514 assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1514, __PRETTY_FUNCTION__))
;
1515
1516 B.setInstr(MI);
1517
1518 if (IdxVal.getValue() < VecTy.getNumElements())
1519 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1520 else
1521 B.buildUndef(Dst);
1522
1523 MI.eraseFromParent();
1524 return true;
1525}
1526
1527bool AMDGPULegalizerInfo::legalizeSinCos(
1528 MachineInstr &MI, MachineRegisterInfo &MRI,
1529 MachineIRBuilder &B) const {
1530 B.setInstr(MI);
1531
1532 Register DstReg = MI.getOperand(0).getReg();
1533 Register SrcReg = MI.getOperand(1).getReg();
1534 LLT Ty = MRI.getType(DstReg);
1535 unsigned Flags = MI.getFlags();
1536
1537 Register TrigVal;
1538 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI3.14159265358979323846);
1539 if (ST.hasTrigReducedRange()) {
1540 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1541 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1542 .addUse(MulVal.getReg(0))
1543 .setMIFlags(Flags).getReg(0);
1544 } else
1545 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1546
1547 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1548 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1549 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1550 .addUse(TrigVal)
1551 .setMIFlags(Flags);
1552 MI.eraseFromParent();
1553 return true;
1554}
1555
1556bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1557 Register DstReg, LLT PtrTy,
1558 MachineIRBuilder &B, const GlobalValue *GV,
1559 unsigned Offset, unsigned GAFlags) const {
1560 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1561 // to the following code sequence:
1562 //
1563 // For constant address space:
1564 // s_getpc_b64 s[0:1]
1565 // s_add_u32 s0, s0, $symbol
1566 // s_addc_u32 s1, s1, 0
1567 //
1568 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1569 // a fixup or relocation is emitted to replace $symbol with a literal
1570 // constant, which is a pc-relative offset from the encoding of the $symbol
1571 // operand to the global variable.
1572 //
1573 // For global address space:
1574 // s_getpc_b64 s[0:1]
1575 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1576 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1577 //
1578 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1579 // fixups or relocations are emitted to replace $symbol@*@lo and
1580 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1581 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1582 // operand to the global variable.
1583 //
1584 // What we want here is an offset from the value returned by s_getpc
1585 // (which is the address of the s_add_u32 instruction) to the global
1586 // variable, but since the encoding of $symbol starts 4 bytes after the start
1587 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1588 // small. This requires us to add 4 to the global variable offset in order to
1589 // compute the correct address.
1590
1591 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1592
1593 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1594 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1595
1596 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1597 .addDef(PCReg);
1598
1599 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1600 if (GAFlags == SIInstrInfo::MO_NONE)
1601 MIB.addImm(0);
1602 else
1603 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1604
1605 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1606
1607 if (PtrTy.getSizeInBits() == 32)
1608 B.buildExtract(DstReg, PCReg, 0);
1609 return true;
1610 }
1611
1612bool AMDGPULegalizerInfo::legalizeGlobalValue(
1613 MachineInstr &MI, MachineRegisterInfo &MRI,
1614 MachineIRBuilder &B) const {
1615 Register DstReg = MI.getOperand(0).getReg();
1616 LLT Ty = MRI.getType(DstReg);
1617 unsigned AS = Ty.getAddressSpace();
1618
1619 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1620 MachineFunction &MF = B.getMF();
1621 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1622 B.setInstr(MI);
1623
1624 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1625 if (!MFI->isEntryFunction()) {
1626 const Function &Fn = MF.getFunction();
1627 DiagnosticInfoUnsupported BadLDSDecl(
1628 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1629 Fn.getContext().diagnose(BadLDSDecl);
1630 }
1631
1632 // TODO: We could emit code to handle the initialization somewhere.
1633 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1634 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1635 MI.eraseFromParent();
1636 return true;
1637 }
1638
1639 const Function &Fn = MF.getFunction();
1640 DiagnosticInfoUnsupported BadInit(
1641 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1642 Fn.getContext().diagnose(BadInit);
1643 return true;
1644 }
1645
1646 const SITargetLowering *TLI = ST.getTargetLowering();
1647
1648 if (TLI->shouldEmitFixup(GV)) {
1649 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1650 MI.eraseFromParent();
1651 return true;
1652 }
1653
1654 if (TLI->shouldEmitPCReloc(GV)) {
1655 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1656 MI.eraseFromParent();
1657 return true;
1658 }
1659
1660 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1661 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1662
1663 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1664 MachinePointerInfo::getGOT(MF),
1665 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1666 MachineMemOperand::MOInvariant,
1667 8 /*Size*/, 8 /*Align*/);
1668
1669 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1670
1671 if (Ty.getSizeInBits() == 32) {
1672 // Truncate if this is a 32-bit constant adrdess.
1673 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1674 B.buildExtract(DstReg, Load, 0);
1675 } else
1676 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1677
1678 MI.eraseFromParent();
1679 return true;
1680}
1681
1682bool AMDGPULegalizerInfo::legalizeLoad(
1683 MachineInstr &MI, MachineRegisterInfo &MRI,
1684 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1685 B.setInstr(MI);
1686 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1687 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1688 Observer.changingInstr(MI);
1689 MI.getOperand(1).setReg(Cast.getReg(0));
1690 Observer.changedInstr(MI);
1691 return true;
1692}
1693
1694bool AMDGPULegalizerInfo::legalizeFMad(
1695 MachineInstr &MI, MachineRegisterInfo &MRI,
1696 MachineIRBuilder &B) const {
1697 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1698 assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1698, __PRETTY_FUNCTION__))
;
1699
1700 // TODO: Always legal with future ftz flag.
1701 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1702 return true;
1703 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1704 return true;
1705
1706 MachineFunction &MF = B.getMF();
1707
1708 MachineIRBuilder HelperBuilder(MI);
1709 GISelObserverWrapper DummyObserver;
1710 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1711 HelperBuilder.setMBB(*MI.getParent());
1712 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1713}
1714
1715// Return the use branch instruction, otherwise null if the usage is invalid.
1716static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1717 MachineRegisterInfo &MRI) {
1718 Register CondDef = MI.getOperand(0).getReg();
1719 if (!MRI.hasOneNonDBGUse(CondDef))
1720 return nullptr;
1721
1722 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1723 return UseMI.getParent() == MI.getParent() &&
1724 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1725}
1726
1727Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1728 Register Reg, LLT Ty) const {
1729 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1730 if (LiveIn)
1731 return LiveIn;
1732
1733 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1734 MRI.addLiveIn(Reg, NewReg);
1735 return NewReg;
1736}
1737
1738bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1739 const ArgDescriptor *Arg) const {
1740 if (!Arg->isRegister() || !Arg->getRegister().isValid())
9
Taking false branch
1741 return false; // TODO: Handle these
1742
1743 assert(Arg->getRegister().isPhysical())((Arg->getRegister().isPhysical()) ? static_cast<void>
(0) : __assert_fail ("Arg->getRegister().isPhysical()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1743, __PRETTY_FUNCTION__))
;
10
'?' condition is true
1744
1745 MachineRegisterInfo &MRI = *B.getMRI();
1746
1747 LLT Ty = MRI.getType(DstReg);
1748 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1749
1750 if (Arg->isMasked()) {
11
Calling 'ArgDescriptor::isMasked'
14
Returning from 'ArgDescriptor::isMasked'
15
Taking true branch
1751 // TODO: Should we try to emit this once in the entry block?
1752 const LLT S32 = LLT::scalar(32);
1753 const unsigned Mask = Arg->getMask();
1754 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
16
Calling 'countTrailingZeros<unsigned int>'
23
Returning from 'countTrailingZeros<unsigned int>'
24
'Shift' initialized to 32
1755
1756 Register AndMaskSrc = LiveIn;
1757
1758 if (Shift
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
!= 0) {
25
Taking true branch
1759 auto ShiftAmt = B.buildConstant(S32, Shift);
1760 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1761 }
1762
1763 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
26
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
1764 } else
1765 B.buildCopy(DstReg, LiveIn);
1766
1767 // Insert the argument copy if it doens't already exist.
1768 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1769 if (!MRI.getVRegDef(LiveIn)) {
1770 // FIXME: Should have scoped insert pt
1771 MachineBasicBlock &OrigInsBB = B.getMBB();
1772 auto OrigInsPt = B.getInsertPt();
1773
1774 MachineBasicBlock &EntryMBB = B.getMF().front();
1775 EntryMBB.addLiveIn(Arg->getRegister());
1776 B.setInsertPt(EntryMBB, EntryMBB.begin());
1777 B.buildCopy(LiveIn, Arg->getRegister());
1778
1779 B.setInsertPt(OrigInsBB, OrigInsPt);
1780 }
1781
1782 return true;
1783}
1784
1785bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1786 MachineInstr &MI,
1787 MachineRegisterInfo &MRI,
1788 MachineIRBuilder &B,
1789 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1790 B.setInstr(MI);
1791
1792 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1793
1794 const ArgDescriptor *Arg;
1795 const TargetRegisterClass *RC;
1796 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1797 if (!Arg) {
6
Assuming 'Arg' is non-null
7
Taking false branch
1798 LLVM_DEBUG(dbgs() << "Required arg register missing\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-legalinfo")) { dbgs() << "Required arg register missing\n"
; } } while (false)
;
1799 return false;
1800 }
1801
1802 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
8
Calling 'AMDGPULegalizerInfo::loadInputValue'
1803 MI.eraseFromParent();
1804 return true;
1805 }
1806
1807 return false;
1808}
1809
1810bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1811 MachineRegisterInfo &MRI,
1812 MachineIRBuilder &B) const {
1813 B.setInstr(MI);
1814 Register Res = MI.getOperand(0).getReg();
1815 Register LHS = MI.getOperand(2).getReg();
1816 Register RHS = MI.getOperand(3).getReg();
1817 uint16_t Flags = MI.getFlags();
1818
1819 LLT S32 = LLT::scalar(32);
1820 LLT S1 = LLT::scalar(1);
1821
1822 auto Abs = B.buildFAbs(S32, RHS, Flags);
1823 const APFloat C0Val(1.0f);
1824
1825 auto C0 = B.buildConstant(S32, 0x6f800000);
1826 auto C1 = B.buildConstant(S32, 0x2f800000);
1827 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1828
1829 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1830 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1831
1832 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1833
1834 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1835 .addUse(Mul0.getReg(0))
1836 .setMIFlags(Flags);
1837
1838 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1839
1840 B.buildFMul(Res, Sel, Mul1, Flags);
1841
1842 MI.eraseFromParent();
1843 return true;
1844}
1845
1846bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1847 MachineRegisterInfo &MRI,
1848 MachineIRBuilder &B) const {
1849 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1850 if (!MFI->isEntryFunction()) {
3
Assuming the condition is true
4
Taking true branch
1851 return legalizePreloadedArgIntrin(MI, MRI, B,
5
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
1852 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1853 }
1854
1855 B.setInstr(MI);
1856
1857 uint64_t Offset =
1858 ST.getTargetLowering()->getImplicitParameterOffset(
1859 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1860 Register DstReg = MI.getOperand(0).getReg();
1861 LLT DstTy = MRI.getType(DstReg);
1862 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1863
1864 const ArgDescriptor *Arg;
1865 const TargetRegisterClass *RC;
1866 std::tie(Arg, RC)
1867 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1868 if (!Arg)
1869 return false;
1870
1871 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1872 if (!loadInputValue(KernargPtrReg, B, Arg))
1873 return false;
1874
1875 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1876 MI.eraseFromParent();
1877 return true;
1878}
1879
1880bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1881 MachineRegisterInfo &MRI,
1882 MachineIRBuilder &B,
1883 unsigned AddrSpace) const {
1884 B.setInstr(MI);
1885 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1886 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1887 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1888 MI.eraseFromParent();
1889 return true;
1890}
1891
1892/// Handle register layout difference for f16 images for some subtargets.
1893Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1894 MachineRegisterInfo &MRI,
1895 Register Reg) const {
1896 if (!ST.hasUnpackedD16VMem())
1897 return Reg;
1898
1899 const LLT S16 = LLT::scalar(16);
1900 const LLT S32 = LLT::scalar(32);
1901 LLT StoreVT = MRI.getType(Reg);
1902 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16
) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1902, __PRETTY_FUNCTION__))
;
1903
1904 auto Unmerge = B.buildUnmerge(S16, Reg);
1905
1906 SmallVector<Register, 4> WideRegs;
1907 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1908 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1909
1910 int NumElts = StoreVT.getNumElements();
1911
1912 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1913}
1914
1915bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1916 MachineRegisterInfo &MRI,
1917 MachineIRBuilder &B,
1918 bool IsFormat) const {
1919 // TODO: Reject f16 format on targets where unsupported.
1920 Register VData = MI.getOperand(1).getReg();
1921 LLT Ty = MRI.getType(VData);
1922
1923 B.setInstr(MI);
1924
1925 const LLT S32 = LLT::scalar(32);
1926 const LLT S16 = LLT::scalar(16);
1927
1928 // Fixup illegal register types for i8 stores.
1929 if (Ty == LLT::scalar(8) || Ty == S16) {
1930 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1931 MI.getOperand(1).setReg(AnyExt);
1932 return true;
1933 }
1934
1935 if (Ty.isVector()) {
1936 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1937 if (IsFormat)
1938 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1939 return true;
1940 }
1941
1942 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1943 }
1944
1945 return Ty == S32;
1946}
1947
1948bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1949 MachineRegisterInfo &MRI,
1950 MachineIRBuilder &B) const {
1951 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1952 switch (MI.getIntrinsicID()) {
1
Control jumps to 'case amdgcn_implicitarg_ptr:' at line 1996
1953 case Intrinsic::amdgcn_if: {
1954 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1955 const SIRegisterInfo *TRI
1956 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1957
1958 B.setInstr(*BrCond);
1959 Register Def = MI.getOperand(1).getReg();
1960 Register Use = MI.getOperand(3).getReg();
1961 B.buildInstr(AMDGPU::SI_IF)
1962 .addDef(Def)
1963 .addUse(Use)
1964 .addMBB(BrCond->getOperand(1).getMBB());
1965
1966 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1967 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1968 MI.eraseFromParent();
1969 BrCond->eraseFromParent();
1970 return true;
1971 }
1972
1973 return false;
1974 }
1975 case Intrinsic::amdgcn_loop: {
1976 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1977 const SIRegisterInfo *TRI
1978 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1979
1980 B.setInstr(*BrCond);
1981 Register Reg = MI.getOperand(2).getReg();
1982 B.buildInstr(AMDGPU::SI_LOOP)
1983 .addUse(Reg)
1984 .addMBB(BrCond->getOperand(1).getMBB());
1985 MI.eraseFromParent();
1986 BrCond->eraseFromParent();
1987 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1988 return true;
1989 }
1990
1991 return false;
1992 }
1993 case Intrinsic::amdgcn_kernarg_segment_ptr:
1994 return legalizePreloadedArgIntrin(
1995 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1996 case Intrinsic::amdgcn_implicitarg_ptr:
1997 return legalizeImplicitArgPtr(MI, MRI, B);
2
Calling 'AMDGPULegalizerInfo::legalizeImplicitArgPtr'
1998 case Intrinsic::amdgcn_workitem_id_x:
1999 return legalizePreloadedArgIntrin(MI, MRI, B,
2000 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2001 case Intrinsic::amdgcn_workitem_id_y:
2002 return legalizePreloadedArgIntrin(MI, MRI, B,
2003 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2004 case Intrinsic::amdgcn_workitem_id_z:
2005 return legalizePreloadedArgIntrin(MI, MRI, B,
2006 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2007 case Intrinsic::amdgcn_workgroup_id_x:
2008 return legalizePreloadedArgIntrin(MI, MRI, B,
2009 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2010 case Intrinsic::amdgcn_workgroup_id_y:
2011 return legalizePreloadedArgIntrin(MI, MRI, B,
2012 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2013 case Intrinsic::amdgcn_workgroup_id_z:
2014 return legalizePreloadedArgIntrin(MI, MRI, B,
2015 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2016 case Intrinsic::amdgcn_dispatch_ptr:
2017 return legalizePreloadedArgIntrin(MI, MRI, B,
2018 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2019 case Intrinsic::amdgcn_queue_ptr:
2020 return legalizePreloadedArgIntrin(MI, MRI, B,
2021 AMDGPUFunctionArgInfo::QUEUE_PTR);
2022 case Intrinsic::amdgcn_implicit_buffer_ptr:
2023 return legalizePreloadedArgIntrin(
2024 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2025 case Intrinsic::amdgcn_dispatch_id:
2026 return legalizePreloadedArgIntrin(MI, MRI, B,
2027 AMDGPUFunctionArgInfo::DISPATCH_ID);
2028 case Intrinsic::amdgcn_fdiv_fast:
2029 return legalizeFDIVFast(MI, MRI, B);
2030 case Intrinsic::amdgcn_is_shared:
2031 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2032 case Intrinsic::amdgcn_is_private:
2033 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2034 case Intrinsic::amdgcn_wavefrontsize: {
2035 B.setInstr(MI);
2036 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2037 MI.eraseFromParent();
2038 return true;
2039 }
2040 case Intrinsic::amdgcn_raw_buffer_store:
2041 return legalizeRawBufferStore(MI, MRI, B, false);
2042 case Intrinsic::amdgcn_raw_buffer_store_format:
2043 return legalizeRawBufferStore(MI, MRI, B, true);
2044 default:
2045 return true;
2046 }
2047
2048 return true;
2049}

/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
12
Assuming the condition is true
13
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include "llvm/Support/SwapByteOrder.h"
18#include <algorithm>
19#include <cassert>
20#include <climits>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
18
Assuming 'Val' is equal to 0
19
Taking true branch
117 return 32;
20
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
17
Calling 'TrailingZerosCounter::count'
21
Returning from 'TrailingZerosCounter::count'
22
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 251, __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315// NOTE: The following support functions use the _32/_64 extensions instead of
316// type overloading so that signed and unsigned integers can be used without
317// ambiguity.
318
319/// Return the high 32 bits of a 64 bit value.
320constexpr inline uint32_t Hi_32(uint64_t Value) {
321 return static_cast<uint32_t>(Value >> 32);
322}
323
324/// Return the low 32 bits of a 64 bit value.
325constexpr inline uint32_t Lo_32(uint64_t Value) {
326 return static_cast<uint32_t>(Value);
327}
328
329/// Make a 64-bit integer from a high / low pair of 32-bit integers.
330constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
331 return ((uint64_t)High << 32) | (uint64_t)Low;
332}
333
334/// Checks if an integer fits into the given bit width.
335template <unsigned N> constexpr inline bool isInt(int64_t x) {
336 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
337}
338// Template specializations to get better code for common cases.
339template <> constexpr inline bool isInt<8>(int64_t x) {
340 return static_cast<int8_t>(x) == x;
341}
342template <> constexpr inline bool isInt<16>(int64_t x) {
343 return static_cast<int16_t>(x) == x;
344}
345template <> constexpr inline bool isInt<32>(int64_t x) {
346 return static_cast<int32_t>(x) == x;
347}
348
349/// Checks if a signed integer is an N bit number shifted left by S.
350template <unsigned N, unsigned S>
351constexpr inline bool isShiftedInt(int64_t x) {
352 static_assert(
353 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
354 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
355 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
356}
357
358/// Checks if an unsigned integer fits into the given bit width.
359///
360/// This is written as two functions rather than as simply
361///
362/// return N >= 64 || X < (UINT64_C(1) << N);
363///
364/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
365/// left too many places.
366template <unsigned N>
367constexpr inline typename std::enable_if<(N < 64), bool>::type
368isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline typename std::enable_if<N >= 64, bool>::type
374isUInt(uint64_t X) {
375 return true;
376}
377
378// Template specializations to get better code for common cases.
379template <> constexpr inline bool isUInt<8>(uint64_t x) {
380 return static_cast<uint8_t>(x) == x;
381}
382template <> constexpr inline bool isUInt<16>(uint64_t x) {
383 return static_cast<uint16_t>(x) == x;
384}
385template <> constexpr inline bool isUInt<32>(uint64_t x) {
386 return static_cast<uint32_t>(x) == x;
387}
388
389/// Checks if a unsigned integer is an N bit number shifted left by S.
390template <unsigned N, unsigned S>
391constexpr inline bool isShiftedUInt(uint64_t x) {
392 static_assert(
393 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
394 static_assert(N + S <= 64,
395 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
396 // Per the two static_asserts above, S must be strictly less than 64. So
397 // 1 << S is not undefined behavior.
398 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
399}
400
401/// Gets the maximum value for a N-bit unsigned integer.
402inline uint64_t maxUIntN(uint64_t N) {
403 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 403, __PRETTY_FUNCTION__))
;
404
405 // uint64_t(1) << 64 is undefined behavior, so we can't do
406 // (uint64_t(1) << N) - 1
407 // without checking first that N != 64. But this works and doesn't have a
408 // branch.
409 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
410}
411
412/// Gets the minimum value for a N-bit signed integer.
413inline int64_t minIntN(int64_t N) {
414 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 414, __PRETTY_FUNCTION__))
;
415
416 return -(UINT64_C(1)1UL<<(N-1));
417}
418
419/// Gets the maximum value for a N-bit signed integer.
420inline int64_t maxIntN(int64_t N) {
421 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 421, __PRETTY_FUNCTION__))
;
422
423 // This relies on two's complement wraparound when N == 64, so we convert to
424 // int64_t only at the very end to avoid UB.
425 return (UINT64_C(1)1UL << (N - 1)) - 1;
426}
427
428/// Checks if an unsigned integer fits into the given (dynamic) bit width.
429inline bool isUIntN(unsigned N, uint64_t x) {
430 return N >= 64 || x <= maxUIntN(N);
431}
432
433/// Checks if an signed integer fits into the given (dynamic) bit width.
434inline bool isIntN(unsigned N, int64_t x) {
435 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
436}
437
438/// Return true if the argument is a non-empty sequence of ones starting at the
439/// least significant bit with the remainder zero (32 bit version).
440/// Ex. isMask_32(0x0000FFFFU) == true.
441constexpr inline bool isMask_32(uint32_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument is a non-empty sequence of ones starting at the
446/// least significant bit with the remainder zero (64 bit version).
447constexpr inline bool isMask_64(uint64_t Value) {
448 return Value && ((Value + 1) & Value) == 0;
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
453constexpr inline bool isShiftedMask_32(uint32_t Value) {
454 return Value && isMask_32((Value - 1) | Value);
455}
456
457/// Return true if the argument contains a non-empty sequence of ones with the
458/// remainder zero (64 bit version.)
459constexpr inline bool isShiftedMask_64(uint64_t Value) {
460 return Value && isMask_64((Value - 1) | Value);
461}
462
463/// Return true if the argument is a power of two > 0.
464/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
465constexpr inline bool isPowerOf2_32(uint32_t Value) {
466 return Value && !(Value & (Value - 1));
467}
468
469/// Return true if the argument is a power of two > 0 (64 bit edition.)
470constexpr inline bool isPowerOf2_64(uint64_t Value) {
471 return Value && !(Value & (Value - 1));
472}
473
474/// Return a byte-swapped representation of the 16-bit argument.
475inline uint16_t ByteSwap_16(uint16_t Value) {
476 return sys::SwapByteOrder_16(Value);
477}
478
479/// Return a byte-swapped representation of the 32-bit argument.
480inline uint32_t ByteSwap_32(uint32_t Value) {
481 return sys::SwapByteOrder_32(Value);
482}
483
484/// Return a byte-swapped representation of the 64-bit argument.
485inline uint64_t ByteSwap_64(uint64_t Value) {
486 return sys::SwapByteOrder_64(Value);
487}
488
489/// Count the number of ones from the most significant bit to the first
490/// zero bit.
491///
492/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countLeadingZeros<T>(~Value, ZB);
503}
504
505/// Count the number of ones from the least significant bit to the first
506/// zero bit.
507///
508/// Ex. countTrailingOnes(0x00FF00FF) == 8.
509/// Only unsigned integral types are allowed.
510///
511/// \param ZB the behavior on an input of all ones. Only ZB_Width and
512/// ZB_Undefined are valid arguments.
513template <typename T>
514unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
515 static_assert(std::numeric_limits<T>::is_integer &&
516 !std::numeric_limits<T>::is_signed,
517 "Only unsigned integral types are allowed.");
518 return countTrailingZeros<T>(~Value, ZB);
519}
520
521namespace detail {
522template <typename T, std::size_t SizeOfT> struct PopulationCounter {
523 static unsigned count(T Value) {
524 // Generic version, forward to 32 bits.
525 static_assert(SizeOfT <= 4, "Not implemented!");
526#if defined(__GNUC__4)
527 return __builtin_popcount(Value);
528#else
529 uint32_t v = Value;
530 v = v - ((v >> 1) & 0x55555555);
531 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
532 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
533#endif
534 }
535};
536
537template <typename T> struct PopulationCounter<T, 8> {
538 static unsigned count(T Value) {
539#if defined(__GNUC__4)
540 return __builtin_popcountll(Value);
541#else
542 uint64_t v = Value;
543 v = v - ((v >> 1) & 0x5555555555555555ULL);
544 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
545 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
546 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
547#endif
548 }
549};
550} // namespace detail
551
552/// Count the number of set bits in a value.
553/// Ex. countPopulation(0xF000F000) = 8
554/// Returns 0 if the word is zero.
555template <typename T>
556inline unsigned countPopulation(T Value) {
557 static_assert(std::numeric_limits<T>::is_integer &&
558 !std::numeric_limits<T>::is_signed,
559 "Only unsigned integral types are allowed.");
560 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
561}
562
563/// Compile time Log2.
564/// Valid only for positive powers of two.
565template <size_t kValue> constexpr inline size_t CTLog2() {
566 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
567 "Value is not a valid power of 2");
568 return 1 + CTLog2<kValue / 2>();
569}
570
571template <> constexpr inline size_t CTLog2<1>() { return 0; }
572
573/// Return the log base 2 of the specified value.
574inline double Log2(double Value) {
575#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
576 return __builtin_log(Value) / __builtin_log(2.0);
577#else
578 return log2(Value);
579#endif
580}
581
582/// Return the floor log base 2 of the specified value, -1 if the value is zero.
583/// (32 bit edition.)
584/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
585inline unsigned Log2_32(uint32_t Value) {
586 return 31 - countLeadingZeros(Value);
587}
588
589/// Return the floor log base 2 of the specified value, -1 if the value is zero.
590/// (64 bit edition.)
591inline unsigned Log2_64(uint64_t Value) {
592 return 63 - countLeadingZeros(Value);
593}
594
595/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
596/// (32 bit edition).
597/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
598inline unsigned Log2_32_Ceil(uint32_t Value) {
599 return 32 - countLeadingZeros(Value - 1);
600}
601
602/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
603/// (64 bit edition.)
604inline unsigned Log2_64_Ceil(uint64_t Value) {
605 return 64 - countLeadingZeros(Value - 1);
606}
607
608/// Return the greatest common divisor of the values using Euclid's algorithm.
609template <typename T>
610inline T greatestCommonDivisor(T A, T B) {
611 while (B) {
612 T Tmp = B;
613 B = A % B;
614 A = Tmp;
615 }
616 return A;
617}
618
619inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
620 return greatestCommonDivisor<uint64_t>(A, B);
621}
622
623/// This function takes a 64-bit integer and returns the bit equivalent double.
624inline double BitsToDouble(uint64_t Bits) {
625 double D;
626 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
627 memcpy(&D, &Bits, sizeof(Bits));
628 return D;
629}
630
631/// This function takes a 32-bit integer and returns the bit equivalent float.
632inline float BitsToFloat(uint32_t Bits) {
633 float F;
634 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
635 memcpy(&F, &Bits, sizeof(Bits));
636 return F;
637}
638
639/// This function takes a double and returns the bit equivalent 64-bit integer.
640/// Note that copying doubles around changes the bits of NaNs on some hosts,
641/// notably x86, so this routine cannot be used if these bits are needed.
642inline uint64_t DoubleToBits(double Double) {
643 uint64_t Bits;
644 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
645 memcpy(&Bits, &Double, sizeof(Double));
646 return Bits;
647}
648
649/// This function takes a float and returns the bit equivalent 32-bit integer.
650/// Note that copying floats around changes the bits of NaNs on some hosts,
651/// notably x86, so this routine cannot be used if these bits are needed.
652inline uint32_t FloatToBits(float Float) {
653 uint32_t Bits;
654 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
655 memcpy(&Bits, &Float, sizeof(Float));
656 return Bits;
657}
658
659/// A and B are either alignments or offsets. Return the minimum alignment that
660/// may be assumed after adding the two together.
661constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
662 // The largest power of 2 that divides both A and B.
663 //
664 // Replace "-Value" by "1+~Value" in the following commented code to avoid
665 // MSVC warning C4146
666 // return (A | B) & -(A | B);
667 return (A | B) & (1 + ~(A | B));
668}
669
670/// Returns the next power of two (in 64-bits) that is strictly greater than A.
671/// Returns zero on overflow.
672inline uint64_t NextPowerOf2(uint64_t A) {
673 A |= (A >> 1);
674 A |= (A >> 2);
675 A |= (A >> 4);
676 A |= (A >> 8);
677 A |= (A >> 16);
678 A |= (A >> 32);
679 return A + 1;
680}
681
682/// Returns the power of two which is less than or equal to the given value.
683/// Essentially, it is a floor operation across the domain of powers of two.
684inline uint64_t PowerOf2Floor(uint64_t A) {
685 if (!A) return 0;
686 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
687}
688
689/// Returns the power of two which is greater than or equal to the given value.
690/// Essentially, it is a ceil operation across the domain of powers of two.
691inline uint64_t PowerOf2Ceil(uint64_t A) {
692 if (!A)
693 return 0;
694 return NextPowerOf2(A - 1);
695}
696
697/// Returns the next integer (mod 2**64) that is greater than or equal to
698/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
699///
700/// If non-zero \p Skew is specified, the return value will be a minimal
701/// integer that is greater than or equal to \p Value and equal to
702/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
703/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
704///
705/// Examples:
706/// \code
707/// alignTo(5, 8) = 8
708/// alignTo(17, 8) = 24
709/// alignTo(~0LL, 8) = 0
710/// alignTo(321, 255) = 510
711///
712/// alignTo(5, 8, 7) = 7
713/// alignTo(17, 8, 1) = 17
714/// alignTo(~0LL, 8, 3) = 3
715/// alignTo(321, 255, 42) = 552
716/// \endcode
717inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
718 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 718, __PRETTY_FUNCTION__))
;
719 Skew %= Align;
720 return (Value + Align - 1 - Skew) / Align * Align + Skew;
721}
722
723/// Returns the next integer (mod 2**64) that is greater than or equal to
724/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
725template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
726 static_assert(Align != 0u, "Align must be non-zero");
727 return (Value + Align - 1) / Align * Align;
728}
729
730/// Returns the integer ceil(Numerator / Denominator).
731inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
732 return alignTo(Numerator, Denominator) / Denominator;
733}
734
735/// Returns the largest uint64_t less than or equal to \p Value and is
736/// \p Skew mod \p Align. \p Align must be non-zero
737inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
738 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 738, __PRETTY_FUNCTION__))
;
739 Skew %= Align;
740 return (Value - Skew) / Align * Align + Skew;
741}
742
743/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
744/// Requires 0 < B <= 32.
745template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
746 static_assert(B > 0, "Bit width can't be 0.");
747 static_assert(B <= 32, "Bit width out of range.");
748 return int32_t(X << (32 - B)) >> (32 - B);
749}
750
751/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
752/// Requires 0 < B < 32.
753inline int32_t SignExtend32(uint32_t X, unsigned B) {
754 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 754, __PRETTY_FUNCTION__))
;
755 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 755, __PRETTY_FUNCTION__))
;
756 return int32_t(X << (32 - B)) >> (32 - B);
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
760/// Requires 0 < B < 64.
761template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 64, "Bit width out of range.");
764 return int64_t(x << (64 - B)) >> (64 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
768/// Requires 0 < B < 64.
769inline int64_t SignExtend64(uint64_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 770, __PRETTY_FUNCTION__))
;
771 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 771, __PRETTY_FUNCTION__))
;
772 return int64_t(X << (64 - B)) >> (64 - B);
773}
774
775/// Subtract two unsigned integers, X and Y, of type T and return the absolute
776/// value of the result.
777template <typename T>
778typename std::enable_if<std::is_unsigned<T>::value, T>::type
779AbsoluteDifference(T X, T Y) {
780 return std::max(X, Y) - std::min(X, Y);
781}
782
783/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
784/// maximum representable value of T on overflow. ResultOverflowed indicates if
785/// the result is larger than the maximum representable value of type T.
786template <typename T>
787typename std::enable_if<std::is_unsigned<T>::value, T>::type
788SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
789 bool Dummy;
790 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
791 // Hacker's Delight, p. 29
792 T Z = X + Y;
793 Overflowed = (Z < X || Z < Y);
794 if (Overflowed)
795 return std::numeric_limits<T>::max();
796 else
797 return Z;
798}
799
800/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
801/// maximum representable value of T on overflow. ResultOverflowed indicates if
802/// the result is larger than the maximum representable value of type T.
803template <typename T>
804typename std::enable_if<std::is_unsigned<T>::value, T>::type
805SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
806 bool Dummy;
807 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
808
809 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
810 // because it fails for uint16_t (where multiplication can have undefined
811 // behavior due to promotion to int), and requires a division in addition
812 // to the multiplication.
813
814 Overflowed = false;
815
816 // Log2(Z) would be either Log2Z or Log2Z + 1.
817 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
818 // will necessarily be less than Log2Max as desired.
819 int Log2Z = Log2_64(X) + Log2_64(Y);
820 const T Max = std::numeric_limits<T>::max();
821 int Log2Max = Log2_64(Max);
822 if (Log2Z < Log2Max) {
823 return X * Y;
824 }
825 if (Log2Z > Log2Max) {
826 Overflowed = true;
827 return Max;
828 }
829
830 // We're going to use the top bit, and maybe overflow one
831 // bit past it. Multiply all but the bottom bit then add
832 // that on at the end.
833 T Z = (X >> 1) * Y;
834 if (Z & ~(Max >> 1)) {
835 Overflowed = true;
836 return Max;
837 }
838 Z <<= 1;
839 if (X & 1)
840 return SaturatingAdd(Z, Y, ResultOverflowed);
841
842 return Z;
843}
844
845/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
846/// the product. Clamp the result to the maximum representable value of T on
847/// overflow. ResultOverflowed indicates if the result is larger than the
848/// maximum representable value of type T.
849template <typename T>
850typename std::enable_if<std::is_unsigned<T>::value, T>::type
851SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
852 bool Dummy;
853 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
854
855 T Product = SaturatingMultiply(X, Y, &Overflowed);
856 if (Overflowed)
857 return Product;
858
859 return SaturatingAdd(A, Product, &Overflowed);
860}
861
862/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
863extern const float huge_valf;
864
865
866/// Add two signed integers, computing the two's complement truncated result,
867/// returning true if overflow occured.
868template <typename T>
869typename std::enable_if<std::is_signed<T>::value, T>::type
870AddOverflow(T X, T Y, T &Result) {
871#if __has_builtin(__builtin_add_overflow)1
872 return __builtin_add_overflow(X, Y, &Result);
873#else
874 // Perform the unsigned addition.
875 using U = typename std::make_unsigned<T>::type;
876 const U UX = static_cast<U>(X);
877 const U UY = static_cast<U>(Y);
878 const U UResult = UX + UY;
879
880 // Convert to signed.
881 Result = static_cast<T>(UResult);
882
883 // Adding two positive numbers should result in a positive number.
884 if (X > 0 && Y > 0)
885 return Result <= 0;
886 // Adding two negatives should result in a negative number.
887 if (X < 0 && Y < 0)
888 return Result >= 0;
889 return false;
890#endif
891}
892
893/// Subtract two signed integers, computing the two's complement truncated
894/// result, returning true if an overflow ocurred.
895template <typename T>
896typename std::enable_if<std::is_signed<T>::value, T>::type
897SubOverflow(T X, T Y, T &Result) {
898#if __has_builtin(__builtin_sub_overflow)1
899 return __builtin_sub_overflow(X, Y, &Result);
900#else
901 // Perform the unsigned addition.
902 using U = typename std::make_unsigned<T>::type;
903 const U UX = static_cast<U>(X);
904 const U UY = static_cast<U>(Y);
905 const U UResult = UX - UY;
906
907 // Convert to signed.
908 Result = static_cast<T>(UResult);
909
910 // Subtracting a positive number from a negative results in a negative number.
911 if (X <= 0 && Y > 0)
912 return Result >= 0;
913 // Subtracting a negative number from a positive results in a positive number.
914 if (X >= 0 && Y < 0)
915 return Result <= 0;
916 return false;
917#endif
918}
919
920
921/// Multiply two signed integers, computing the two's complement truncated
922/// result, returning true if an overflow ocurred.
923template <typename T>
924typename std::enable_if<std::is_signed<T>::value, T>::type
925MulOverflow(T X, T Y, T &Result) {
926 // Perform the unsigned multiplication on absolute values.
927 using U = typename std::make_unsigned<T>::type;
928 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
929 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
930 const U UResult = UX * UY;
931
932 // Convert to signed.
933 const bool IsNegative = (X < 0) ^ (Y < 0);
934 Result = IsNegative ? (0 - UResult) : UResult;
935
936 // If any of the args was 0, result is 0 and no overflow occurs.
937 if (UX == 0 || UY == 0)
938 return false;
939
940 // UX and UY are in [1, 2^n], where n is the number of digits.
941 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
942 // positive) divided by an argument compares to the other.
943 if (IsNegative)
944 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
945 else
946 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
947}
948
949} // End llvm namespace
950
951#endif