Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 1854, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/include -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-01-13-084841-49055-1 -x c++ /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#if defined(_MSC_VER) || defined(__MINGW32__)
15// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16// from the Visual C++ cmath / math.h headers:
17// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18#define _USE_MATH_DEFINES
19#endif
20
21#include "AMDGPU.h"
22#include "AMDGPULegalizerInfo.h"
23#include "AMDGPUTargetMachine.h"
24#include "SIMachineFunctionInfo.h"
25#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/TargetOpcodes.h"
28#include "llvm/CodeGen/ValueTypes.h"
29#include "llvm/IR/DerivedTypes.h"
30#include "llvm/IR/DiagnosticInfo.h"
31#include "llvm/IR/Type.h"
32#include "llvm/Support/Debug.h"
33
34#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
35
36using namespace llvm;
37using namespace LegalizeActions;
38using namespace LegalizeMutations;
39using namespace LegalityPredicates;
40
41
42static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 1024) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48 };
49}
50
51static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
54 };
55}
56
57static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
62 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
64 };
65}
66
67static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72 };
73}
74
75static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80 };
81}
82
83static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91 };
92}
93
94// Increase the number of vector elements to reach the next multiple of 32-bit
95// type.
96static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
104
105 assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 105, __PRETTY_FUNCTION__))
;
106
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109 };
110}
111
112static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116 };
117}
118
119static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123 };
124}
125
126static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130 };
131}
132
133// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134// v2s16.
135static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
143 }
144
145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146 };
147}
148
149static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
152 };
153}
154
155static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160 };
161}
162
163AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
166 using namespace TargetOpcode;
167
168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170 };
171
172 const LLT S1 = LLT::scalar(1);
173 const LLT S8 = LLT::scalar(8);
174 const LLT S16 = LLT::scalar(16);
175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
177 const LLT S96 = LLT::scalar(96);
178 const LLT S128 = LLT::scalar(128);
179 const LLT S256 = LLT::scalar(256);
180 const LLT S1024 = LLT::scalar(1024);
181
182 const LLT V2S16 = LLT::vector(2, 16);
183 const LLT V4S16 = LLT::vector(4, 16);
184
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
200 const LLT V32S32 = LLT::vector(32, 32);
201
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
209 const LLT V16S64 = LLT::vector(16, 64);
210
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214 std::initializer_list<LLT> AllS64Vectors =
215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216
217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224
225 const LLT CodePtr = FlatPtr;
226
227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
229 };
230
231 const std::initializer_list<LLT> AddrSpaces32 = {
232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233 };
234
235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
237 };
238
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
241 };
242
243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
245 };
246
247 setAction({G_BRCOND, S1}, Legal); // VCC branches
248 setAction({G_BRCOND, S32}, Legal); // SCC branches
249
250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251 // elements for v3s16
252 getActionDefinitionsBuilder(G_PHI)
253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254 .legalFor(AllS32Vectors)
255 .legalFor(AllS64Vectors)
256 .legalFor(AddrSpaces64)
257 .legalFor(AddrSpaces32)
258 .clampScalar(0, S32, S256)
259 .widenScalarToNextPow2(0, 32)
260 .clampMaxNumElements(0, S32, 16)
261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262 .legalIf(isPointer(0));
263
264 if (ST.has16BitInsts()) {
265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266 .legalFor({S32, S16})
267 .clampScalar(0, S16, S32)
268 .scalarize(0);
269 } else {
270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271 .legalFor({S32})
272 .clampScalar(0, S32, S32)
273 .scalarize(0);
274 }
275
276 // FIXME: Not really legal. Placeholder for custom lowering.
277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278 .legalFor({S32, S64})
279 .clampScalar(0, S32, S64)
280 .widenScalarToNextPow2(0, 32)
281 .scalarize(0);
282
283 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
284 .legalFor({S32})
285 .clampScalar(0, S32, S32)
286 .scalarize(0);
287
288 // Report legal for any types we can handle anywhere. For the cases only legal
289 // on the SALU, RegBankSelect will be able to re-legalize.
290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292 .clampScalar(0, S32, S64)
293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295 .widenScalarToNextPow2(0)
296 .scalarize(0);
297
298 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300 .legalFor({{S32, S1}, {S32, S32}})
301 .clampScalar(0, S32, S32)
302 .scalarize(0); // TODO: Implement.
303
304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
305 .lower();
306
307 getActionDefinitionsBuilder(G_BITCAST)
308 // Don't worry about the size constraint.
309 .legalIf(all(isRegisterType(0), isRegisterType(1)))
310 // FIXME: Testing hack
311 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
312
313 getActionDefinitionsBuilder(G_FCONSTANT)
314 .legalFor({S32, S64, S16})
315 .clampScalar(0, S16, S64);
316
317 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
318 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
319 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
321 .clampScalarOrElt(0, S32, S1024)
322 .legalIf(isMultiple32(0))
323 .widenScalarToNextPow2(0, 32)
324 .clampMaxNumElements(0, S32, 16);
325
326
327 // FIXME: i1 operands to intrinsics should always be legal, but other i1
328 // values may not be legal. We need to figure out how to distinguish
329 // between these two scenarios.
330 getActionDefinitionsBuilder(G_CONSTANT)
331 .legalFor({S1, S32, S64, S16, GlobalPtr,
332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333 .clampScalar(0, S32, S64)
334 .widenScalarToNextPow2(0)
335 .legalIf(isPointer(0));
336
337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340
341
342 auto &FPOpActions = getActionDefinitionsBuilder(
343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344 .legalFor({S32, S64});
345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346 .customFor({S32, S64});
347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348 .customFor({S32, S64});
349
350 if (ST.has16BitInsts()) {
351 if (ST.hasVOP3PInsts())
352 FPOpActions.legalFor({S16, V2S16});
353 else
354 FPOpActions.legalFor({S16});
355
356 TrigActions.customFor({S16});
357 FDIVActions.customFor({S16});
358 }
359
360 auto &MinNumMaxNum = getActionDefinitionsBuilder({
361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362
363 if (ST.hasVOP3PInsts()) {
364 MinNumMaxNum.customFor(FPTypesPK16)
365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366 .clampMaxNumElements(0, S16, 2)
367 .clampScalar(0, S16, S64)
368 .scalarize(0);
369 } else if (ST.has16BitInsts()) {
370 MinNumMaxNum.customFor(FPTypes16)
371 .clampScalar(0, S16, S64)
372 .scalarize(0);
373 } else {
374 MinNumMaxNum.customFor(FPTypesBase)
375 .clampScalar(0, S32, S64)
376 .scalarize(0);
377 }
378
379 if (ST.hasVOP3PInsts())
380 FPOpActions.clampMaxNumElements(0, S16, 2);
381
382 FPOpActions
383 .scalarize(0)
384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385
386 TrigActions
387 .scalarize(0)
388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389
390 FDIVActions
391 .scalarize(0)
392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393
394 getActionDefinitionsBuilder({G_FNEG, G_FABS})
395 .legalFor(FPTypesPK16)
396 .clampMaxNumElements(0, S16, 2)
397 .scalarize(0)
398 .clampScalar(0, S16, S64);
399
400 // TODO: Implement
401 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
402
403 if (ST.has16BitInsts()) {
404 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
405 .legalFor({S32, S64, S16})
406 .scalarize(0)
407 .clampScalar(0, S16, S64);
408 } else {
409 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
410 .legalFor({S32, S64})
411 .scalarize(0)
412 .clampScalar(0, S32, S64);
413 }
414
415 getActionDefinitionsBuilder(G_FPTRUNC)
416 .legalFor({{S32, S64}, {S16, S32}})
417 .scalarize(0);
418
419 getActionDefinitionsBuilder(G_FPEXT)
420 .legalFor({{S64, S32}, {S32, S16}})
421 .lowerFor({{S64, S16}}) // FIXME: Implement
422 .scalarize(0);
423
424 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
425 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
426
427 getActionDefinitionsBuilder(G_FSUB)
428 // Use actual fsub instruction
429 .legalFor({S32})
430 // Must use fadd + fneg
431 .lowerFor({S64, S16, V2S16})
432 .scalarize(0)
433 .clampScalar(0, S32, S64);
434
435 // Whether this is legal depends on the floating point mode for the function.
436 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
437 if (ST.hasMadF16())
438 FMad.customFor({S32, S16});
439 else
440 FMad.customFor({S32});
441 FMad.scalarize(0)
442 .lower();
443
444 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
445 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
446 {S32, S1}, {S64, S1}, {S16, S1},
447 {S96, S32},
448 // FIXME: Hack
449 {S64, LLT::scalar(33)},
450 {S32, S8}, {S32, LLT::scalar(24)}})
451 .scalarize(0)
452 .clampScalar(0, S32, S64);
453
454 // TODO: Split s1->s64 during regbankselect for VALU.
455 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
456 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
457 .lowerFor({{S32, S64}})
458 .lowerIf(typeIs(1, S1))
459 .customFor({{S64, S64}});
460 if (ST.has16BitInsts())
461 IToFP.legalFor({{S16, S16}});
462 IToFP.clampScalar(1, S32, S64)
463 .scalarize(0);
464
465 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
466 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
467 if (ST.has16BitInsts())
468 FPToI.legalFor({{S16, S16}});
469 else
470 FPToI.minScalar(1, S32);
471
472 FPToI.minScalar(0, S32)
473 .scalarize(0);
474
475 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
476 .scalarize(0)
477 .lower();
478
479 if (ST.has16BitInsts()) {
480 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
481 .legalFor({S16, S32, S64})
482 .clampScalar(0, S16, S64)
483 .scalarize(0);
484 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
485 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
486 .legalFor({S32, S64})
487 .clampScalar(0, S32, S64)
488 .scalarize(0);
489 } else {
490 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
491 .legalFor({S32})
492 .customFor({S64})
493 .clampScalar(0, S32, S64)
494 .scalarize(0);
495 }
496
497 getActionDefinitionsBuilder(G_PTR_ADD)
498 .legalForCartesianProduct(AddrSpaces64, {S64})
499 .legalForCartesianProduct(AddrSpaces32, {S32})
500 .scalarize(0);
501
502 getActionDefinitionsBuilder(G_PTR_MASK)
503 .scalarize(0)
504 .alwaysLegal();
505
506 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
507
508 auto &CmpBuilder =
509 getActionDefinitionsBuilder(G_ICMP)
510 // The compare output type differs based on the register bank of the output,
511 // so make both s1 and s32 legal.
512 //
513 // Scalar compares producing output in scc will be promoted to s32, as that
514 // is the allocatable register type that will be needed for the copy from
515 // scc. This will be promoted during RegBankSelect, and we assume something
516 // before that won't try to use s32 result types.
517 //
518 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
519 // bank.
520 .legalForCartesianProduct(
521 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
522 .legalForCartesianProduct(
523 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
524 if (ST.has16BitInsts()) {
525 CmpBuilder.legalFor({{S1, S16}});
526 }
527
528 CmpBuilder
529 .widenScalarToNextPow2(1)
530 .clampScalar(1, S32, S64)
531 .scalarize(0)
532 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
533
534 getActionDefinitionsBuilder(G_FCMP)
535 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
536 .widenScalarToNextPow2(1)
537 .clampScalar(1, S32, S64)
538 .scalarize(0);
539
540 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
541 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
542 G_FLOG, G_FLOG2, G_FLOG10})
543 .legalFor({S32})
544 .scalarize(0);
545
546 // The 64-bit versions produce 32-bit results, but only on the SALU.
547 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
548 G_CTTZ, G_CTTZ_ZERO_UNDEF,
549 G_CTPOP})
550 .legalFor({{S32, S32}, {S32, S64}})
551 .clampScalar(0, S32, S32)
552 .clampScalar(1, S32, S64)
553 .scalarize(0)
554 .widenScalarToNextPow2(0, 32)
555 .widenScalarToNextPow2(1, 32);
556
557 // TODO: Expand for > s32
558 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
559 .legalFor({S32})
560 .clampScalar(0, S32, S32)
561 .scalarize(0);
562
563 if (ST.has16BitInsts()) {
564 if (ST.hasVOP3PInsts()) {
565 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
566 .legalFor({S32, S16, V2S16})
567 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
568 .clampMaxNumElements(0, S16, 2)
569 .clampScalar(0, S16, S32)
570 .widenScalarToNextPow2(0)
571 .scalarize(0);
572 } else {
573 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
574 .legalFor({S32, S16})
575 .widenScalarToNextPow2(0)
576 .clampScalar(0, S16, S32)
577 .scalarize(0);
578 }
579 } else {
580 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
581 .legalFor({S32})
582 .clampScalar(0, S32, S32)
583 .widenScalarToNextPow2(0)
584 .scalarize(0);
585 }
586
587 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
588 return [=](const LegalityQuery &Query) {
589 return Query.Types[TypeIdx0].getSizeInBits() <
590 Query.Types[TypeIdx1].getSizeInBits();
591 };
592 };
593
594 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
595 return [=](const LegalityQuery &Query) {
596 return Query.Types[TypeIdx0].getSizeInBits() >
597 Query.Types[TypeIdx1].getSizeInBits();
598 };
599 };
600
601 getActionDefinitionsBuilder(G_INTTOPTR)
602 // List the common cases
603 .legalForCartesianProduct(AddrSpaces64, {S64})
604 .legalForCartesianProduct(AddrSpaces32, {S32})
605 .scalarize(0)
606 // Accept any address space as long as the size matches
607 .legalIf(sameSize(0, 1))
608 .widenScalarIf(smallerThan(1, 0),
609 [](const LegalityQuery &Query) {
610 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
611 })
612 .narrowScalarIf(greaterThan(1, 0),
613 [](const LegalityQuery &Query) {
614 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
615 });
616
617 getActionDefinitionsBuilder(G_PTRTOINT)
618 // List the common cases
619 .legalForCartesianProduct(AddrSpaces64, {S64})
620 .legalForCartesianProduct(AddrSpaces32, {S32})
621 .scalarize(0)
622 // Accept any address space as long as the size matches
623 .legalIf(sameSize(0, 1))
624 .widenScalarIf(smallerThan(0, 1),
625 [](const LegalityQuery &Query) {
626 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
627 })
628 .narrowScalarIf(
629 greaterThan(0, 1),
630 [](const LegalityQuery &Query) {
631 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
632 });
633
634 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
635 .scalarize(0)
636 .custom();
637
638 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
639 // handle some operations by just promoting the register during
640 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
641 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
642 switch (AS) {
643 // FIXME: Private element size.
644 case AMDGPUAS::PRIVATE_ADDRESS:
645 return 32;
646 // FIXME: Check subtarget
647 case AMDGPUAS::LOCAL_ADDRESS:
648 return ST.useDS128() ? 128 : 64;
649
650 // Treat constant and global as identical. SMRD loads are sometimes usable
651 // for global loads (ideally constant address space should be eliminated)
652 // depending on the context. Legality cannot be context dependent, but
653 // RegBankSelect can split the load as necessary depending on the pointer
654 // register bank/uniformity and if the memory is invariant or not written in
655 // a kernel.
656 case AMDGPUAS::CONSTANT_ADDRESS:
657 case AMDGPUAS::GLOBAL_ADDRESS:
658 return 512;
659 default:
660 return 128;
661 }
662 };
663
664 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
665 const LLT DstTy = Query.Types[0];
666
667 // Split vector extloads.
668 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
669 unsigned Align = Query.MMODescrs[0].AlignInBits;
670
671 if (MemSize < DstTy.getSizeInBits())
672 MemSize = std::max(MemSize, Align);
673
674 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
675 return true;
676
677 const LLT PtrTy = Query.Types[1];
678 unsigned AS = PtrTy.getAddressSpace();
679 if (MemSize > maxSizeForAddrSpace(AS))
680 return true;
681
682 // Catch weird sized loads that don't evenly divide into the access sizes
683 // TODO: May be able to widen depending on alignment etc.
684 unsigned NumRegs = MemSize / 32;
685 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
686 return true;
687
688 if (Align < MemSize) {
689 const SITargetLowering *TLI = ST.getTargetLowering();
690 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
691 }
692
693 return false;
694 };
695
696 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
697 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
698 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
699
700 // TODO: Refine based on subtargets which support unaligned access or 128-bit
701 // LDS
702 // TODO: Unsupported flat for SI.
703
704 for (unsigned Op : {G_LOAD, G_STORE}) {
705 const bool IsStore = Op == G_STORE;
706
707 auto &Actions = getActionDefinitionsBuilder(Op);
708 // Whitelist the common cases.
709 // TODO: Pointer loads
710 // TODO: Wide constant loads
711 // TODO: Only CI+ has 3x loads
712 // TODO: Loads to s16 on gfx9
713 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
714 {V2S32, GlobalPtr, 64, GlobalAlign32},
715 {V3S32, GlobalPtr, 96, GlobalAlign32},
716 {S96, GlobalPtr, 96, GlobalAlign32},
717 {V4S32, GlobalPtr, 128, GlobalAlign32},
718 {S128, GlobalPtr, 128, GlobalAlign32},
719 {S64, GlobalPtr, 64, GlobalAlign32},
720 {V2S64, GlobalPtr, 128, GlobalAlign32},
721 {V2S16, GlobalPtr, 32, GlobalAlign32},
722 {S32, GlobalPtr, 8, GlobalAlign8},
723 {S32, GlobalPtr, 16, GlobalAlign16},
724
725 {S32, LocalPtr, 32, 32},
726 {S64, LocalPtr, 64, 32},
727 {V2S32, LocalPtr, 64, 32},
728 {S32, LocalPtr, 8, 8},
729 {S32, LocalPtr, 16, 16},
730 {V2S16, LocalPtr, 32, 32},
731
732 {S32, PrivatePtr, 32, 32},
733 {S32, PrivatePtr, 8, 8},
734 {S32, PrivatePtr, 16, 16},
735 {V2S16, PrivatePtr, 32, 32},
736
737 {S32, FlatPtr, 32, GlobalAlign32},
738 {S32, FlatPtr, 16, GlobalAlign16},
739 {S32, FlatPtr, 8, GlobalAlign8},
740 {V2S16, FlatPtr, 32, GlobalAlign32},
741
742 {S32, ConstantPtr, 32, GlobalAlign32},
743 {V2S32, ConstantPtr, 64, GlobalAlign32},
744 {V3S32, ConstantPtr, 96, GlobalAlign32},
745 {V4S32, ConstantPtr, 128, GlobalAlign32},
746 {S64, ConstantPtr, 64, GlobalAlign32},
747 {S128, ConstantPtr, 128, GlobalAlign32},
748 {V2S32, ConstantPtr, 32, GlobalAlign32}});
749 Actions
750 .customIf(typeIs(1, Constant32Ptr))
751 .narrowScalarIf(
752 [=](const LegalityQuery &Query) -> bool {
753 return !Query.Types[0].isVector() && needToSplitLoad(Query);
754 },
755 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
756 const LLT DstTy = Query.Types[0];
757 const LLT PtrTy = Query.Types[1];
758
759 const unsigned DstSize = DstTy.getSizeInBits();
760 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
761
762 // Split extloads.
763 if (DstSize > MemSize)
764 return std::make_pair(0, LLT::scalar(MemSize));
765
766 if (DstSize > 32 && (DstSize % 32 != 0)) {
767 // FIXME: Need a way to specify non-extload of larger size if
768 // suitably aligned.
769 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
770 }
771
772 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
773 if (MemSize > MaxSize)
774 return std::make_pair(0, LLT::scalar(MaxSize));
775
776 unsigned Align = Query.MMODescrs[0].AlignInBits;
777 return std::make_pair(0, LLT::scalar(Align));
778 })
779 .fewerElementsIf(
780 [=](const LegalityQuery &Query) -> bool {
781 return Query.Types[0].isVector() && needToSplitLoad(Query);
782 },
783 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
784 const LLT DstTy = Query.Types[0];
785 const LLT PtrTy = Query.Types[1];
786
787 LLT EltTy = DstTy.getElementType();
788 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
789
790 // Split if it's too large for the address space.
791 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
792 unsigned NumElts = DstTy.getNumElements();
793 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
794
795 // FIXME: Refine when odd breakdowns handled
796 // The scalars will need to be re-legalized.
797 if (NumPieces == 1 || NumPieces >= NumElts ||
798 NumElts % NumPieces != 0)
799 return std::make_pair(0, EltTy);
800
801 return std::make_pair(0,
802 LLT::vector(NumElts / NumPieces, EltTy));
803 }
804
805 // Need to split because of alignment.
806 unsigned Align = Query.MMODescrs[0].AlignInBits;
807 unsigned EltSize = EltTy.getSizeInBits();
808 if (EltSize > Align &&
809 (EltSize / Align < DstTy.getNumElements())) {
810 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
811 }
812
813 // May need relegalization for the scalars.
814 return std::make_pair(0, EltTy);
815 })
816 .minScalar(0, S32);
817
818 if (IsStore)
819 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
820
821 // TODO: Need a bitcast lower option?
822 Actions
823 .legalIf([=](const LegalityQuery &Query) {
824 const LLT Ty0 = Query.Types[0];
825 unsigned Size = Ty0.getSizeInBits();
826 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
827 unsigned Align = Query.MMODescrs[0].AlignInBits;
828
829 // FIXME: Widening store from alignment not valid.
830 if (MemSize < Size)
831 MemSize = std::max(MemSize, Align);
832
833 // No extending vector loads.
834 if (Size > MemSize && Ty0.isVector())
835 return false;
836
837 switch (MemSize) {
838 case 8:
839 case 16:
840 return Size == 32;
841 case 32:
842 case 64:
843 case 128:
844 return true;
845 case 96:
846 return ST.hasDwordx3LoadStores();
847 case 256:
848 case 512:
849 return true;
850 default:
851 return false;
852 }
853 })
854 .widenScalarToNextPow2(0)
855 // TODO: v3s32->v4s32 with alignment
856 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
857 }
858
859 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
860 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
861 {S32, GlobalPtr, 16, 2 * 8},
862 {S32, LocalPtr, 8, 8},
863 {S32, LocalPtr, 16, 16},
864 {S32, PrivatePtr, 8, 8},
865 {S32, PrivatePtr, 16, 16},
866 {S32, ConstantPtr, 8, 8},
867 {S32, ConstantPtr, 16, 2 * 8}});
868 if (ST.hasFlatAddressSpace()) {
869 ExtLoads.legalForTypesWithMemDesc(
870 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
871 }
872
873 ExtLoads.clampScalar(0, S32, S32)
874 .widenScalarToNextPow2(0)
875 .unsupportedIfMemSizeNotPow2()
876 .lower();
877
878 auto &Atomics = getActionDefinitionsBuilder(
879 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
880 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
881 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
882 G_ATOMICRMW_UMIN})
883 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
884 {S64, GlobalPtr}, {S64, LocalPtr}});
885 if (ST.hasFlatAddressSpace()) {
886 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
887 }
888
889 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
890 .legalFor({{S32, LocalPtr}});
891
892 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
893 // demarshalling
894 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
895 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
896 {S32, FlatPtr}, {S64, FlatPtr}})
897 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
898 {S32, RegionPtr}, {S64, RegionPtr}});
899
900 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
901 .lower();
902
903 // TODO: Pointer types, any 32-bit or 64-bit vector
904
905 // Condition should be s32 for scalar, s1 for vector.
906 getActionDefinitionsBuilder(G_SELECT)
907 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
908 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
909 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
910 .clampScalar(0, S16, S64)
911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
913 .scalarize(1)
914 .clampMaxNumElements(0, S32, 2)
915 .clampMaxNumElements(0, LocalPtr, 2)
916 .clampMaxNumElements(0, PrivatePtr, 2)
917 .scalarize(0)
918 .widenScalarToNextPow2(0)
919 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
920
921 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
922 // be more flexible with the shift amount type.
923 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
924 .legalFor({{S32, S32}, {S64, S32}});
925 if (ST.has16BitInsts()) {
926 if (ST.hasVOP3PInsts()) {
927 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
928 .clampMaxNumElements(0, S16, 2);
929 } else
930 Shifts.legalFor({{S16, S32}, {S16, S16}});
931
932 // TODO: Support 16-bit shift amounts
933 Shifts.clampScalar(1, S32, S32);
934 Shifts.clampScalar(0, S16, S64);
935 Shifts.widenScalarToNextPow2(0, 16);
936 } else {
937 // Make sure we legalize the shift amount type first, as the general
938 // expansion for the shifted type will produce much worse code if it hasn't
939 // been truncated already.
940 Shifts.clampScalar(1, S32, S32);
941 Shifts.clampScalar(0, S32, S64);
942 Shifts.widenScalarToNextPow2(0, 32);
943 }
944 Shifts.scalarize(0);
945
946 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
947 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
948 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
949 unsigned IdxTypeIdx = 2;
950
951 getActionDefinitionsBuilder(Op)
952 .customIf([=](const LegalityQuery &Query) {
953 const LLT EltTy = Query.Types[EltTypeIdx];
954 const LLT VecTy = Query.Types[VecTypeIdx];
955 const LLT IdxTy = Query.Types[IdxTypeIdx];
956 return (EltTy.getSizeInBits() == 16 ||
957 EltTy.getSizeInBits() % 32 == 0) &&
958 VecTy.getSizeInBits() % 32 == 0 &&
959 VecTy.getSizeInBits() <= 1024 &&
960 IdxTy.getSizeInBits() == 32;
961 })
962 .clampScalar(EltTypeIdx, S32, S64)
963 .clampScalar(VecTypeIdx, S32, S64)
964 .clampScalar(IdxTypeIdx, S32, S32);
965 }
966
967 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
968 .unsupportedIf([=](const LegalityQuery &Query) {
969 const LLT &EltTy = Query.Types[1].getElementType();
970 return Query.Types[0] != EltTy;
971 });
972
973 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
974 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
975 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
976
977 // FIXME: Doesn't handle extract of illegal sizes.
978 getActionDefinitionsBuilder(Op)
979 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
980 // FIXME: Multiples of 16 should not be legal.
981 .legalIf([=](const LegalityQuery &Query) {
982 const LLT BigTy = Query.Types[BigTyIdx];
983 const LLT LitTy = Query.Types[LitTyIdx];
984 return (BigTy.getSizeInBits() % 32 == 0) &&
985 (LitTy.getSizeInBits() % 16 == 0);
986 })
987 .widenScalarIf(
988 [=](const LegalityQuery &Query) {
989 const LLT BigTy = Query.Types[BigTyIdx];
990 return (BigTy.getScalarSizeInBits() < 16);
991 },
992 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
993 .widenScalarIf(
994 [=](const LegalityQuery &Query) {
995 const LLT LitTy = Query.Types[LitTyIdx];
996 return (LitTy.getScalarSizeInBits() < 16);
997 },
998 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
999 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1000 .widenScalarToNextPow2(BigTyIdx, 32);
1001
1002 }
1003
1004 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1005 .legalForCartesianProduct(AllS32Vectors, {S32})
1006 .legalForCartesianProduct(AllS64Vectors, {S64})
1007 .clampNumElements(0, V16S32, V32S32)
1008 .clampNumElements(0, V2S64, V16S64)
1009 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1010
1011 if (ST.hasScalarPackInsts())
1012 BuildVector.legalFor({V2S16, S32});
1013
1014 BuildVector
1015 .minScalarSameAs(1, 0)
1016 .legalIf(isRegisterType(0))
1017 .minScalarOrElt(0, S32);
1018
1019 if (ST.hasScalarPackInsts()) {
1020 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1021 .legalFor({V2S16, S32})
1022 .lower();
1023 } else {
1024 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1025 .lower();
1026 }
1027
1028 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1029 .legalIf(isRegisterType(0));
1030
1031 // TODO: Don't fully scalarize v2s16 pieces
1032 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1033
1034 // Merge/Unmerge
1035 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1036 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1037 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1038
1039 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1040 const LLT &Ty = Query.Types[TypeIdx];
1041 if (Ty.isVector()) {
1042 const LLT &EltTy = Ty.getElementType();
1043 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1044 return true;
1045 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1046 return true;
1047 }
1048 return false;
1049 };
1050
1051 auto &Builder = getActionDefinitionsBuilder(Op)
1052 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1053 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1054 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1055 // valid.
1056 .clampScalar(LitTyIdx, S16, S256)
1057 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1058 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1059 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1060 elementTypeIs(1, S16)),
1061 changeTo(1, V2S16))
1062 // Break up vectors with weird elements into scalars
1063 .fewerElementsIf(
1064 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1065 scalarize(0))
1066 .fewerElementsIf(
1067 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1068 scalarize(1))
1069 .clampScalar(BigTyIdx, S32, S1024)
1070 .lowerFor({{S16, V2S16}});
1071
1072 if (Op == G_MERGE_VALUES) {
1073 Builder.widenScalarIf(
1074 // TODO: Use 16-bit shifts if legal for 8-bit values?
1075 [=](const LegalityQuery &Query) {
1076 const LLT Ty = Query.Types[LitTyIdx];
1077 return Ty.getSizeInBits() < 32;
1078 },
1079 changeTo(LitTyIdx, S32));
1080 }
1081
1082 Builder.widenScalarIf(
1083 [=](const LegalityQuery &Query) {
1084 const LLT Ty = Query.Types[BigTyIdx];
1085 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1086 Ty.getSizeInBits() % 16 != 0;
1087 },
1088 [=](const LegalityQuery &Query) {
1089 // Pick the next power of 2, or a multiple of 64 over 128.
1090 // Whichever is smaller.
1091 const LLT &Ty = Query.Types[BigTyIdx];
1092 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1093 if (NewSizeInBits >= 256) {
1094 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1095 if (RoundedTo < NewSizeInBits)
1096 NewSizeInBits = RoundedTo;
1097 }
1098 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1099 })
1100 .legalIf([=](const LegalityQuery &Query) {
1101 const LLT &BigTy = Query.Types[BigTyIdx];
1102 const LLT &LitTy = Query.Types[LitTyIdx];
1103
1104 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1105 return false;
1106 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1107 return false;
1108
1109 return BigTy.getSizeInBits() % 16 == 0 &&
1110 LitTy.getSizeInBits() % 16 == 0 &&
1111 BigTy.getSizeInBits() <= 1024;
1112 })
1113 // Any vectors left are the wrong size. Scalarize them.
1114 .scalarize(0)
1115 .scalarize(1);
1116 }
1117
1118 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1119
1120 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1121
1122 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1123 .legalFor({S64});
1124
1125 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1126 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1127 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1128 .unsupported();
1129
1130 computeTables();
1131 verify(*ST.getInstrInfo());
1132}
1133
1134bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1135 MachineRegisterInfo &MRI,
1136 MachineIRBuilder &B,
1137 GISelChangeObserver &Observer) const {
1138 switch (MI.getOpcode()) {
1139 case TargetOpcode::G_ADDRSPACE_CAST:
1140 return legalizeAddrSpaceCast(MI, MRI, B);
1141 case TargetOpcode::G_FRINT:
1142 return legalizeFrint(MI, MRI, B);
1143 case TargetOpcode::G_FCEIL:
1144 return legalizeFceil(MI, MRI, B);
1145 case TargetOpcode::G_INTRINSIC_TRUNC:
1146 return legalizeIntrinsicTrunc(MI, MRI, B);
1147 case TargetOpcode::G_SITOFP:
1148 return legalizeITOFP(MI, MRI, B, true);
1149 case TargetOpcode::G_UITOFP:
1150 return legalizeITOFP(MI, MRI, B, false);
1151 case TargetOpcode::G_FMINNUM:
1152 case TargetOpcode::G_FMAXNUM:
1153 case TargetOpcode::G_FMINNUM_IEEE:
1154 case TargetOpcode::G_FMAXNUM_IEEE:
1155 return legalizeMinNumMaxNum(MI, MRI, B);
1156 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1157 return legalizeExtractVectorElt(MI, MRI, B);
1158 case TargetOpcode::G_INSERT_VECTOR_ELT:
1159 return legalizeInsertVectorElt(MI, MRI, B);
1160 case TargetOpcode::G_FSIN:
1161 case TargetOpcode::G_FCOS:
1162 return legalizeSinCos(MI, MRI, B);
1163 case TargetOpcode::G_GLOBAL_VALUE:
1164 return legalizeGlobalValue(MI, MRI, B);
1165 case TargetOpcode::G_LOAD:
1166 return legalizeLoad(MI, MRI, B, Observer);
1167 case TargetOpcode::G_FMAD:
1168 return legalizeFMad(MI, MRI, B);
1169 case TargetOpcode::G_FDIV:
1170 return legalizeFDIV(MI, MRI, B);
1171 case TargetOpcode::G_ATOMIC_CMPXCHG:
1172 return legalizeAtomicCmpXChg(MI, MRI, B);
1173 default:
1174 return false;
1175 }
1176
1177 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1177)
;
1178}
1179
1180Register AMDGPULegalizerInfo::getSegmentAperture(
1181 unsigned AS,
1182 MachineRegisterInfo &MRI,
1183 MachineIRBuilder &B) const {
1184 MachineFunction &MF = B.getMF();
1185 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1186 const LLT S32 = LLT::scalar(32);
1187
1188 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1188, __PRETTY_FUNCTION__))
;
1189
1190 if (ST.hasApertureRegs()) {
1191 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1192 // getreg.
1193 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1194 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1195 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1196 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1197 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1198 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1199 unsigned Encoding =
1200 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1201 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1202 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1203
1204 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1205 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1206
1207 B.buildInstr(AMDGPU::S_GETREG_B32)
1208 .addDef(GetReg)
1209 .addImm(Encoding);
1210 MRI.setType(GetReg, S32);
1211
1212 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1213 B.buildInstr(TargetOpcode::G_SHL)
1214 .addDef(ApertureReg)
1215 .addUse(GetReg)
1216 .addUse(ShiftAmt.getReg(0));
1217
1218 return ApertureReg;
1219 }
1220
1221 Register QueuePtr = MRI.createGenericVirtualRegister(
1222 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1223
1224 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1225 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1226 return Register();
1227
1228 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1229 // private_segment_aperture_base_hi.
1230 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1231
1232 // TODO: can we be smarter about machine pointer info?
1233 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1234 MachineMemOperand *MMO = MF.getMachineMemOperand(
1235 PtrInfo,
1236 MachineMemOperand::MOLoad |
1237 MachineMemOperand::MODereferenceable |
1238 MachineMemOperand::MOInvariant,
1239 4,
1240 MinAlign(64, StructOffset));
1241
1242 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1243 Register LoadAddr;
1244
1245 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1246 B.buildLoad(LoadResult, LoadAddr, *MMO);
1247 return LoadResult;
1248}
1249
1250bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1251 MachineInstr &MI, MachineRegisterInfo &MRI,
1252 MachineIRBuilder &B) const {
1253 MachineFunction &MF = B.getMF();
1254
1255 B.setInstr(MI);
1256
1257 const LLT S32 = LLT::scalar(32);
1258 Register Dst = MI.getOperand(0).getReg();
1259 Register Src = MI.getOperand(1).getReg();
1260
1261 LLT DstTy = MRI.getType(Dst);
1262 LLT SrcTy = MRI.getType(Src);
1263 unsigned DestAS = DstTy.getAddressSpace();
1264 unsigned SrcAS = SrcTy.getAddressSpace();
1265
1266 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1267 // vector element.
1268 assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1268, __PRETTY_FUNCTION__))
;
1269
1270 const AMDGPUTargetMachine &TM
1271 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1272
1273 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1274 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1275 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1276 return true;
1277 }
1278
1279 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1280 // Truncate.
1281 B.buildExtract(Dst, Src, 0);
1282 MI.eraseFromParent();
1283 return true;
1284 }
1285
1286 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1287 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1288 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1289
1290 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1291 // another. Merge operands are required to be the same type, but creating an
1292 // extra ptrtoint would be kind of pointless.
1293 auto HighAddr = B.buildConstant(
1294 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1295 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1296 MI.eraseFromParent();
1297 return true;
1298 }
1299
1300 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1301 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1302, __PRETTY_FUNCTION__))
1302 DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1302, __PRETTY_FUNCTION__))
;
1303 unsigned NullVal = TM.getNullPointerValue(DestAS);
1304
1305 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1306 auto FlatNull = B.buildConstant(SrcTy, 0);
1307
1308 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1309
1310 // Extract low 32-bits of the pointer.
1311 B.buildExtract(PtrLo32, Src, 0);
1312
1313 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1314 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1315 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1316
1317 MI.eraseFromParent();
1318 return true;
1319 }
1320
1321 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1322 return false;
1323
1324 if (!ST.hasFlatAddressSpace())
1325 return false;
1326
1327 auto SegmentNull =
1328 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1329 auto FlatNull =
1330 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1331
1332 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1333 if (!ApertureReg.isValid())
1334 return false;
1335
1336 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1337 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1338
1339 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1340
1341 // Coerce the type of the low half of the result so we can use merge_values.
1342 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1343 B.buildInstr(TargetOpcode::G_PTRTOINT)
1344 .addDef(SrcAsInt)
1345 .addUse(Src);
1346
1347 // TODO: Should we allow mismatched types but matching sizes in merges to
1348 // avoid the ptrtoint?
1349 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1350 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1351
1352 MI.eraseFromParent();
1353 return true;
1354}
1355
1356bool AMDGPULegalizerInfo::legalizeFrint(
1357 MachineInstr &MI, MachineRegisterInfo &MRI,
1358 MachineIRBuilder &B) const {
1359 B.setInstr(MI);
1360
1361 Register Src = MI.getOperand(1).getReg();
1362 LLT Ty = MRI.getType(Src);
1363 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast
<void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1363, __PRETTY_FUNCTION__))
;
1364
1365 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1366 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1367
1368 auto C1 = B.buildFConstant(Ty, C1Val);
1369 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1370
1371 // TODO: Should this propagate fast-math-flags?
1372 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1373 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1374
1375 auto C2 = B.buildFConstant(Ty, C2Val);
1376 auto Fabs = B.buildFAbs(Ty, Src);
1377
1378 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1379 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1380 return true;
1381}
1382
1383bool AMDGPULegalizerInfo::legalizeFceil(
1384 MachineInstr &MI, MachineRegisterInfo &MRI,
1385 MachineIRBuilder &B) const {
1386 B.setInstr(MI);
1387
1388 const LLT S1 = LLT::scalar(1);
1389 const LLT S64 = LLT::scalar(64);
1390
1391 Register Src = MI.getOperand(1).getReg();
1392 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1392, __PRETTY_FUNCTION__))
;
1393
1394 // result = trunc(src)
1395 // if (src > 0.0 && src != result)
1396 // result += 1.0
1397
1398 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1399
1400 const auto Zero = B.buildFConstant(S64, 0.0);
1401 const auto One = B.buildFConstant(S64, 1.0);
1402 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1403 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1404 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1405 auto Add = B.buildSelect(S64, And, One, Zero);
1406
1407 // TODO: Should this propagate fast-math-flags?
1408 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1409 return true;
1410}
1411
1412static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1413 MachineIRBuilder &B) {
1414 const unsigned FractBits = 52;
1415 const unsigned ExpBits = 11;
1416 LLT S32 = LLT::scalar(32);
1417
1418 auto Const0 = B.buildConstant(S32, FractBits - 32);
1419 auto Const1 = B.buildConstant(S32, ExpBits);
1420
1421 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1422 .addUse(Const0.getReg(0))
1423 .addUse(Const1.getReg(0));
1424
1425 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1426}
1427
1428bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1429 MachineInstr &MI, MachineRegisterInfo &MRI,
1430 MachineIRBuilder &B) const {
1431 B.setInstr(MI);
1432
1433 const LLT S1 = LLT::scalar(1);
1434 const LLT S32 = LLT::scalar(32);
1435 const LLT S64 = LLT::scalar(64);
1436
1437 Register Src = MI.getOperand(1).getReg();
1438 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1438, __PRETTY_FUNCTION__))
;
1439
1440 // TODO: Should this use extract since the low half is unused?
1441 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1442 Register Hi = Unmerge.getReg(1);
1443
1444 // Extract the upper half, since this is where we will find the sign and
1445 // exponent.
1446 auto Exp = extractF64Exponent(Hi, B);
1447
1448 const unsigned FractBits = 52;
1449
1450 // Extract the sign bit.
1451 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
1452 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1453
1454 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
1455
1456 const auto Zero32 = B.buildConstant(S32, 0);
1457
1458 // Extend back to 64-bits.
1459 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1460
1461 auto Shr = B.buildAShr(S64, FractMask, Exp);
1462 auto Not = B.buildNot(S64, Shr);
1463 auto Tmp0 = B.buildAnd(S64, Src, Not);
1464 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1465
1466 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1467 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1468
1469 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1470 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1471 return true;
1472}
1473
1474bool AMDGPULegalizerInfo::legalizeITOFP(
1475 MachineInstr &MI, MachineRegisterInfo &MRI,
1476 MachineIRBuilder &B, bool Signed) const {
1477 B.setInstr(MI);
1478
1479 Register Dst = MI.getOperand(0).getReg();
1480 Register Src = MI.getOperand(1).getReg();
1481
1482 const LLT S64 = LLT::scalar(64);
1483 const LLT S32 = LLT::scalar(32);
1484
1485 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1485, __PRETTY_FUNCTION__))
;
1486
1487 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1488
1489 auto CvtHi = Signed ?
1490 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1491 B.buildUITOFP(S64, Unmerge.getReg(1));
1492
1493 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1494
1495 auto ThirtyTwo = B.buildConstant(S32, 32);
1496 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1497 .addUse(CvtHi.getReg(0))
1498 .addUse(ThirtyTwo.getReg(0));
1499
1500 // TODO: Should this propagate fast-math-flags?
1501 B.buildFAdd(Dst, LdExp, CvtLo);
1502 MI.eraseFromParent();
1503 return true;
1504}
1505
1506bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1507 MachineInstr &MI, MachineRegisterInfo &MRI,
1508 MachineIRBuilder &B) const {
1509 MachineFunction &MF = B.getMF();
1510 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1511
1512 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1513 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1514
1515 // With ieee_mode disabled, the instructions have the correct behavior
1516 // already for G_FMINNUM/G_FMAXNUM
1517 if (!MFI->getMode().IEEE)
1518 return !IsIEEEOp;
1519
1520 if (IsIEEEOp)
1521 return true;
1522
1523 MachineIRBuilder HelperBuilder(MI);
1524 GISelObserverWrapper DummyObserver;
1525 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1526 HelperBuilder.setInstr(MI);
1527 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1528}
1529
1530bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1531 MachineInstr &MI, MachineRegisterInfo &MRI,
1532 MachineIRBuilder &B) const {
1533 // TODO: Should move some of this into LegalizerHelper.
1534
1535 // TODO: Promote dynamic indexing of s16 to s32
1536 // TODO: Dynamic s64 indexing is only legal for SGPR.
1537 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1538 if (!IdxVal) // Dynamic case will be selected to register indexing.
1539 return true;
1540
1541 Register Dst = MI.getOperand(0).getReg();
1542 Register Vec = MI.getOperand(1).getReg();
1543
1544 LLT VecTy = MRI.getType(Vec);
1545 LLT EltTy = VecTy.getElementType();
1546 assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1546, __PRETTY_FUNCTION__))
;
1547
1548 B.setInstr(MI);
1549
1550 if (IdxVal.getValue() < VecTy.getNumElements())
1551 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1552 else
1553 B.buildUndef(Dst);
1554
1555 MI.eraseFromParent();
1556 return true;
1557}
1558
1559bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1560 MachineInstr &MI, MachineRegisterInfo &MRI,
1561 MachineIRBuilder &B) const {
1562 // TODO: Should move some of this into LegalizerHelper.
1563
1564 // TODO: Promote dynamic indexing of s16 to s32
1565 // TODO: Dynamic s64 indexing is only legal for SGPR.
1566 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1567 if (!IdxVal) // Dynamic case will be selected to register indexing.
1568 return true;
1569
1570 Register Dst = MI.getOperand(0).getReg();
1571 Register Vec = MI.getOperand(1).getReg();
1572 Register Ins = MI.getOperand(2).getReg();
1573
1574 LLT VecTy = MRI.getType(Vec);
1575 LLT EltTy = VecTy.getElementType();
1576 assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1576, __PRETTY_FUNCTION__))
;
1577
1578 B.setInstr(MI);
1579
1580 if (IdxVal.getValue() < VecTy.getNumElements())
1581 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1582 else
1583 B.buildUndef(Dst);
1584
1585 MI.eraseFromParent();
1586 return true;
1587}
1588
1589bool AMDGPULegalizerInfo::legalizeSinCos(
1590 MachineInstr &MI, MachineRegisterInfo &MRI,
1591 MachineIRBuilder &B) const {
1592 B.setInstr(MI);
1593
1594 Register DstReg = MI.getOperand(0).getReg();
1595 Register SrcReg = MI.getOperand(1).getReg();
1596 LLT Ty = MRI.getType(DstReg);
1597 unsigned Flags = MI.getFlags();
1598
1599 Register TrigVal;
1600 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI3.14159265358979323846);
1601 if (ST.hasTrigReducedRange()) {
1602 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1603 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1604 .addUse(MulVal.getReg(0))
1605 .setMIFlags(Flags).getReg(0);
1606 } else
1607 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1608
1609 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1610 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1611 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1612 .addUse(TrigVal)
1613 .setMIFlags(Flags);
1614 MI.eraseFromParent();
1615 return true;
1616}
1617
1618bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1619 Register DstReg, LLT PtrTy,
1620 MachineIRBuilder &B, const GlobalValue *GV,
1621 unsigned Offset, unsigned GAFlags) const {
1622 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1623 // to the following code sequence:
1624 //
1625 // For constant address space:
1626 // s_getpc_b64 s[0:1]
1627 // s_add_u32 s0, s0, $symbol
1628 // s_addc_u32 s1, s1, 0
1629 //
1630 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1631 // a fixup or relocation is emitted to replace $symbol with a literal
1632 // constant, which is a pc-relative offset from the encoding of the $symbol
1633 // operand to the global variable.
1634 //
1635 // For global address space:
1636 // s_getpc_b64 s[0:1]
1637 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1638 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1639 //
1640 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1641 // fixups or relocations are emitted to replace $symbol@*@lo and
1642 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1643 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1644 // operand to the global variable.
1645 //
1646 // What we want here is an offset from the value returned by s_getpc
1647 // (which is the address of the s_add_u32 instruction) to the global
1648 // variable, but since the encoding of $symbol starts 4 bytes after the start
1649 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1650 // small. This requires us to add 4 to the global variable offset in order to
1651 // compute the correct address.
1652
1653 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1654
1655 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1656 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1657
1658 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1659 .addDef(PCReg);
1660
1661 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1662 if (GAFlags == SIInstrInfo::MO_NONE)
1663 MIB.addImm(0);
1664 else
1665 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1666
1667 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1668
1669 if (PtrTy.getSizeInBits() == 32)
1670 B.buildExtract(DstReg, PCReg, 0);
1671 return true;
1672 }
1673
1674bool AMDGPULegalizerInfo::legalizeGlobalValue(
1675 MachineInstr &MI, MachineRegisterInfo &MRI,
1676 MachineIRBuilder &B) const {
1677 Register DstReg = MI.getOperand(0).getReg();
1678 LLT Ty = MRI.getType(DstReg);
1679 unsigned AS = Ty.getAddressSpace();
1680
1681 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1682 MachineFunction &MF = B.getMF();
1683 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1684 B.setInstr(MI);
1685
1686 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1687 if (!MFI->isEntryFunction()) {
1688 const Function &Fn = MF.getFunction();
1689 DiagnosticInfoUnsupported BadLDSDecl(
1690 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1691 Fn.getContext().diagnose(BadLDSDecl);
1692 }
1693
1694 // TODO: We could emit code to handle the initialization somewhere.
1695 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1696 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1697 MI.eraseFromParent();
1698 return true;
1699 }
1700
1701 const Function &Fn = MF.getFunction();
1702 DiagnosticInfoUnsupported BadInit(
1703 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1704 Fn.getContext().diagnose(BadInit);
1705 return true;
1706 }
1707
1708 const SITargetLowering *TLI = ST.getTargetLowering();
1709
1710 if (TLI->shouldEmitFixup(GV)) {
1711 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1712 MI.eraseFromParent();
1713 return true;
1714 }
1715
1716 if (TLI->shouldEmitPCReloc(GV)) {
1717 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1718 MI.eraseFromParent();
1719 return true;
1720 }
1721
1722 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1723 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1724
1725 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1726 MachinePointerInfo::getGOT(MF),
1727 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1728 MachineMemOperand::MOInvariant,
1729 8 /*Size*/, 8 /*Align*/);
1730
1731 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1732
1733 if (Ty.getSizeInBits() == 32) {
1734 // Truncate if this is a 32-bit constant adrdess.
1735 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1736 B.buildExtract(DstReg, Load, 0);
1737 } else
1738 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1739
1740 MI.eraseFromParent();
1741 return true;
1742}
1743
1744bool AMDGPULegalizerInfo::legalizeLoad(
1745 MachineInstr &MI, MachineRegisterInfo &MRI,
1746 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1747 B.setInstr(MI);
1748 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1749 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1750 Observer.changingInstr(MI);
1751 MI.getOperand(1).setReg(Cast.getReg(0));
1752 Observer.changedInstr(MI);
1753 return true;
1754}
1755
1756bool AMDGPULegalizerInfo::legalizeFMad(
1757 MachineInstr &MI, MachineRegisterInfo &MRI,
1758 MachineIRBuilder &B) const {
1759 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1760 assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1760, __PRETTY_FUNCTION__))
;
1761
1762 MachineFunction &MF = B.getMF();
1763 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1764
1765 // TODO: Always legal with future ftz flag.
1766 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1767 return true;
1768 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1769 return true;
1770
1771
1772 MachineIRBuilder HelperBuilder(MI);
1773 GISelObserverWrapper DummyObserver;
1774 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1775 HelperBuilder.setMBB(*MI.getParent());
1776 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1777}
1778
1779bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1780 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1781 Register DstReg = MI.getOperand(0).getReg();
1782 Register PtrReg = MI.getOperand(1).getReg();
1783 Register CmpVal = MI.getOperand(2).getReg();
1784 Register NewVal = MI.getOperand(3).getReg();
1785
1786 assert(SITargetLowering::isFlatGlobalAddrSpace(((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1788, __PRETTY_FUNCTION__))
1787 MRI.getType(PtrReg).getAddressSpace()) &&((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1788, __PRETTY_FUNCTION__))
1788 "this should not have been custom lowered")((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1788, __PRETTY_FUNCTION__))
;
1789
1790 LLT ValTy = MRI.getType(CmpVal);
1791 LLT VecTy = LLT::vector(2, ValTy);
1792
1793 B.setInstr(MI);
1794 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1795
1796 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1797 .addDef(DstReg)
1798 .addUse(PtrReg)
1799 .addUse(PackedVal)
1800 .setMemRefs(MI.memoperands());
1801
1802 MI.eraseFromParent();
1803 return true;
1804}
1805
1806// Return the use branch instruction, otherwise null if the usage is invalid.
1807static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1808 MachineRegisterInfo &MRI) {
1809 Register CondDef = MI.getOperand(0).getReg();
1810 if (!MRI.hasOneNonDBGUse(CondDef))
1811 return nullptr;
1812
1813 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1814 return UseMI.getParent() == MI.getParent() &&
1815 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1816}
1817
1818Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1819 Register Reg, LLT Ty) const {
1820 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1821 if (LiveIn)
1822 return LiveIn;
1823
1824 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1825 MRI.addLiveIn(Reg, NewReg);
1826 return NewReg;
1827}
1828
1829bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1830 const ArgDescriptor *Arg) const {
1831 if (!Arg->isRegister() || !Arg->getRegister().isValid())
9
Taking false branch
1832 return false; // TODO: Handle these
1833
1834 assert(Arg->getRegister().isPhysical())((Arg->getRegister().isPhysical()) ? static_cast<void>
(0) : __assert_fail ("Arg->getRegister().isPhysical()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1834, __PRETTY_FUNCTION__))
;
10
'?' condition is true
1835
1836 MachineRegisterInfo &MRI = *B.getMRI();
1837
1838 LLT Ty = MRI.getType(DstReg);
1839 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1840
1841 if (Arg->isMasked()) {
11
Calling 'ArgDescriptor::isMasked'
14
Returning from 'ArgDescriptor::isMasked'
15
Taking true branch
1842 // TODO: Should we try to emit this once in the entry block?
1843 const LLT S32 = LLT::scalar(32);
1844 const unsigned Mask = Arg->getMask();
1845 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
16
Calling 'countTrailingZeros<unsigned int>'
23
Returning from 'countTrailingZeros<unsigned int>'
24
'Shift' initialized to 32
1846
1847 Register AndMaskSrc = LiveIn;
1848
1849 if (Shift
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
!= 0) {
25
Taking true branch
1850 auto ShiftAmt = B.buildConstant(S32, Shift);
1851 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1852 }
1853
1854 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
26
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
1855 } else
1856 B.buildCopy(DstReg, LiveIn);
1857
1858 // Insert the argument copy if it doens't already exist.
1859 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1860 if (!MRI.getVRegDef(LiveIn)) {
1861 // FIXME: Should have scoped insert pt
1862 MachineBasicBlock &OrigInsBB = B.getMBB();
1863 auto OrigInsPt = B.getInsertPt();
1864
1865 MachineBasicBlock &EntryMBB = B.getMF().front();
1866 EntryMBB.addLiveIn(Arg->getRegister());
1867 B.setInsertPt(EntryMBB, EntryMBB.begin());
1868 B.buildCopy(LiveIn, Arg->getRegister());
1869
1870 B.setInsertPt(OrigInsBB, OrigInsPt);
1871 }
1872
1873 return true;
1874}
1875
1876bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1877 MachineInstr &MI,
1878 MachineRegisterInfo &MRI,
1879 MachineIRBuilder &B,
1880 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1881 B.setInstr(MI);
1882
1883 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1884
1885 const ArgDescriptor *Arg;
1886 const TargetRegisterClass *RC;
1887 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1888 if (!Arg) {
6
Assuming 'Arg' is non-null
7
Taking false branch
1889 LLVM_DEBUG(dbgs() << "Required arg register missing\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-legalinfo")) { dbgs() << "Required arg register missing\n"
; } } while (false)
;
1890 return false;
1891 }
1892
1893 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
8
Calling 'AMDGPULegalizerInfo::loadInputValue'
1894 MI.eraseFromParent();
1895 return true;
1896 }
1897
1898 return false;
1899}
1900
1901bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1902 MachineRegisterInfo &MRI,
1903 MachineIRBuilder &B) const {
1904 B.setInstr(MI);
1905 Register Dst = MI.getOperand(0).getReg();
1906 LLT DstTy = MRI.getType(Dst);
1907 LLT S16 = LLT::scalar(16);
1908 LLT S32 = LLT::scalar(32);
1909 LLT S64 = LLT::scalar(64);
1910
1911 if (legalizeFastUnsafeFDIV(MI, MRI, B))
1912 return true;
1913
1914 if (DstTy == S16)
1915 return legalizeFDIV16(MI, MRI, B);
1916 if (DstTy == S32)
1917 return legalizeFDIV32(MI, MRI, B);
1918 if (DstTy == S64)
1919 return legalizeFDIV64(MI, MRI, B);
1920
1921 return false;
1922}
1923
1924bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1925 MachineRegisterInfo &MRI,
1926 MachineIRBuilder &B) const {
1927 Register Res = MI.getOperand(0).getReg();
1928 Register LHS = MI.getOperand(1).getReg();
1929 Register RHS = MI.getOperand(2).getReg();
1930
1931 uint16_t Flags = MI.getFlags();
1932
1933 LLT ResTy = MRI.getType(Res);
1934 LLT S32 = LLT::scalar(32);
1935 LLT S64 = LLT::scalar(64);
1936
1937 const MachineFunction &MF = B.getMF();
1938 bool Unsafe =
1939 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1940
1941 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1942 return false;
1943
1944 if (!Unsafe && ResTy == S32 &&
1945 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1946 return false;
1947
1948 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1949 // 1 / x -> RCP(x)
1950 if (CLHS->isExactlyValue(1.0)) {
1951 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1952 .addUse(RHS)
1953 .setMIFlags(Flags);
1954
1955 MI.eraseFromParent();
1956 return true;
1957 }
1958
1959 // -1 / x -> RCP( FNEG(x) )
1960 if (CLHS->isExactlyValue(-1.0)) {
1961 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1962 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1963 .addUse(FNeg.getReg(0))
1964 .setMIFlags(Flags);
1965
1966 MI.eraseFromParent();
1967 return true;
1968 }
1969 }
1970
1971 // x / y -> x * (1.0 / y)
1972 if (Unsafe) {
1973 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1974 .addUse(RHS)
1975 .setMIFlags(Flags);
1976 B.buildFMul(Res, LHS, RCP, Flags);
1977
1978 MI.eraseFromParent();
1979 return true;
1980 }
1981
1982 return false;
1983}
1984
1985bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1986 MachineRegisterInfo &MRI,
1987 MachineIRBuilder &B) const {
1988 B.setInstr(MI);
1989 Register Res = MI.getOperand(0).getReg();
1990 Register LHS = MI.getOperand(1).getReg();
1991 Register RHS = MI.getOperand(2).getReg();
1992
1993 uint16_t Flags = MI.getFlags();
1994
1995 LLT S16 = LLT::scalar(16);
1996 LLT S32 = LLT::scalar(32);
1997
1998 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1999 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2000
2001 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2002 .addUse(RHSExt.getReg(0))
2003 .setMIFlags(Flags);
2004
2005 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2006 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2007
2008 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2009 .addUse(RDst.getReg(0))
2010 .addUse(RHS)
2011 .addUse(LHS)
2012 .setMIFlags(Flags);
2013
2014 MI.eraseFromParent();
2015 return true;
2016}
2017
2018// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2019// to enable denorm mode. When 'Enable' is false, disable denorm mode.
2020static void toggleSPDenormMode(bool Enable,
2021 MachineIRBuilder &B,
2022 const GCNSubtarget &ST,
2023 AMDGPU::SIModeRegisterDefaults Mode) {
2024 // Set SP denorm mode to this value.
2025 unsigned SPDenormMode =
2026 Enable ? FP_DENORM_FLUSH_NONE3 : FP_DENORM_FLUSH_IN_FLUSH_OUT0;
2027
2028 if (ST.hasDenormModeInst()) {
2029 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2030 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2031 ? FP_DENORM_FLUSH_NONE3
2032 : FP_DENORM_FLUSH_IN_FLUSH_OUT0;
2033
2034 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2035 B.buildInstr(AMDGPU::S_DENORM_MODE)
2036 .addImm(NewDenormModeValue);
2037
2038 } else {
2039 // Select FP32 bit field in mode register.
2040 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2041 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2042 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2043
2044 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2045 .addImm(SPDenormMode)
2046 .addImm(SPDenormModeBitField);
2047 }
2048}
2049
2050bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2051 MachineRegisterInfo &MRI,
2052 MachineIRBuilder &B) const {
2053 B.setInstr(MI);
2054 Register Res = MI.getOperand(0).getReg();
2055 Register LHS = MI.getOperand(1).getReg();
2056 Register RHS = MI.getOperand(2).getReg();
2057 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2058 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2059
2060 uint16_t Flags = MI.getFlags();
2061
2062 LLT S32 = LLT::scalar(32);
2063 LLT S1 = LLT::scalar(1);
2064
2065 auto One = B.buildFConstant(S32, 1.0f);
2066
2067 auto DenominatorScaled =
2068 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2069 .addUse(RHS)
2070 .addUse(LHS)
2071 .addImm(1)
2072 .setMIFlags(Flags);
2073 auto NumeratorScaled =
2074 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2075 .addUse(LHS)
2076 .addUse(RHS)
2077 .addImm(0)
2078 .setMIFlags(Flags);
2079
2080 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2081 .addUse(DenominatorScaled.getReg(0))
2082 .setMIFlags(Flags);
2083 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2084
2085 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2086 // aren't modeled as reading it.
2087 if (!Mode.FP32Denormals)
2088 toggleSPDenormMode(true, B, ST, Mode);
2089
2090 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2091 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2092 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2093 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2094 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2095 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2096
2097 if (!Mode.FP32Denormals)
2098 toggleSPDenormMode(false, B, ST, Mode);
2099
2100 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2101 .addUse(Fma4.getReg(0))
2102 .addUse(Fma1.getReg(0))
2103 .addUse(Fma3.getReg(0))
2104 .addUse(NumeratorScaled.getReg(1))
2105 .setMIFlags(Flags);
2106
2107 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2108 .addUse(Fmas.getReg(0))
2109 .addUse(RHS)
2110 .addUse(LHS)
2111 .setMIFlags(Flags);
2112
2113 MI.eraseFromParent();
2114 return true;
2115}
2116
2117bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2118 MachineRegisterInfo &MRI,
2119 MachineIRBuilder &B) const {
2120 B.setInstr(MI);
2121 Register Res = MI.getOperand(0).getReg();
2122 Register LHS = MI.getOperand(1).getReg();
2123 Register RHS = MI.getOperand(2).getReg();
2124
2125 uint16_t Flags = MI.getFlags();
2126
2127 LLT S64 = LLT::scalar(64);
2128 LLT S1 = LLT::scalar(1);
2129
2130 auto One = B.buildFConstant(S64, 1.0);
2131
2132 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2133 .addUse(LHS)
2134 .addUse(RHS)
2135 .addImm(1)
2136 .setMIFlags(Flags);
2137
2138 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2139
2140 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2141 .addUse(DivScale0.getReg(0))
2142 .setMIFlags(Flags);
2143
2144 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2145 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2146 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2147
2148 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2149 .addUse(LHS)
2150 .addUse(RHS)
2151 .addImm(0)
2152 .setMIFlags(Flags);
2153
2154 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2155 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2156 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2157
2158 Register Scale;
2159 if (!ST.hasUsableDivScaleConditionOutput()) {
2160 // Workaround a hardware bug on SI where the condition output from div_scale
2161 // is not usable.
2162
2163 Scale = MRI.createGenericVirtualRegister(S1);
2164
2165 LLT S32 = LLT::scalar(32);
2166
2167 auto NumUnmerge = B.buildUnmerge(S32, LHS);
2168 auto DenUnmerge = B.buildUnmerge(S32, RHS);
2169 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2170 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2171
2172 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2173 Scale1Unmerge.getReg(1));
2174 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2175 Scale0Unmerge.getReg(1));
2176 B.buildXor(Scale, CmpNum, CmpDen);
2177 } else {
2178 Scale = DivScale1.getReg(1);
2179 }
2180
2181 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2182 .addUse(Fma4.getReg(0))
2183 .addUse(Fma3.getReg(0))
2184 .addUse(Mul.getReg(0))
2185 .addUse(Scale)
2186 .setMIFlags(Flags);
2187
2188 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2189 .addUse(Fmas.getReg(0))
2190 .addUse(RHS)
2191 .addUse(LHS)
2192 .setMIFlags(Flags);
2193
2194 MI.eraseFromParent();
2195 return true;
2196}
2197
2198bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2199 MachineRegisterInfo &MRI,
2200 MachineIRBuilder &B) const {
2201 B.setInstr(MI);
2202 Register Res = MI.getOperand(0).getReg();
2203 Register LHS = MI.getOperand(2).getReg();
2204 Register RHS = MI.getOperand(3).getReg();
2205 uint16_t Flags = MI.getFlags();
2206
2207 LLT S32 = LLT::scalar(32);
2208 LLT S1 = LLT::scalar(1);
2209
2210 auto Abs = B.buildFAbs(S32, RHS, Flags);
2211 const APFloat C0Val(1.0f);
2212
2213 auto C0 = B.buildConstant(S32, 0x6f800000);
2214 auto C1 = B.buildConstant(S32, 0x2f800000);
2215 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2216
2217 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2218 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2219
2220 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2221
2222 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2223 .addUse(Mul0.getReg(0))
2224 .setMIFlags(Flags);
2225
2226 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2227
2228 B.buildFMul(Res, Sel, Mul1, Flags);
2229
2230 MI.eraseFromParent();
2231 return true;
2232}
2233
2234bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2235 MachineRegisterInfo &MRI,
2236 MachineIRBuilder &B) const {
2237 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2238 if (!MFI->isEntryFunction()) {
3
Assuming the condition is true
4
Taking true branch
2239 return legalizePreloadedArgIntrin(MI, MRI, B,
5
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
2240 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2241 }
2242
2243 B.setInstr(MI);
2244
2245 uint64_t Offset =
2246 ST.getTargetLowering()->getImplicitParameterOffset(
2247 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2248 Register DstReg = MI.getOperand(0).getReg();
2249 LLT DstTy = MRI.getType(DstReg);
2250 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2251
2252 const ArgDescriptor *Arg;
2253 const TargetRegisterClass *RC;
2254 std::tie(Arg, RC)
2255 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2256 if (!Arg)
2257 return false;
2258
2259 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2260 if (!loadInputValue(KernargPtrReg, B, Arg))
2261 return false;
2262
2263 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2264 MI.eraseFromParent();
2265 return true;
2266}
2267
2268bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2269 MachineRegisterInfo &MRI,
2270 MachineIRBuilder &B,
2271 unsigned AddrSpace) const {
2272 B.setInstr(MI);
2273 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2274 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2275 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2276 MI.eraseFromParent();
2277 return true;
2278}
2279
2280/// Handle register layout difference for f16 images for some subtargets.
2281Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2282 MachineRegisterInfo &MRI,
2283 Register Reg) const {
2284 if (!ST.hasUnpackedD16VMem())
2285 return Reg;
2286
2287 const LLT S16 = LLT::scalar(16);
2288 const LLT S32 = LLT::scalar(32);
2289 LLT StoreVT = MRI.getType(Reg);
2290 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16
) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2290, __PRETTY_FUNCTION__))
;
2291
2292 auto Unmerge = B.buildUnmerge(S16, Reg);
2293
2294 SmallVector<Register, 4> WideRegs;
2295 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2296 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2297
2298 int NumElts = StoreVT.getNumElements();
2299
2300 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2301}
2302
2303bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2304 MachineRegisterInfo &MRI,
2305 MachineIRBuilder &B,
2306 bool IsFormat) const {
2307 // TODO: Reject f16 format on targets where unsupported.
2308 Register VData = MI.getOperand(1).getReg();
2309 LLT Ty = MRI.getType(VData);
2310
2311 B.setInstr(MI);
2312
2313 const LLT S32 = LLT::scalar(32);
2314 const LLT S16 = LLT::scalar(16);
2315
2316 // Fixup illegal register types for i8 stores.
2317 if (Ty == LLT::scalar(8) || Ty == S16) {
2318 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2319 MI.getOperand(1).setReg(AnyExt);
2320 return true;
2321 }
2322
2323 if (Ty.isVector()) {
2324 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2325 if (IsFormat)
2326 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2327 return true;
2328 }
2329
2330 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2331 }
2332
2333 return Ty == S32;
2334}
2335
2336bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2337 MachineRegisterInfo &MRI,
2338 MachineIRBuilder &B) const {
2339 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2340 auto IntrID = MI.getIntrinsicID();
2341 switch (IntrID) {
1
Control jumps to 'case amdgcn_implicitarg_ptr:' at line 2395
2342 case Intrinsic::amdgcn_if:
2343 case Intrinsic::amdgcn_else: {
2344 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2345 const SIRegisterInfo *TRI
2346 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2347
2348 B.setInstr(*BrCond);
2349 Register Def = MI.getOperand(1).getReg();
2350 Register Use = MI.getOperand(3).getReg();
2351
2352 if (IntrID == Intrinsic::amdgcn_if) {
2353 B.buildInstr(AMDGPU::SI_IF)
2354 .addDef(Def)
2355 .addUse(Use)
2356 .addMBB(BrCond->getOperand(1).getMBB());
2357 } else {
2358 B.buildInstr(AMDGPU::SI_ELSE)
2359 .addDef(Def)
2360 .addUse(Use)
2361 .addMBB(BrCond->getOperand(1).getMBB())
2362 .addImm(0);
2363 }
2364
2365 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2366 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2367 MI.eraseFromParent();
2368 BrCond->eraseFromParent();
2369 return true;
2370 }
2371
2372 return false;
2373 }
2374 case Intrinsic::amdgcn_loop: {
2375 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2376 const SIRegisterInfo *TRI
2377 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2378
2379 B.setInstr(*BrCond);
2380 Register Reg = MI.getOperand(2).getReg();
2381 B.buildInstr(AMDGPU::SI_LOOP)
2382 .addUse(Reg)
2383 .addMBB(BrCond->getOperand(1).getMBB());
2384 MI.eraseFromParent();
2385 BrCond->eraseFromParent();
2386 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2387 return true;
2388 }
2389
2390 return false;
2391 }
2392 case Intrinsic::amdgcn_kernarg_segment_ptr:
2393 return legalizePreloadedArgIntrin(
2394 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2395 case Intrinsic::amdgcn_implicitarg_ptr:
2396 return legalizeImplicitArgPtr(MI, MRI, B);
2
Calling 'AMDGPULegalizerInfo::legalizeImplicitArgPtr'
2397 case Intrinsic::amdgcn_workitem_id_x:
2398 return legalizePreloadedArgIntrin(MI, MRI, B,
2399 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2400 case Intrinsic::amdgcn_workitem_id_y:
2401 return legalizePreloadedArgIntrin(MI, MRI, B,
2402 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2403 case Intrinsic::amdgcn_workitem_id_z:
2404 return legalizePreloadedArgIntrin(MI, MRI, B,
2405 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2406 case Intrinsic::amdgcn_workgroup_id_x:
2407 return legalizePreloadedArgIntrin(MI, MRI, B,
2408 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2409 case Intrinsic::amdgcn_workgroup_id_y:
2410 return legalizePreloadedArgIntrin(MI, MRI, B,
2411 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2412 case Intrinsic::amdgcn_workgroup_id_z:
2413 return legalizePreloadedArgIntrin(MI, MRI, B,
2414 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2415 case Intrinsic::amdgcn_dispatch_ptr:
2416 return legalizePreloadedArgIntrin(MI, MRI, B,
2417 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2418 case Intrinsic::amdgcn_queue_ptr:
2419 return legalizePreloadedArgIntrin(MI, MRI, B,
2420 AMDGPUFunctionArgInfo::QUEUE_PTR);
2421 case Intrinsic::amdgcn_implicit_buffer_ptr:
2422 return legalizePreloadedArgIntrin(
2423 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2424 case Intrinsic::amdgcn_dispatch_id:
2425 return legalizePreloadedArgIntrin(MI, MRI, B,
2426 AMDGPUFunctionArgInfo::DISPATCH_ID);
2427 case Intrinsic::amdgcn_fdiv_fast:
2428 return legalizeFDIVFastIntrin(MI, MRI, B);
2429 case Intrinsic::amdgcn_is_shared:
2430 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2431 case Intrinsic::amdgcn_is_private:
2432 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2433 case Intrinsic::amdgcn_wavefrontsize: {
2434 B.setInstr(MI);
2435 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2436 MI.eraseFromParent();
2437 return true;
2438 }
2439 case Intrinsic::amdgcn_raw_buffer_store:
2440 return legalizeRawBufferStore(MI, MRI, B, false);
2441 case Intrinsic::amdgcn_raw_buffer_store_format:
2442 return legalizeRawBufferStore(MI, MRI, B, true);
2443 default:
2444 return true;
2445 }
2446
2447 return true;
2448}

/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
12
Assuming the condition is true
13
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include "llvm/Support/SwapByteOrder.h"
18#include <algorithm>
19#include <cassert>
20#include <climits>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
18
Assuming 'Val' is equal to 0
19
Taking true branch
117 return 32;
20
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
17
Calling 'TrailingZerosCounter::count'
21
Returning from 'TrailingZerosCounter::count'
22
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 251, __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315// NOTE: The following support functions use the _32/_64 extensions instead of
316// type overloading so that signed and unsigned integers can be used without
317// ambiguity.
318
319/// Return the high 32 bits of a 64 bit value.
320constexpr inline uint32_t Hi_32(uint64_t Value) {
321 return static_cast<uint32_t>(Value >> 32);
322}
323
324/// Return the low 32 bits of a 64 bit value.
325constexpr inline uint32_t Lo_32(uint64_t Value) {
326 return static_cast<uint32_t>(Value);
327}
328
329/// Make a 64-bit integer from a high / low pair of 32-bit integers.
330constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
331 return ((uint64_t)High << 32) | (uint64_t)Low;
332}
333
334/// Checks if an integer fits into the given bit width.
335template <unsigned N> constexpr inline bool isInt(int64_t x) {
336 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
337}
338// Template specializations to get better code for common cases.
339template <> constexpr inline bool isInt<8>(int64_t x) {
340 return static_cast<int8_t>(x) == x;
341}
342template <> constexpr inline bool isInt<16>(int64_t x) {
343 return static_cast<int16_t>(x) == x;
344}
345template <> constexpr inline bool isInt<32>(int64_t x) {
346 return static_cast<int32_t>(x) == x;
347}
348
349/// Checks if a signed integer is an N bit number shifted left by S.
350template <unsigned N, unsigned S>
351constexpr inline bool isShiftedInt(int64_t x) {
352 static_assert(
353 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
354 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
355 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
356}
357
358/// Checks if an unsigned integer fits into the given bit width.
359///
360/// This is written as two functions rather than as simply
361///
362/// return N >= 64 || X < (UINT64_C(1) << N);
363///
364/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
365/// left too many places.
366template <unsigned N>
367constexpr inline typename std::enable_if<(N < 64), bool>::type
368isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline typename std::enable_if<N >= 64, bool>::type
374isUInt(uint64_t X) {
375 return true;
376}
377
378// Template specializations to get better code for common cases.
379template <> constexpr inline bool isUInt<8>(uint64_t x) {
380 return static_cast<uint8_t>(x) == x;
381}
382template <> constexpr inline bool isUInt<16>(uint64_t x) {
383 return static_cast<uint16_t>(x) == x;
384}
385template <> constexpr inline bool isUInt<32>(uint64_t x) {
386 return static_cast<uint32_t>(x) == x;
387}
388
389/// Checks if a unsigned integer is an N bit number shifted left by S.
390template <unsigned N, unsigned S>
391constexpr inline bool isShiftedUInt(uint64_t x) {
392 static_assert(
393 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
394 static_assert(N + S <= 64,
395 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
396 // Per the two static_asserts above, S must be strictly less than 64. So
397 // 1 << S is not undefined behavior.
398 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
399}
400
401/// Gets the maximum value for a N-bit unsigned integer.
402inline uint64_t maxUIntN(uint64_t N) {
403 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 403, __PRETTY_FUNCTION__))
;
404
405 // uint64_t(1) << 64 is undefined behavior, so we can't do
406 // (uint64_t(1) << N) - 1
407 // without checking first that N != 64. But this works and doesn't have a
408 // branch.
409 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
410}
411
412/// Gets the minimum value for a N-bit signed integer.
413inline int64_t minIntN(int64_t N) {
414 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 414, __PRETTY_FUNCTION__))
;
415
416 return -(UINT64_C(1)1UL<<(N-1));
417}
418
419/// Gets the maximum value for a N-bit signed integer.
420inline int64_t maxIntN(int64_t N) {
421 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 421, __PRETTY_FUNCTION__))
;
422
423 // This relies on two's complement wraparound when N == 64, so we convert to
424 // int64_t only at the very end to avoid UB.
425 return (UINT64_C(1)1UL << (N - 1)) - 1;
426}
427
428/// Checks if an unsigned integer fits into the given (dynamic) bit width.
429inline bool isUIntN(unsigned N, uint64_t x) {
430 return N >= 64 || x <= maxUIntN(N);
431}
432
433/// Checks if an signed integer fits into the given (dynamic) bit width.
434inline bool isIntN(unsigned N, int64_t x) {
435 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
436}
437
438/// Return true if the argument is a non-empty sequence of ones starting at the
439/// least significant bit with the remainder zero (32 bit version).
440/// Ex. isMask_32(0x0000FFFFU) == true.
441constexpr inline bool isMask_32(uint32_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument is a non-empty sequence of ones starting at the
446/// least significant bit with the remainder zero (64 bit version).
447constexpr inline bool isMask_64(uint64_t Value) {
448 return Value && ((Value + 1) & Value) == 0;
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
453constexpr inline bool isShiftedMask_32(uint32_t Value) {
454 return Value && isMask_32((Value - 1) | Value);
455}
456
457/// Return true if the argument contains a non-empty sequence of ones with the
458/// remainder zero (64 bit version.)
459constexpr inline bool isShiftedMask_64(uint64_t Value) {
460 return Value && isMask_64((Value - 1) | Value);
461}
462
463/// Return true if the argument is a power of two > 0.
464/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
465constexpr inline bool isPowerOf2_32(uint32_t Value) {
466 return Value && !(Value & (Value - 1));
467}
468
469/// Return true if the argument is a power of two > 0 (64 bit edition.)
470constexpr inline bool isPowerOf2_64(uint64_t Value) {
471 return Value && !(Value & (Value - 1));
472}
473
474/// Return a byte-swapped representation of the 16-bit argument.
475inline uint16_t ByteSwap_16(uint16_t Value) {
476 return sys::SwapByteOrder_16(Value);
477}
478
479/// Return a byte-swapped representation of the 32-bit argument.
480inline uint32_t ByteSwap_32(uint32_t Value) {
481 return sys::SwapByteOrder_32(Value);
482}
483
484/// Return a byte-swapped representation of the 64-bit argument.
485inline uint64_t ByteSwap_64(uint64_t Value) {
486 return sys::SwapByteOrder_64(Value);
487}
488
489/// Count the number of ones from the most significant bit to the first
490/// zero bit.
491///
492/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countLeadingZeros<T>(~Value, ZB);
503}
504
505/// Count the number of ones from the least significant bit to the first
506/// zero bit.
507///
508/// Ex. countTrailingOnes(0x00FF00FF) == 8.
509/// Only unsigned integral types are allowed.
510///
511/// \param ZB the behavior on an input of all ones. Only ZB_Width and
512/// ZB_Undefined are valid arguments.
513template <typename T>
514unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
515 static_assert(std::numeric_limits<T>::is_integer &&
516 !std::numeric_limits<T>::is_signed,
517 "Only unsigned integral types are allowed.");
518 return countTrailingZeros<T>(~Value, ZB);
519}
520
521namespace detail {
522template <typename T, std::size_t SizeOfT> struct PopulationCounter {
523 static unsigned count(T Value) {
524 // Generic version, forward to 32 bits.
525 static_assert(SizeOfT <= 4, "Not implemented!");
526#if defined(__GNUC__4)
527 return __builtin_popcount(Value);
528#else
529 uint32_t v = Value;
530 v = v - ((v >> 1) & 0x55555555);
531 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
532 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
533#endif
534 }
535};
536
537template <typename T> struct PopulationCounter<T, 8> {
538 static unsigned count(T Value) {
539#if defined(__GNUC__4)
540 return __builtin_popcountll(Value);
541#else
542 uint64_t v = Value;
543 v = v - ((v >> 1) & 0x5555555555555555ULL);
544 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
545 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
546 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
547#endif
548 }
549};
550} // namespace detail
551
552/// Count the number of set bits in a value.
553/// Ex. countPopulation(0xF000F000) = 8
554/// Returns 0 if the word is zero.
555template <typename T>
556inline unsigned countPopulation(T Value) {
557 static_assert(std::numeric_limits<T>::is_integer &&
558 !std::numeric_limits<T>::is_signed,
559 "Only unsigned integral types are allowed.");
560 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
561}
562
563/// Compile time Log2.
564/// Valid only for positive powers of two.
565template <size_t kValue> constexpr inline size_t CTLog2() {
566 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
567 "Value is not a valid power of 2");
568 return 1 + CTLog2<kValue / 2>();
569}
570
571template <> constexpr inline size_t CTLog2<1>() { return 0; }
572
573/// Return the log base 2 of the specified value.
574inline double Log2(double Value) {
575#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
576 return __builtin_log(Value) / __builtin_log(2.0);
577#else
578 return log2(Value);
579#endif
580}
581
582/// Return the floor log base 2 of the specified value, -1 if the value is zero.
583/// (32 bit edition.)
584/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
585inline unsigned Log2_32(uint32_t Value) {
586 return 31 - countLeadingZeros(Value);
587}
588
589/// Return the floor log base 2 of the specified value, -1 if the value is zero.
590/// (64 bit edition.)
591inline unsigned Log2_64(uint64_t Value) {
592 return 63 - countLeadingZeros(Value);
593}
594
595/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
596/// (32 bit edition).
597/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
598inline unsigned Log2_32_Ceil(uint32_t Value) {
599 return 32 - countLeadingZeros(Value - 1);
600}
601
602/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
603/// (64 bit edition.)
604inline unsigned Log2_64_Ceil(uint64_t Value) {
605 return 64 - countLeadingZeros(Value - 1);
606}
607
608/// Return the greatest common divisor of the values using Euclid's algorithm.
609template <typename T>
610inline T greatestCommonDivisor(T A, T B) {
611 while (B) {
612 T Tmp = B;
613 B = A % B;
614 A = Tmp;
615 }
616 return A;
617}
618
619inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
620 return greatestCommonDivisor<uint64_t>(A, B);
621}
622
623/// This function takes a 64-bit integer and returns the bit equivalent double.
624inline double BitsToDouble(uint64_t Bits) {
625 double D;
626 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
627 memcpy(&D, &Bits, sizeof(Bits));
628 return D;
629}
630
631/// This function takes a 32-bit integer and returns the bit equivalent float.
632inline float BitsToFloat(uint32_t Bits) {
633 float F;
634 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
635 memcpy(&F, &Bits, sizeof(Bits));
636 return F;
637}
638
639/// This function takes a double and returns the bit equivalent 64-bit integer.
640/// Note that copying doubles around changes the bits of NaNs on some hosts,
641/// notably x86, so this routine cannot be used if these bits are needed.
642inline uint64_t DoubleToBits(double Double) {
643 uint64_t Bits;
644 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
645 memcpy(&Bits, &Double, sizeof(Double));
646 return Bits;
647}
648
649/// This function takes a float and returns the bit equivalent 32-bit integer.
650/// Note that copying floats around changes the bits of NaNs on some hosts,
651/// notably x86, so this routine cannot be used if these bits are needed.
652inline uint32_t FloatToBits(float Float) {
653 uint32_t Bits;
654 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
655 memcpy(&Bits, &Float, sizeof(Float));
656 return Bits;
657}
658
659/// A and B are either alignments or offsets. Return the minimum alignment that
660/// may be assumed after adding the two together.
661constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
662 // The largest power of 2 that divides both A and B.
663 //
664 // Replace "-Value" by "1+~Value" in the following commented code to avoid
665 // MSVC warning C4146
666 // return (A | B) & -(A | B);
667 return (A | B) & (1 + ~(A | B));
668}
669
670/// Returns the next power of two (in 64-bits) that is strictly greater than A.
671/// Returns zero on overflow.
672inline uint64_t NextPowerOf2(uint64_t A) {
673 A |= (A >> 1);
674 A |= (A >> 2);
675 A |= (A >> 4);
676 A |= (A >> 8);
677 A |= (A >> 16);
678 A |= (A >> 32);
679 return A + 1;
680}
681
682/// Returns the power of two which is less than or equal to the given value.
683/// Essentially, it is a floor operation across the domain of powers of two.
684inline uint64_t PowerOf2Floor(uint64_t A) {
685 if (!A) return 0;
686 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
687}
688
689/// Returns the power of two which is greater than or equal to the given value.
690/// Essentially, it is a ceil operation across the domain of powers of two.
691inline uint64_t PowerOf2Ceil(uint64_t A) {
692 if (!A)
693 return 0;
694 return NextPowerOf2(A - 1);
695}
696
697/// Returns the next integer (mod 2**64) that is greater than or equal to
698/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
699///
700/// If non-zero \p Skew is specified, the return value will be a minimal
701/// integer that is greater than or equal to \p Value and equal to
702/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
703/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
704///
705/// Examples:
706/// \code
707/// alignTo(5, 8) = 8
708/// alignTo(17, 8) = 24
709/// alignTo(~0LL, 8) = 0
710/// alignTo(321, 255) = 510
711///
712/// alignTo(5, 8, 7) = 7
713/// alignTo(17, 8, 1) = 17
714/// alignTo(~0LL, 8, 3) = 3
715/// alignTo(321, 255, 42) = 552
716/// \endcode
717inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
718 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 718, __PRETTY_FUNCTION__))
;
719 Skew %= Align;
720 return (Value + Align - 1 - Skew) / Align * Align + Skew;
721}
722
723/// Returns the next integer (mod 2**64) that is greater than or equal to
724/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
725template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
726 static_assert(Align != 0u, "Align must be non-zero");
727 return (Value + Align - 1) / Align * Align;
728}
729
730/// Returns the integer ceil(Numerator / Denominator).
731inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
732 return alignTo(Numerator, Denominator) / Denominator;
733}
734
735/// Returns the integer nearest(Numerator / Denominator).
736inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
737 return (Numerator + (Denominator / 2)) / Denominator;
738}
739
740/// Returns the largest uint64_t less than or equal to \p Value and is
741/// \p Skew mod \p Align. \p Align must be non-zero
742inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
743 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 743, __PRETTY_FUNCTION__))
;
744 Skew %= Align;
745 return (Value - Skew) / Align * Align + Skew;
746}
747
748/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
749/// Requires 0 < B <= 32.
750template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
751 static_assert(B > 0, "Bit width can't be 0.");
752 static_assert(B <= 32, "Bit width out of range.");
753 return int32_t(X << (32 - B)) >> (32 - B);
754}
755
756/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
757/// Requires 0 < B < 32.
758inline int32_t SignExtend32(uint32_t X, unsigned B) {
759 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 759, __PRETTY_FUNCTION__))
;
760 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 760, __PRETTY_FUNCTION__))
;
761 return int32_t(X << (32 - B)) >> (32 - B);
762}
763
764/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
765/// Requires 0 < B < 64.
766template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
767 static_assert(B > 0, "Bit width can't be 0.");
768 static_assert(B <= 64, "Bit width out of range.");
769 return int64_t(x << (64 - B)) >> (64 - B);
770}
771
772/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
773/// Requires 0 < B < 64.
774inline int64_t SignExtend64(uint64_t X, unsigned B) {
775 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 775, __PRETTY_FUNCTION__))
;
776 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 776, __PRETTY_FUNCTION__))
;
777 return int64_t(X << (64 - B)) >> (64 - B);
778}
779
780/// Subtract two unsigned integers, X and Y, of type T and return the absolute
781/// value of the result.
782template <typename T>
783typename std::enable_if<std::is_unsigned<T>::value, T>::type
784AbsoluteDifference(T X, T Y) {
785 return std::max(X, Y) - std::min(X, Y);
786}
787
788/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
789/// maximum representable value of T on overflow. ResultOverflowed indicates if
790/// the result is larger than the maximum representable value of type T.
791template <typename T>
792typename std::enable_if<std::is_unsigned<T>::value, T>::type
793SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
794 bool Dummy;
795 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
796 // Hacker's Delight, p. 29
797 T Z = X + Y;
798 Overflowed = (Z < X || Z < Y);
799 if (Overflowed)
800 return std::numeric_limits<T>::max();
801 else
802 return Z;
803}
804
805/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
806/// maximum representable value of T on overflow. ResultOverflowed indicates if
807/// the result is larger than the maximum representable value of type T.
808template <typename T>
809typename std::enable_if<std::is_unsigned<T>::value, T>::type
810SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
811 bool Dummy;
812 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
813
814 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
815 // because it fails for uint16_t (where multiplication can have undefined
816 // behavior due to promotion to int), and requires a division in addition
817 // to the multiplication.
818
819 Overflowed = false;
820
821 // Log2(Z) would be either Log2Z or Log2Z + 1.
822 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
823 // will necessarily be less than Log2Max as desired.
824 int Log2Z = Log2_64(X) + Log2_64(Y);
825 const T Max = std::numeric_limits<T>::max();
826 int Log2Max = Log2_64(Max);
827 if (Log2Z < Log2Max) {
828 return X * Y;
829 }
830 if (Log2Z > Log2Max) {
831 Overflowed = true;
832 return Max;
833 }
834
835 // We're going to use the top bit, and maybe overflow one
836 // bit past it. Multiply all but the bottom bit then add
837 // that on at the end.
838 T Z = (X >> 1) * Y;
839 if (Z & ~(Max >> 1)) {
840 Overflowed = true;
841 return Max;
842 }
843 Z <<= 1;
844 if (X & 1)
845 return SaturatingAdd(Z, Y, ResultOverflowed);
846
847 return Z;
848}
849
850/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
851/// the product. Clamp the result to the maximum representable value of T on
852/// overflow. ResultOverflowed indicates if the result is larger than the
853/// maximum representable value of type T.
854template <typename T>
855typename std::enable_if<std::is_unsigned<T>::value, T>::type
856SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
857 bool Dummy;
858 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
859
860 T Product = SaturatingMultiply(X, Y, &Overflowed);
861 if (Overflowed)
862 return Product;
863
864 return SaturatingAdd(A, Product, &Overflowed);
865}
866
867/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
868extern const float huge_valf;
869
870
871/// Add two signed integers, computing the two's complement truncated result,
872/// returning true if overflow occured.
873template <typename T>
874typename std::enable_if<std::is_signed<T>::value, T>::type
875AddOverflow(T X, T Y, T &Result) {
876#if __has_builtin(__builtin_add_overflow)1
877 return __builtin_add_overflow(X, Y, &Result);
878#else
879 // Perform the unsigned addition.
880 using U = typename std::make_unsigned<T>::type;
881 const U UX = static_cast<U>(X);
882 const U UY = static_cast<U>(Y);
883 const U UResult = UX + UY;
884
885 // Convert to signed.
886 Result = static_cast<T>(UResult);
887
888 // Adding two positive numbers should result in a positive number.
889 if (X > 0 && Y > 0)
890 return Result <= 0;
891 // Adding two negatives should result in a negative number.
892 if (X < 0 && Y < 0)
893 return Result >= 0;
894 return false;
895#endif
896}
897
898/// Subtract two signed integers, computing the two's complement truncated
899/// result, returning true if an overflow ocurred.
900template <typename T>
901typename std::enable_if<std::is_signed<T>::value, T>::type
902SubOverflow(T X, T Y, T &Result) {
903#if __has_builtin(__builtin_sub_overflow)1
904 return __builtin_sub_overflow(X, Y, &Result);
905#else
906 // Perform the unsigned addition.
907 using U = typename std::make_unsigned<T>::type;
908 const U UX = static_cast<U>(X);
909 const U UY = static_cast<U>(Y);
910 const U UResult = UX - UY;
911
912 // Convert to signed.
913 Result = static_cast<T>(UResult);
914
915 // Subtracting a positive number from a negative results in a negative number.
916 if (X <= 0 && Y > 0)
917 return Result >= 0;
918 // Subtracting a negative number from a positive results in a positive number.
919 if (X >= 0 && Y < 0)
920 return Result <= 0;
921 return false;
922#endif
923}
924
925
926/// Multiply two signed integers, computing the two's complement truncated
927/// result, returning true if an overflow ocurred.
928template <typename T>
929typename std::enable_if<std::is_signed<T>::value, T>::type
930MulOverflow(T X, T Y, T &Result) {
931 // Perform the unsigned multiplication on absolute values.
932 using U = typename std::make_unsigned<T>::type;
933 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
934 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
935 const U UResult = UX * UY;
936
937 // Convert to signed.
938 const bool IsNegative = (X < 0) ^ (Y < 0);
939 Result = IsNegative ? (0 - UResult) : UResult;
940
941 // If any of the args was 0, result is 0 and no overflow occurs.
942 if (UX == 0 || UY == 0)
943 return false;
944
945 // UX and UY are in [1, 2^n], where n is the number of digits.
946 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
947 // positive) divided by an argument compares to the other.
948 if (IsNegative)
949 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
950 else
951 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
952}
953
954} // End llvm namespace
955
956#endif