Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2294, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fdenormal-fp-math=ieee,ieee -fdenormal-fp-math-f32=ieee,ieee -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62=. -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-05-084736-29294-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#if defined(_MSC_VER) || defined(__MINGW32__)
15// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16// from the Visual C++ cmath / math.h headers:
17// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18#define _USE_MATH_DEFINES
19#endif
20
21#include "AMDGPULegalizerInfo.h"
22
23#include "AMDGPU.h"
24#include "AMDGPUGlobalISelUtils.h"
25#include "AMDGPUTargetMachine.h"
26#include "SIMachineFunctionInfo.h"
27#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30#include "llvm/CodeGen/TargetOpcodes.h"
31#include "llvm/CodeGen/ValueTypes.h"
32#include "llvm/IR/DerivedTypes.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/Type.h"
35#include "llvm/Support/Debug.h"
36
37#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Round the number of elements to the next power of two elements
46static LLT getPow2VectorType(LLT Ty) {
47 unsigned NElts = Ty.getNumElements();
48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
49 return Ty.changeNumElements(Pow2NElts);
50}
51
52// Round the number of bits to the next power of two bits
53static LLT getPow2ScalarType(LLT Ty) {
54 unsigned Bits = Ty.getSizeInBits();
55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
56 return LLT::scalar(Pow2Bits);
57}
58
59static LegalityPredicate isMultiple32(unsigned TypeIdx,
60 unsigned MaxSize = 1024) {
61 return [=](const LegalityQuery &Query) {
62 const LLT Ty = Query.Types[TypeIdx];
63 const LLT EltTy = Ty.getScalarType();
64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65 };
66}
67
68static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69 return [=](const LegalityQuery &Query) {
70 return Query.Types[TypeIdx].getSizeInBits() == Size;
71 };
72}
73
74static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75 return [=](const LegalityQuery &Query) {
76 const LLT Ty = Query.Types[TypeIdx];
77 return Ty.isVector() &&
78 Ty.getNumElements() % 2 != 0 &&
79 Ty.getElementType().getSizeInBits() < 32 &&
80 Ty.getSizeInBits() % 32 != 0;
81 };
82}
83
84static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85 return [=](const LegalityQuery &Query) {
86 const LLT Ty = Query.Types[TypeIdx];
87 const LLT EltTy = Ty.getScalarType();
88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89 };
90}
91
92static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93 return [=](const LegalityQuery &Query) {
94 const LLT Ty = Query.Types[TypeIdx];
95 const LLT EltTy = Ty.getElementType();
96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97 };
98}
99
100static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101 return [=](const LegalityQuery &Query) {
102 const LLT Ty = Query.Types[TypeIdx];
103 const LLT EltTy = Ty.getElementType();
104 unsigned Size = Ty.getSizeInBits();
105 unsigned Pieces = (Size + 63) / 64;
106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108 };
109}
110
111// Increase the number of vector elements to reach the next multiple of 32-bit
112// type.
113static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116
117 const LLT EltTy = Ty.getElementType();
118 const int Size = Ty.getSizeInBits();
119 const int EltSize = EltTy.getSizeInBits();
120 const int NextMul32 = (Size + 31) / 32;
121
122 assert(EltSize < 32)((EltSize < 32) ? static_cast<void> (0) : __assert_fail
("EltSize < 32", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 122, __PRETTY_FUNCTION__))
;
123
124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126 };
127}
128
129static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130 return [=](const LegalityQuery &Query) {
131 const LLT QueryTy = Query.Types[TypeIdx];
132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133 };
134}
135
136static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137 return [=](const LegalityQuery &Query) {
138 const LLT QueryTy = Query.Types[TypeIdx];
139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140 };
141}
142
143static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144 return [=](const LegalityQuery &Query) {
145 const LLT QueryTy = Query.Types[TypeIdx];
146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147 };
148}
149
150// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151// v2s16.
152static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 if (Ty.isVector()) {
156 const int EltSize = Ty.getElementType().getSizeInBits();
157 return EltSize == 32 || EltSize == 64 ||
158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159 EltSize == 128 || EltSize == 256;
160 }
161
162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163 };
164}
165
166static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167 return [=](const LegalityQuery &Query) {
168 const LLT QueryTy = Query.Types[TypeIdx];
169 return QueryTy.isVector() && QueryTy.getElementType() == Type;
170 };
171}
172
173static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174 return [=](const LegalityQuery &Query) {
175 const LLT Ty = Query.Types[TypeIdx];
176 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178 };
179}
180
181static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182 return [=](const LegalityQuery &Query) {
183 return Query.Types[TypeIdx0].getSizeInBits() <
184 Query.Types[TypeIdx1].getSizeInBits();
185 };
186}
187
188static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189 return [=](const LegalityQuery &Query) {
190 return Query.Types[TypeIdx0].getSizeInBits() >
191 Query.Types[TypeIdx1].getSizeInBits();
192 };
193}
194
195AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196 const GCNTargetMachine &TM)
197 : ST(ST_) {
198 using namespace TargetOpcode;
199
200 auto GetAddrSpacePtr = [&TM](unsigned AS) {
201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202 };
203
204 const LLT S1 = LLT::scalar(1);
205 const LLT S16 = LLT::scalar(16);
206 const LLT S32 = LLT::scalar(32);
207 const LLT S64 = LLT::scalar(64);
208 const LLT S128 = LLT::scalar(128);
209 const LLT S256 = LLT::scalar(256);
210 const LLT S1024 = LLT::scalar(1024);
211
212 const LLT V2S16 = LLT::vector(2, 16);
213 const LLT V4S16 = LLT::vector(4, 16);
214
215 const LLT V2S32 = LLT::vector(2, 32);
216 const LLT V3S32 = LLT::vector(3, 32);
217 const LLT V4S32 = LLT::vector(4, 32);
218 const LLT V5S32 = LLT::vector(5, 32);
219 const LLT V6S32 = LLT::vector(6, 32);
220 const LLT V7S32 = LLT::vector(7, 32);
221 const LLT V8S32 = LLT::vector(8, 32);
222 const LLT V9S32 = LLT::vector(9, 32);
223 const LLT V10S32 = LLT::vector(10, 32);
224 const LLT V11S32 = LLT::vector(11, 32);
225 const LLT V12S32 = LLT::vector(12, 32);
226 const LLT V13S32 = LLT::vector(13, 32);
227 const LLT V14S32 = LLT::vector(14, 32);
228 const LLT V15S32 = LLT::vector(15, 32);
229 const LLT V16S32 = LLT::vector(16, 32);
230 const LLT V32S32 = LLT::vector(32, 32);
231
232 const LLT V2S64 = LLT::vector(2, 64);
233 const LLT V3S64 = LLT::vector(3, 64);
234 const LLT V4S64 = LLT::vector(4, 64);
235 const LLT V5S64 = LLT::vector(5, 64);
236 const LLT V6S64 = LLT::vector(6, 64);
237 const LLT V7S64 = LLT::vector(7, 64);
238 const LLT V8S64 = LLT::vector(8, 64);
239 const LLT V16S64 = LLT::vector(16, 64);
240
241 std::initializer_list<LLT> AllS32Vectors =
242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244 std::initializer_list<LLT> AllS64Vectors =
245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246
247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254
255 const LLT CodePtr = FlatPtr;
256
257 const std::initializer_list<LLT> AddrSpaces64 = {
258 GlobalPtr, ConstantPtr, FlatPtr
259 };
260
261 const std::initializer_list<LLT> AddrSpaces32 = {
262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263 };
264
265 const std::initializer_list<LLT> FPTypesBase = {
266 S32, S64
267 };
268
269 const std::initializer_list<LLT> FPTypes16 = {
270 S32, S64, S16
271 };
272
273 const std::initializer_list<LLT> FPTypesPK16 = {
274 S32, S64, S16, V2S16
275 };
276
277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278
279 setAction({G_BRCOND, S1}, Legal); // VCC branches
280 setAction({G_BRCOND, S32}, Legal); // SCC branches
281
282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283 // elements for v3s16
284 getActionDefinitionsBuilder(G_PHI)
285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286 .legalFor(AllS32Vectors)
287 .legalFor(AllS64Vectors)
288 .legalFor(AddrSpaces64)
289 .legalFor(AddrSpaces32)
290 .clampScalar(0, S32, S256)
291 .widenScalarToNextPow2(0, 32)
292 .clampMaxNumElements(0, S32, 16)
293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294 .legalIf(isPointer(0));
295
296 if (ST.hasVOP3PInsts()) {
297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298 .legalFor({S32, S16, V2S16})
299 .clampScalar(0, S16, S32)
300 .clampMaxNumElements(0, S16, 2)
301 .scalarize(0)
302 .widenScalarToNextPow2(0, 32);
303 } else if (ST.has16BitInsts()) {
304 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
305 .legalFor({S32, S16})
306 .clampScalar(0, S16, S32)
307 .scalarize(0)
308 .widenScalarToNextPow2(0, 32);
309 } else {
310 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
311 .legalFor({S32})
312 .clampScalar(0, S32, S32)
313 .scalarize(0);
314 }
315
316 // FIXME: Not really legal. Placeholder for custom lowering.
317 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
318 .customFor({S32, S64})
319 .clampScalar(0, S32, S64)
320 .widenScalarToNextPow2(0, 32)
321 .scalarize(0);
322
323 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
324 .legalFor({S32})
325 .clampScalar(0, S32, S32)
326 .scalarize(0);
327
328 // Report legal for any types we can handle anywhere. For the cases only legal
329 // on the SALU, RegBankSelect will be able to re-legalize.
330 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
331 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
332 .clampScalar(0, S32, S64)
333 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
334 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
335 .widenScalarToNextPow2(0)
336 .scalarize(0);
337
338 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
339 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
340 .legalFor({{S32, S1}, {S32, S32}})
341 .minScalar(0, S32)
342 // TODO: .scalarize(0)
343 .lower();
344
345 getActionDefinitionsBuilder(G_BITCAST)
346 // Don't worry about the size constraint.
347 .legalIf(all(isRegisterType(0), isRegisterType(1)))
348 .lower();
349
350
351 getActionDefinitionsBuilder(G_CONSTANT)
352 .legalFor({S1, S32, S64, S16, GlobalPtr,
353 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
354 .clampScalar(0, S32, S64)
355 .widenScalarToNextPow2(0)
356 .legalIf(isPointer(0));
357
358 getActionDefinitionsBuilder(G_FCONSTANT)
359 .legalFor({S32, S64, S16})
360 .clampScalar(0, S16, S64);
361
362 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
363 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
364 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366 .clampScalarOrElt(0, S32, S1024)
367 .legalIf(isMultiple32(0))
368 .widenScalarToNextPow2(0, 32)
369 .clampMaxNumElements(0, S32, 16);
370
371 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
372 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
373 .unsupportedFor({PrivatePtr})
374 .custom();
375 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
376
377 auto &FPOpActions = getActionDefinitionsBuilder(
378 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
379 .legalFor({S32, S64});
380 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
381 .customFor({S32, S64});
382 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
383 .customFor({S32, S64});
384
385 if (ST.has16BitInsts()) {
386 if (ST.hasVOP3PInsts())
387 FPOpActions.legalFor({S16, V2S16});
388 else
389 FPOpActions.legalFor({S16});
390
391 TrigActions.customFor({S16});
392 FDIVActions.customFor({S16});
393 }
394
395 auto &MinNumMaxNum = getActionDefinitionsBuilder({
396 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
397
398 if (ST.hasVOP3PInsts()) {
399 MinNumMaxNum.customFor(FPTypesPK16)
400 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
401 .clampMaxNumElements(0, S16, 2)
402 .clampScalar(0, S16, S64)
403 .scalarize(0);
404 } else if (ST.has16BitInsts()) {
405 MinNumMaxNum.customFor(FPTypes16)
406 .clampScalar(0, S16, S64)
407 .scalarize(0);
408 } else {
409 MinNumMaxNum.customFor(FPTypesBase)
410 .clampScalar(0, S32, S64)
411 .scalarize(0);
412 }
413
414 if (ST.hasVOP3PInsts())
415 FPOpActions.clampMaxNumElements(0, S16, 2);
416
417 FPOpActions
418 .scalarize(0)
419 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
420
421 TrigActions
422 .scalarize(0)
423 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
424
425 FDIVActions
426 .scalarize(0)
427 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
428
429 getActionDefinitionsBuilder({G_FNEG, G_FABS})
430 .legalFor(FPTypesPK16)
431 .clampMaxNumElements(0, S16, 2)
432 .scalarize(0)
433 .clampScalar(0, S16, S64);
434
435 if (ST.has16BitInsts()) {
436 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
437 .legalFor({S32, S64, S16})
438 .scalarize(0)
439 .clampScalar(0, S16, S64);
440 } else {
441 getActionDefinitionsBuilder(G_FSQRT)
442 .legalFor({S32, S64})
443 .scalarize(0)
444 .clampScalar(0, S32, S64);
445
446 if (ST.hasFractBug()) {
447 getActionDefinitionsBuilder(G_FFLOOR)
448 .customFor({S64})
449 .legalFor({S32, S64})
450 .scalarize(0)
451 .clampScalar(0, S32, S64);
452 } else {
453 getActionDefinitionsBuilder(G_FFLOOR)
454 .legalFor({S32, S64})
455 .scalarize(0)
456 .clampScalar(0, S32, S64);
457 }
458 }
459
460 getActionDefinitionsBuilder(G_FPTRUNC)
461 .legalFor({{S32, S64}, {S16, S32}})
462 .scalarize(0)
463 .lower();
464
465 getActionDefinitionsBuilder(G_FPEXT)
466 .legalFor({{S64, S32}, {S32, S16}})
467 .lowerFor({{S64, S16}}) // FIXME: Implement
468 .scalarize(0);
469
470 getActionDefinitionsBuilder(G_FSUB)
471 // Use actual fsub instruction
472 .legalFor({S32})
473 // Must use fadd + fneg
474 .lowerFor({S64, S16, V2S16})
475 .scalarize(0)
476 .clampScalar(0, S32, S64);
477
478 // Whether this is legal depends on the floating point mode for the function.
479 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
480 if (ST.hasMadF16())
481 FMad.customFor({S32, S16});
482 else
483 FMad.customFor({S32});
484 FMad.scalarize(0)
485 .lower();
486
487 getActionDefinitionsBuilder(G_TRUNC)
488 .alwaysLegal();
489
490 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
491 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
492 {S32, S1}, {S64, S1}, {S16, S1}})
493 .scalarize(0)
494 .clampScalar(0, S32, S64)
495 .widenScalarToNextPow2(1, 32);
496
497 // TODO: Split s1->s64 during regbankselect for VALU.
498 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
499 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
500 .lowerFor({{S32, S64}})
501 .lowerIf(typeIs(1, S1))
502 .customFor({{S64, S64}});
503 if (ST.has16BitInsts())
504 IToFP.legalFor({{S16, S16}});
505 IToFP.clampScalar(1, S32, S64)
506 .scalarize(0)
507 .widenScalarToNextPow2(1);
508
509 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
510 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
511 .customFor({{S64, S64}});
512 if (ST.has16BitInsts())
513 FPToI.legalFor({{S16, S16}});
514 else
515 FPToI.minScalar(1, S32);
516
517 FPToI.minScalar(0, S32)
518 .scalarize(0)
519 .lower();
520
521 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
522 .scalarize(0)
523 .lower();
524
525 if (ST.has16BitInsts()) {
526 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
527 .legalFor({S16, S32, S64})
528 .clampScalar(0, S16, S64)
529 .scalarize(0);
530 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
531 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
532 .legalFor({S32, S64})
533 .clampScalar(0, S32, S64)
534 .scalarize(0);
535 } else {
536 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
537 .legalFor({S32})
538 .customFor({S64})
539 .clampScalar(0, S32, S64)
540 .scalarize(0);
541 }
542
543 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
544 .scalarize(0)
545 .alwaysLegal();
546
547 auto &CmpBuilder =
548 getActionDefinitionsBuilder(G_ICMP)
549 // The compare output type differs based on the register bank of the output,
550 // so make both s1 and s32 legal.
551 //
552 // Scalar compares producing output in scc will be promoted to s32, as that
553 // is the allocatable register type that will be needed for the copy from
554 // scc. This will be promoted during RegBankSelect, and we assume something
555 // before that won't try to use s32 result types.
556 //
557 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
558 // bank.
559 .legalForCartesianProduct(
560 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
561 .legalForCartesianProduct(
562 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
563 if (ST.has16BitInsts()) {
564 CmpBuilder.legalFor({{S1, S16}});
565 }
566
567 CmpBuilder
568 .widenScalarToNextPow2(1)
569 .clampScalar(1, S32, S64)
570 .scalarize(0)
571 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
572
573 getActionDefinitionsBuilder(G_FCMP)
574 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
575 .widenScalarToNextPow2(1)
576 .clampScalar(1, S32, S64)
577 .scalarize(0);
578
579 // FIXME: fpow has a selection pattern that should move to custom lowering.
580 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
581 if (ST.has16BitInsts())
582 Exp2Ops.legalFor({S32, S16});
583 else
584 Exp2Ops.legalFor({S32});
585 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
586 Exp2Ops.scalarize(0);
587
588 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
589 if (ST.has16BitInsts())
590 ExpOps.customFor({{S32}, {S16}});
591 else
592 ExpOps.customFor({S32});
593 ExpOps.clampScalar(0, MinScalarFPTy, S32)
594 .scalarize(0);
595
596 // The 64-bit versions produce 32-bit results, but only on the SALU.
597 getActionDefinitionsBuilder(G_CTPOP)
598 .legalFor({{S32, S32}, {S32, S64}})
599 .clampScalar(0, S32, S32)
600 .clampScalar(1, S32, S64)
601 .scalarize(0)
602 .widenScalarToNextPow2(0, 32)
603 .widenScalarToNextPow2(1, 32);
604
605 // The hardware instructions return a different result on 0 than the generic
606 // instructions expect. The hardware produces -1, but these produce the
607 // bitwidth.
608 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
609 .scalarize(0)
610 .clampScalar(0, S32, S32)
611 .clampScalar(1, S32, S64)
612 .widenScalarToNextPow2(0, 32)
613 .widenScalarToNextPow2(1, 32)
614 .lower();
615
616 // The 64-bit versions produce 32-bit results, but only on the SALU.
617 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
618 .legalFor({{S32, S32}, {S32, S64}})
619 .clampScalar(0, S32, S32)
620 .clampScalar(1, S32, S64)
621 .scalarize(0)
622 .widenScalarToNextPow2(0, 32)
623 .widenScalarToNextPow2(1, 32);
624
625 getActionDefinitionsBuilder(G_BITREVERSE)
626 .legalFor({S32})
627 .clampScalar(0, S32, S32)
628 .scalarize(0);
629
630 if (ST.has16BitInsts()) {
631 getActionDefinitionsBuilder(G_BSWAP)
632 .legalFor({S16, S32, V2S16})
633 .clampMaxNumElements(0, S16, 2)
634 // FIXME: Fixing non-power-of-2 before clamp is workaround for
635 // narrowScalar limitation.
636 .widenScalarToNextPow2(0)
637 .clampScalar(0, S16, S32)
638 .scalarize(0);
639
640 if (ST.hasVOP3PInsts()) {
641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642 .legalFor({S32, S16, V2S16})
643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644 .clampMaxNumElements(0, S16, 2)
645 .minScalar(0, S16)
646 .widenScalarToNextPow2(0)
647 .scalarize(0)
648 .lower();
649 } else {
650 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
651 .legalFor({S32, S16})
652 .widenScalarToNextPow2(0)
653 .minScalar(0, S16)
654 .scalarize(0)
655 .lower();
656 }
657 } else {
658 // TODO: Should have same legality without v_perm_b32
659 getActionDefinitionsBuilder(G_BSWAP)
660 .legalFor({S32})
661 .lowerIf(narrowerThan(0, 32))
662 // FIXME: Fixing non-power-of-2 before clamp is workaround for
663 // narrowScalar limitation.
664 .widenScalarToNextPow2(0)
665 .maxScalar(0, S32)
666 .scalarize(0)
667 .lower();
668
669 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
670 .legalFor({S32})
671 .minScalar(0, S32)
672 .widenScalarToNextPow2(0)
673 .scalarize(0)
674 .lower();
675 }
676
677 getActionDefinitionsBuilder(G_INTTOPTR)
678 // List the common cases
679 .legalForCartesianProduct(AddrSpaces64, {S64})
680 .legalForCartesianProduct(AddrSpaces32, {S32})
681 .scalarize(0)
682 // Accept any address space as long as the size matches
683 .legalIf(sameSize(0, 1))
684 .widenScalarIf(smallerThan(1, 0),
685 [](const LegalityQuery &Query) {
686 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
687 })
688 .narrowScalarIf(greaterThan(1, 0),
689 [](const LegalityQuery &Query) {
690 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
691 });
692
693 getActionDefinitionsBuilder(G_PTRTOINT)
694 // List the common cases
695 .legalForCartesianProduct(AddrSpaces64, {S64})
696 .legalForCartesianProduct(AddrSpaces32, {S32})
697 .scalarize(0)
698 // Accept any address space as long as the size matches
699 .legalIf(sameSize(0, 1))
700 .widenScalarIf(smallerThan(0, 1),
701 [](const LegalityQuery &Query) {
702 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
703 })
704 .narrowScalarIf(
705 greaterThan(0, 1),
706 [](const LegalityQuery &Query) {
707 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
708 });
709
710 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
711 .scalarize(0)
712 .custom();
713
714 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
715 // handle some operations by just promoting the register during
716 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
717 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
718 switch (AS) {
719 // FIXME: Private element size.
720 case AMDGPUAS::PRIVATE_ADDRESS:
721 return 32;
722 // FIXME: Check subtarget
723 case AMDGPUAS::LOCAL_ADDRESS:
724 return ST.useDS128() ? 128 : 64;
725
726 // Treat constant and global as identical. SMRD loads are sometimes usable
727 // for global loads (ideally constant address space should be eliminated)
728 // depending on the context. Legality cannot be context dependent, but
729 // RegBankSelect can split the load as necessary depending on the pointer
730 // register bank/uniformity and if the memory is invariant or not written in
731 // a kernel.
732 case AMDGPUAS::CONSTANT_ADDRESS:
733 case AMDGPUAS::GLOBAL_ADDRESS:
734 return IsLoad ? 512 : 128;
735 default:
736 return 128;
737 }
738 };
739
740 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
741 bool IsLoad) -> bool {
742 const LLT DstTy = Query.Types[0];
743
744 // Split vector extloads.
745 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
746 unsigned Align = Query.MMODescrs[0].AlignInBits;
747
748 if (MemSize < DstTy.getSizeInBits())
749 MemSize = std::max(MemSize, Align);
750
751 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
752 return true;
753
754 const LLT PtrTy = Query.Types[1];
755 unsigned AS = PtrTy.getAddressSpace();
756 if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
757 return true;
758
759 // Catch weird sized loads that don't evenly divide into the access sizes
760 // TODO: May be able to widen depending on alignment etc.
761 unsigned NumRegs = (MemSize + 31) / 32;
762 if (NumRegs == 3) {
763 if (!ST.hasDwordx3LoadStores())
764 return true;
765 } else {
766 // If the alignment allows, these should have been widened.
767 if (!isPowerOf2_32(NumRegs))
768 return true;
769 }
770
771 if (Align < MemSize) {
772 const SITargetLowering *TLI = ST.getTargetLowering();
773 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
774 }
775
776 return false;
777 };
778
779 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
780 unsigned Size = Query.Types[0].getSizeInBits();
781 if (isPowerOf2_32(Size))
782 return false;
783
784 if (Size == 96 && ST.hasDwordx3LoadStores())
785 return false;
786
787 unsigned AddrSpace = Query.Types[1].getAddressSpace();
788 if (Size >= maxSizeForAddrSpace(AddrSpace, true))
789 return false;
790
791 unsigned Align = Query.MMODescrs[0].AlignInBits;
792 unsigned RoundedSize = NextPowerOf2(Size);
793 return (Align >= RoundedSize);
794 };
795
796 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
797 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
798 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
799
800 // TODO: Refine based on subtargets which support unaligned access or 128-bit
801 // LDS
802 // TODO: Unsupported flat for SI.
803
804 for (unsigned Op : {G_LOAD, G_STORE}) {
805 const bool IsStore = Op == G_STORE;
806
807 auto &Actions = getActionDefinitionsBuilder(Op);
808 // Whitelist the common cases.
809 // TODO: Loads to s16 on gfx9
810 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
811 {V2S32, GlobalPtr, 64, GlobalAlign32},
812 {V4S32, GlobalPtr, 128, GlobalAlign32},
813 {S128, GlobalPtr, 128, GlobalAlign32},
814 {S64, GlobalPtr, 64, GlobalAlign32},
815 {V2S64, GlobalPtr, 128, GlobalAlign32},
816 {V2S16, GlobalPtr, 32, GlobalAlign32},
817 {S32, GlobalPtr, 8, GlobalAlign8},
818 {S32, GlobalPtr, 16, GlobalAlign16},
819
820 {S32, LocalPtr, 32, 32},
821 {S64, LocalPtr, 64, 32},
822 {V2S32, LocalPtr, 64, 32},
823 {S32, LocalPtr, 8, 8},
824 {S32, LocalPtr, 16, 16},
825 {V2S16, LocalPtr, 32, 32},
826
827 {S32, PrivatePtr, 32, 32},
828 {S32, PrivatePtr, 8, 8},
829 {S32, PrivatePtr, 16, 16},
830 {V2S16, PrivatePtr, 32, 32},
831
832 {S32, FlatPtr, 32, GlobalAlign32},
833 {S32, FlatPtr, 16, GlobalAlign16},
834 {S32, FlatPtr, 8, GlobalAlign8},
835 {V2S16, FlatPtr, 32, GlobalAlign32},
836
837 {S32, ConstantPtr, 32, GlobalAlign32},
838 {V2S32, ConstantPtr, 64, GlobalAlign32},
839 {V4S32, ConstantPtr, 128, GlobalAlign32},
840 {S64, ConstantPtr, 64, GlobalAlign32},
841 {S128, ConstantPtr, 128, GlobalAlign32},
842 {V2S32, ConstantPtr, 32, GlobalAlign32}});
843 Actions
844 .customIf(typeIs(1, Constant32Ptr))
845 // Widen suitably aligned loads by loading extra elements.
846 .moreElementsIf([=](const LegalityQuery &Query) {
847 const LLT Ty = Query.Types[0];
848 return Op == G_LOAD && Ty.isVector() &&
849 shouldWidenLoadResult(Query);
850 }, moreElementsToNextPow2(0))
851 .widenScalarIf([=](const LegalityQuery &Query) {
852 const LLT Ty = Query.Types[0];
853 return Op == G_LOAD && !Ty.isVector() &&
854 shouldWidenLoadResult(Query);
855 }, widenScalarOrEltToNextPow2(0))
856 .narrowScalarIf(
857 [=](const LegalityQuery &Query) -> bool {
858 return !Query.Types[0].isVector() &&
859 needToSplitMemOp(Query, Op == G_LOAD);
860 },
861 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
862 const LLT DstTy = Query.Types[0];
863 const LLT PtrTy = Query.Types[1];
864
865 const unsigned DstSize = DstTy.getSizeInBits();
866 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
867
868 // Split extloads.
869 if (DstSize > MemSize)
870 return std::make_pair(0, LLT::scalar(MemSize));
871
872 if (!isPowerOf2_32(DstSize)) {
873 // We're probably decomposing an odd sized store. Try to split
874 // to the widest type. TODO: Account for alignment. As-is it
875 // should be OK, since the new parts will be further legalized.
876 unsigned FloorSize = PowerOf2Floor(DstSize);
877 return std::make_pair(0, LLT::scalar(FloorSize));
878 }
879
880 if (DstSize > 32 && (DstSize % 32 != 0)) {
881 // FIXME: Need a way to specify non-extload of larger size if
882 // suitably aligned.
883 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
884 }
885
886 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
887 Op == G_LOAD);
888 if (MemSize > MaxSize)
889 return std::make_pair(0, LLT::scalar(MaxSize));
890
891 unsigned Align = Query.MMODescrs[0].AlignInBits;
892 return std::make_pair(0, LLT::scalar(Align));
893 })
894 .fewerElementsIf(
895 [=](const LegalityQuery &Query) -> bool {
896 return Query.Types[0].isVector() &&
897 needToSplitMemOp(Query, Op == G_LOAD);
898 },
899 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
900 const LLT DstTy = Query.Types[0];
901 const LLT PtrTy = Query.Types[1];
902
903 LLT EltTy = DstTy.getElementType();
904 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
905 Op == G_LOAD);
906
907 // FIXME: Handle widened to power of 2 results better. This ends
908 // up scalarizing.
909 // FIXME: 3 element stores scalarized on SI
910
911 // Split if it's too large for the address space.
912 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
913 unsigned NumElts = DstTy.getNumElements();
914 unsigned EltSize = EltTy.getSizeInBits();
915
916 if (MaxSize % EltSize == 0) {
917 return std::make_pair(
918 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
919 }
920
921 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
922
923 // FIXME: Refine when odd breakdowns handled
924 // The scalars will need to be re-legalized.
925 if (NumPieces == 1 || NumPieces >= NumElts ||
926 NumElts % NumPieces != 0)
927 return std::make_pair(0, EltTy);
928
929 return std::make_pair(0,
930 LLT::vector(NumElts / NumPieces, EltTy));
931 }
932
933 // FIXME: We could probably handle weird extending loads better.
934 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
935 if (DstTy.getSizeInBits() > MemSize)
936 return std::make_pair(0, EltTy);
937
938 unsigned EltSize = EltTy.getSizeInBits();
939 unsigned DstSize = DstTy.getSizeInBits();
940 if (!isPowerOf2_32(DstSize)) {
941 // We're probably decomposing an odd sized store. Try to split
942 // to the widest type. TODO: Account for alignment. As-is it
943 // should be OK, since the new parts will be further legalized.
944 unsigned FloorSize = PowerOf2Floor(DstSize);
945 return std::make_pair(
946 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
947 }
948
949 // Need to split because of alignment.
950 unsigned Align = Query.MMODescrs[0].AlignInBits;
951 if (EltSize > Align &&
952 (EltSize / Align < DstTy.getNumElements())) {
953 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
954 }
955
956 // May need relegalization for the scalars.
957 return std::make_pair(0, EltTy);
958 })
959 .minScalar(0, S32);
960
961 if (IsStore)
962 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
963
964 // TODO: Need a bitcast lower option?
965 Actions
966 .legalIf([=](const LegalityQuery &Query) {
967 const LLT Ty0 = Query.Types[0];
968 unsigned Size = Ty0.getSizeInBits();
969 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
970 unsigned Align = Query.MMODescrs[0].AlignInBits;
971
972 // FIXME: Widening store from alignment not valid.
973 if (MemSize < Size)
974 MemSize = std::max(MemSize, Align);
975
976 // No extending vector loads.
977 if (Size > MemSize && Ty0.isVector())
978 return false;
979
980 switch (MemSize) {
981 case 8:
982 case 16:
983 return Size == 32;
984 case 32:
985 case 64:
986 case 128:
987 return true;
988 case 96:
989 return ST.hasDwordx3LoadStores();
990 case 256:
991 case 512:
992 return true;
993 default:
994 return false;
995 }
996 })
997 .widenScalarToNextPow2(0)
998 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
999 }
1000
1001 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1002 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1003 {S32, GlobalPtr, 16, 2 * 8},
1004 {S32, LocalPtr, 8, 8},
1005 {S32, LocalPtr, 16, 16},
1006 {S32, PrivatePtr, 8, 8},
1007 {S32, PrivatePtr, 16, 16},
1008 {S32, ConstantPtr, 8, 8},
1009 {S32, ConstantPtr, 16, 2 * 8}});
1010 if (ST.hasFlatAddressSpace()) {
1011 ExtLoads.legalForTypesWithMemDesc(
1012 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1013 }
1014
1015 ExtLoads.clampScalar(0, S32, S32)
1016 .widenScalarToNextPow2(0)
1017 .unsupportedIfMemSizeNotPow2()
1018 .lower();
1019
1020 auto &Atomics = getActionDefinitionsBuilder(
1021 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1022 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1023 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1024 G_ATOMICRMW_UMIN})
1025 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1026 {S64, GlobalPtr}, {S64, LocalPtr}});
1027 if (ST.hasFlatAddressSpace()) {
1028 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1029 }
1030
1031 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1032 .legalFor({{S32, LocalPtr}});
1033
1034 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1035 // demarshalling
1036 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1037 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1038 {S32, FlatPtr}, {S64, FlatPtr}})
1039 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1040 {S32, RegionPtr}, {S64, RegionPtr}});
1041 // TODO: Pointer types, any 32-bit or 64-bit vector
1042
1043 // Condition should be s32 for scalar, s1 for vector.
1044 getActionDefinitionsBuilder(G_SELECT)
1045 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1046 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1047 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1048 .clampScalar(0, S16, S64)
1049 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1050 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1051 .scalarize(1)
1052 .clampMaxNumElements(0, S32, 2)
1053 .clampMaxNumElements(0, LocalPtr, 2)
1054 .clampMaxNumElements(0, PrivatePtr, 2)
1055 .scalarize(0)
1056 .widenScalarToNextPow2(0)
1057 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1058
1059 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1060 // be more flexible with the shift amount type.
1061 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1062 .legalFor({{S32, S32}, {S64, S32}});
1063 if (ST.has16BitInsts()) {
1064 if (ST.hasVOP3PInsts()) {
1065 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1066 .clampMaxNumElements(0, S16, 2);
1067 } else
1068 Shifts.legalFor({{S16, S32}, {S16, S16}});
1069
1070 // TODO: Support 16-bit shift amounts
1071 Shifts.clampScalar(1, S32, S32);
1072 Shifts.clampScalar(0, S16, S64);
1073 Shifts.widenScalarToNextPow2(0, 16);
1074 } else {
1075 // Make sure we legalize the shift amount type first, as the general
1076 // expansion for the shifted type will produce much worse code if it hasn't
1077 // been truncated already.
1078 Shifts.clampScalar(1, S32, S32);
1079 Shifts.clampScalar(0, S32, S64);
1080 Shifts.widenScalarToNextPow2(0, 32);
1081 }
1082 Shifts.scalarize(0);
1083
1084 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1085 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1086 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1087 unsigned IdxTypeIdx = 2;
1088
1089 getActionDefinitionsBuilder(Op)
1090 .customIf([=](const LegalityQuery &Query) {
1091 const LLT EltTy = Query.Types[EltTypeIdx];
1092 const LLT VecTy = Query.Types[VecTypeIdx];
1093 const LLT IdxTy = Query.Types[IdxTypeIdx];
1094 return (EltTy.getSizeInBits() == 16 ||
1095 EltTy.getSizeInBits() % 32 == 0) &&
1096 VecTy.getSizeInBits() % 32 == 0 &&
1097 VecTy.getSizeInBits() <= 1024 &&
1098 IdxTy.getSizeInBits() == 32;
1099 })
1100 .clampScalar(EltTypeIdx, S32, S64)
1101 .clampScalar(VecTypeIdx, S32, S64)
1102 .clampScalar(IdxTypeIdx, S32, S32);
1103 }
1104
1105 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1106 .unsupportedIf([=](const LegalityQuery &Query) {
1107 const LLT &EltTy = Query.Types[1].getElementType();
1108 return Query.Types[0] != EltTy;
1109 });
1110
1111 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1112 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1113 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1114
1115 // FIXME: Doesn't handle extract of illegal sizes.
1116 getActionDefinitionsBuilder(Op)
1117 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1118 // FIXME: Multiples of 16 should not be legal.
1119 .legalIf([=](const LegalityQuery &Query) {
1120 const LLT BigTy = Query.Types[BigTyIdx];
1121 const LLT LitTy = Query.Types[LitTyIdx];
1122 return (BigTy.getSizeInBits() % 32 == 0) &&
1123 (LitTy.getSizeInBits() % 16 == 0);
1124 })
1125 .widenScalarIf(
1126 [=](const LegalityQuery &Query) {
1127 const LLT BigTy = Query.Types[BigTyIdx];
1128 return (BigTy.getScalarSizeInBits() < 16);
1129 },
1130 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1131 .widenScalarIf(
1132 [=](const LegalityQuery &Query) {
1133 const LLT LitTy = Query.Types[LitTyIdx];
1134 return (LitTy.getScalarSizeInBits() < 16);
1135 },
1136 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1137 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1138 .widenScalarToNextPow2(BigTyIdx, 32);
1139
1140 }
1141
1142 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1143 .legalForCartesianProduct(AllS32Vectors, {S32})
1144 .legalForCartesianProduct(AllS64Vectors, {S64})
1145 .clampNumElements(0, V16S32, V32S32)
1146 .clampNumElements(0, V2S64, V16S64)
1147 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1148
1149 if (ST.hasScalarPackInsts()) {
1150 BuildVector
1151 // FIXME: Should probably widen s1 vectors straight to s32
1152 .minScalarOrElt(0, S16)
1153 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1154 .minScalar(1, S32);
1155
1156 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1157 .legalFor({V2S16, S32})
1158 .lower();
1159 BuildVector.minScalarOrElt(0, S32);
1160 } else {
1161 BuildVector.customFor({V2S16, S16});
1162 BuildVector.minScalarOrElt(0, S32);
1163
1164 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1165 .customFor({V2S16, S32})
1166 .lower();
1167 }
1168
1169 BuildVector.legalIf(isRegisterType(0));
1170
1171 // FIXME: Clamp maximum size
1172 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1173 .legalIf(isRegisterType(0));
1174
1175 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1176 // pre-legalize.
1177 if (ST.hasVOP3PInsts()) {
1178 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1179 .customFor({V2S16, V2S16})
1180 .lower();
1181 } else
1182 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1183
1184 // Merge/Unmerge
1185 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1186 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1187 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1188
1189 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1190 const LLT &Ty = Query.Types[TypeIdx];
1191 if (Ty.isVector()) {
1192 const LLT &EltTy = Ty.getElementType();
1193 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1194 return true;
1195 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1196 return true;
1197 }
1198 return false;
1199 };
1200
1201 auto &Builder = getActionDefinitionsBuilder(Op)
1202 // Try to widen to s16 first for small types.
1203 // TODO: Only do this on targets with legal s16 shifts
1204 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1205
1206 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1207 .lowerFor({{S16, V2S16}})
1208 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1209 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1210 elementTypeIs(1, S16)),
1211 changeTo(1, V2S16))
1212 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1213 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1214 // valid.
1215 .clampScalar(LitTyIdx, S32, S256)
1216 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1217 // Break up vectors with weird elements into scalars
1218 .fewerElementsIf(
1219 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1220 scalarize(0))
1221 .fewerElementsIf(
1222 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1223 scalarize(1))
1224 .clampScalar(BigTyIdx, S32, S1024);
1225
1226 if (Op == G_MERGE_VALUES) {
1227 Builder.widenScalarIf(
1228 // TODO: Use 16-bit shifts if legal for 8-bit values?
1229 [=](const LegalityQuery &Query) {
1230 const LLT Ty = Query.Types[LitTyIdx];
1231 return Ty.getSizeInBits() < 32;
1232 },
1233 changeTo(LitTyIdx, S32));
1234 }
1235
1236 Builder.widenScalarIf(
1237 [=](const LegalityQuery &Query) {
1238 const LLT Ty = Query.Types[BigTyIdx];
1239 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1240 Ty.getSizeInBits() % 16 != 0;
1241 },
1242 [=](const LegalityQuery &Query) {
1243 // Pick the next power of 2, or a multiple of 64 over 128.
1244 // Whichever is smaller.
1245 const LLT &Ty = Query.Types[BigTyIdx];
1246 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1247 if (NewSizeInBits >= 256) {
1248 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1249 if (RoundedTo < NewSizeInBits)
1250 NewSizeInBits = RoundedTo;
1251 }
1252 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1253 })
1254 .legalIf([=](const LegalityQuery &Query) {
1255 const LLT &BigTy = Query.Types[BigTyIdx];
1256 const LLT &LitTy = Query.Types[LitTyIdx];
1257
1258 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1259 return false;
1260 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1261 return false;
1262
1263 return BigTy.getSizeInBits() % 16 == 0 &&
1264 LitTy.getSizeInBits() % 16 == 0 &&
1265 BigTy.getSizeInBits() <= 1024;
1266 })
1267 // Any vectors left are the wrong size. Scalarize them.
1268 .scalarize(0)
1269 .scalarize(1);
1270 }
1271
1272 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1273 // RegBankSelect.
1274 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1275 .legalFor({{S32}, {S64}});
1276
1277 if (ST.hasVOP3PInsts()) {
1278 SextInReg.lowerFor({{V2S16}})
1279 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1280 // get more vector shift opportunities, since we'll get those when
1281 // expanded.
1282 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1283 } else if (ST.has16BitInsts()) {
1284 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1285 } else {
1286 // Prefer to promote to s32 before lowering if we don't have 16-bit
1287 // shifts. This avoid a lot of intermediate truncate and extend operations.
1288 SextInReg.lowerFor({{S32}, {S64}});
1289 }
1290
1291 SextInReg
1292 .scalarize(0)
1293 .clampScalar(0, S32, S64)
1294 .lower();
1295
1296 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1297 .legalFor({S64});
1298
1299 getActionDefinitionsBuilder({
1300 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1301 G_FCOPYSIGN,
1302
1303 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1304 G_READ_REGISTER,
1305 G_WRITE_REGISTER,
1306
1307 G_SADDO, G_SSUBO,
1308
1309 // TODO: Implement
1310 G_FMINIMUM, G_FMAXIMUM
1311 }).lower();
1312
1313 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1314 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1315 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1316 .unsupported();
1317
1318 computeTables();
1319 verify(*ST.getInstrInfo());
1320}
1321
1322bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1323 MachineRegisterInfo &MRI,
1324 MachineIRBuilder &B,
1325 GISelChangeObserver &Observer) const {
1326 switch (MI.getOpcode()) {
1327 case TargetOpcode::G_ADDRSPACE_CAST:
1328 return legalizeAddrSpaceCast(MI, MRI, B);
1329 case TargetOpcode::G_FRINT:
1330 return legalizeFrint(MI, MRI, B);
1331 case TargetOpcode::G_FCEIL:
1332 return legalizeFceil(MI, MRI, B);
1333 case TargetOpcode::G_INTRINSIC_TRUNC:
1334 return legalizeIntrinsicTrunc(MI, MRI, B);
1335 case TargetOpcode::G_SITOFP:
1336 return legalizeITOFP(MI, MRI, B, true);
1337 case TargetOpcode::G_UITOFP:
1338 return legalizeITOFP(MI, MRI, B, false);
1339 case TargetOpcode::G_FPTOSI:
1340 return legalizeFPTOI(MI, MRI, B, true);
1341 case TargetOpcode::G_FPTOUI:
1342 return legalizeFPTOI(MI, MRI, B, false);
1343 case TargetOpcode::G_FMINNUM:
1344 case TargetOpcode::G_FMAXNUM:
1345 case TargetOpcode::G_FMINNUM_IEEE:
1346 case TargetOpcode::G_FMAXNUM_IEEE:
1347 return legalizeMinNumMaxNum(MI, MRI, B);
1348 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1349 return legalizeExtractVectorElt(MI, MRI, B);
1350 case TargetOpcode::G_INSERT_VECTOR_ELT:
1351 return legalizeInsertVectorElt(MI, MRI, B);
1352 case TargetOpcode::G_SHUFFLE_VECTOR:
1353 return legalizeShuffleVector(MI, MRI, B);
1354 case TargetOpcode::G_FSIN:
1355 case TargetOpcode::G_FCOS:
1356 return legalizeSinCos(MI, MRI, B);
1357 case TargetOpcode::G_GLOBAL_VALUE:
1358 return legalizeGlobalValue(MI, MRI, B);
1359 case TargetOpcode::G_LOAD:
1360 return legalizeLoad(MI, MRI, B, Observer);
1361 case TargetOpcode::G_FMAD:
1362 return legalizeFMad(MI, MRI, B);
1363 case TargetOpcode::G_FDIV:
1364 return legalizeFDIV(MI, MRI, B);
1365 case TargetOpcode::G_UDIV:
1366 case TargetOpcode::G_UREM:
1367 return legalizeUDIV_UREM(MI, MRI, B);
1368 case TargetOpcode::G_SDIV:
1369 case TargetOpcode::G_SREM:
1370 return legalizeSDIV_SREM(MI, MRI, B);
1371 case TargetOpcode::G_ATOMIC_CMPXCHG:
1372 return legalizeAtomicCmpXChg(MI, MRI, B);
1373 case TargetOpcode::G_FLOG:
1374 return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1375 case TargetOpcode::G_FLOG10:
1376 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1377 case TargetOpcode::G_FEXP:
1378 return legalizeFExp(MI, B);
1379 case TargetOpcode::G_FPOW:
1380 return legalizeFPow(MI, B);
1381 case TargetOpcode::G_FFLOOR:
1382 return legalizeFFloor(MI, MRI, B);
1383 case TargetOpcode::G_BUILD_VECTOR:
1384 return legalizeBuildVector(MI, MRI, B);
1385 default:
1386 return false;
1387 }
1388
1389 llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1389)
;
1390}
1391
1392Register AMDGPULegalizerInfo::getSegmentAperture(
1393 unsigned AS,
1394 MachineRegisterInfo &MRI,
1395 MachineIRBuilder &B) const {
1396 MachineFunction &MF = B.getMF();
1397 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1398 const LLT S32 = LLT::scalar(32);
1399
1400 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1400, __PRETTY_FUNCTION__))
;
1401
1402 if (ST.hasApertureRegs()) {
1403 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1404 // getreg.
1405 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1406 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1407 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1408 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1409 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1410 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1411 unsigned Encoding =
1412 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1413 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1414 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1415
1416 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1417
1418 B.buildInstr(AMDGPU::S_GETREG_B32)
1419 .addDef(GetReg)
1420 .addImm(Encoding);
1421 MRI.setType(GetReg, S32);
1422
1423 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1424 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1425 }
1426
1427 Register QueuePtr = MRI.createGenericVirtualRegister(
1428 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1429
1430 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1431 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1432 return Register();
1433
1434 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1435 // private_segment_aperture_base_hi.
1436 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1437
1438 // TODO: can we be smarter about machine pointer info?
1439 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1440 MachineMemOperand *MMO = MF.getMachineMemOperand(
1441 PtrInfo,
1442 MachineMemOperand::MOLoad |
1443 MachineMemOperand::MODereferenceable |
1444 MachineMemOperand::MOInvariant,
1445 4,
1446 MinAlign(64, StructOffset));
1447
1448 Register LoadAddr;
1449
1450 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1451 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1452}
1453
1454bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1455 MachineInstr &MI, MachineRegisterInfo &MRI,
1456 MachineIRBuilder &B) const {
1457 MachineFunction &MF = B.getMF();
1458
1459 B.setInstr(MI);
1460
1461 const LLT S32 = LLT::scalar(32);
1462 Register Dst = MI.getOperand(0).getReg();
1463 Register Src = MI.getOperand(1).getReg();
1464
1465 LLT DstTy = MRI.getType(Dst);
1466 LLT SrcTy = MRI.getType(Src);
1467 unsigned DestAS = DstTy.getAddressSpace();
1468 unsigned SrcAS = SrcTy.getAddressSpace();
1469
1470 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1471 // vector element.
1472 assert(!DstTy.isVector())((!DstTy.isVector()) ? static_cast<void> (0) : __assert_fail
("!DstTy.isVector()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1472, __PRETTY_FUNCTION__))
;
1473
1474 const AMDGPUTargetMachine &TM
1475 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1476
1477 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1478 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1479 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1480 return true;
1481 }
1482
1483 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1484 // Truncate.
1485 B.buildExtract(Dst, Src, 0);
1486 MI.eraseFromParent();
1487 return true;
1488 }
1489
1490 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1491 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1492 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1493
1494 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1495 // another. Merge operands are required to be the same type, but creating an
1496 // extra ptrtoint would be kind of pointless.
1497 auto HighAddr = B.buildConstant(
1498 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1499 B.buildMerge(Dst, {Src, HighAddr});
1500 MI.eraseFromParent();
1501 return true;
1502 }
1503
1504 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1505 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1506, __PRETTY_FUNCTION__))
1506 DestAS == AMDGPUAS::PRIVATE_ADDRESS)((DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS
) ? static_cast<void> (0) : __assert_fail ("DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1506, __PRETTY_FUNCTION__))
;
1507 unsigned NullVal = TM.getNullPointerValue(DestAS);
1508
1509 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1510 auto FlatNull = B.buildConstant(SrcTy, 0);
1511
1512 // Extract low 32-bits of the pointer.
1513 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1514
1515 auto CmpRes =
1516 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1517 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1518
1519 MI.eraseFromParent();
1520 return true;
1521 }
1522
1523 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1524 return false;
1525
1526 if (!ST.hasFlatAddressSpace())
1527 return false;
1528
1529 auto SegmentNull =
1530 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1531 auto FlatNull =
1532 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1533
1534 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1535 if (!ApertureReg.isValid())
1536 return false;
1537
1538 auto CmpRes =
1539 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1540
1541 // Coerce the type of the low half of the result so we can use merge_values.
1542 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1543
1544 // TODO: Should we allow mismatched types but matching sizes in merges to
1545 // avoid the ptrtoint?
1546 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1547 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1548
1549 MI.eraseFromParent();
1550 return true;
1551}
1552
1553bool AMDGPULegalizerInfo::legalizeFrint(
1554 MachineInstr &MI, MachineRegisterInfo &MRI,
1555 MachineIRBuilder &B) const {
1556 B.setInstr(MI);
1557
1558 Register Src = MI.getOperand(1).getReg();
1559 LLT Ty = MRI.getType(Src);
1560 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((Ty.isScalar() && Ty.getSizeInBits() == 64) ? static_cast
<void> (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1560, __PRETTY_FUNCTION__))
;
1561
1562 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1563 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1564
1565 auto C1 = B.buildFConstant(Ty, C1Val);
1566 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1567
1568 // TODO: Should this propagate fast-math-flags?
1569 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1570 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1571
1572 auto C2 = B.buildFConstant(Ty, C2Val);
1573 auto Fabs = B.buildFAbs(Ty, Src);
1574
1575 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1576 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1577 return true;
1578}
1579
1580bool AMDGPULegalizerInfo::legalizeFceil(
1581 MachineInstr &MI, MachineRegisterInfo &MRI,
1582 MachineIRBuilder &B) const {
1583 B.setInstr(MI);
1584
1585 const LLT S1 = LLT::scalar(1);
1586 const LLT S64 = LLT::scalar(64);
1587
1588 Register Src = MI.getOperand(1).getReg();
1589 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1589, __PRETTY_FUNCTION__))
;
1590
1591 // result = trunc(src)
1592 // if (src > 0.0 && src != result)
1593 // result += 1.0
1594
1595 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1596
1597 const auto Zero = B.buildFConstant(S64, 0.0);
1598 const auto One = B.buildFConstant(S64, 1.0);
1599 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1600 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1601 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1602 auto Add = B.buildSelect(S64, And, One, Zero);
1603
1604 // TODO: Should this propagate fast-math-flags?
1605 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1606 return true;
1607}
1608
1609static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1610 MachineIRBuilder &B) {
1611 const unsigned FractBits = 52;
1612 const unsigned ExpBits = 11;
1613 LLT S32 = LLT::scalar(32);
1614
1615 auto Const0 = B.buildConstant(S32, FractBits - 32);
1616 auto Const1 = B.buildConstant(S32, ExpBits);
1617
1618 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1619 .addUse(Const0.getReg(0))
1620 .addUse(Const1.getReg(0));
1621
1622 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1623}
1624
1625bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1626 MachineInstr &MI, MachineRegisterInfo &MRI,
1627 MachineIRBuilder &B) const {
1628 B.setInstr(MI);
1629
1630 const LLT S1 = LLT::scalar(1);
1631 const LLT S32 = LLT::scalar(32);
1632 const LLT S64 = LLT::scalar(64);
1633
1634 Register Src = MI.getOperand(1).getReg();
1635 assert(MRI.getType(Src) == S64)((MRI.getType(Src) == S64) ? static_cast<void> (0) : __assert_fail
("MRI.getType(Src) == S64", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1635, __PRETTY_FUNCTION__))
;
1636
1637 // TODO: Should this use extract since the low half is unused?
1638 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1639 Register Hi = Unmerge.getReg(1);
1640
1641 // Extract the upper half, since this is where we will find the sign and
1642 // exponent.
1643 auto Exp = extractF64Exponent(Hi, B);
1644
1645 const unsigned FractBits = 52;
1646
1647 // Extract the sign bit.
1648 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
1649 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1650
1651 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1);
1652
1653 const auto Zero32 = B.buildConstant(S32, 0);
1654
1655 // Extend back to 64-bits.
1656 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1657
1658 auto Shr = B.buildAShr(S64, FractMask, Exp);
1659 auto Not = B.buildNot(S64, Shr);
1660 auto Tmp0 = B.buildAnd(S64, Src, Not);
1661 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1662
1663 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1664 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1665
1666 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1667 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1668 return true;
1669}
1670
1671bool AMDGPULegalizerInfo::legalizeITOFP(
1672 MachineInstr &MI, MachineRegisterInfo &MRI,
1673 MachineIRBuilder &B, bool Signed) const {
1674 B.setInstr(MI);
1675
1676 Register Dst = MI.getOperand(0).getReg();
1677 Register Src = MI.getOperand(1).getReg();
1678
1679 const LLT S64 = LLT::scalar(64);
1680 const LLT S32 = LLT::scalar(32);
1681
1682 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1682, __PRETTY_FUNCTION__))
;
1683
1684 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1685
1686 auto CvtHi = Signed ?
1687 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1688 B.buildUITOFP(S64, Unmerge.getReg(1));
1689
1690 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1691
1692 auto ThirtyTwo = B.buildConstant(S32, 32);
1693 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1694 .addUse(CvtHi.getReg(0))
1695 .addUse(ThirtyTwo.getReg(0));
1696
1697 // TODO: Should this propagate fast-math-flags?
1698 B.buildFAdd(Dst, LdExp, CvtLo);
1699 MI.eraseFromParent();
1700 return true;
1701}
1702
1703// TODO: Copied from DAG implementation. Verify logic and document how this
1704// actually works.
1705bool AMDGPULegalizerInfo::legalizeFPTOI(
1706 MachineInstr &MI, MachineRegisterInfo &MRI,
1707 MachineIRBuilder &B, bool Signed) const {
1708 B.setInstr(MI);
1709
1710 Register Dst = MI.getOperand(0).getReg();
1711 Register Src = MI.getOperand(1).getReg();
1712
1713 const LLT S64 = LLT::scalar(64);
1714 const LLT S32 = LLT::scalar(32);
1715
1716 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)
? static_cast<void> (0) : __assert_fail ("MRI.getType(Src) == S64 && MRI.getType(Dst) == S64"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1716, __PRETTY_FUNCTION__))
;
1717
1718 unsigned Flags = MI.getFlags();
1719
1720 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1721 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL));
1722 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL));
1723
1724 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1725 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1726 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1727
1728 auto Hi = Signed ?
1729 B.buildFPTOSI(S32, FloorMul) :
1730 B.buildFPTOUI(S32, FloorMul);
1731 auto Lo = B.buildFPTOUI(S32, Fma);
1732
1733 B.buildMerge(Dst, { Lo, Hi });
1734 MI.eraseFromParent();
1735
1736 return true;
1737}
1738
1739bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1740 MachineInstr &MI, MachineRegisterInfo &MRI,
1741 MachineIRBuilder &B) const {
1742 MachineFunction &MF = B.getMF();
1743 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1744
1745 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1746 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1747
1748 // With ieee_mode disabled, the instructions have the correct behavior
1749 // already for G_FMINNUM/G_FMAXNUM
1750 if (!MFI->getMode().IEEE)
1751 return !IsIEEEOp;
1752
1753 if (IsIEEEOp)
1754 return true;
1755
1756 MachineIRBuilder HelperBuilder(MI);
1757 GISelObserverWrapper DummyObserver;
1758 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1759 HelperBuilder.setInstr(MI);
1760 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1761}
1762
1763bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1764 MachineInstr &MI, MachineRegisterInfo &MRI,
1765 MachineIRBuilder &B) const {
1766 // TODO: Should move some of this into LegalizerHelper.
1767
1768 // TODO: Promote dynamic indexing of s16 to s32
1769
1770 // FIXME: Artifact combiner probably should have replaced the truncated
1771 // constant before this, so we shouldn't need
1772 // getConstantVRegValWithLookThrough.
1773 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1774 MI.getOperand(2).getReg(), MRI);
1775 if (!IdxVal) // Dynamic case will be selected to register indexing.
1776 return true;
1777
1778 Register Dst = MI.getOperand(0).getReg();
1779 Register Vec = MI.getOperand(1).getReg();
1780
1781 LLT VecTy = MRI.getType(Vec);
1782 LLT EltTy = VecTy.getElementType();
1783 assert(EltTy == MRI.getType(Dst))((EltTy == MRI.getType(Dst)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Dst)", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1783, __PRETTY_FUNCTION__))
;
1784
1785 B.setInstr(MI);
1786
1787 if (IdxVal->Value < VecTy.getNumElements())
1788 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1789 else
1790 B.buildUndef(Dst);
1791
1792 MI.eraseFromParent();
1793 return true;
1794}
1795
1796bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1797 MachineInstr &MI, MachineRegisterInfo &MRI,
1798 MachineIRBuilder &B) const {
1799 // TODO: Should move some of this into LegalizerHelper.
1800
1801 // TODO: Promote dynamic indexing of s16 to s32
1802
1803 // FIXME: Artifact combiner probably should have replaced the truncated
1804 // constant before this, so we shouldn't need
1805 // getConstantVRegValWithLookThrough.
1806 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1807 MI.getOperand(3).getReg(), MRI);
1808 if (!IdxVal) // Dynamic case will be selected to register indexing.
1809 return true;
1810
1811 Register Dst = MI.getOperand(0).getReg();
1812 Register Vec = MI.getOperand(1).getReg();
1813 Register Ins = MI.getOperand(2).getReg();
1814
1815 LLT VecTy = MRI.getType(Vec);
1816 LLT EltTy = VecTy.getElementType();
1817 assert(EltTy == MRI.getType(Ins))((EltTy == MRI.getType(Ins)) ? static_cast<void> (0) : __assert_fail
("EltTy == MRI.getType(Ins)", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 1817, __PRETTY_FUNCTION__))
;
1818
1819 B.setInstr(MI);
1820
1821 if (IdxVal->Value < VecTy.getNumElements())
1822 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1823 else
1824 B.buildUndef(Dst);
1825
1826 MI.eraseFromParent();
1827 return true;
1828}
1829
1830bool AMDGPULegalizerInfo::legalizeShuffleVector(
1831 MachineInstr &MI, MachineRegisterInfo &MRI,
1832 MachineIRBuilder &B) const {
1833 const LLT V2S16 = LLT::vector(2, 16);
1834
1835 Register Dst = MI.getOperand(0).getReg();
1836 Register Src0 = MI.getOperand(1).getReg();
1837 LLT DstTy = MRI.getType(Dst);
1838 LLT SrcTy = MRI.getType(Src0);
1839
1840 if (SrcTy == V2S16 && DstTy == V2S16 &&
1841 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1842 return true;
1843
1844 MachineIRBuilder HelperBuilder(MI);
1845 GISelObserverWrapper DummyObserver;
1846 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1847 HelperBuilder.setInstr(MI);
1848 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1849}
1850
1851bool AMDGPULegalizerInfo::legalizeSinCos(
1852 MachineInstr &MI, MachineRegisterInfo &MRI,
1853 MachineIRBuilder &B) const {
1854 B.setInstr(MI);
1855
1856 Register DstReg = MI.getOperand(0).getReg();
1857 Register SrcReg = MI.getOperand(1).getReg();
1858 LLT Ty = MRI.getType(DstReg);
1859 unsigned Flags = MI.getFlags();
1860
1861 Register TrigVal;
1862 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI3.14159265358979323846);
1863 if (ST.hasTrigReducedRange()) {
1864 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1865 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1866 .addUse(MulVal.getReg(0))
1867 .setMIFlags(Flags).getReg(0);
1868 } else
1869 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1870
1871 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1872 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1873 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1874 .addUse(TrigVal)
1875 .setMIFlags(Flags);
1876 MI.eraseFromParent();
1877 return true;
1878}
1879
1880bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1881 Register DstReg, LLT PtrTy,
1882 MachineIRBuilder &B, const GlobalValue *GV,
1883 unsigned Offset, unsigned GAFlags) const {
1884 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1885 // to the following code sequence:
1886 //
1887 // For constant address space:
1888 // s_getpc_b64 s[0:1]
1889 // s_add_u32 s0, s0, $symbol
1890 // s_addc_u32 s1, s1, 0
1891 //
1892 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1893 // a fixup or relocation is emitted to replace $symbol with a literal
1894 // constant, which is a pc-relative offset from the encoding of the $symbol
1895 // operand to the global variable.
1896 //
1897 // For global address space:
1898 // s_getpc_b64 s[0:1]
1899 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1900 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1901 //
1902 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1903 // fixups or relocations are emitted to replace $symbol@*@lo and
1904 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1905 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1906 // operand to the global variable.
1907 //
1908 // What we want here is an offset from the value returned by s_getpc
1909 // (which is the address of the s_add_u32 instruction) to the global
1910 // variable, but since the encoding of $symbol starts 4 bytes after the start
1911 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1912 // small. This requires us to add 4 to the global variable offset in order to
1913 // compute the correct address.
1914
1915 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1916
1917 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1918 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1919
1920 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1921 .addDef(PCReg);
1922
1923 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1924 if (GAFlags == SIInstrInfo::MO_NONE)
1925 MIB.addImm(0);
1926 else
1927 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1928
1929 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1930
1931 if (PtrTy.getSizeInBits() == 32)
1932 B.buildExtract(DstReg, PCReg, 0);
1933 return true;
1934 }
1935
1936bool AMDGPULegalizerInfo::legalizeGlobalValue(
1937 MachineInstr &MI, MachineRegisterInfo &MRI,
1938 MachineIRBuilder &B) const {
1939 Register DstReg = MI.getOperand(0).getReg();
1940 LLT Ty = MRI.getType(DstReg);
1941 unsigned AS = Ty.getAddressSpace();
1942
1943 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1944 MachineFunction &MF = B.getMF();
1945 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1946 B.setInstr(MI);
1947
1948 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1949 if (!MFI->isEntryFunction()) {
1950 const Function &Fn = MF.getFunction();
1951 DiagnosticInfoUnsupported BadLDSDecl(
1952 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1953 Fn.getContext().diagnose(BadLDSDecl);
1954 }
1955
1956 // TODO: We could emit code to handle the initialization somewhere.
1957 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1958 const SITargetLowering *TLI = ST.getTargetLowering();
1959 if (!TLI->shouldUseLDSConstAddress(GV)) {
1960 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1961 return true; // Leave in place;
1962 }
1963
1964 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1965 MI.eraseFromParent();
1966 return true;
1967 }
1968
1969 const Function &Fn = MF.getFunction();
1970 DiagnosticInfoUnsupported BadInit(
1971 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1972 Fn.getContext().diagnose(BadInit);
1973 return true;
1974 }
1975
1976 const SITargetLowering *TLI = ST.getTargetLowering();
1977
1978 if (TLI->shouldEmitFixup(GV)) {
1979 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1980 MI.eraseFromParent();
1981 return true;
1982 }
1983
1984 if (TLI->shouldEmitPCReloc(GV)) {
1985 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1986 MI.eraseFromParent();
1987 return true;
1988 }
1989
1990 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1991 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1992
1993 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1994 MachinePointerInfo::getGOT(MF),
1995 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1996 MachineMemOperand::MOInvariant,
1997 8 /*Size*/, 8 /*Align*/);
1998
1999 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2000
2001 if (Ty.getSizeInBits() == 32) {
2002 // Truncate if this is a 32-bit constant adrdess.
2003 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2004 B.buildExtract(DstReg, Load, 0);
2005 } else
2006 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2007
2008 MI.eraseFromParent();
2009 return true;
2010}
2011
2012bool AMDGPULegalizerInfo::legalizeLoad(
2013 MachineInstr &MI, MachineRegisterInfo &MRI,
2014 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2015 B.setInstr(MI);
2016 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2017 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2018 Observer.changingInstr(MI);
2019 MI.getOperand(1).setReg(Cast.getReg(0));
2020 Observer.changedInstr(MI);
2021 return true;
2022}
2023
2024bool AMDGPULegalizerInfo::legalizeFMad(
2025 MachineInstr &MI, MachineRegisterInfo &MRI,
2026 MachineIRBuilder &B) const {
2027 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2028 assert(Ty.isScalar())((Ty.isScalar()) ? static_cast<void> (0) : __assert_fail
("Ty.isScalar()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2028, __PRETTY_FUNCTION__))
;
2029
2030 MachineFunction &MF = B.getMF();
2031 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2032
2033 // TODO: Always legal with future ftz flag.
2034 // FIXME: Do we need just output?
2035 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2036 return true;
2037 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2038 return true;
2039
2040 MachineIRBuilder HelperBuilder(MI);
2041 GISelObserverWrapper DummyObserver;
2042 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2043 HelperBuilder.setMBB(*MI.getParent());
2044 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2045}
2046
2047bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2048 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2049 Register DstReg = MI.getOperand(0).getReg();
2050 Register PtrReg = MI.getOperand(1).getReg();
2051 Register CmpVal = MI.getOperand(2).getReg();
2052 Register NewVal = MI.getOperand(3).getReg();
2053
2054 assert(SITargetLowering::isFlatGlobalAddrSpace(((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2056, __PRETTY_FUNCTION__))
2055 MRI.getType(PtrReg).getAddressSpace()) &&((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2056, __PRETTY_FUNCTION__))
2056 "this should not have been custom lowered")((SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg
).getAddressSpace()) && "this should not have been custom lowered"
) ? static_cast<void> (0) : __assert_fail ("SITargetLowering::isFlatGlobalAddrSpace( MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2056, __PRETTY_FUNCTION__))
;
2057
2058 LLT ValTy = MRI.getType(CmpVal);
2059 LLT VecTy = LLT::vector(2, ValTy);
2060
2061 B.setInstr(MI);
2062 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2063
2064 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2065 .addDef(DstReg)
2066 .addUse(PtrReg)
2067 .addUse(PackedVal)
2068 .setMemRefs(MI.memoperands());
2069
2070 MI.eraseFromParent();
2071 return true;
2072}
2073
2074bool AMDGPULegalizerInfo::legalizeFlog(
2075 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2076 Register Dst = MI.getOperand(0).getReg();
2077 Register Src = MI.getOperand(1).getReg();
2078 LLT Ty = B.getMRI()->getType(Dst);
2079 unsigned Flags = MI.getFlags();
2080 B.setInstr(MI);
2081
2082 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2083 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2084
2085 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2086 MI.eraseFromParent();
2087 return true;
2088}
2089
2090bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2091 MachineIRBuilder &B) const {
2092 Register Dst = MI.getOperand(0).getReg();
2093 Register Src = MI.getOperand(1).getReg();
2094 unsigned Flags = MI.getFlags();
2095 LLT Ty = B.getMRI()->getType(Dst);
2096 B.setInstr(MI);
2097
2098 auto K = B.buildFConstant(Ty, numbers::log2e);
2099 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2100 B.buildFExp2(Dst, Mul, Flags);
2101 MI.eraseFromParent();
2102 return true;
2103}
2104
2105bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2106 MachineIRBuilder &B) const {
2107 Register Dst = MI.getOperand(0).getReg();
2108 Register Src0 = MI.getOperand(1).getReg();
2109 Register Src1 = MI.getOperand(2).getReg();
2110 unsigned Flags = MI.getFlags();
2111 LLT Ty = B.getMRI()->getType(Dst);
2112 B.setInstr(MI);
2113 const LLT S16 = LLT::scalar(16);
2114 const LLT S32 = LLT::scalar(32);
2115
2116 if (Ty == S32) {
2117 auto Log = B.buildFLog2(S32, Src0, Flags);
2118 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2119 .addUse(Log.getReg(0))
2120 .addUse(Src1)
2121 .setMIFlags(Flags);
2122 B.buildFExp2(Dst, Mul, Flags);
2123 } else if (Ty == S16) {
2124 // There's no f16 fmul_legacy, so we need to convert for it.
2125 auto Log = B.buildFLog2(S16, Src0, Flags);
2126 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2127 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2128 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2129 .addUse(Ext0.getReg(0))
2130 .addUse(Ext1.getReg(0))
2131 .setMIFlags(Flags);
2132
2133 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2134 } else
2135 return false;
2136
2137 MI.eraseFromParent();
2138 return true;
2139}
2140
2141// Find a source register, ignoring any possible source modifiers.
2142static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2143 Register ModSrc = OrigSrc;
2144 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2145 ModSrc = SrcFNeg->getOperand(1).getReg();
2146 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2147 ModSrc = SrcFAbs->getOperand(1).getReg();
2148 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2149 ModSrc = SrcFAbs->getOperand(1).getReg();
2150 return ModSrc;
2151}
2152
2153bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2154 MachineRegisterInfo &MRI,
2155 MachineIRBuilder &B) const {
2156 B.setInstr(MI);
2157
2158 const LLT S1 = LLT::scalar(1);
2159 const LLT S64 = LLT::scalar(64);
2160 Register Dst = MI.getOperand(0).getReg();
2161 Register OrigSrc = MI.getOperand(1).getReg();
2162 unsigned Flags = MI.getFlags();
2163 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&((ST.hasFractBug() && MRI.getType(Dst) == S64 &&
"this should not have been custom lowered") ? static_cast<
void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2164, __PRETTY_FUNCTION__))
2164 "this should not have been custom lowered")((ST.hasFractBug() && MRI.getType(Dst) == S64 &&
"this should not have been custom lowered") ? static_cast<
void> (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2164, __PRETTY_FUNCTION__))
;
2165
2166 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2167 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2168 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2169 // V_FRACT bug is:
2170 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2171 //
2172 // Convert floor(x) to (x - fract(x))
2173
2174 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2175 .addUse(OrigSrc)
2176 .setMIFlags(Flags);
2177
2178 // Give source modifier matching some assistance before obscuring a foldable
2179 // pattern.
2180
2181 // TODO: We can avoid the neg on the fract? The input sign to fract
2182 // shouldn't matter?
2183 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2184
2185 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2186
2187 Register Min = MRI.createGenericVirtualRegister(S64);
2188
2189 // We don't need to concern ourselves with the snan handling difference, so
2190 // use the one which will directly select.
2191 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2192 if (MFI->getMode().IEEE)
2193 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2194 else
2195 B.buildFMinNum(Min, Fract, Const, Flags);
2196
2197 Register CorrectedFract = Min;
2198 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2199 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2200 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2201 }
2202
2203 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2204 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2205
2206 MI.eraseFromParent();
2207 return true;
2208}
2209
2210// Turn an illegal packed v2s16 build vector into bit operations.
2211// TODO: This should probably be a bitcast action in LegalizerHelper.
2212bool AMDGPULegalizerInfo::legalizeBuildVector(
2213 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2214 Register Dst = MI.getOperand(0).getReg();
2215 LLT DstTy = MRI.getType(Dst);
2216 const LLT S32 = LLT::scalar(32);
2217 const LLT V2S16 = LLT::vector(2, 16);
2218 (void)DstTy;
2219 (void)V2S16;
2220 assert(DstTy == V2S16)((DstTy == V2S16) ? static_cast<void> (0) : __assert_fail
("DstTy == V2S16", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2220, __PRETTY_FUNCTION__))
;
2221
2222 Register Src0 = MI.getOperand(1).getReg();
2223 Register Src1 = MI.getOperand(2).getReg();
2224 assert(MRI.getType(Src0) == LLT::scalar(16))((MRI.getType(Src0) == LLT::scalar(16)) ? static_cast<void
> (0) : __assert_fail ("MRI.getType(Src0) == LLT::scalar(16)"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2224, __PRETTY_FUNCTION__))
;
2225
2226 B.setInstr(MI);
2227 auto Merge = B.buildMerge(S32, {Src0, Src1});
2228 B.buildBitcast(Dst, Merge);
2229
2230 MI.eraseFromParent();
2231 return true;
2232}
2233
2234// Return the use branch instruction, otherwise null if the usage is invalid.
2235static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2236 MachineRegisterInfo &MRI,
2237 MachineInstr *&Br) {
2238 Register CondDef = MI.getOperand(0).getReg();
2239 if (!MRI.hasOneNonDBGUse(CondDef))
2240 return nullptr;
2241
2242 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2243 if (UseMI.getParent() != MI.getParent() ||
2244 UseMI.getOpcode() != AMDGPU::G_BRCOND)
2245 return nullptr;
2246
2247 // Make sure the cond br is followed by a G_BR
2248 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2249 if (Next != MI.getParent()->end()) {
2250 if (Next->getOpcode() != AMDGPU::G_BR)
2251 return nullptr;
2252 Br = &*Next;
2253 }
2254
2255 return &UseMI;
2256}
2257
2258Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2259 Register Reg, LLT Ty) const {
2260 Register LiveIn = MRI.getLiveInVirtReg(Reg);
2261 if (LiveIn)
2262 return LiveIn;
2263
2264 Register NewReg = MRI.createGenericVirtualRegister(Ty);
2265 MRI.addLiveIn(Reg, NewReg);
2266 return NewReg;
2267}
2268
2269bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2270 const ArgDescriptor *Arg) const {
2271 if (!Arg->isRegister() || !Arg->getRegister().isValid())
9
Taking false branch
2272 return false; // TODO: Handle these
2273
2274 assert(Arg->getRegister().isPhysical())((Arg->getRegister().isPhysical()) ? static_cast<void>
(0) : __assert_fail ("Arg->getRegister().isPhysical()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2274, __PRETTY_FUNCTION__))
;
10
'?' condition is true
2275
2276 MachineRegisterInfo &MRI = *B.getMRI();
2277
2278 LLT Ty = MRI.getType(DstReg);
2279 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2280
2281 if (Arg->isMasked()) {
11
Calling 'ArgDescriptor::isMasked'
14
Returning from 'ArgDescriptor::isMasked'
15
Taking true branch
2282 // TODO: Should we try to emit this once in the entry block?
2283 const LLT S32 = LLT::scalar(32);
2284 const unsigned Mask = Arg->getMask();
2285 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
16
Calling 'countTrailingZeros<unsigned int>'
23
Returning from 'countTrailingZeros<unsigned int>'
24
'Shift' initialized to 32
2286
2287 Register AndMaskSrc = LiveIn;
2288
2289 if (Shift
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
24.1
'Shift' is not equal to 0
!= 0) {
25
Taking true branch
2290 auto ShiftAmt = B.buildConstant(S32, Shift);
2291 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2292 }
2293
2294 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
26
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2295 } else
2296 B.buildCopy(DstReg, LiveIn);
2297
2298 // Insert the argument copy if it doens't already exist.
2299 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2300 if (!MRI.getVRegDef(LiveIn)) {
2301 // FIXME: Should have scoped insert pt
2302 MachineBasicBlock &OrigInsBB = B.getMBB();
2303 auto OrigInsPt = B.getInsertPt();
2304
2305 MachineBasicBlock &EntryMBB = B.getMF().front();
2306 EntryMBB.addLiveIn(Arg->getRegister());
2307 B.setInsertPt(EntryMBB, EntryMBB.begin());
2308 B.buildCopy(LiveIn, Arg->getRegister());
2309
2310 B.setInsertPt(OrigInsBB, OrigInsPt);
2311 }
2312
2313 return true;
2314}
2315
2316bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2317 MachineInstr &MI,
2318 MachineRegisterInfo &MRI,
2319 MachineIRBuilder &B,
2320 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2321 B.setInstr(MI);
2322
2323 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2324
2325 const ArgDescriptor *Arg;
2326 const TargetRegisterClass *RC;
2327 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2328 if (!Arg) {
6
Assuming 'Arg' is non-null
7
Taking false branch
2329 LLVM_DEBUG(dbgs() << "Required arg register missing\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-legalinfo")) { dbgs() << "Required arg register missing\n"
; } } while (false)
;
2330 return false;
2331 }
2332
2333 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
8
Calling 'AMDGPULegalizerInfo::loadInputValue'
2334 MI.eraseFromParent();
2335 return true;
2336 }
2337
2338 return false;
2339}
2340
2341bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2342 MachineRegisterInfo &MRI,
2343 MachineIRBuilder &B) const {
2344 B.setInstr(MI);
2345 Register Dst = MI.getOperand(0).getReg();
2346 LLT DstTy = MRI.getType(Dst);
2347 LLT S16 = LLT::scalar(16);
2348 LLT S32 = LLT::scalar(32);
2349 LLT S64 = LLT::scalar(64);
2350
2351 if (legalizeFastUnsafeFDIV(MI, MRI, B))
2352 return true;
2353
2354 if (DstTy == S16)
2355 return legalizeFDIV16(MI, MRI, B);
2356 if (DstTy == S32)
2357 return legalizeFDIV32(MI, MRI, B);
2358 if (DstTy == S64)
2359 return legalizeFDIV64(MI, MRI, B);
2360
2361 return false;
2362}
2363
2364static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2365 const LLT S32 = LLT::scalar(32);
2366
2367 auto Cvt0 = B.buildUITOFP(S32, Src);
2368 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2369 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2370 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2371 return B.buildFPTOUI(S32, Mul).getReg(0);
2372}
2373
2374void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2375 Register DstReg,
2376 Register Num,
2377 Register Den,
2378 bool IsRem) const {
2379 const LLT S1 = LLT::scalar(1);
2380 const LLT S32 = LLT::scalar(32);
2381
2382 // RCP = URECIP(Den) = 2^32 / Den + e
2383 // e is rounding error.
2384 auto RCP = buildDivRCP(B, Den);
2385
2386 // RCP_LO = mul(RCP, Den)
2387 auto RCP_LO = B.buildMul(S32, RCP, Den);
2388
2389 // RCP_HI = mulhu (RCP, Den) */
2390 auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2391
2392 // NEG_RCP_LO = -RCP_LO
2393 auto Zero = B.buildConstant(S32, 0);
2394 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2395
2396 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2397 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2398 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2399
2400 // Calculate the rounding error from the URECIP instruction
2401 // E = mulhu(ABS_RCP_LO, RCP)
2402 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2403
2404 // RCP_A_E = RCP + E
2405 auto RCP_A_E = B.buildAdd(S32, RCP, E);
2406
2407 // RCP_S_E = RCP - E
2408 auto RCP_S_E = B.buildSub(S32, RCP, E);
2409
2410 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2411 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2412
2413 // Quotient = mulhu(Tmp0, Num)stmp
2414 auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2415
2416 // Num_S_Remainder = Quotient * Den
2417 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2418
2419 // Remainder = Num - Num_S_Remainder
2420 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2421
2422 // Remainder_GE_Den = Remainder >= Den
2423 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2424
2425 // Remainder_GE_Zero = Num >= Num_S_Remainder;
2426 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2427 Num, Num_S_Remainder);
2428
2429 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2430 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2431
2432 // Calculate Division result:
2433
2434 // Quotient_A_One = Quotient + 1
2435 auto One = B.buildConstant(S32, 1);
2436 auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2437
2438 // Quotient_S_One = Quotient - 1
2439 auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2440
2441 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2442 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2443
2444 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2445 if (IsRem) {
2446 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2447
2448 // Calculate Rem result:
2449 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2450
2451 // Remainder_A_Den = Remainder + Den
2452 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2453
2454 // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2455 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2456
2457 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2458 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2459 } else {
2460 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2461 }
2462}
2463
2464bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2465 MachineRegisterInfo &MRI,
2466 MachineIRBuilder &B) const {
2467 B.setInstr(MI);
2468 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2469 Register DstReg = MI.getOperand(0).getReg();
2470 Register Num = MI.getOperand(1).getReg();
2471 Register Den = MI.getOperand(2).getReg();
2472 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2473 MI.eraseFromParent();
2474 return true;
2475}
2476
2477bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2478 MachineRegisterInfo &MRI,
2479 MachineIRBuilder &B) const {
2480 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2481 return legalizeUDIV_UREM32(MI, MRI, B);
2482 return false;
2483}
2484
2485bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2486 MachineRegisterInfo &MRI,
2487 MachineIRBuilder &B) const {
2488 B.setInstr(MI);
2489 const LLT S32 = LLT::scalar(32);
2490
2491 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2492 Register DstReg = MI.getOperand(0).getReg();
2493 Register LHS = MI.getOperand(1).getReg();
2494 Register RHS = MI.getOperand(2).getReg();
2495
2496 auto ThirtyOne = B.buildConstant(S32, 31);
2497 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2498 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2499
2500 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2501 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2502
2503 LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2504 RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2505
2506 Register UDivRem = MRI.createGenericVirtualRegister(S32);
2507 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2508
2509 if (IsRem) {
2510 auto RSign = LHSign; // Remainder sign is the same as LHS
2511 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2512 B.buildSub(DstReg, UDivRem, RSign);
2513 } else {
2514 auto DSign = B.buildXor(S32, LHSign, RHSign);
2515 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2516 B.buildSub(DstReg, UDivRem, DSign);
2517 }
2518
2519 MI.eraseFromParent();
2520 return true;
2521}
2522
2523bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2524 MachineRegisterInfo &MRI,
2525 MachineIRBuilder &B) const {
2526 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2527 return legalizeSDIV_SREM32(MI, MRI, B);
2528 return false;
2529}
2530
2531bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2532 MachineRegisterInfo &MRI,
2533 MachineIRBuilder &B) const {
2534 Register Res = MI.getOperand(0).getReg();
2535 Register LHS = MI.getOperand(1).getReg();
2536 Register RHS = MI.getOperand(2).getReg();
2537
2538 uint16_t Flags = MI.getFlags();
2539
2540 LLT ResTy = MRI.getType(Res);
2541 LLT S32 = LLT::scalar(32);
2542 LLT S64 = LLT::scalar(64);
2543
2544 const MachineFunction &MF = B.getMF();
2545 bool Unsafe =
2546 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2547
2548 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2549 return false;
2550
2551 if (!Unsafe && ResTy == S32 &&
2552 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2553 return false;
2554
2555 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2556 // 1 / x -> RCP(x)
2557 if (CLHS->isExactlyValue(1.0)) {
2558 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2559 .addUse(RHS)
2560 .setMIFlags(Flags);
2561
2562 MI.eraseFromParent();
2563 return true;
2564 }
2565
2566 // -1 / x -> RCP( FNEG(x) )
2567 if (CLHS->isExactlyValue(-1.0)) {
2568 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2569 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2570 .addUse(FNeg.getReg(0))
2571 .setMIFlags(Flags);
2572
2573 MI.eraseFromParent();
2574 return true;
2575 }
2576 }
2577
2578 // x / y -> x * (1.0 / y)
2579 if (Unsafe) {
2580 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2581 .addUse(RHS)
2582 .setMIFlags(Flags);
2583 B.buildFMul(Res, LHS, RCP, Flags);
2584
2585 MI.eraseFromParent();
2586 return true;
2587 }
2588
2589 return false;
2590}
2591
2592bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2593 MachineRegisterInfo &MRI,
2594 MachineIRBuilder &B) const {
2595 B.setInstr(MI);
2596 Register Res = MI.getOperand(0).getReg();
2597 Register LHS = MI.getOperand(1).getReg();
2598 Register RHS = MI.getOperand(2).getReg();
2599
2600 uint16_t Flags = MI.getFlags();
2601
2602 LLT S16 = LLT::scalar(16);
2603 LLT S32 = LLT::scalar(32);
2604
2605 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2606 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2607
2608 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2609 .addUse(RHSExt.getReg(0))
2610 .setMIFlags(Flags);
2611
2612 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2613 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2614
2615 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2616 .addUse(RDst.getReg(0))
2617 .addUse(RHS)
2618 .addUse(LHS)
2619 .setMIFlags(Flags);
2620
2621 MI.eraseFromParent();
2622 return true;
2623}
2624
2625// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2626// to enable denorm mode. When 'Enable' is false, disable denorm mode.
2627static void toggleSPDenormMode(bool Enable,
2628 MachineIRBuilder &B,
2629 const GCNSubtarget &ST,
2630 AMDGPU::SIModeRegisterDefaults Mode) {
2631 // Set SP denorm mode to this value.
2632 unsigned SPDenormMode =
2633 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
2634
2635 if (ST.hasDenormModeInst()) {
2636 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2637 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2638
2639 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2640 B.buildInstr(AMDGPU::S_DENORM_MODE)
2641 .addImm(NewDenormModeValue);
2642
2643 } else {
2644 // Select FP32 bit field in mode register.
2645 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2646 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2647 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2648
2649 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2650 .addImm(SPDenormMode)
2651 .addImm(SPDenormModeBitField);
2652 }
2653}
2654
2655bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2656 MachineRegisterInfo &MRI,
2657 MachineIRBuilder &B) const {
2658 B.setInstr(MI);
2659 Register Res = MI.getOperand(0).getReg();
2660 Register LHS = MI.getOperand(1).getReg();
2661 Register RHS = MI.getOperand(2).getReg();
2662 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2663 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2664
2665 uint16_t Flags = MI.getFlags();
2666
2667 LLT S32 = LLT::scalar(32);
2668 LLT S1 = LLT::scalar(1);
2669
2670 auto One = B.buildFConstant(S32, 1.0f);
2671
2672 auto DenominatorScaled =
2673 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2674 .addUse(RHS)
2675 .addUse(LHS)
2676 .addImm(1)
2677 .setMIFlags(Flags);
2678 auto NumeratorScaled =
2679 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2680 .addUse(LHS)
2681 .addUse(RHS)
2682 .addImm(0)
2683 .setMIFlags(Flags);
2684
2685 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2686 .addUse(DenominatorScaled.getReg(0))
2687 .setMIFlags(Flags);
2688 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2689
2690 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2691 // aren't modeled as reading it.
2692 if (!Mode.allFP32Denormals())
2693 toggleSPDenormMode(true, B, ST, Mode);
2694
2695 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2696 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2697 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2698 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2699 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2700 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2701
2702 if (!Mode.allFP32Denormals())
2703 toggleSPDenormMode(false, B, ST, Mode);
2704
2705 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2706 .addUse(Fma4.getReg(0))
2707 .addUse(Fma1.getReg(0))
2708 .addUse(Fma3.getReg(0))
2709 .addUse(NumeratorScaled.getReg(1))
2710 .setMIFlags(Flags);
2711
2712 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2713 .addUse(Fmas.getReg(0))
2714 .addUse(RHS)
2715 .addUse(LHS)
2716 .setMIFlags(Flags);
2717
2718 MI.eraseFromParent();
2719 return true;
2720}
2721
2722bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2723 MachineRegisterInfo &MRI,
2724 MachineIRBuilder &B) const {
2725 B.setInstr(MI);
2726 Register Res = MI.getOperand(0).getReg();
2727 Register LHS = MI.getOperand(1).getReg();
2728 Register RHS = MI.getOperand(2).getReg();
2729
2730 uint16_t Flags = MI.getFlags();
2731
2732 LLT S64 = LLT::scalar(64);
2733 LLT S1 = LLT::scalar(1);
2734
2735 auto One = B.buildFConstant(S64, 1.0);
2736
2737 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2738 .addUse(LHS)
2739 .addUse(RHS)
2740 .addImm(1)
2741 .setMIFlags(Flags);
2742
2743 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2744
2745 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2746 .addUse(DivScale0.getReg(0))
2747 .setMIFlags(Flags);
2748
2749 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2750 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2751 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2752
2753 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2754 .addUse(LHS)
2755 .addUse(RHS)
2756 .addImm(0)
2757 .setMIFlags(Flags);
2758
2759 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2760 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2761 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2762
2763 Register Scale;
2764 if (!ST.hasUsableDivScaleConditionOutput()) {
2765 // Workaround a hardware bug on SI where the condition output from div_scale
2766 // is not usable.
2767
2768 LLT S32 = LLT::scalar(32);
2769
2770 auto NumUnmerge = B.buildUnmerge(S32, LHS);
2771 auto DenUnmerge = B.buildUnmerge(S32, RHS);
2772 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2773 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2774
2775 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2776 Scale1Unmerge.getReg(1));
2777 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2778 Scale0Unmerge.getReg(1));
2779 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2780 } else {
2781 Scale = DivScale1.getReg(1);
2782 }
2783
2784 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2785 .addUse(Fma4.getReg(0))
2786 .addUse(Fma3.getReg(0))
2787 .addUse(Mul.getReg(0))
2788 .addUse(Scale)
2789 .setMIFlags(Flags);
2790
2791 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2792 .addUse(Fmas.getReg(0))
2793 .addUse(RHS)
2794 .addUse(LHS)
2795 .setMIFlags(Flags);
2796
2797 MI.eraseFromParent();
2798 return true;
2799}
2800
2801bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2802 MachineRegisterInfo &MRI,
2803 MachineIRBuilder &B) const {
2804 B.setInstr(MI);
2805 Register Res = MI.getOperand(0).getReg();
2806 Register LHS = MI.getOperand(2).getReg();
2807 Register RHS = MI.getOperand(3).getReg();
2808 uint16_t Flags = MI.getFlags();
2809
2810 LLT S32 = LLT::scalar(32);
2811 LLT S1 = LLT::scalar(1);
2812
2813 auto Abs = B.buildFAbs(S32, RHS, Flags);
2814 const APFloat C0Val(1.0f);
2815
2816 auto C0 = B.buildConstant(S32, 0x6f800000);
2817 auto C1 = B.buildConstant(S32, 0x2f800000);
2818 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2819
2820 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2821 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2822
2823 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2824
2825 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2826 .addUse(Mul0.getReg(0))
2827 .setMIFlags(Flags);
2828
2829 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2830
2831 B.buildFMul(Res, Sel, Mul1, Flags);
2832
2833 MI.eraseFromParent();
2834 return true;
2835}
2836
2837bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2838 MachineRegisterInfo &MRI,
2839 MachineIRBuilder &B) const {
2840 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2841 if (!MFI->isEntryFunction()) {
3
Assuming the condition is true
4
Taking true branch
2842 return legalizePreloadedArgIntrin(MI, MRI, B,
5
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
2843 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2844 }
2845
2846 B.setInstr(MI);
2847
2848 uint64_t Offset =
2849 ST.getTargetLowering()->getImplicitParameterOffset(
2850 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2851 Register DstReg = MI.getOperand(0).getReg();
2852 LLT DstTy = MRI.getType(DstReg);
2853 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2854
2855 const ArgDescriptor *Arg;
2856 const TargetRegisterClass *RC;
2857 std::tie(Arg, RC)
2858 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2859 if (!Arg)
2860 return false;
2861
2862 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2863 if (!loadInputValue(KernargPtrReg, B, Arg))
2864 return false;
2865
2866 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2867 MI.eraseFromParent();
2868 return true;
2869}
2870
2871bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2872 MachineRegisterInfo &MRI,
2873 MachineIRBuilder &B,
2874 unsigned AddrSpace) const {
2875 B.setInstr(MI);
2876 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2877 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2878 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2879 MI.eraseFromParent();
2880 return true;
2881}
2882
2883// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2884// offset (the offset that is included in bounds checking and swizzling, to be
2885// split between the instruction's voffset and immoffset fields) and soffset
2886// (the offset that is excluded from bounds checking and swizzling, to go in
2887// the instruction's soffset field). This function takes the first kind of
2888// offset and figures out how to split it between voffset and immoffset.
2889std::tuple<Register, unsigned, unsigned>
2890AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2891 Register OrigOffset) const {
2892 const unsigned MaxImm = 4095;
2893 Register BaseReg;
2894 unsigned TotalConstOffset;
2895 MachineInstr *OffsetDef;
2896 const LLT S32 = LLT::scalar(32);
2897
2898 std::tie(BaseReg, TotalConstOffset, OffsetDef)
2899 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2900
2901 unsigned ImmOffset = TotalConstOffset;
2902
2903 // If the immediate value is too big for the immoffset field, put the value
2904 // and -4096 into the immoffset field so that the value that is copied/added
2905 // for the voffset field is a multiple of 4096, and it stands more chance
2906 // of being CSEd with the copy/add for another similar load/store.
2907 // However, do not do that rounding down to a multiple of 4096 if that is a
2908 // negative number, as it appears to be illegal to have a negative offset
2909 // in the vgpr, even if adding the immediate offset makes it positive.
2910 unsigned Overflow = ImmOffset & ~MaxImm;
2911 ImmOffset -= Overflow;
2912 if ((int32_t)Overflow < 0) {
2913 Overflow += ImmOffset;
2914 ImmOffset = 0;
2915 }
2916
2917 if (Overflow != 0) {
2918 if (!BaseReg) {
2919 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2920 } else {
2921 auto OverflowVal = B.buildConstant(S32, Overflow);
2922 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2923 }
2924 }
2925
2926 if (!BaseReg)
2927 BaseReg = B.buildConstant(S32, 0).getReg(0);
2928
2929 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2930}
2931
2932/// Handle register layout difference for f16 images for some subtargets.
2933Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2934 MachineRegisterInfo &MRI,
2935 Register Reg) const {
2936 if (!ST.hasUnpackedD16VMem())
2937 return Reg;
2938
2939 const LLT S16 = LLT::scalar(16);
2940 const LLT S32 = LLT::scalar(32);
2941 LLT StoreVT = MRI.getType(Reg);
2942 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((StoreVT.isVector() && StoreVT.getElementType() == S16
) ? static_cast<void> (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 2942, __PRETTY_FUNCTION__))
;
2943
2944 auto Unmerge = B.buildUnmerge(S16, Reg);
2945
2946 SmallVector<Register, 4> WideRegs;
2947 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2948 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2949
2950 int NumElts = StoreVT.getNumElements();
2951
2952 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2953}
2954
2955Register AMDGPULegalizerInfo::fixStoreSourceType(
2956 MachineIRBuilder &B, Register VData, bool IsFormat) const {
2957 MachineRegisterInfo *MRI = B.getMRI();
2958 LLT Ty = MRI->getType(VData);
2959
2960 const LLT S16 = LLT::scalar(16);
2961
2962 // Fixup illegal register types for i8 stores.
2963 if (Ty == LLT::scalar(8) || Ty == S16) {
2964 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2965 return AnyExt;
2966 }
2967
2968 if (Ty.isVector()) {
2969 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2970 if (IsFormat)
2971 return handleD16VData(B, *MRI, VData);
2972 }
2973 }
2974
2975 return VData;
2976}
2977
2978bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2979 MachineRegisterInfo &MRI,
2980 MachineIRBuilder &B,
2981 bool IsTyped,
2982 bool IsFormat) const {
2983 B.setInstr(MI);
2984
2985 Register VData = MI.getOperand(1).getReg();
2986 LLT Ty = MRI.getType(VData);
2987 LLT EltTy = Ty.getScalarType();
2988 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2989 const LLT S32 = LLT::scalar(32);
2990
2991 VData = fixStoreSourceType(B, VData, IsFormat);
2992 Register RSrc = MI.getOperand(2).getReg();
2993
2994 MachineMemOperand *MMO = *MI.memoperands_begin();
2995 const int MemSize = MMO->getSize();
2996
2997 unsigned ImmOffset;
2998 unsigned TotalOffset;
2999
3000 // The typed intrinsics add an immediate after the registers.
3001 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3002
3003 // The struct intrinsic variants add one additional operand over raw.
3004 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3005 Register VIndex;
3006 int OpOffset = 0;
3007 if (HasVIndex) {
3008 VIndex = MI.getOperand(3).getReg();
3009 OpOffset = 1;
3010 }
3011
3012 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3013 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3014
3015 unsigned Format = 0;
3016 if (IsTyped) {
3017 Format = MI.getOperand(5 + OpOffset).getImm();
3018 ++OpOffset;
3019 }
3020
3021 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3022
3023 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3024 if (TotalOffset != 0)
3025 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3026
3027 unsigned Opc;
3028 if (IsTyped) {
3029 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3030 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3031 } else if (IsFormat) {
3032 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3033 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3034 } else {
3035 switch (MemSize) {
3036 case 1:
3037 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3038 break;
3039 case 2:
3040 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3041 break;
3042 default:
3043 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3044 break;
3045 }
3046 }
3047
3048 if (!VIndex)
3049 VIndex = B.buildConstant(S32, 0).getReg(0);
3050
3051 auto MIB = B.buildInstr(Opc)
3052 .addUse(VData) // vdata
3053 .addUse(RSrc) // rsrc
3054 .addUse(VIndex) // vindex
3055 .addUse(VOffset) // voffset
3056 .addUse(SOffset) // soffset
3057 .addImm(ImmOffset); // offset(imm)
3058
3059 if (IsTyped)
3060 MIB.addImm(Format);
3061
3062 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3063 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3064 .addMemOperand(MMO);
3065
3066 MI.eraseFromParent();
3067 return true;
3068}
3069
3070bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3071 MachineRegisterInfo &MRI,
3072 MachineIRBuilder &B,
3073 bool IsFormat,
3074 bool IsTyped) const {
3075 B.setInstr(MI);
3076
3077 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3078 MachineMemOperand *MMO = *MI.memoperands_begin();
3079 const int MemSize = MMO->getSize();
3080 const LLT S32 = LLT::scalar(32);
3081
3082 Register Dst = MI.getOperand(0).getReg();
3083 Register RSrc = MI.getOperand(2).getReg();
3084
3085 // The typed intrinsics add an immediate after the registers.
3086 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3087
3088 // The struct intrinsic variants add one additional operand over raw.
3089 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3090 Register VIndex;
3091 int OpOffset = 0;
3092 if (HasVIndex) {
3093 VIndex = MI.getOperand(3).getReg();
3094 OpOffset = 1;
3095 }
3096
3097 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3098 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3099
3100 unsigned Format = 0;
3101 if (IsTyped) {
3102 Format = MI.getOperand(5 + OpOffset).getImm();
3103 ++OpOffset;
3104 }
3105
3106 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3107 unsigned ImmOffset;
3108 unsigned TotalOffset;
3109
3110 LLT Ty = MRI.getType(Dst);
3111 LLT EltTy = Ty.getScalarType();
3112 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3113 const bool Unpacked = ST.hasUnpackedD16VMem();
3114
3115 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3116 if (TotalOffset != 0)
3117 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3118
3119 unsigned Opc;
3120
3121 if (IsTyped) {
3122 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3123 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3124 } else if (IsFormat) {
3125 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3126 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3127 } else {
3128 switch (MemSize) {
3129 case 1:
3130 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3131 break;
3132 case 2:
3133 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3134 break;
3135 default:
3136 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3137 break;
3138 }
3139 }
3140
3141 Register LoadDstReg;
3142
3143 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3144 LLT UnpackedTy = Ty.changeElementSize(32);
3145
3146 if (IsExtLoad)
3147 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3148 else if (Unpacked && IsD16 && Ty.isVector())
3149 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3150 else
3151 LoadDstReg = Dst;
3152
3153 if (!VIndex)
3154 VIndex = B.buildConstant(S32, 0).getReg(0);
3155
3156 auto MIB = B.buildInstr(Opc)
3157 .addDef(LoadDstReg) // vdata
3158 .addUse(RSrc) // rsrc
3159 .addUse(VIndex) // vindex
3160 .addUse(VOffset) // voffset
3161 .addUse(SOffset) // soffset
3162 .addImm(ImmOffset); // offset(imm)
3163
3164 if (IsTyped)
3165 MIB.addImm(Format);
3166
3167 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3168 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3169 .addMemOperand(MMO);
3170
3171 if (LoadDstReg != Dst) {
3172 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3173
3174 // Widen result for extending loads was widened.
3175 if (IsExtLoad)
3176 B.buildTrunc(Dst, LoadDstReg);
3177 else {
3178 // Repack to original 16-bit vector result
3179 // FIXME: G_TRUNC should work, but legalization currently fails
3180 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3181 SmallVector<Register, 4> Repack;
3182 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3183 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3184 B.buildMerge(Dst, Repack);
3185 }
3186 }
3187
3188 MI.eraseFromParent();
3189 return true;
3190}
3191
3192bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3193 MachineIRBuilder &B,
3194 bool IsInc) const {
3195 B.setInstr(MI);
3196 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3197 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3198 B.buildInstr(Opc)
3199 .addDef(MI.getOperand(0).getReg())
3200 .addUse(MI.getOperand(2).getReg())
3201 .addUse(MI.getOperand(3).getReg())
3202 .cloneMemRefs(MI);
3203 MI.eraseFromParent();
3204 return true;
3205}
3206
3207static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3208 switch (IntrID) {
3209 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3210 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3211 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3212 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3213 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3214 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3215 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3216 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3217 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3218 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3219 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3220 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3221 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3222 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3223 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3224 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3225 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3226 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3227 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3228 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3229 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3230 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3231 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3232 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3233 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3234 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3235 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3236 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3237 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3238 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3239 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3240 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3241 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3242 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3243 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3244 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3245 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3246 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3247 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3248 default:
3249 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3249)
;
3250 }
3251}
3252
3253bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3254 MachineIRBuilder &B,
3255 Intrinsic::ID IID) const {
3256 B.setInstr(MI);
3257
3258 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3259 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3260
3261 Register Dst = MI.getOperand(0).getReg();
3262 Register VData = MI.getOperand(2).getReg();
3263
3264 Register CmpVal;
3265 int OpOffset = 0;
3266
3267 if (IsCmpSwap) {
3268 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3269 ++OpOffset;
3270 }
3271
3272 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3273 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3274
3275 // The struct intrinsic variants add one additional operand over raw.
3276 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3277 Register VIndex;
3278 if (HasVIndex) {
3279 VIndex = MI.getOperand(4 + OpOffset).getReg();
3280 ++OpOffset;
3281 }
3282
3283 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3284 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3285 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3286
3287 MachineMemOperand *MMO = *MI.memoperands_begin();
3288
3289 unsigned ImmOffset;
3290 unsigned TotalOffset;
3291 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3292 if (TotalOffset != 0)
3293 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3294
3295 if (!VIndex)
3296 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3297
3298 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3299 .addDef(Dst)
3300 .addUse(VData); // vdata
3301
3302 if (IsCmpSwap)
3303 MIB.addReg(CmpVal);
3304
3305 MIB.addUse(RSrc) // rsrc
3306 .addUse(VIndex) // vindex
3307 .addUse(VOffset) // voffset
3308 .addUse(SOffset) // soffset
3309 .addImm(ImmOffset) // offset(imm)
3310 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3311 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3312 .addMemOperand(MMO);
3313
3314 MI.eraseFromParent();
3315 return true;
3316}
3317
3318// Produce a vector of s16 elements from s32 pieces.
3319static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3320 ArrayRef<Register> UnmergeParts) {
3321 const LLT S16 = LLT::scalar(16);
3322
3323 SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3324 for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3325 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3326
3327 B.buildBuildVector(DstReg, RemergeParts);
3328}
3329
3330/// Convert a set of s32 registers to a result vector with s16 elements.
3331static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3332 ArrayRef<Register> UnmergeParts) {
3333 MachineRegisterInfo &MRI = *B.getMRI();
3334 const LLT V2S16 = LLT::vector(2, 16);
3335 LLT TargetTy = MRI.getType(DstReg);
3336 int NumElts = UnmergeParts.size();
3337
3338 if (NumElts == 1) {
3339 assert(TargetTy == V2S16)((TargetTy == V2S16) ? static_cast<void> (0) : __assert_fail
("TargetTy == V2S16", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3339, __PRETTY_FUNCTION__))
;
3340 B.buildBitcast(DstReg, UnmergeParts[0]);
3341 return;
3342 }
3343
3344 SmallVector<Register, 4> RemergeParts(NumElts);
3345 for (int I = 0; I != NumElts; ++I)
3346 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3347
3348 if (TargetTy.getSizeInBits() == 32u * NumElts) {
3349 B.buildConcatVectors(DstReg, RemergeParts);
3350 return;
3351 }
3352
3353 const LLT V3S16 = LLT::vector(3, 16);
3354 const LLT V6S16 = LLT::vector(6, 16);
3355
3356 // Widen to v6s16 and unpack v3 parts.
3357 assert(TargetTy == V3S16)((TargetTy == V3S16) ? static_cast<void> (0) : __assert_fail
("TargetTy == V3S16", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp"
, 3357, __PRETTY_FUNCTION__))
;
3358
3359 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3360 auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3361 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3362}
3363
3364// FIXME: Just vector trunc should be sufficent, but legalization currently
3365// broken.
3366static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3367 Register WideDstReg) {
3368 const LLT S32 = LLT::scalar(32);
3369 const LLT S16 = LLT::scalar(16);
3370
3371 auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3372
3373 int NumOps = Unmerge->getNumOperands() - 1;
3374 SmallVector<Register, 4> RemergeParts(NumOps);
3375 for (int I = 0; I != NumOps; ++I)
3376 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3377
3378 B.buildBuildVector(DstReg, RemergeParts);
3379}
3380
3381bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3382 MachineInstr &MI, MachineIRBuilder &B,
3383 GISelChangeObserver &Observer,
3384 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3385 bool IsTFE = MI.getNumExplicitDefs() == 2;
3386
3387 // We are only processing the operands of d16 image operations on subtargets
3388 // that use the unpacked register layout, or need to repack the TFE result.
3389
3390 // TODO: Need to handle a16 images too
3391 // TODO: Do we need to guard against already legalized intrinsics?
3392 if (!IsTFE && !ST.hasUnpackedD16VMem())
3393 return true;
3394
3395 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3396 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3397
3398 if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3399 return true;
3400
3401 B.setInstr(MI);
3402
3403 MachineRegisterInfo *MRI = B.getMRI();
3404 const LLT S32 = LLT::scalar(32);
3405 const LLT S16 = LLT::scalar(16);
3406
3407 if (BaseOpcode->Store) { // No TFE for stores?
3408 Register VData = MI.getOperand(1).getReg();
3409 LLT Ty = MRI->getType(VData);
3410 if (!Ty.isVector() || Ty.getElementType() != S16)
3411 return true;
3412
3413 B.setInstr(MI);
3414
3415 Observer.changingInstr(MI);
3416 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3417 Observer.changedInstr(MI);
3418 return true;
3419 }
3420
3421 Register DstReg = MI.getOperand(0).getReg();
3422 LLT Ty = MRI->getType(DstReg);
3423 const LLT EltTy = Ty.getScalarType();
3424 const bool IsD16 = Ty.getScalarType() == S16;
3425 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3426
3427 if (IsTFE) {
3428 // In the IR, TFE is supposed to be used with a 2 element struct return
3429 // type. The intruction really returns these two values in one contiguous
3430 // register, with one additional dword beyond the loaded data. Rewrite the
3431 // return type to use a single register result.
3432 Register Dst1Reg = MI.getOperand(1).getReg();
3433 if (MRI->getType(Dst1Reg) != S32)
3434 return false;
3435
3436 // TODO: Make sure the TFE operand bit is set.
3437
3438 // The raw dword aligned data component of the load. The only legal cases
3439 // where this matters should be when using the packed D16 format, for
3440 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3441 LLT RoundedTy;
3442 LLT TFETy;
3443
3444 if (IsD16 && ST.hasUnpackedD16VMem()) {
3445 RoundedTy = LLT::scalarOrVector(NumElts, 32);
3446 TFETy = LLT::vector(NumElts + 1, 32);
3447 } else {
3448 unsigned EltSize = Ty.getScalarSizeInBits();
3449 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3450 unsigned RoundedSize = 32 * RoundedElts;
3451 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3452 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3453 }
3454
3455 Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3456 Observer.changingInstr(MI);
3457
3458 MI.getOperand(0).setReg(TFEReg);
3459 MI.RemoveOperand(1);
3460
3461 Observer.changedInstr(MI);
3462
3463 // Insert after the instruction.
3464 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3465
3466 // Now figure out how to copy the new result register back into the old
3467 // result.
3468
3469 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3470 int NumDataElts = TFETy.getNumElements() - 1;
3471
3472 if (!Ty.isVector()) {
3473 // Simplest case is a trivial unmerge (plus a truncate for d16).
3474 UnmergeResults[0] = Ty == S32 ?
3475 DstReg : MRI->createGenericVirtualRegister(S32);
3476
3477 B.buildUnmerge(UnmergeResults, TFEReg);
3478 if (Ty != S32)
3479 B.buildTrunc(DstReg, UnmergeResults[0]);
3480 return true;
3481 }
3482
3483 // We have to repack into a new vector of some kind.
3484 for (int I = 0; I != NumDataElts; ++I)
3485 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3486 B.buildUnmerge(UnmergeResults, TFEReg);
3487
3488 // Drop the final TFE element.
3489 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3490
3491 if (EltTy == S32)
3492 B.buildBuildVector(DstReg, DataPart);
3493 else if (ST.hasUnpackedD16VMem())
3494 truncToS16Vector(B, DstReg, DataPart);
3495 else
3496 bitcastToS16Vector(B, DstReg, DataPart);
3497
3498 return true;
3499 }
3500
3501 // Must be an image load.
3502 if (!Ty.isVector() || Ty.getElementType() != S16)
3503 return true;
3504
3505 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3506
3507 LLT WidenedTy = Ty.changeElementType(S32);
3508 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3509
3510 Observer.changingInstr(MI);
3511 MI.getOperand(0).setReg(WideDstReg);
3512 Observer.changedInstr(MI);
3513
3514 repackUnpackedD16Load(B, DstReg, WideDstReg);
3515 return true;
3516}
3517
3518bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3519 MachineInstr &MI, MachineIRBuilder &B,
3520 GISelChangeObserver &Observer) const {
3521 Register Dst = MI.getOperand(0).getReg();
3522 LLT Ty = B.getMRI()->getType(Dst);
3523 unsigned Size = Ty.getSizeInBits();
3524 MachineFunction &MF = B.getMF();
3525
3526 Observer.changingInstr(MI);
3527
3528 // FIXME: We don't really need this intermediate instruction. The intrinsic
3529 // should be fixed to have a memory operand. Since it's readnone, we're not
3530 // allowed to add one.
3531 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3532 MI.RemoveOperand(1); // Remove intrinsic ID
3533
3534 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3535 // TODO: Should this use datalayout alignment?
3536 const unsigned MemSize = (Size + 7) / 8;
3537 const unsigned MemAlign = 4;
3538 MachineMemOperand *MMO = MF.getMachineMemOperand(
3539 MachinePointerInfo(),
3540 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3541 MachineMemOperand::MOInvariant, MemSize, MemAlign);
3542 MI.addMemOperand(MF, MMO);
3543
3544 // There are no 96-bit result scalar loads, but widening to 128-bit should
3545 // always be legal. We may need to restore this to a 96-bit result if it turns
3546 // out this needs to be converted to a vector load during RegBankSelect.
3547 if (!isPowerOf2_32(Size)) {
3548 LegalizerHelper Helper(MF, *this, Observer, B);
3549 B.setInstr(MI);
3550
3551 if (Ty.isVector())
3552 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3553 else
3554 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3555 }
3556
3557 Observer.changedInstr(MI);
3558 return true;
3559}
3560
3561bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3562 MachineIRBuilder &B,
3563 GISelChangeObserver &Observer) const {
3564 MachineRegisterInfo &MRI = *B.getMRI();
3565
3566 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3567 auto IntrID = MI.getIntrinsicID();
3568 switch (IntrID) {
1
Control jumps to 'case amdgcn_implicitarg_ptr:' at line 3640
3569 case Intrinsic::amdgcn_if:
3570 case Intrinsic::amdgcn_else: {
3571 MachineInstr *Br = nullptr;
3572 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3573 const SIRegisterInfo *TRI
3574 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3575
3576 B.setInstr(*BrCond);
3577 Register Def = MI.getOperand(1).getReg();
3578 Register Use = MI.getOperand(3).getReg();
3579
3580 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3581 if (Br)
3582 BrTarget = Br->getOperand(0).getMBB();
3583
3584 if (IntrID == Intrinsic::amdgcn_if) {
3585 B.buildInstr(AMDGPU::SI_IF)
3586 .addDef(Def)
3587 .addUse(Use)
3588 .addMBB(BrTarget);
3589 } else {
3590 B.buildInstr(AMDGPU::SI_ELSE)
3591 .addDef(Def)
3592 .addUse(Use)
3593 .addMBB(BrTarget)
3594 .addImm(0);
3595 }
3596
3597 if (Br)
3598 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3599
3600 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3601 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3602 MI.eraseFromParent();
3603 BrCond->eraseFromParent();
3604 return true;
3605 }
3606
3607 return false;
3608 }
3609 case Intrinsic::amdgcn_loop: {
3610 MachineInstr *Br = nullptr;
3611 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3612 const SIRegisterInfo *TRI
3613 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3614
3615 B.setInstr(*BrCond);
3616
3617 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3618 if (Br)
3619 BrTarget = Br->getOperand(0).getMBB();
3620
3621 Register Reg = MI.getOperand(2).getReg();
3622 B.buildInstr(AMDGPU::SI_LOOP)
3623 .addUse(Reg)
3624 .addMBB(BrTarget);
3625
3626 if (Br)
3627 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3628
3629 MI.eraseFromParent();
3630 BrCond->eraseFromParent();
3631 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3632 return true;
3633 }
3634
3635 return false;
3636 }
3637 case Intrinsic::amdgcn_kernarg_segment_ptr:
3638 return legalizePreloadedArgIntrin(
3639 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3640 case Intrinsic::amdgcn_implicitarg_ptr:
3641 return legalizeImplicitArgPtr(MI, MRI, B);
2
Calling 'AMDGPULegalizerInfo::legalizeImplicitArgPtr'
3642 case Intrinsic::amdgcn_workitem_id_x:
3643 return legalizePreloadedArgIntrin(MI, MRI, B,
3644 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3645 case Intrinsic::amdgcn_workitem_id_y:
3646 return legalizePreloadedArgIntrin(MI, MRI, B,
3647 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3648 case Intrinsic::amdgcn_workitem_id_z:
3649 return legalizePreloadedArgIntrin(MI, MRI, B,
3650 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3651 case Intrinsic::amdgcn_workgroup_id_x:
3652 return legalizePreloadedArgIntrin(MI, MRI, B,
3653 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3654 case Intrinsic::amdgcn_workgroup_id_y:
3655 return legalizePreloadedArgIntrin(MI, MRI, B,
3656 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3657 case Intrinsic::amdgcn_workgroup_id_z:
3658 return legalizePreloadedArgIntrin(MI, MRI, B,
3659 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3660 case Intrinsic::amdgcn_dispatch_ptr:
3661 return legalizePreloadedArgIntrin(MI, MRI, B,
3662 AMDGPUFunctionArgInfo::DISPATCH_PTR);
3663 case Intrinsic::amdgcn_queue_ptr:
3664 return legalizePreloadedArgIntrin(MI, MRI, B,
3665 AMDGPUFunctionArgInfo::QUEUE_PTR);
3666 case Intrinsic::amdgcn_implicit_buffer_ptr:
3667 return legalizePreloadedArgIntrin(
3668 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3669 case Intrinsic::amdgcn_dispatch_id:
3670 return legalizePreloadedArgIntrin(MI, MRI, B,
3671 AMDGPUFunctionArgInfo::DISPATCH_ID);
3672 case Intrinsic::amdgcn_fdiv_fast:
3673 return legalizeFDIVFastIntrin(MI, MRI, B);
3674 case Intrinsic::amdgcn_is_shared:
3675 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3676 case Intrinsic::amdgcn_is_private:
3677 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3678 case Intrinsic::amdgcn_wavefrontsize: {
3679 B.setInstr(MI);
3680 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3681 MI.eraseFromParent();
3682 return true;
3683 }
3684 case Intrinsic::amdgcn_s_buffer_load:
3685 return legalizeSBufferLoad(MI, B, Observer);
3686 case Intrinsic::amdgcn_raw_buffer_store:
3687 case Intrinsic::amdgcn_struct_buffer_store:
3688 return legalizeBufferStore(MI, MRI, B, false, false);
3689 case Intrinsic::amdgcn_raw_buffer_store_format:
3690 case Intrinsic::amdgcn_struct_buffer_store_format:
3691 return legalizeBufferStore(MI, MRI, B, false, true);
3692 case Intrinsic::amdgcn_raw_tbuffer_store:
3693 case Intrinsic::amdgcn_struct_tbuffer_store:
3694 return legalizeBufferStore(MI, MRI, B, true, true);
3695 case Intrinsic::amdgcn_raw_buffer_load:
3696 case Intrinsic::amdgcn_struct_buffer_load:
3697 return legalizeBufferLoad(MI, MRI, B, false, false);
3698 case Intrinsic::amdgcn_raw_buffer_load_format:
3699 case Intrinsic::amdgcn_struct_buffer_load_format:
3700 return legalizeBufferLoad(MI, MRI, B, true, false);
3701 case Intrinsic::amdgcn_raw_tbuffer_load:
3702 case Intrinsic::amdgcn_struct_tbuffer_load:
3703 return legalizeBufferLoad(MI, MRI, B, true, true);
3704 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3705 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3706 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3707 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3708 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3709 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3710 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3711 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3712 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3713 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3714 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3715 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3716 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3717 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3718 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3719 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3720 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3721 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3722 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3723 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3724 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3725 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3726 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3727 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3728 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3729 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3730 return legalizeBufferAtomic(MI, B, IntrID);
3731 case Intrinsic::amdgcn_atomic_inc:
3732 return legalizeAtomicIncDec(MI, B, true);
3733 case Intrinsic::amdgcn_atomic_dec:
3734 return legalizeAtomicIncDec(MI, B, false);
3735 default: {
3736 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3737 AMDGPU::getImageDimIntrinsicInfo(IntrID))
3738 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3739 return true;
3740 }
3741 }
3742
3743 return true;
3744}

/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
12
Assuming the condition is true
13
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <algorithm>
18#include <cassert>
19#include <climits>
20#include <cmath>
21#include <cstdint>
22#include <cstring>
23#include <limits>
24#include <type_traits>
25
26#ifdef __ANDROID_NDK__
27#include <android/api-level.h>
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and MathExtras.h is popular.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44/// The behavior an operation has on an input of 0.
45enum ZeroBehavior {
46 /// The returned value is undefined.
47 ZB_Undefined,
48 /// The returned value is numeric_limits<T>::max()
49 ZB_Max,
50 /// The returned value is numeric_limits<T>::digits
51 ZB_Width
52};
53
54/// Mathematical constants.
55namespace numbers {
56// TODO: Track C++20 std::numbers.
57// TODO: Favor using the hexadecimal FP constants (requires C++17).
58constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
59 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
60 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
61 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
62 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
63 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
64 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
65 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
66 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
67 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
68 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
69 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
70 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
71 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
72 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
73constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
74 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
75 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
76 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
77 log2ef = 1.44269504F, // (0x1.715476P+0)
78 log10ef = .434294482F, // (0x1.bcb7b2P-2)
79 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
80 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
81 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
82 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
83 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
84 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
85 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
86 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
87 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
88} // namespace numbers
89
90namespace detail {
91template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
92 static unsigned count(T Val, ZeroBehavior) {
93 if (!Val)
94 return std::numeric_limits<T>::digits;
95 if (Val & 0x1)
96 return 0;
97
98 // Bisection method.
99 unsigned ZeroBits = 0;
100 T Shift = std::numeric_limits<T>::digits >> 1;
101 T Mask = std::numeric_limits<T>::max() >> Shift;
102 while (Shift) {
103 if ((Val & Mask) == 0) {
104 Val >>= Shift;
105 ZeroBits |= Shift;
106 }
107 Shift >>= 1;
108 Mask >>= Shift;
109 }
110 return ZeroBits;
111 }
112};
113
114#if defined(__GNUC__4) || defined(_MSC_VER)
115template <typename T> struct TrailingZerosCounter<T, 4> {
116 static unsigned count(T Val, ZeroBehavior ZB) {
117 if (ZB
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
17.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
18
Assuming 'Val' is equal to 0
19
Taking true branch
118 return 32;
20
Returning the value 32
119
120#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
121 return __builtin_ctz(Val);
122#elif defined(_MSC_VER)
123 unsigned long Index;
124 _BitScanForward(&Index, Val);
125 return Index;
126#endif
127 }
128};
129
130#if !defined(_MSC_VER) || defined(_M_X64)
131template <typename T> struct TrailingZerosCounter<T, 8> {
132 static unsigned count(T Val, ZeroBehavior ZB) {
133 if (ZB != ZB_Undefined && Val == 0)
134 return 64;
135
136#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
137 return __builtin_ctzll(Val);
138#elif defined(_MSC_VER)
139 unsigned long Index;
140 _BitScanForward64(&Index, Val);
141 return Index;
142#endif
143 }
144};
145#endif
146#endif
147} // namespace detail
148
149/// Count number of 0's from the least significant bit to the most
150/// stopping at the first 1.
151///
152/// Only unsigned integral types are allowed.
153///
154/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
155/// valid arguments.
156template <typename T>
157unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
158 static_assert(std::numeric_limits<T>::is_integer &&
159 !std::numeric_limits<T>::is_signed,
160 "Only unsigned integral types are allowed.");
161 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
17
Calling 'TrailingZerosCounter::count'
21
Returning from 'TrailingZerosCounter::count'
22
Returning the value 32
162}
163
164namespace detail {
165template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
166 static unsigned count(T Val, ZeroBehavior) {
167 if (!Val)
168 return std::numeric_limits<T>::digits;
169
170 // Bisection method.
171 unsigned ZeroBits = 0;
172 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
173 T Tmp = Val >> Shift;
174 if (Tmp)
175 Val = Tmp;
176 else
177 ZeroBits |= Shift;
178 }
179 return ZeroBits;
180 }
181};
182
183#if defined(__GNUC__4) || defined(_MSC_VER)
184template <typename T> struct LeadingZerosCounter<T, 4> {
185 static unsigned count(T Val, ZeroBehavior ZB) {
186 if (ZB != ZB_Undefined && Val == 0)
187 return 32;
188
189#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
190 return __builtin_clz(Val);
191#elif defined(_MSC_VER)
192 unsigned long Index;
193 _BitScanReverse(&Index, Val);
194 return Index ^ 31;
195#endif
196 }
197};
198
199#if !defined(_MSC_VER) || defined(_M_X64)
200template <typename T> struct LeadingZerosCounter<T, 8> {
201 static unsigned count(T Val, ZeroBehavior ZB) {
202 if (ZB != ZB_Undefined && Val == 0)
203 return 64;
204
205#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
206 return __builtin_clzll(Val);
207#elif defined(_MSC_VER)
208 unsigned long Index;
209 _BitScanReverse64(&Index, Val);
210 return Index ^ 63;
211#endif
212 }
213};
214#endif
215#endif
216} // namespace detail
217
218/// Count number of 0's from the most significant bit to the least
219/// stopping at the first 1.
220///
221/// Only unsigned integral types are allowed.
222///
223/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
224/// valid arguments.
225template <typename T>
226unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
227 static_assert(std::numeric_limits<T>::is_integer &&
228 !std::numeric_limits<T>::is_signed,
229 "Only unsigned integral types are allowed.");
230 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
231}
232
233/// Get the index of the first set bit starting from the least
234/// significant bit.
235///
236/// Only unsigned integral types are allowed.
237///
238/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
239/// valid arguments.
240template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
241 if (ZB == ZB_Max && Val == 0)
242 return std::numeric_limits<T>::max();
243
244 return countTrailingZeros(Val, ZB_Undefined);
245}
246
247/// Create a bitmask with the N right-most bits set to 1, and all other
248/// bits set to 0. Only unsigned types are allowed.
249template <typename T> T maskTrailingOnes(unsigned N) {
250 static_assert(std::is_unsigned<T>::value, "Invalid type!");
251 const unsigned Bits = CHAR_BIT8 * sizeof(T);
252 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 252, __PRETTY_FUNCTION__))
;
253 return N == 0 ? 0 : (T(-1) >> (Bits - N));
254}
255
256/// Create a bitmask with the N left-most bits set to 1, and all other
257/// bits set to 0. Only unsigned types are allowed.
258template <typename T> T maskLeadingOnes(unsigned N) {
259 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
260}
261
262/// Create a bitmask with the N right-most bits set to 0, and all other
263/// bits set to 1. Only unsigned types are allowed.
264template <typename T> T maskTrailingZeros(unsigned N) {
265 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
266}
267
268/// Create a bitmask with the N left-most bits set to 0, and all other
269/// bits set to 1. Only unsigned types are allowed.
270template <typename T> T maskLeadingZeros(unsigned N) {
271 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
272}
273
274/// Get the index of the last set bit starting from the least
275/// significant bit.
276///
277/// Only unsigned integral types are allowed.
278///
279/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
280/// valid arguments.
281template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
282 if (ZB == ZB_Max && Val == 0)
283 return std::numeric_limits<T>::max();
284
285 // Use ^ instead of - because both gcc and llvm can remove the associated ^
286 // in the __builtin_clz intrinsic on x86.
287 return countLeadingZeros(Val, ZB_Undefined) ^
288 (std::numeric_limits<T>::digits - 1);
289}
290
291/// Macro compressed bit reversal table for 256 bits.
292///
293/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
294static const unsigned char BitReverseTable256[256] = {
295#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
296#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
297#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
298 R6(0), R6(2), R6(1), R6(3)
299#undef R2
300#undef R4
301#undef R6
302};
303
304/// Reverse the bits in \p Val.
305template <typename T>
306T reverseBits(T Val) {
307 unsigned char in[sizeof(Val)];
308 unsigned char out[sizeof(Val)];
309 std::memcpy(in, &Val, sizeof(Val));
310 for (unsigned i = 0; i < sizeof(Val); ++i)
311 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
312 std::memcpy(&Val, out, sizeof(Val));
313 return Val;
314}
315
316// NOTE: The following support functions use the _32/_64 extensions instead of
317// type overloading so that signed and unsigned integers can be used without
318// ambiguity.
319
320/// Return the high 32 bits of a 64 bit value.
321constexpr inline uint32_t Hi_32(uint64_t Value) {
322 return static_cast<uint32_t>(Value >> 32);
323}
324
325/// Return the low 32 bits of a 64 bit value.
326constexpr inline uint32_t Lo_32(uint64_t Value) {
327 return static_cast<uint32_t>(Value);
328}
329
330/// Make a 64-bit integer from a high / low pair of 32-bit integers.
331constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
332 return ((uint64_t)High << 32) | (uint64_t)Low;
333}
334
335/// Checks if an integer fits into the given bit width.
336template <unsigned N> constexpr inline bool isInt(int64_t x) {
337 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
338}
339// Template specializations to get better code for common cases.
340template <> constexpr inline bool isInt<8>(int64_t x) {
341 return static_cast<int8_t>(x) == x;
342}
343template <> constexpr inline bool isInt<16>(int64_t x) {
344 return static_cast<int16_t>(x) == x;
345}
346template <> constexpr inline bool isInt<32>(int64_t x) {
347 return static_cast<int32_t>(x) == x;
348}
349
350/// Checks if a signed integer is an N bit number shifted left by S.
351template <unsigned N, unsigned S>
352constexpr inline bool isShiftedInt(int64_t x) {
353 static_assert(
354 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
355 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
356 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
357}
358
359/// Checks if an unsigned integer fits into the given bit width.
360///
361/// This is written as two functions rather than as simply
362///
363/// return N >= 64 || X < (UINT64_C(1) << N);
364///
365/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
366/// left too many places.
367template <unsigned N>
368constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t X) {
374 return true;
375}
376
377// Template specializations to get better code for common cases.
378template <> constexpr inline bool isUInt<8>(uint64_t x) {
379 return static_cast<uint8_t>(x) == x;
380}
381template <> constexpr inline bool isUInt<16>(uint64_t x) {
382 return static_cast<uint16_t>(x) == x;
383}
384template <> constexpr inline bool isUInt<32>(uint64_t x) {
385 return static_cast<uint32_t>(x) == x;
386}
387
388/// Checks if a unsigned integer is an N bit number shifted left by S.
389template <unsigned N, unsigned S>
390constexpr inline bool isShiftedUInt(uint64_t x) {
391 static_assert(
392 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
393 static_assert(N + S <= 64,
394 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
395 // Per the two static_asserts above, S must be strictly less than 64. So
396 // 1 << S is not undefined behavior.
397 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
398}
399
400/// Gets the maximum value for a N-bit unsigned integer.
401inline uint64_t maxUIntN(uint64_t N) {
402 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 402, __PRETTY_FUNCTION__))
;
403
404 // uint64_t(1) << 64 is undefined behavior, so we can't do
405 // (uint64_t(1) << N) - 1
406 // without checking first that N != 64. But this works and doesn't have a
407 // branch.
408 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
409}
410
411/// Gets the minimum value for a N-bit signed integer.
412inline int64_t minIntN(int64_t N) {
413 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 413, __PRETTY_FUNCTION__))
;
414
415 return -(UINT64_C(1)1UL<<(N-1));
416}
417
418/// Gets the maximum value for a N-bit signed integer.
419inline int64_t maxIntN(int64_t N) {
420 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 420, __PRETTY_FUNCTION__))
;
421
422 // This relies on two's complement wraparound when N == 64, so we convert to
423 // int64_t only at the very end to avoid UB.
424 return (UINT64_C(1)1UL << (N - 1)) - 1;
425}
426
427/// Checks if an unsigned integer fits into the given (dynamic) bit width.
428inline bool isUIntN(unsigned N, uint64_t x) {
429 return N >= 64 || x <= maxUIntN(N);
430}
431
432/// Checks if an signed integer fits into the given (dynamic) bit width.
433inline bool isIntN(unsigned N, int64_t x) {
434 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
435}
436
437/// Return true if the argument is a non-empty sequence of ones starting at the
438/// least significant bit with the remainder zero (32 bit version).
439/// Ex. isMask_32(0x0000FFFFU) == true.
440constexpr inline bool isMask_32(uint32_t Value) {
441 return Value && ((Value + 1) & Value) == 0;
442}
443
444/// Return true if the argument is a non-empty sequence of ones starting at the
445/// least significant bit with the remainder zero (64 bit version).
446constexpr inline bool isMask_64(uint64_t Value) {
447 return Value && ((Value + 1) & Value) == 0;
448}
449
450/// Return true if the argument contains a non-empty sequence of ones with the
451/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
452constexpr inline bool isShiftedMask_32(uint32_t Value) {
453 return Value && isMask_32((Value - 1) | Value);
454}
455
456/// Return true if the argument contains a non-empty sequence of ones with the
457/// remainder zero (64 bit version.)
458constexpr inline bool isShiftedMask_64(uint64_t Value) {
459 return Value && isMask_64((Value - 1) | Value);
460}
461
462/// Return true if the argument is a power of two > 0.
463/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
464constexpr inline bool isPowerOf2_32(uint32_t Value) {
465 return Value && !(Value & (Value - 1));
466}
467
468/// Return true if the argument is a power of two > 0 (64 bit edition.)
469constexpr inline bool isPowerOf2_64(uint64_t Value) {
470 return Value && !(Value & (Value - 1));
471}
472
473/// Count the number of ones from the most significant bit to the first
474/// zero bit.
475///
476/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
477/// Only unsigned integral types are allowed.
478///
479/// \param ZB the behavior on an input of all ones. Only ZB_Width and
480/// ZB_Undefined are valid arguments.
481template <typename T>
482unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
483 static_assert(std::numeric_limits<T>::is_integer &&
484 !std::numeric_limits<T>::is_signed,
485 "Only unsigned integral types are allowed.");
486 return countLeadingZeros<T>(~Value, ZB);
487}
488
489/// Count the number of ones from the least significant bit to the first
490/// zero bit.
491///
492/// Ex. countTrailingOnes(0x00FF00FF) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countTrailingZeros<T>(~Value, ZB);
503}
504
505namespace detail {
506template <typename T, std::size_t SizeOfT> struct PopulationCounter {
507 static unsigned count(T Value) {
508 // Generic version, forward to 32 bits.
509 static_assert(SizeOfT <= 4, "Not implemented!");
510#if defined(__GNUC__4)
511 return __builtin_popcount(Value);
512#else
513 uint32_t v = Value;
514 v = v - ((v >> 1) & 0x55555555);
515 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
516 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
517#endif
518 }
519};
520
521template <typename T> struct PopulationCounter<T, 8> {
522 static unsigned count(T Value) {
523#if defined(__GNUC__4)
524 return __builtin_popcountll(Value);
525#else
526 uint64_t v = Value;
527 v = v - ((v >> 1) & 0x5555555555555555ULL);
528 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
529 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
530 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
531#endif
532 }
533};
534} // namespace detail
535
536/// Count the number of set bits in a value.
537/// Ex. countPopulation(0xF000F000) = 8
538/// Returns 0 if the word is zero.
539template <typename T>
540inline unsigned countPopulation(T Value) {
541 static_assert(std::numeric_limits<T>::is_integer &&
542 !std::numeric_limits<T>::is_signed,
543 "Only unsigned integral types are allowed.");
544 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
545}
546
547/// Compile time Log2.
548/// Valid only for positive powers of two.
549template <size_t kValue> constexpr inline size_t CTLog2() {
550 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
551 "Value is not a valid power of 2");
552 return 1 + CTLog2<kValue / 2>();
553}
554
555template <> constexpr inline size_t CTLog2<1>() { return 0; }
556
557/// Return the log base 2 of the specified value.
558inline double Log2(double Value) {
559#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
560 return __builtin_log(Value) / __builtin_log(2.0);
561#else
562 return log2(Value);
563#endif
564}
565
566/// Return the floor log base 2 of the specified value, -1 if the value is zero.
567/// (32 bit edition.)
568/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
569inline unsigned Log2_32(uint32_t Value) {
570 return 31 - countLeadingZeros(Value);
571}
572
573/// Return the floor log base 2 of the specified value, -1 if the value is zero.
574/// (64 bit edition.)
575inline unsigned Log2_64(uint64_t Value) {
576 return 63 - countLeadingZeros(Value);
577}
578
579/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
580/// (32 bit edition).
581/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
582inline unsigned Log2_32_Ceil(uint32_t Value) {
583 return 32 - countLeadingZeros(Value - 1);
584}
585
586/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
587/// (64 bit edition.)
588inline unsigned Log2_64_Ceil(uint64_t Value) {
589 return 64 - countLeadingZeros(Value - 1);
590}
591
592/// Return the greatest common divisor of the values using Euclid's algorithm.
593template <typename T>
594inline T greatestCommonDivisor(T A, T B) {
595 while (B) {
596 T Tmp = B;
597 B = A % B;
598 A = Tmp;
599 }
600 return A;
601}
602
603inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
604 return greatestCommonDivisor<uint64_t>(A, B);
605}
606
607/// This function takes a 64-bit integer and returns the bit equivalent double.
608inline double BitsToDouble(uint64_t Bits) {
609 double D;
610 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
611 memcpy(&D, &Bits, sizeof(Bits));
612 return D;
613}
614
615/// This function takes a 32-bit integer and returns the bit equivalent float.
616inline float BitsToFloat(uint32_t Bits) {
617 float F;
618 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
619 memcpy(&F, &Bits, sizeof(Bits));
620 return F;
621}
622
623/// This function takes a double and returns the bit equivalent 64-bit integer.
624/// Note that copying doubles around changes the bits of NaNs on some hosts,
625/// notably x86, so this routine cannot be used if these bits are needed.
626inline uint64_t DoubleToBits(double Double) {
627 uint64_t Bits;
628 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
629 memcpy(&Bits, &Double, sizeof(Double));
630 return Bits;
631}
632
633/// This function takes a float and returns the bit equivalent 32-bit integer.
634/// Note that copying floats around changes the bits of NaNs on some hosts,
635/// notably x86, so this routine cannot be used if these bits are needed.
636inline uint32_t FloatToBits(float Float) {
637 uint32_t Bits;
638 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
639 memcpy(&Bits, &Float, sizeof(Float));
640 return Bits;
641}
642
643/// A and B are either alignments or offsets. Return the minimum alignment that
644/// may be assumed after adding the two together.
645constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
646 // The largest power of 2 that divides both A and B.
647 //
648 // Replace "-Value" by "1+~Value" in the following commented code to avoid
649 // MSVC warning C4146
650 // return (A | B) & -(A | B);
651 return (A | B) & (1 + ~(A | B));
652}
653
654/// Returns the next power of two (in 64-bits) that is strictly greater than A.
655/// Returns zero on overflow.
656inline uint64_t NextPowerOf2(uint64_t A) {
657 A |= (A >> 1);
658 A |= (A >> 2);
659 A |= (A >> 4);
660 A |= (A >> 8);
661 A |= (A >> 16);
662 A |= (A >> 32);
663 return A + 1;
664}
665
666/// Returns the power of two which is less than or equal to the given value.
667/// Essentially, it is a floor operation across the domain of powers of two.
668inline uint64_t PowerOf2Floor(uint64_t A) {
669 if (!A) return 0;
670 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
671}
672
673/// Returns the power of two which is greater than or equal to the given value.
674/// Essentially, it is a ceil operation across the domain of powers of two.
675inline uint64_t PowerOf2Ceil(uint64_t A) {
676 if (!A)
677 return 0;
678 return NextPowerOf2(A - 1);
679}
680
681/// Returns the next integer (mod 2**64) that is greater than or equal to
682/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
683///
684/// If non-zero \p Skew is specified, the return value will be a minimal
685/// integer that is greater than or equal to \p Value and equal to
686/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
687/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
688///
689/// Examples:
690/// \code
691/// alignTo(5, 8) = 8
692/// alignTo(17, 8) = 24
693/// alignTo(~0LL, 8) = 0
694/// alignTo(321, 255) = 510
695///
696/// alignTo(5, 8, 7) = 7
697/// alignTo(17, 8, 1) = 17
698/// alignTo(~0LL, 8, 3) = 3
699/// alignTo(321, 255, 42) = 552
700/// \endcode
701inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
702 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 702, __PRETTY_FUNCTION__))
;
703 Skew %= Align;
704 return (Value + Align - 1 - Skew) / Align * Align + Skew;
705}
706
707/// Returns the next integer (mod 2**64) that is greater than or equal to
708/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
709template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
710 static_assert(Align != 0u, "Align must be non-zero");
711 return (Value + Align - 1) / Align * Align;
712}
713
714/// Returns the integer ceil(Numerator / Denominator).
715inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
716 return alignTo(Numerator, Denominator) / Denominator;
717}
718
719/// Returns the integer nearest(Numerator / Denominator).
720inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
721 return (Numerator + (Denominator / 2)) / Denominator;
722}
723
724/// Returns the largest uint64_t less than or equal to \p Value and is
725/// \p Skew mod \p Align. \p Align must be non-zero
726inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
727 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 727, __PRETTY_FUNCTION__))
;
728 Skew %= Align;
729 return (Value - Skew) / Align * Align + Skew;
730}
731
732/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
733/// Requires 0 < B <= 32.
734template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
735 static_assert(B > 0, "Bit width can't be 0.");
736 static_assert(B <= 32, "Bit width out of range.");
737 return int32_t(X << (32 - B)) >> (32 - B);
738}
739
740/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
741/// Requires 0 < B < 32.
742inline int32_t SignExtend32(uint32_t X, unsigned B) {
743 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 743, __PRETTY_FUNCTION__))
;
744 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 744, __PRETTY_FUNCTION__))
;
745 return int32_t(X << (32 - B)) >> (32 - B);
746}
747
748/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
749/// Requires 0 < B < 64.
750template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
751 static_assert(B > 0, "Bit width can't be 0.");
752 static_assert(B <= 64, "Bit width out of range.");
753 return int64_t(x << (64 - B)) >> (64 - B);
754}
755
756/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
757/// Requires 0 < B < 64.
758inline int64_t SignExtend64(uint64_t X, unsigned B) {
759 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 759, __PRETTY_FUNCTION__))
;
760 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-11~++20200304121622+a8706b22a62/llvm/include/llvm/Support/MathExtras.h"
, 760, __PRETTY_FUNCTION__))
;
761 return int64_t(X << (64 - B)) >> (64 - B);
762}
763
764/// Subtract two unsigned integers, X and Y, of type T and return the absolute
765/// value of the result.
766template <typename T>
767std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
768 return std::max(X, Y) - std::min(X, Y);
769}
770
771/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
772/// maximum representable value of T on overflow. ResultOverflowed indicates if
773/// the result is larger than the maximum representable value of type T.
774template <typename T>
775std::enable_if_t<std::is_unsigned<T>::value, T>
776SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
777 bool Dummy;
778 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
779 // Hacker's Delight, p. 29
780 T Z = X + Y;
781 Overflowed = (Z < X || Z < Y);
782 if (Overflowed)
783 return std::numeric_limits<T>::max();
784 else
785 return Z;
786}
787
788/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
789/// maximum representable value of T on overflow. ResultOverflowed indicates if
790/// the result is larger than the maximum representable value of type T.
791template <typename T>
792std::enable_if_t<std::is_unsigned<T>::value, T>
793SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
794 bool Dummy;
795 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
796
797 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
798 // because it fails for uint16_t (where multiplication can have undefined
799 // behavior due to promotion to int), and requires a division in addition
800 // to the multiplication.
801
802 Overflowed = false;
803
804 // Log2(Z) would be either Log2Z or Log2Z + 1.
805 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
806 // will necessarily be less than Log2Max as desired.
807 int Log2Z = Log2_64(X) + Log2_64(Y);
808 const T Max = std::numeric_limits<T>::max();
809 int Log2Max = Log2_64(Max);
810 if (Log2Z < Log2Max) {
811 return X * Y;
812 }
813 if (Log2Z > Log2Max) {
814 Overflowed = true;
815 return Max;
816 }
817
818 // We're going to use the top bit, and maybe overflow one
819 // bit past it. Multiply all but the bottom bit then add
820 // that on at the end.
821 T Z = (X >> 1) * Y;
822 if (Z & ~(Max >> 1)) {
823 Overflowed = true;
824 return Max;
825 }
826 Z <<= 1;
827 if (X & 1)
828 return SaturatingAdd(Z, Y, ResultOverflowed);
829
830 return Z;
831}
832
833/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
834/// the product. Clamp the result to the maximum representable value of T on
835/// overflow. ResultOverflowed indicates if the result is larger than the
836/// maximum representable value of type T.
837template <typename T>
838std::enable_if_t<std::is_unsigned<T>::value, T>
839SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
840 bool Dummy;
841 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
842
843 T Product = SaturatingMultiply(X, Y, &Overflowed);
844 if (Overflowed)
845 return Product;
846
847 return SaturatingAdd(A, Product, &Overflowed);
848}
849
850/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
851extern const float huge_valf;
852
853
854/// Add two signed integers, computing the two's complement truncated result,
855/// returning true if overflow occured.
856template <typename T>
857std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
858#if __has_builtin(__builtin_add_overflow)1
859 return __builtin_add_overflow(X, Y, &Result);
860#else
861 // Perform the unsigned addition.
862 using U = std::make_unsigned_t<T>;
863 const U UX = static_cast<U>(X);
864 const U UY = static_cast<U>(Y);
865 const U UResult = UX + UY;
866
867 // Convert to signed.
868 Result = static_cast<T>(UResult);
869
870 // Adding two positive numbers should result in a positive number.
871 if (X > 0 && Y > 0)
872 return Result <= 0;
873 // Adding two negatives should result in a negative number.
874 if (X < 0 && Y < 0)
875 return Result >= 0;
876 return false;
877#endif
878}
879
880/// Subtract two signed integers, computing the two's complement truncated
881/// result, returning true if an overflow ocurred.
882template <typename T>
883std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
884#if __has_builtin(__builtin_sub_overflow)1
885 return __builtin_sub_overflow(X, Y, &Result);
886#else
887 // Perform the unsigned addition.
888 using U = std::make_unsigned_t<T>;
889 const U UX = static_cast<U>(X);
890 const U UY = static_cast<U>(Y);
891 const U UResult = UX - UY;
892
893 // Convert to signed.
894 Result = static_cast<T>(UResult);
895
896 // Subtracting a positive number from a negative results in a negative number.
897 if (X <= 0 && Y > 0)
898 return Result >= 0;
899 // Subtracting a negative number from a positive results in a positive number.
900 if (X >= 0 && Y < 0)
901 return Result <= 0;
902 return false;
903#endif
904}
905
906/// Multiply two signed integers, computing the two's complement truncated
907/// result, returning true if an overflow ocurred.
908template <typename T>
909std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
910 // Perform the unsigned multiplication on absolute values.
911 using U = std::make_unsigned_t<T>;
912 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
913 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
914 const U UResult = UX * UY;
915
916 // Convert to signed.
917 const bool IsNegative = (X < 0) ^ (Y < 0);
918 Result = IsNegative ? (0 - UResult) : UResult;
919
920 // If any of the args was 0, result is 0 and no overflow occurs.
921 if (UX == 0 || UY == 0)
922 return false;
923
924 // UX and UY are in [1, 2^n], where n is the number of digits.
925 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
926 // positive) divided by an argument compares to the other.
927 if (IsNegative)
928 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
929 else
930 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
931}
932
933} // End llvm namespace
934
935#endif