Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4370, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-27-232948-117823-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/CodeGen/Analysis.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include "llvm/Support/CommandLine.h"
25#include "llvm/Support/KnownBits.h"
26#include "llvm/Target/TargetMachine.h"
27
28using namespace llvm;
29
30#include "AMDGPUGenCallingConv.inc"
31
32static cl::opt<bool> AMDGPUBypassSlowDiv(
33 "amdgpu-bypass-slow-div",
34 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35 cl::init(true));
36
37// Find a larger type to do a load / store of a vector with.
38EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
39 unsigned StoreSize = VT.getStoreSizeInBits();
40 if (StoreSize <= 32)
41 return EVT::getIntegerVT(Ctx, StoreSize);
42
43 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")(static_cast <bool> (StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? void (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 43, __extension__
__PRETTY_FUNCTION__))
;
44 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45}
46
47unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
48 return DAG.computeKnownBits(Op).countMaxActiveBits();
49}
50
51unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
52 // In order for this to be a signed 24-bit value, bit 23, must
53 // be a sign bit.
54 return DAG.ComputeMaxSignificantBits(Op);
55}
56
57AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
58 const AMDGPUSubtarget &STI)
59 : TargetLowering(TM), Subtarget(&STI) {
60 // Lower floating point store/load to integer store/load to reduce the number
61 // of patterns in tablegen.
62 setOperationAction(ISD::LOAD, MVT::f32, Promote);
63 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
64
65 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
66 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
67
68 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
69 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
70
71 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
72 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
73
74 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
75 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
76
77 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
78 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
79
80 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
81 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
82
83 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
84 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
85
86 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
87 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
88
89 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
90 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
91
92 setOperationAction(ISD::LOAD, MVT::i64, Promote);
93 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
94
95 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
96 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
97
98 setOperationAction(ISD::LOAD, MVT::f64, Promote);
99 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
100
101 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
102 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
103
104 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
106
107 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
109
110 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
112
113 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
115
116 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
118
119 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
120 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
121
122 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
123 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
124
125 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
126 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
127
128 // There are no 64-bit extloads. These should be done as a 32-bit extload and
129 // an extension to 64-bit.
130 for (MVT VT : MVT::integer_valuetypes()) {
131 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
132 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
133 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
134 }
135
136 for (MVT VT : MVT::integer_valuetypes()) {
137 if (VT == MVT::i64)
138 continue;
139
140 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
141 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
143 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
144
145 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
146 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
147 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
148 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
149
150 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
151 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
152 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
153 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
154 }
155
156 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
158 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
159 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
160 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
161 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
162 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
163 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
164 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
165 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
166 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
167 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
168 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
169 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
170 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
171 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
172 }
173
174 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
175 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
181
182 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
188
189 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
190 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
195
196 setOperationAction(ISD::STORE, MVT::f32, Promote);
197 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
198
199 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
200 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
201
202 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
203 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
204
205 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
206 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
207
208 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
209 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
210
211 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
212 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
213
214 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
215 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
216
217 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
218 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
219
220 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
221 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
222
223 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
224 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
225
226 setOperationAction(ISD::STORE, MVT::i64, Promote);
227 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
228
229 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
230 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
231
232 setOperationAction(ISD::STORE, MVT::f64, Promote);
233 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
234
235 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
236 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
237
238 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
239 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
240
241 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
242 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
243
244 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
245 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
246
247 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
248 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
249
250 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
251 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
252
253 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
254 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
255
256 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
257 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
258
259 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
260 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
261
262 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
263 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
264 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
265 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
266
267 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
268 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
269 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
270 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
271
272 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
273 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
274 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
275 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
276 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
277 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
278 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
279
280 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
281 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
282
283 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
284 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
285
286 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
287 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
288 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
289 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
290
291 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
292 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
293 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
294 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
295
296 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
297 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
298
299 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
300 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
301 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
302 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
303 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
304 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
305 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
306
307 setOperationAction(ISD::Constant, MVT::i32, Legal);
308 setOperationAction(ISD::Constant, MVT::i64, Legal);
309 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
310 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
311
312 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
313 setOperationAction(ISD::BRIND, MVT::Other, Expand);
314
315 // This is totally unsupported, just custom lower to produce an error.
316 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
317
318 // Library functions. These default to Expand, but we have instructions
319 // for them.
320 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
321 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
322 setOperationAction(ISD::FPOW, MVT::f32, Legal);
323 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
324 setOperationAction(ISD::FABS, MVT::f32, Legal);
325 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
326 setOperationAction(ISD::FRINT, MVT::f32, Legal);
327 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
328 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
329 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
330
331 setOperationAction(ISD::FROUND, MVT::f32, Custom);
332 setOperationAction(ISD::FROUND, MVT::f64, Custom);
333
334 setOperationAction(ISD::FLOG, MVT::f32, Custom);
335 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
336 setOperationAction(ISD::FEXP, MVT::f32, Custom);
337
338
339 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
340 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
341
342 setOperationAction(ISD::FREM, MVT::f16, Custom);
343 setOperationAction(ISD::FREM, MVT::f32, Custom);
344 setOperationAction(ISD::FREM, MVT::f64, Custom);
345
346 // Expand to fneg + fadd.
347 setOperationAction(ISD::FSUB, MVT::f64, Expand);
348
349 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
350 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
351 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
352 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
353 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
354 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
355 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
356 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
357 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
358 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
359 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
360 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
361 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
362 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
363 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
364 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
365 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
366 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
367 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
368 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
369 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
370 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
371 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
372 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
373 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
374 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
375 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
376 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
377 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
378 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
379 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
380 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
381 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
382 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
383 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
384 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
385 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
386 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
387 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
388 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
389 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
390 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
391 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
392 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
393
394 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
395 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
396 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
397
398 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
399 for (MVT VT : ScalarIntVTs) {
400 // These should use [SU]DIVREM, so set them to expand
401 setOperationAction(ISD::SDIV, VT, Expand);
402 setOperationAction(ISD::UDIV, VT, Expand);
403 setOperationAction(ISD::SREM, VT, Expand);
404 setOperationAction(ISD::UREM, VT, Expand);
405
406 // GPU does not have divrem function for signed or unsigned.
407 setOperationAction(ISD::SDIVREM, VT, Custom);
408 setOperationAction(ISD::UDIVREM, VT, Custom);
409
410 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
411 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
412 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
413
414 setOperationAction(ISD::BSWAP, VT, Expand);
415 setOperationAction(ISD::CTTZ, VT, Expand);
416 setOperationAction(ISD::CTLZ, VT, Expand);
417
418 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
419 setOperationAction(ISD::ADDC, VT, Legal);
420 setOperationAction(ISD::SUBC, VT, Legal);
421 setOperationAction(ISD::ADDE, VT, Legal);
422 setOperationAction(ISD::SUBE, VT, Legal);
423 }
424
425 // The hardware supports 32-bit FSHR, but not FSHL.
426 setOperationAction(ISD::FSHR, MVT::i32, Legal);
427
428 // The hardware supports 32-bit ROTR, but not ROTL.
429 setOperationAction(ISD::ROTL, MVT::i32, Expand);
430 setOperationAction(ISD::ROTL, MVT::i64, Expand);
431 setOperationAction(ISD::ROTR, MVT::i64, Expand);
432
433 setOperationAction(ISD::MULHU, MVT::i16, Expand);
434 setOperationAction(ISD::MULHS, MVT::i16, Expand);
435
436 setOperationAction(ISD::MUL, MVT::i64, Expand);
437 setOperationAction(ISD::MULHU, MVT::i64, Expand);
438 setOperationAction(ISD::MULHS, MVT::i64, Expand);
439 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
440 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
441 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
442 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
443 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
444
445 setOperationAction(ISD::SMIN, MVT::i32, Legal);
446 setOperationAction(ISD::UMIN, MVT::i32, Legal);
447 setOperationAction(ISD::SMAX, MVT::i32, Legal);
448 setOperationAction(ISD::UMAX, MVT::i32, Legal);
449
450 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
451 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
452 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
453 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
454
455 static const MVT::SimpleValueType VectorIntTypes[] = {
456 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
457
458 for (MVT VT : VectorIntTypes) {
459 // Expand the following operations for the current type by default.
460 setOperationAction(ISD::ADD, VT, Expand);
461 setOperationAction(ISD::AND, VT, Expand);
462 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
463 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
464 setOperationAction(ISD::MUL, VT, Expand);
465 setOperationAction(ISD::MULHU, VT, Expand);
466 setOperationAction(ISD::MULHS, VT, Expand);
467 setOperationAction(ISD::OR, VT, Expand);
468 setOperationAction(ISD::SHL, VT, Expand);
469 setOperationAction(ISD::SRA, VT, Expand);
470 setOperationAction(ISD::SRL, VT, Expand);
471 setOperationAction(ISD::ROTL, VT, Expand);
472 setOperationAction(ISD::ROTR, VT, Expand);
473 setOperationAction(ISD::SUB, VT, Expand);
474 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
475 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
476 setOperationAction(ISD::SDIV, VT, Expand);
477 setOperationAction(ISD::UDIV, VT, Expand);
478 setOperationAction(ISD::SREM, VT, Expand);
479 setOperationAction(ISD::UREM, VT, Expand);
480 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
481 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
482 setOperationAction(ISD::SDIVREM, VT, Expand);
483 setOperationAction(ISD::UDIVREM, VT, Expand);
484 setOperationAction(ISD::SELECT, VT, Expand);
485 setOperationAction(ISD::VSELECT, VT, Expand);
486 setOperationAction(ISD::SELECT_CC, VT, Expand);
487 setOperationAction(ISD::XOR, VT, Expand);
488 setOperationAction(ISD::BSWAP, VT, Expand);
489 setOperationAction(ISD::CTPOP, VT, Expand);
490 setOperationAction(ISD::CTTZ, VT, Expand);
491 setOperationAction(ISD::CTLZ, VT, Expand);
492 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
493 setOperationAction(ISD::SETCC, VT, Expand);
494 }
495
496 static const MVT::SimpleValueType FloatVectorTypes[] = {
497 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
498
499 for (MVT VT : FloatVectorTypes) {
500 setOperationAction(ISD::FABS, VT, Expand);
501 setOperationAction(ISD::FMINNUM, VT, Expand);
502 setOperationAction(ISD::FMAXNUM, VT, Expand);
503 setOperationAction(ISD::FADD, VT, Expand);
504 setOperationAction(ISD::FCEIL, VT, Expand);
505 setOperationAction(ISD::FCOS, VT, Expand);
506 setOperationAction(ISD::FDIV, VT, Expand);
507 setOperationAction(ISD::FEXP2, VT, Expand);
508 setOperationAction(ISD::FEXP, VT, Expand);
509 setOperationAction(ISD::FLOG2, VT, Expand);
510 setOperationAction(ISD::FREM, VT, Expand);
511 setOperationAction(ISD::FLOG, VT, Expand);
512 setOperationAction(ISD::FLOG10, VT, Expand);
513 setOperationAction(ISD::FPOW, VT, Expand);
514 setOperationAction(ISD::FFLOOR, VT, Expand);
515 setOperationAction(ISD::FTRUNC, VT, Expand);
516 setOperationAction(ISD::FMUL, VT, Expand);
517 setOperationAction(ISD::FMA, VT, Expand);
518 setOperationAction(ISD::FRINT, VT, Expand);
519 setOperationAction(ISD::FNEARBYINT, VT, Expand);
520 setOperationAction(ISD::FSQRT, VT, Expand);
521 setOperationAction(ISD::FSIN, VT, Expand);
522 setOperationAction(ISD::FSUB, VT, Expand);
523 setOperationAction(ISD::FNEG, VT, Expand);
524 setOperationAction(ISD::VSELECT, VT, Expand);
525 setOperationAction(ISD::SELECT_CC, VT, Expand);
526 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
527 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
528 setOperationAction(ISD::SETCC, VT, Expand);
529 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
530 }
531
532 // This causes using an unrolled select operation rather than expansion with
533 // bit operations. This is in general better, but the alternative using BFI
534 // instructions may be better if the select sources are SGPRs.
535 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
536 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
537
538 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
539 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
540
541 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
542 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
543
544 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
545 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
546
547 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
548 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
549
550 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
551 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
552
553 // There are no libcalls of any kind.
554 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
555 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
556
557 setSchedulingPreference(Sched::RegPressure);
558 setJumpIsExpensive(true);
559
560 // FIXME: This is only partially true. If we have to do vector compares, any
561 // SGPR pair can be a condition register. If we have a uniform condition, we
562 // are better off doing SALU operations, where there is only one SCC. For now,
563 // we don't have a way of knowing during instruction selection if a condition
564 // will be uniform and we always use vector compares. Assume we are using
565 // vector compares until that is fixed.
566 setHasMultipleConditionRegisters(true);
567
568 setMinCmpXchgSizeInBits(32);
569 setSupportsUnalignedAtomics(false);
570
571 PredictableSelectIsExpensive = false;
572
573 // We want to find all load dependencies for long chains of stores to enable
574 // merging into very wide vectors. The problem is with vectors with > 4
575 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
576 // vectors are a legal type, even though we have to split the loads
577 // usually. When we can more precisely specify load legality per address
578 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
579 // smarter so that they can figure out what to do in 2 iterations without all
580 // N > 4 stores on the same chain.
581 GatherAllAliasesMaxDepth = 16;
582
583 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
584 // about these during lowering.
585 MaxStoresPerMemcpy = 0xffffffff;
586 MaxStoresPerMemmove = 0xffffffff;
587 MaxStoresPerMemset = 0xffffffff;
588
589 // The expansion for 64-bit division is enormous.
590 if (AMDGPUBypassSlowDiv)
591 addBypassSlowDiv(64, 32);
592
593 setTargetDAGCombine(ISD::BITCAST);
594 setTargetDAGCombine(ISD::SHL);
595 setTargetDAGCombine(ISD::SRA);
596 setTargetDAGCombine(ISD::SRL);
597 setTargetDAGCombine(ISD::TRUNCATE);
598 setTargetDAGCombine(ISD::MUL);
599 setTargetDAGCombine(ISD::SMUL_LOHI);
600 setTargetDAGCombine(ISD::UMUL_LOHI);
601 setTargetDAGCombine(ISD::MULHU);
602 setTargetDAGCombine(ISD::MULHS);
603 setTargetDAGCombine(ISD::SELECT);
604 setTargetDAGCombine(ISD::SELECT_CC);
605 setTargetDAGCombine(ISD::STORE);
606 setTargetDAGCombine(ISD::FADD);
607 setTargetDAGCombine(ISD::FSUB);
608 setTargetDAGCombine(ISD::FNEG);
609 setTargetDAGCombine(ISD::FABS);
610 setTargetDAGCombine(ISD::AssertZext);
611 setTargetDAGCombine(ISD::AssertSext);
612 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
613}
614
615bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
616 if (getTargetMachine().Options.NoSignedZerosFPMath)
617 return true;
618
619 const auto Flags = Op.getNode()->getFlags();
620 if (Flags.hasNoSignedZeros())
621 return true;
622
623 return false;
624}
625
626//===----------------------------------------------------------------------===//
627// Target Information
628//===----------------------------------------------------------------------===//
629
630LLVM_READNONE__attribute__((__const__))
631static bool fnegFoldsIntoOp(unsigned Opc) {
632 switch (Opc) {
633 case ISD::FADD:
634 case ISD::FSUB:
635 case ISD::FMUL:
636 case ISD::FMA:
637 case ISD::FMAD:
638 case ISD::FMINNUM:
639 case ISD::FMAXNUM:
640 case ISD::FMINNUM_IEEE:
641 case ISD::FMAXNUM_IEEE:
642 case ISD::FSIN:
643 case ISD::FTRUNC:
644 case ISD::FRINT:
645 case ISD::FNEARBYINT:
646 case ISD::FCANONICALIZE:
647 case AMDGPUISD::RCP:
648 case AMDGPUISD::RCP_LEGACY:
649 case AMDGPUISD::RCP_IFLAG:
650 case AMDGPUISD::SIN_HW:
651 case AMDGPUISD::FMUL_LEGACY:
652 case AMDGPUISD::FMIN_LEGACY:
653 case AMDGPUISD::FMAX_LEGACY:
654 case AMDGPUISD::FMED3:
655 // TODO: handle llvm.amdgcn.fma.legacy
656 return true;
657 default:
658 return false;
659 }
660}
661
662/// \p returns true if the operation will definitely need to use a 64-bit
663/// encoding, and thus will use a VOP3 encoding regardless of the source
664/// modifiers.
665LLVM_READONLY__attribute__((__pure__))
666static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
667 return N->getNumOperands() > 2 || VT == MVT::f64;
668}
669
670// Most FP instructions support source modifiers, but this could be refined
671// slightly.
672LLVM_READONLY__attribute__((__pure__))
673static bool hasSourceMods(const SDNode *N) {
674 if (isa<MemSDNode>(N))
675 return false;
676
677 switch (N->getOpcode()) {
678 case ISD::CopyToReg:
679 case ISD::SELECT:
680 case ISD::FDIV:
681 case ISD::FREM:
682 case ISD::INLINEASM:
683 case ISD::INLINEASM_BR:
684 case AMDGPUISD::DIV_SCALE:
685 case ISD::INTRINSIC_W_CHAIN:
686
687 // TODO: Should really be looking at the users of the bitcast. These are
688 // problematic because bitcasts are used to legalize all stores to integer
689 // types.
690 case ISD::BITCAST:
691 return false;
692 case ISD::INTRINSIC_WO_CHAIN: {
693 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
694 case Intrinsic::amdgcn_interp_p1:
695 case Intrinsic::amdgcn_interp_p2:
696 case Intrinsic::amdgcn_interp_mov:
697 case Intrinsic::amdgcn_interp_p1_f16:
698 case Intrinsic::amdgcn_interp_p2_f16:
699 return false;
700 default:
701 return true;
702 }
703 }
704 default:
705 return true;
706 }
707}
708
709bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
710 unsigned CostThreshold) {
711 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
712 // it is truly free to use a source modifier in all cases. If there are
713 // multiple users but for each one will necessitate using VOP3, there will be
714 // a code size increase. Try to avoid increasing code size unless we know it
715 // will save on the instruction count.
716 unsigned NumMayIncreaseSize = 0;
717 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
718
719 // XXX - Should this limit number of uses to check?
720 for (const SDNode *U : N->uses()) {
721 if (!hasSourceMods(U))
722 return false;
723
724 if (!opMustUseVOP3Encoding(U, VT)) {
725 if (++NumMayIncreaseSize > CostThreshold)
726 return false;
727 }
728 }
729
730 return true;
731}
732
733EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
734 ISD::NodeType ExtendKind) const {
735 assert(!VT.isVector() && "only scalar expected")(static_cast <bool> (!VT.isVector() && "only scalar expected"
) ? void (0) : __assert_fail ("!VT.isVector() && \"only scalar expected\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 735, __extension__
__PRETTY_FUNCTION__))
;
736
737 // Round to the next multiple of 32-bits.
738 unsigned Size = VT.getSizeInBits();
739 if (Size <= 32)
740 return MVT::i32;
741 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
742}
743
744MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
745 return MVT::i32;
746}
747
748bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
749 return true;
750}
751
752// The backend supports 32 and 64 bit floating point immediates.
753// FIXME: Why are we reporting vectors of FP immediates as legal?
754bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
755 bool ForCodeSize) const {
756 EVT ScalarVT = VT.getScalarType();
757 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
758 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
759}
760
761// We don't want to shrink f64 / f32 constants.
762bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
763 EVT ScalarVT = VT.getScalarType();
764 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
765}
766
767bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
768 ISD::LoadExtType ExtTy,
769 EVT NewVT) const {
770 // TODO: This may be worth removing. Check regression tests for diffs.
771 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
772 return false;
773
774 unsigned NewSize = NewVT.getStoreSizeInBits();
775
776 // If we are reducing to a 32-bit load or a smaller multi-dword load,
777 // this is always better.
778 if (NewSize >= 32)
779 return true;
780
781 EVT OldVT = N->getValueType(0);
782 unsigned OldSize = OldVT.getStoreSizeInBits();
783
784 MemSDNode *MN = cast<MemSDNode>(N);
785 unsigned AS = MN->getAddressSpace();
786 // Do not shrink an aligned scalar load to sub-dword.
787 // Scalar engine cannot do sub-dword loads.
788 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
789 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
790 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
791 (isa<LoadSDNode>(N) &&
792 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
793 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
794 return false;
795
796 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
797 // extloads, so doing one requires using a buffer_load. In cases where we
798 // still couldn't use a scalar load, using the wider load shouldn't really
799 // hurt anything.
800
801 // If the old size already had to be an extload, there's no harm in continuing
802 // to reduce the width.
803 return (OldSize < 32);
804}
805
806bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
807 const SelectionDAG &DAG,
808 const MachineMemOperand &MMO) const {
809
810 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())(static_cast <bool> (LoadTy.getSizeInBits() == CastTy.getSizeInBits
()) ? void (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 810, __extension__
__PRETTY_FUNCTION__))
;
811
812 if (LoadTy.getScalarType() == MVT::i32)
813 return false;
814
815 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
816 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
817
818 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
819 return false;
820
821 bool Fast = false;
822 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
823 CastTy, MMO, &Fast) &&
824 Fast;
825}
826
827// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
828// profitable with the expansion for 64-bit since it's generally good to
829// speculate things.
830// FIXME: These should really have the size as a parameter.
831bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
832 return true;
833}
834
835bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
836 return true;
837}
838
839bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
840 switch (N->getOpcode()) {
841 case ISD::EntryToken:
842 case ISD::TokenFactor:
843 return true;
844 case ISD::INTRINSIC_WO_CHAIN: {
845 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
846 switch (IntrID) {
847 case Intrinsic::amdgcn_readfirstlane:
848 case Intrinsic::amdgcn_readlane:
849 return true;
850 }
851 return false;
852 }
853 case ISD::LOAD:
854 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
855 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
856 return true;
857 return false;
858 }
859 return false;
860}
861
862SDValue AMDGPUTargetLowering::getNegatedExpression(
863 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
864 NegatibleCost &Cost, unsigned Depth) const {
865
866 switch (Op.getOpcode()) {
867 case ISD::FMA:
868 case ISD::FMAD: {
869 // Negating a fma is not free if it has users without source mods.
870 if (!allUsesHaveSourceMods(Op.getNode()))
871 return SDValue();
872 break;
873 }
874 default:
875 break;
876 }
877
878 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
879 ForCodeSize, Cost, Depth);
880}
881
882//===---------------------------------------------------------------------===//
883// Target Properties
884//===---------------------------------------------------------------------===//
885
886bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
887 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 887, __extension__ __PRETTY_FUNCTION__))
;
888
889 // Packed operations do not have a fabs modifier.
890 return VT == MVT::f32 || VT == MVT::f64 ||
891 (Subtarget->has16BitInsts() && VT == MVT::f16);
892}
893
894bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
895 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 895, __extension__ __PRETTY_FUNCTION__))
;
896 // Report this based on the end legalized type.
897 VT = VT.getScalarType();
898 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
899}
900
901bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
902 unsigned NumElem,
903 unsigned AS) const {
904 return true;
905}
906
907bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
908 // There are few operations which truly have vector input operands. Any vector
909 // operation is going to involve operations on each component, and a
910 // build_vector will be a copy per element, so it always makes sense to use a
911 // build_vector input in place of the extracted element to avoid a copy into a
912 // super register.
913 //
914 // We should probably only do this if all users are extracts only, but this
915 // should be the common case.
916 return true;
917}
918
919bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
920 // Truncate is just accessing a subregister.
921
922 unsigned SrcSize = Source.getSizeInBits();
923 unsigned DestSize = Dest.getSizeInBits();
924
925 return DestSize < SrcSize && DestSize % 32 == 0 ;
926}
927
928bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
929 // Truncate is just accessing a subregister.
930
931 unsigned SrcSize = Source->getScalarSizeInBits();
932 unsigned DestSize = Dest->getScalarSizeInBits();
933
934 if (DestSize== 16 && Subtarget->has16BitInsts())
935 return SrcSize >= 32;
936
937 return DestSize < SrcSize && DestSize % 32 == 0;
938}
939
940bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
941 unsigned SrcSize = Src->getScalarSizeInBits();
942 unsigned DestSize = Dest->getScalarSizeInBits();
943
944 if (SrcSize == 16 && Subtarget->has16BitInsts())
945 return DestSize >= 32;
946
947 return SrcSize == 32 && DestSize == 64;
948}
949
950bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
951 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
952 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
953 // this will enable reducing 64-bit operations the 32-bit, which is always
954 // good.
955
956 if (Src == MVT::i16)
957 return Dest == MVT::i32 ||Dest == MVT::i64 ;
958
959 return Src == MVT::i32 && Dest == MVT::i64;
960}
961
962bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
963 return isZExtFree(Val.getValueType(), VT2);
964}
965
966bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
967 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
968 // limited number of native 64-bit operations. Shrinking an operation to fit
969 // in a single 32-bit register should always be helpful. As currently used,
970 // this is much less general than the name suggests, and is only used in
971 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
972 // not profitable, and may actually be harmful.
973 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
974}
975
976//===---------------------------------------------------------------------===//
977// TargetLowering Callbacks
978//===---------------------------------------------------------------------===//
979
980CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
981 bool IsVarArg) {
982 switch (CC) {
983 case CallingConv::AMDGPU_VS:
984 case CallingConv::AMDGPU_GS:
985 case CallingConv::AMDGPU_PS:
986 case CallingConv::AMDGPU_CS:
987 case CallingConv::AMDGPU_HS:
988 case CallingConv::AMDGPU_ES:
989 case CallingConv::AMDGPU_LS:
990 return CC_AMDGPU;
991 case CallingConv::C:
992 case CallingConv::Fast:
993 case CallingConv::Cold:
994 return CC_AMDGPU_Func;
995 case CallingConv::AMDGPU_Gfx:
996 return CC_SI_Gfx;
997 case CallingConv::AMDGPU_KERNEL:
998 case CallingConv::SPIR_KERNEL:
999 default:
1000 report_fatal_error("Unsupported calling convention for call");
1001 }
1002}
1003
1004CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1005 bool IsVarArg) {
1006 switch (CC) {
1007 case CallingConv::AMDGPU_KERNEL:
1008 case CallingConv::SPIR_KERNEL:
1009 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1009)
;
1010 case CallingConv::AMDGPU_VS:
1011 case CallingConv::AMDGPU_GS:
1012 case CallingConv::AMDGPU_PS:
1013 case CallingConv::AMDGPU_CS:
1014 case CallingConv::AMDGPU_HS:
1015 case CallingConv::AMDGPU_ES:
1016 case CallingConv::AMDGPU_LS:
1017 return RetCC_SI_Shader;
1018 case CallingConv::AMDGPU_Gfx:
1019 return RetCC_SI_Gfx;
1020 case CallingConv::C:
1021 case CallingConv::Fast:
1022 case CallingConv::Cold:
1023 return RetCC_AMDGPU_Func;
1024 default:
1025 report_fatal_error("Unsupported calling convention.");
1026 }
1027}
1028
1029/// The SelectionDAGBuilder will automatically promote function arguments
1030/// with illegal types. However, this does not work for the AMDGPU targets
1031/// since the function arguments are stored in memory as these illegal types.
1032/// In order to handle this properly we need to get the original types sizes
1033/// from the LLVM IR Function and fixup the ISD:InputArg values before
1034/// passing them to AnalyzeFormalArguments()
1035
1036/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1037/// input values across multiple registers. Each item in the Ins array
1038/// represents a single value that will be stored in registers. Ins[x].VT is
1039/// the value type of the value that will be stored in the register, so
1040/// whatever SDNode we lower the argument to needs to be this type.
1041///
1042/// In order to correctly lower the arguments we need to know the size of each
1043/// argument. Since Ins[x].VT gives us the size of the register that will
1044/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1045/// for the original function argument so that we can deduce the correct memory
1046/// type to use for Ins[x]. In most cases the correct memory type will be
1047/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1048/// we have a kernel argument of type v8i8, this argument will be split into
1049/// 8 parts and each part will be represented by its own item in the Ins array.
1050/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1051/// the argument before it was split. From this, we deduce that the memory type
1052/// for each individual part is i8. We pass the memory type as LocVT to the
1053/// calling convention analysis function and the register type (Ins[x].VT) as
1054/// the ValVT.
1055void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1056 CCState &State,
1057 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1058 const MachineFunction &MF = State.getMachineFunction();
1059 const Function &Fn = MF.getFunction();
1060 LLVMContext &Ctx = Fn.getParent()->getContext();
1061 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1062 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1063 CallingConv::ID CC = Fn.getCallingConv();
1064
1065 Align MaxAlign = Align(1);
1066 uint64_t ExplicitArgOffset = 0;
1067 const DataLayout &DL = Fn.getParent()->getDataLayout();
1068
1069 unsigned InIndex = 0;
1070
1071 for (const Argument &Arg : Fn.args()) {
1072 const bool IsByRef = Arg.hasByRefAttr();
1073 Type *BaseArgTy = Arg.getType();
1074 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1075 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1076 if (!Alignment)
1077 Alignment = DL.getABITypeAlign(MemArgTy);
1078 MaxAlign = max(Alignment, MaxAlign);
1079 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1080
1081 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1082 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1083
1084 // We're basically throwing away everything passed into us and starting over
1085 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1086 // to us as computed in Ins.
1087 //
1088 // We also need to figure out what type legalization is trying to do to get
1089 // the correct memory offsets.
1090
1091 SmallVector<EVT, 16> ValueVTs;
1092 SmallVector<uint64_t, 16> Offsets;
1093 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1094
1095 for (unsigned Value = 0, NumValues = ValueVTs.size();
1096 Value != NumValues; ++Value) {
1097 uint64_t BasePartOffset = Offsets[Value];
1098
1099 EVT ArgVT = ValueVTs[Value];
1100 EVT MemVT = ArgVT;
1101 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1102 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1103
1104 if (NumRegs == 1) {
1105 // This argument is not split, so the IR type is the memory type.
1106 if (ArgVT.isExtended()) {
1107 // We have an extended type, like i24, so we should just use the
1108 // register type.
1109 MemVT = RegisterVT;
1110 } else {
1111 MemVT = ArgVT;
1112 }
1113 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1114 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1115 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())(static_cast <bool> (ArgVT.getVectorNumElements() > RegisterVT
.getVectorNumElements()) ? void (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1115, __extension__
__PRETTY_FUNCTION__))
;
1116 // We have a vector value which has been split into a vector with
1117 // the same scalar type, but fewer elements. This should handle
1118 // all the floating-point vector types.
1119 MemVT = RegisterVT;
1120 } else if (ArgVT.isVector() &&
1121 ArgVT.getVectorNumElements() == NumRegs) {
1122 // This arg has been split so that each element is stored in a separate
1123 // register.
1124 MemVT = ArgVT.getScalarType();
1125 } else if (ArgVT.isExtended()) {
1126 // We have an extended type, like i65.
1127 MemVT = RegisterVT;
1128 } else {
1129 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1130 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)(static_cast <bool> (ArgVT.getStoreSizeInBits() % NumRegs
== 0) ? void (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1130, __extension__
__PRETTY_FUNCTION__))
;
1131 if (RegisterVT.isInteger()) {
1132 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1133 } else if (RegisterVT.isVector()) {
1134 assert(!RegisterVT.getScalarType().isFloatingPoint())(static_cast <bool> (!RegisterVT.getScalarType().isFloatingPoint
()) ? void (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1134, __extension__
__PRETTY_FUNCTION__))
;
1135 unsigned NumElements = RegisterVT.getVectorNumElements();
1136 assert(MemoryBits % NumElements == 0)(static_cast <bool> (MemoryBits % NumElements == 0) ? void
(0) : __assert_fail ("MemoryBits % NumElements == 0", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1136, __extension__ __PRETTY_FUNCTION__))
;
1137 // This vector type has been split into another vector type with
1138 // a different elements size.
1139 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1140 MemoryBits / NumElements);
1141 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1142 } else {
1143 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1143)
;
1144 }
1145 }
1146
1147 // Convert one element vectors to scalar.
1148 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1149 MemVT = MemVT.getScalarType();
1150
1151 // Round up vec3/vec5 argument.
1152 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1153 assert(MemVT.getVectorNumElements() == 3 ||(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5) ? void (0) : __assert_fail
("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1154, __extension__
__PRETTY_FUNCTION__))
1154 MemVT.getVectorNumElements() == 5)(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5) ? void (0) : __assert_fail
("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1154, __extension__
__PRETTY_FUNCTION__))
;
1155 MemVT = MemVT.getPow2VectorType(State.getContext());
1156 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1157 MemVT = MemVT.getRoundIntegerType(State.getContext());
1158 }
1159
1160 unsigned PartOffset = 0;
1161 for (unsigned i = 0; i != NumRegs; ++i) {
1162 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1163 BasePartOffset + PartOffset,
1164 MemVT.getSimpleVT(),
1165 CCValAssign::Full));
1166 PartOffset += MemVT.getStoreSize();
1167 }
1168 }
1169 }
1170}
1171
1172SDValue AMDGPUTargetLowering::LowerReturn(
1173 SDValue Chain, CallingConv::ID CallConv,
1174 bool isVarArg,
1175 const SmallVectorImpl<ISD::OutputArg> &Outs,
1176 const SmallVectorImpl<SDValue> &OutVals,
1177 const SDLoc &DL, SelectionDAG &DAG) const {
1178 // FIXME: Fails for r600 tests
1179 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1180 // "wave terminate should not have return values");
1181 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1182}
1183
1184//===---------------------------------------------------------------------===//
1185// Target specific lowering
1186//===---------------------------------------------------------------------===//
1187
1188/// Selects the correct CCAssignFn for a given CallingConvention value.
1189CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1190 bool IsVarArg) {
1191 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1192}
1193
1194CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1195 bool IsVarArg) {
1196 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1197}
1198
1199SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1200 SelectionDAG &DAG,
1201 MachineFrameInfo &MFI,
1202 int ClobberedFI) const {
1203 SmallVector<SDValue, 8> ArgChains;
1204 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1205 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1206
1207 // Include the original chain at the beginning of the list. When this is
1208 // used by target LowerCall hooks, this helps legalize find the
1209 // CALLSEQ_BEGIN node.
1210 ArgChains.push_back(Chain);
1211
1212 // Add a chain value for each stack argument corresponding
1213 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1214 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1215 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1216 if (FI->getIndex() < 0) {
1217 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1218 int64_t InLastByte = InFirstByte;
1219 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1220
1221 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1222 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1223 ArgChains.push_back(SDValue(L, 1));
1224 }
1225 }
1226 }
1227 }
1228
1229 // Build a tokenfactor for all the chains.
1230 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1231}
1232
1233SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1234 SmallVectorImpl<SDValue> &InVals,
1235 StringRef Reason) const {
1236 SDValue Callee = CLI.Callee;
1237 SelectionDAG &DAG = CLI.DAG;
1238
1239 const Function &Fn = DAG.getMachineFunction().getFunction();
1240
1241 StringRef FuncName("<unknown>");
1242
1243 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1244 FuncName = G->getSymbol();
1245 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1246 FuncName = G->getGlobal()->getName();
1247
1248 DiagnosticInfoUnsupported NoCalls(
1249 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1250 DAG.getContext()->diagnose(NoCalls);
1251
1252 if (!CLI.IsTailCall) {
1253 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1254 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1255 }
1256
1257 return DAG.getEntryNode();
1258}
1259
1260SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1261 SmallVectorImpl<SDValue> &InVals) const {
1262 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1263}
1264
1265SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1266 SelectionDAG &DAG) const {
1267 const Function &Fn = DAG.getMachineFunction().getFunction();
1268
1269 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1270 SDLoc(Op).getDebugLoc());
1271 DAG.getContext()->diagnose(NoDynamicAlloca);
1272 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1273 return DAG.getMergeValues(Ops, SDLoc());
1274}
1275
1276SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1277 SelectionDAG &DAG) const {
1278 switch (Op.getOpcode()) {
1279 default:
1280 Op->print(errs(), &DAG);
1281 llvm_unreachable("Custom lowering code for this "::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1282)
1282 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1282)
;
1283 break;
1284 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1285 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1286 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1287 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1288 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1289 case ISD::FREM: return LowerFREM(Op, DAG);
1290 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1291 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1292 case ISD::FRINT: return LowerFRINT(Op, DAG);
1293 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1294 case ISD::FROUND: return LowerFROUND(Op, DAG);
1295 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1296 case ISD::FLOG:
1297 return LowerFLOG(Op, DAG, numbers::ln2f);
1298 case ISD::FLOG10:
1299 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1300 case ISD::FEXP:
1301 return lowerFEXP(Op, DAG);
1302 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1303 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1304 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1305 case ISD::FP_TO_SINT:
1306 case ISD::FP_TO_UINT:
1307 return LowerFP_TO_INT(Op, DAG);
1308 case ISD::CTTZ:
1309 case ISD::CTTZ_ZERO_UNDEF:
1310 case ISD::CTLZ:
1311 case ISD::CTLZ_ZERO_UNDEF:
1312 return LowerCTLZ_CTTZ(Op, DAG);
1313 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1314 }
1315 return Op;
1316}
1317
1318void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1319 SmallVectorImpl<SDValue> &Results,
1320 SelectionDAG &DAG) const {
1321 switch (N->getOpcode()) {
1322 case ISD::SIGN_EXTEND_INREG:
1323 // Different parts of legalization seem to interpret which type of
1324 // sign_extend_inreg is the one to check for custom lowering. The extended
1325 // from type is what really matters, but some places check for custom
1326 // lowering of the result type. This results in trying to use
1327 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1328 // nothing here and let the illegal result integer be handled normally.
1329 return;
1330 default:
1331 return;
1332 }
1333}
1334
1335SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1336 SDValue Op,
1337 SelectionDAG &DAG) const {
1338
1339 const DataLayout &DL = DAG.getDataLayout();
1340 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1341 const GlobalValue *GV = G->getGlobal();
1342
1343 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1344 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1345 if (!MFI->isModuleEntryFunction() &&
1346 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1347 SDLoc DL(Op);
1348 const Function &Fn = DAG.getMachineFunction().getFunction();
1349 DiagnosticInfoUnsupported BadLDSDecl(
1350 Fn, "local memory global used by non-kernel function",
1351 DL.getDebugLoc(), DS_Warning);
1352 DAG.getContext()->diagnose(BadLDSDecl);
1353
1354 // We currently don't have a way to correctly allocate LDS objects that
1355 // aren't directly associated with a kernel. We do force inlining of
1356 // functions that use local objects. However, if these dead functions are
1357 // not eliminated, we don't want a compile time error. Just emit a warning
1358 // and a trap, since there should be no callable path here.
1359 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1360 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1361 Trap, DAG.getRoot());
1362 DAG.setRoot(OutputChain);
1363 return DAG.getUNDEF(Op.getValueType());
1364 }
1365
1366 // XXX: What does the value of G->getOffset() mean?
1367 assert(G->getOffset() == 0 &&(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1368, __extension__
__PRETTY_FUNCTION__))
1368 "Do not know what to do with an non-zero offset")(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1368, __extension__
__PRETTY_FUNCTION__))
;
1369
1370 // TODO: We could emit code to handle the initialization somewhere.
1371 // We ignore the initializer for now and legalize it to allow selection.
1372 // The initializer will anyway get errored out during assembly emission.
1373 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1374 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1375 }
1376 return SDValue();
1377}
1378
1379SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1380 SelectionDAG &DAG) const {
1381 SmallVector<SDValue, 8> Args;
1382
1383 EVT VT = Op.getValueType();
1384 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1385 SDLoc SL(Op);
1386 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1387 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1388
1389 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1390 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1391 }
1392
1393 for (const SDUse &U : Op->ops())
1394 DAG.ExtractVectorElements(U.get(), Args);
1395
1396 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1397}
1398
1399SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1400 SelectionDAG &DAG) const {
1401
1402 SmallVector<SDValue, 8> Args;
1403 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1404 EVT VT = Op.getValueType();
1405 EVT SrcVT = Op.getOperand(0).getValueType();
1406
1407 // For these types, we have some TableGen patterns except if the index is 1
1408 if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1409 (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1410 Start != 1)
1411 return Op;
1412
1413 if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1414 (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1415 (Start == 0 || Start == 4))
1416 return Op;
1417
1418 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1419 VT.getVectorNumElements());
1420
1421 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1422}
1423
1424/// Generate Min/Max node
1425SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1426 SDValue LHS, SDValue RHS,
1427 SDValue True, SDValue False,
1428 SDValue CC,
1429 DAGCombinerInfo &DCI) const {
1430 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1431 return SDValue();
1432
1433 SelectionDAG &DAG = DCI.DAG;
1434 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1435 switch (CCOpcode) {
1436 case ISD::SETOEQ:
1437 case ISD::SETONE:
1438 case ISD::SETUNE:
1439 case ISD::SETNE:
1440 case ISD::SETUEQ:
1441 case ISD::SETEQ:
1442 case ISD::SETFALSE:
1443 case ISD::SETFALSE2:
1444 case ISD::SETTRUE:
1445 case ISD::SETTRUE2:
1446 case ISD::SETUO:
1447 case ISD::SETO:
1448 break;
1449 case ISD::SETULE:
1450 case ISD::SETULT: {
1451 if (LHS == True)
1452 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1453 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1454 }
1455 case ISD::SETOLE:
1456 case ISD::SETOLT:
1457 case ISD::SETLE:
1458 case ISD::SETLT: {
1459 // Ordered. Assume ordered for undefined.
1460
1461 // Only do this after legalization to avoid interfering with other combines
1462 // which might occur.
1463 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1464 !DCI.isCalledByLegalizer())
1465 return SDValue();
1466
1467 // We need to permute the operands to get the correct NaN behavior. The
1468 // selected operand is the second one based on the failing compare with NaN,
1469 // so permute it based on the compare type the hardware uses.
1470 if (LHS == True)
1471 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1472 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1473 }
1474 case ISD::SETUGE:
1475 case ISD::SETUGT: {
1476 if (LHS == True)
1477 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1478 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1479 }
1480 case ISD::SETGT:
1481 case ISD::SETGE:
1482 case ISD::SETOGE:
1483 case ISD::SETOGT: {
1484 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1485 !DCI.isCalledByLegalizer())
1486 return SDValue();
1487
1488 if (LHS == True)
1489 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1490 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1491 }
1492 case ISD::SETCC_INVALID:
1493 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1493)
;
1494 }
1495 return SDValue();
1496}
1497
1498std::pair<SDValue, SDValue>
1499AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1500 SDLoc SL(Op);
1501
1502 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1503
1504 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1505 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1506
1507 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1508 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1509
1510 return std::make_pair(Lo, Hi);
1511}
1512
1513SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1514 SDLoc SL(Op);
1515
1516 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1517 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1518 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1519}
1520
1521SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1522 SDLoc SL(Op);
1523
1524 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1525 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1526 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1527}
1528
1529// Split a vector type into two parts. The first part is a power of two vector.
1530// The second part is whatever is left over, and is a scalar if it would
1531// otherwise be a 1-vector.
1532std::pair<EVT, EVT>
1533AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1534 EVT LoVT, HiVT;
1535 EVT EltVT = VT.getVectorElementType();
1536 unsigned NumElts = VT.getVectorNumElements();
1537 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1538 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1539 HiVT = NumElts - LoNumElts == 1
1540 ? EltVT
1541 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1542 return std::make_pair(LoVT, HiVT);
1543}
1544
1545// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1546// scalar.
1547std::pair<SDValue, SDValue>
1548AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1549 const EVT &LoVT, const EVT &HiVT,
1550 SelectionDAG &DAG) const {
1551 assert(LoVT.getVectorNumElements() +(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1554, __extension__
__PRETTY_FUNCTION__))
1552 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1554, __extension__
__PRETTY_FUNCTION__))
1553 N.getValueType().getVectorNumElements() &&(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1554, __extension__
__PRETTY_FUNCTION__))
1554 "More vector elements requested than available!")(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1554, __extension__
__PRETTY_FUNCTION__))
;
1555 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1556 DAG.getVectorIdxConstant(0, DL));
1557 SDValue Hi = DAG.getNode(
1558 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1559 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1560 return std::make_pair(Lo, Hi);
1561}
1562
1563SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1564 SelectionDAG &DAG) const {
1565 LoadSDNode *Load = cast<LoadSDNode>(Op);
1566 EVT VT = Op.getValueType();
1567 SDLoc SL(Op);
1568
1569
1570 // If this is a 2 element vector, we really want to scalarize and not create
1571 // weird 1 element vectors.
1572 if (VT.getVectorNumElements() == 2) {
1573 SDValue Ops[2];
1574 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1575 return DAG.getMergeValues(Ops, SL);
1576 }
1577
1578 SDValue BasePtr = Load->getBasePtr();
1579 EVT MemVT = Load->getMemoryVT();
1580
1581 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1582
1583 EVT LoVT, HiVT;
1584 EVT LoMemVT, HiMemVT;
1585 SDValue Lo, Hi;
1586
1587 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1588 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1589 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1590
1591 unsigned Size = LoMemVT.getStoreSize();
1592 unsigned BaseAlign = Load->getAlignment();
1593 unsigned HiAlign = MinAlign(BaseAlign, Size);
1594
1595 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1596 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1597 BaseAlign, Load->getMemOperand()->getFlags());
1598 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1599 SDValue HiLoad =
1600 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1601 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1602 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1603
1604 SDValue Join;
1605 if (LoVT == HiVT) {
1606 // This is the case that the vector is power of two so was evenly split.
1607 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1608 } else {
1609 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1610 DAG.getVectorIdxConstant(0, SL));
1611 Join = DAG.getNode(
1612 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1613 VT, Join, HiLoad,
1614 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1615 }
1616
1617 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1618 LoLoad.getValue(1), HiLoad.getValue(1))};
1619
1620 return DAG.getMergeValues(Ops, SL);
1621}
1622
1623SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1624 SelectionDAG &DAG) const {
1625 LoadSDNode *Load = cast<LoadSDNode>(Op);
1626 EVT VT = Op.getValueType();
1627 SDValue BasePtr = Load->getBasePtr();
1628 EVT MemVT = Load->getMemoryVT();
1629 SDLoc SL(Op);
1630 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1631 unsigned BaseAlign = Load->getAlignment();
1632 unsigned NumElements = MemVT.getVectorNumElements();
1633
1634 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1635 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1636 if (NumElements != 3 ||
1637 (BaseAlign < 8 &&
1638 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1639 return SplitVectorLoad(Op, DAG);
1640
1641 assert(NumElements == 3)(static_cast <bool> (NumElements == 3) ? void (0) : __assert_fail
("NumElements == 3", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1641, __extension__ __PRETTY_FUNCTION__))
;
1642
1643 EVT WideVT =
1644 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1645 EVT WideMemVT =
1646 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1647 SDValue WideLoad = DAG.getExtLoad(
1648 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1649 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1650 return DAG.getMergeValues(
1651 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1652 DAG.getVectorIdxConstant(0, SL)),
1653 WideLoad.getValue(1)},
1654 SL);
1655}
1656
1657SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1658 SelectionDAG &DAG) const {
1659 StoreSDNode *Store = cast<StoreSDNode>(Op);
1660 SDValue Val = Store->getValue();
1661 EVT VT = Val.getValueType();
1662
1663 // If this is a 2 element vector, we really want to scalarize and not create
1664 // weird 1 element vectors.
1665 if (VT.getVectorNumElements() == 2)
1666 return scalarizeVectorStore(Store, DAG);
1667
1668 EVT MemVT = Store->getMemoryVT();
1669 SDValue Chain = Store->getChain();
1670 SDValue BasePtr = Store->getBasePtr();
1671 SDLoc SL(Op);
1672
1673 EVT LoVT, HiVT;
1674 EVT LoMemVT, HiMemVT;
1675 SDValue Lo, Hi;
1676
1677 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1678 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1679 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1680
1681 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1682
1683 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1684 unsigned BaseAlign = Store->getAlignment();
1685 unsigned Size = LoMemVT.getStoreSize();
1686 unsigned HiAlign = MinAlign(BaseAlign, Size);
1687
1688 SDValue LoStore =
1689 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1690 Store->getMemOperand()->getFlags());
1691 SDValue HiStore =
1692 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1693 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1694
1695 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1696}
1697
1698// This is a shortcut for integer division because we have fast i32<->f32
1699// conversions, and fast f32 reciprocal instructions. The fractional part of a
1700// float is enough to accurately represent up to a 24-bit signed integer.
1701SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1702 bool Sign) const {
1703 SDLoc DL(Op);
1704 EVT VT = Op.getValueType();
1705 SDValue LHS = Op.getOperand(0);
1706 SDValue RHS = Op.getOperand(1);
1707 MVT IntVT = MVT::i32;
1708 MVT FltVT = MVT::f32;
1709
1710 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1711 if (LHSSignBits < 9)
1712 return SDValue();
1713
1714 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1715 if (RHSSignBits < 9)
1716 return SDValue();
1717
1718 unsigned BitSize = VT.getSizeInBits();
1719 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1720 unsigned DivBits = BitSize - SignBits;
1721 if (Sign)
1722 ++DivBits;
1723
1724 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1725 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1726
1727 SDValue jq = DAG.getConstant(1, DL, IntVT);
1728
1729 if (Sign) {
1730 // char|short jq = ia ^ ib;
1731 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1732
1733 // jq = jq >> (bitsize - 2)
1734 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1735 DAG.getConstant(BitSize - 2, DL, VT));
1736
1737 // jq = jq | 0x1
1738 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1739 }
1740
1741 // int ia = (int)LHS;
1742 SDValue ia = LHS;
1743
1744 // int ib, (int)RHS;
1745 SDValue ib = RHS;
1746
1747 // float fa = (float)ia;
1748 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1749
1750 // float fb = (float)ib;
1751 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1752
1753 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1754 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1755
1756 // fq = trunc(fq);
1757 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1758
1759 // float fqneg = -fq;
1760 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1761
1762 MachineFunction &MF = DAG.getMachineFunction();
1763 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1764
1765 // float fr = mad(fqneg, fb, fa);
1766 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1767 (unsigned)ISD::FMA :
1768 !MFI->getMode().allFP32Denormals() ?
1769 (unsigned)ISD::FMAD :
1770 (unsigned)AMDGPUISD::FMAD_FTZ;
1771 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1772
1773 // int iq = (int)fq;
1774 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1775
1776 // fr = fabs(fr);
1777 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1778
1779 // fb = fabs(fb);
1780 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1781
1782 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1783
1784 // int cv = fr >= fb;
1785 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1786
1787 // jq = (cv ? jq : 0);
1788 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1789
1790 // dst = iq + jq;
1791 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1792
1793 // Rem needs compensation, it's easier to recompute it
1794 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1795 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1796
1797 // Truncate to number of bits this divide really is.
1798 if (Sign) {
1799 SDValue InRegSize
1800 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1801 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1802 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1803 } else {
1804 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1805 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1806 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1807 }
1808
1809 return DAG.getMergeValues({ Div, Rem }, DL);
1810}
1811
1812void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1813 SelectionDAG &DAG,
1814 SmallVectorImpl<SDValue> &Results) const {
1815 SDLoc DL(Op);
1816 EVT VT = Op.getValueType();
1817
1818 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")(static_cast <bool> (VT == MVT::i64 && "LowerUDIVREM64 expects an i64"
) ? void (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1818, __extension__
__PRETTY_FUNCTION__))
;
1819
1820 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1821
1822 SDValue One = DAG.getConstant(1, DL, HalfVT);
1823 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1824
1825 //HiLo split
1826 SDValue LHS = Op.getOperand(0);
1827 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1828 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1829
1830 SDValue RHS = Op.getOperand(1);
1831 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1832 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1833
1834 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1835 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1836
1837 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1838 LHS_Lo, RHS_Lo);
1839
1840 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1841 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1842
1843 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1844 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1845 return;
1846 }
1847
1848 if (isTypeLegal(MVT::i64)) {
1849 // The algorithm here is based on ideas from "Software Integer Division",
1850 // Tom Rodeheffer, August 2008.
1851
1852 MachineFunction &MF = DAG.getMachineFunction();
1853 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1854
1855 // Compute denominator reciprocal.
1856 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1857 (unsigned)ISD::FMA :
1858 !MFI->getMode().allFP32Denormals() ?
1859 (unsigned)ISD::FMAD :
1860 (unsigned)AMDGPUISD::FMAD_FTZ;
1861
1862 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1863 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1864 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1865 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1866 Cvt_Lo);
1867 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1868 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1869 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1870 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1871 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1872 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1873 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1874 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1875 Mul1);
1876 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1877 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1878 SDValue Rcp64 = DAG.getBitcast(VT,
1879 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1880
1881 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1882 SDValue One64 = DAG.getConstant(1, DL, VT);
1883 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1884 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1885
1886 // First round of UNR (Unsigned integer Newton-Raphson).
1887 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1888 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1889 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1890 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1891 Zero);
1892 SDValue Mulhi1_Hi =
1893 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1894 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1895 Mulhi1_Lo, Zero1);
1896 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1897 Mulhi1_Hi, Add1_Lo.getValue(1));
1898 SDValue Add1 = DAG.getBitcast(VT,
1899 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1900
1901 // Second round of UNR.
1902 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1903 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1904 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1905 Zero);
1906 SDValue Mulhi2_Hi =
1907 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1908 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1909 Mulhi2_Lo, Zero1);
1910 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1911 Mulhi2_Hi, Add2_Lo.getValue(1));
1912 SDValue Add2 = DAG.getBitcast(VT,
1913 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1914
1915 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1916
1917 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1918
1919 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1920 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1921 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1922 Mul3_Lo, Zero1);
1923 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1924 Mul3_Hi, Sub1_Lo.getValue(1));
1925 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1926 SDValue Sub1 = DAG.getBitcast(VT,
1927 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1928
1929 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1930 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1931 ISD::SETUGE);
1932 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1933 ISD::SETUGE);
1934 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1935
1936 // TODO: Here and below portions of the code can be enclosed into if/endif.
1937 // Currently control flow is unconditional and we have 4 selects after
1938 // potential endif to substitute PHIs.
1939
1940 // if C3 != 0 ...
1941 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1942 RHS_Lo, Zero1);
1943 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1944 RHS_Hi, Sub1_Lo.getValue(1));
1945 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1946 Zero, Sub2_Lo.getValue(1));
1947 SDValue Sub2 = DAG.getBitcast(VT,
1948 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1949
1950 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1951
1952 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1953 ISD::SETUGE);
1954 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1955 ISD::SETUGE);
1956 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1957
1958 // if (C6 != 0)
1959 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1960
1961 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1962 RHS_Lo, Zero1);
1963 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1964 RHS_Hi, Sub2_Lo.getValue(1));
1965 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1966 Zero, Sub3_Lo.getValue(1));
1967 SDValue Sub3 = DAG.getBitcast(VT,
1968 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1969
1970 // endif C6
1971 // endif C3
1972
1973 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1974 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1975
1976 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1977 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1978
1979 Results.push_back(Div);
1980 Results.push_back(Rem);
1981
1982 return;
1983 }
1984
1985 // r600 expandion.
1986 // Get Speculative values
1987 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1988 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1989
1990 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1991 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1992 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1993
1994 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1995 SDValue DIV_Lo = Zero;
1996
1997 const unsigned halfBitWidth = HalfVT.getSizeInBits();
1998
1999 for (unsigned i = 0; i < halfBitWidth; ++i) {
2000 const unsigned bitPos = halfBitWidth - i - 1;
2001 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2002 // Get value of high bit
2003 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2004 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2005 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2006
2007 // Shift
2008 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2009 // Add LHS high bit
2010 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2011
2012 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2013 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2014
2015 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2016
2017 // Update REM
2018 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2019 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2020 }
2021
2022 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2023 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2024 Results.push_back(DIV);
2025 Results.push_back(REM);
2026}
2027
2028SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2029 SelectionDAG &DAG) const {
2030 SDLoc DL(Op);
2031 EVT VT = Op.getValueType();
2032
2033 if (VT == MVT::i64) {
2034 SmallVector<SDValue, 2> Results;
2035 LowerUDIVREM64(Op, DAG, Results);
2036 return DAG.getMergeValues(Results, DL);
2037 }
2038
2039 if (VT == MVT::i32) {
2040 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2041 return Res;
2042 }
2043
2044 SDValue X = Op.getOperand(0);
2045 SDValue Y = Op.getOperand(1);
2046
2047 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2048 // algorithm used here.
2049
2050 // Initial estimate of inv(y).
2051 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2052
2053 // One round of UNR.
2054 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2055 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2056 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2057 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2058
2059 // Quotient/remainder estimate.
2060 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2061 SDValue R =
2062 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2063
2064 // First quotient/remainder refinement.
2065 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2066 SDValue One = DAG.getConstant(1, DL, VT);
2067 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2068 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2069 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2070 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2071 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2072
2073 // Second quotient/remainder refinement.
2074 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2075 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2076 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2077 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2078 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2079
2080 return DAG.getMergeValues({Q, R}, DL);
2081}
2082
2083SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2084 SelectionDAG &DAG) const {
2085 SDLoc DL(Op);
2086 EVT VT = Op.getValueType();
2087
2088 SDValue LHS = Op.getOperand(0);
2089 SDValue RHS = Op.getOperand(1);
2090
2091 SDValue Zero = DAG.getConstant(0, DL, VT);
2092 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2093
2094 if (VT == MVT::i32) {
2095 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2096 return Res;
2097 }
2098
2099 if (VT == MVT::i64 &&
2100 DAG.ComputeNumSignBits(LHS) > 32 &&
2101 DAG.ComputeNumSignBits(RHS) > 32) {
2102 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2103
2104 //HiLo split
2105 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2106 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2107 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2108 LHS_Lo, RHS_Lo);
2109 SDValue Res[2] = {
2110 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2111 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2112 };
2113 return DAG.getMergeValues(Res, DL);
2114 }
2115
2116 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2117 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2118 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2119 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2120
2121 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2122 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2123
2124 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2125 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2126
2127 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2128 SDValue Rem = Div.getValue(1);
2129
2130 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2131 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2132
2133 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2134 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2135
2136 SDValue Res[2] = {
2137 Div,
2138 Rem
2139 };
2140 return DAG.getMergeValues(Res, DL);
2141}
2142
2143// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2144SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2145 SDLoc SL(Op);
2146 EVT VT = Op.getValueType();
2147 auto Flags = Op->getFlags();
2148 SDValue X = Op.getOperand(0);
2149 SDValue Y = Op.getOperand(1);
2150
2151 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2152 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2153 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2154 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2155 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2156}
2157
2158SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2159 SDLoc SL(Op);
2160 SDValue Src = Op.getOperand(0);
2161
2162 // result = trunc(src)
2163 // if (src > 0.0 && src != result)
2164 // result += 1.0
2165
2166 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2167
2168 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2169 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2170
2171 EVT SetCCVT =
2172 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2173
2174 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2175 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2176 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2177
2178 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2179 // TODO: Should this propagate fast-math-flags?
2180 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2181}
2182
2183static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2184 SelectionDAG &DAG) {
2185 const unsigned FractBits = 52;
2186 const unsigned ExpBits = 11;
2187
2188 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2189 Hi,
2190 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2191 DAG.getConstant(ExpBits, SL, MVT::i32));
2192 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2193 DAG.getConstant(1023, SL, MVT::i32));
2194
2195 return Exp;
2196}
2197
2198SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2199 SDLoc SL(Op);
2200 SDValue Src = Op.getOperand(0);
2201
2202 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2202, __extension__ __PRETTY_FUNCTION__))
;
2203
2204 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2205
2206 // Extract the upper half, since this is where we will find the sign and
2207 // exponent.
2208 SDValue Hi = getHiHalf64(Src, DAG);
2209
2210 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2211
2212 const unsigned FractBits = 52;
2213
2214 // Extract the sign bit.
2215 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2216 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2217
2218 // Extend back to 64-bits.
2219 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2220 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2221
2222 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2223 const SDValue FractMask
2224 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2225
2226 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2227 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2228 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2229
2230 EVT SetCCVT =
2231 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2232
2233 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2234
2235 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2236 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2237
2238 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2239 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2240
2241 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2242}
2243
2244SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2245 SDLoc SL(Op);
2246 SDValue Src = Op.getOperand(0);
2247
2248 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2248, __extension__ __PRETTY_FUNCTION__))
;
2249
2250 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2251 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2252 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2253
2254 // TODO: Should this propagate fast-math-flags?
2255
2256 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2257 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2258
2259 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2260
2261 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2262 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2263
2264 EVT SetCCVT =
2265 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2266 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2267
2268 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2269}
2270
2271SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2272 // FNEARBYINT and FRINT are the same, except in their handling of FP
2273 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2274 // rint, so just treat them as equivalent.
2275 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2276}
2277
2278// XXX - May require not supporting f32 denormals?
2279
2280// Don't handle v2f16. The extra instructions to scalarize and repack around the
2281// compare and vselect end up producing worse code than scalarizing the whole
2282// operation.
2283SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2284 SDLoc SL(Op);
2285 SDValue X = Op.getOperand(0);
2286 EVT VT = Op.getValueType();
2287
2288 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2289
2290 // TODO: Should this propagate fast-math-flags?
2291
2292 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2293
2294 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2295
2296 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2297 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2298 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2299
2300 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2301
2302 EVT SetCCVT =
2303 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2304
2305 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2306
2307 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2308
2309 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2310}
2311
2312SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2313 SDLoc SL(Op);
2314 SDValue Src = Op.getOperand(0);
2315
2316 // result = trunc(src);
2317 // if (src < 0.0 && src != result)
2318 // result += -1.0.
2319
2320 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2321
2322 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2323 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2324
2325 EVT SetCCVT =
2326 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2327
2328 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2329 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2330 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2331
2332 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2333 // TODO: Should this propagate fast-math-flags?
2334 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2335}
2336
2337SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2338 double Log2BaseInverted) const {
2339 EVT VT = Op.getValueType();
2340
2341 SDLoc SL(Op);
2342 SDValue Operand = Op.getOperand(0);
2343 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2344 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2345
2346 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2347}
2348
2349// exp2(M_LOG2E_F * f);
2350SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2351 EVT VT = Op.getValueType();
2352 SDLoc SL(Op);
2353 SDValue Src = Op.getOperand(0);
2354
2355 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2356 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2357 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2358}
2359
2360static bool isCtlzOpc(unsigned Opc) {
2361 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2362}
2363
2364static bool isCttzOpc(unsigned Opc) {
2365 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2366}
2367
2368SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2369 SDLoc SL(Op);
2370 SDValue Src = Op.getOperand(0);
2371
2372 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()))(static_cast <bool> (isCtlzOpc(Op.getOpcode()) || isCttzOpc
(Op.getOpcode())) ? void (0) : __assert_fail ("isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2372, __extension__
__PRETTY_FUNCTION__))
;
2373 bool Ctlz = isCtlzOpc(Op.getOpcode());
2374 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2375
2376 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2377 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2378
2379 if (Src.getValueType() == MVT::i32) {
2380 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2381 // (cttz hi:lo) -> (umin (ffbl src), 32)
2382 // (ctlz_zero_undef src) -> (ffbh src)
2383 // (cttz_zero_undef src) -> (ffbl src)
2384 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2385 if (!ZeroUndef) {
2386 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2387 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2388 }
2389 return NewOpr;
2390 }
2391
2392 SDValue Lo, Hi;
2393 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2394
2395 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2396 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2397
2398 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2399 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2400 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2401 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2402
2403 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2404 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2405 if (Ctlz)
2406 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2407 else
2408 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2409
2410 SDValue NewOpr;
2411 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2412 if (!ZeroUndef) {
2413 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2414 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2415 }
2416
2417 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2418}
2419
2420SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2421 bool Signed) const {
2422 // The regular method converting a 64-bit integer to float roughly consists of
2423 // 2 steps: normalization and rounding. In fact, after normalization, the
2424 // conversion from a 64-bit integer to a float is essentially the same as the
2425 // one from a 32-bit integer. The only difference is that it has more
2426 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2427 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2428 // converted into the correct float number. The basic steps for the unsigned
2429 // conversion are illustrated in the following pseudo code:
2430 //
2431 // f32 uitofp(i64 u) {
2432 // i32 hi, lo = split(u);
2433 // // Only count the leading zeros in hi as we have native support of the
2434 // // conversion from i32 to f32. If hi is all 0s, the conversion is
2435 // // reduced to a 32-bit one automatically.
2436 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2437 // u <<= shamt;
2438 // hi, lo = split(u);
2439 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2440 // // convert it as a 32-bit integer and scale the result back.
2441 // return uitofp(hi) * 2^(32 - shamt);
2442 // }
2443 //
2444 // The signed one follows the same principle but uses 'ffbh_i32' to count its
2445 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2446 // converted instead followed by negation based its sign bit.
2447
2448 SDLoc SL(Op);
2449 SDValue Src = Op.getOperand(0);
2450
2451 SDValue Lo, Hi;
2452 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2453 SDValue Sign;
2454 SDValue ShAmt;
2455 if (Signed && Subtarget->isGCN()) {
2456 // We also need to consider the sign bit in Lo if Hi has just sign bits,
2457 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2458 // account. That is, the maximal shift is
2459 // - 32 if Lo and Hi have opposite signs;
2460 // - 33 if Lo and Hi have the same sign.
2461 //
2462 // Or, MaxShAmt = 33 + OppositeSign, where
2463 //
2464 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2465 // - -1 if Lo and Hi have opposite signs; and
2466 // - 0 otherwise.
2467 //
2468 // All in all, ShAmt is calculated as
2469 //
2470 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2471 //
2472 // or
2473 //
2474 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2475 //
2476 // to reduce the critical path.
2477 SDValue OppositeSign = DAG.getNode(
2478 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2479 DAG.getConstant(31, SL, MVT::i32));
2480 SDValue MaxShAmt =
2481 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2482 OppositeSign);
2483 // Count the leading sign bits.
2484 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2485 // Different from unsigned conversion, the shift should be one bit less to
2486 // preserve the sign bit.
2487 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2488 DAG.getConstant(1, SL, MVT::i32));
2489 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2490 } else {
2491 if (Signed) {
2492 // Without 'ffbh_i32', only leading zeros could be counted. Take the
2493 // absolute value first.
2494 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2495 DAG.getConstant(63, SL, MVT::i64));
2496 SDValue Abs =
2497 DAG.getNode(ISD::XOR, SL, MVT::i64,
2498 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2499 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2500 }
2501 // Count the leading zeros.
2502 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2503 // The shift amount for signed integers is [0, 32].
2504 }
2505 // Normalize the given 64-bit integer.
2506 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2507 // Split it again.
2508 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2509 // Calculate the adjust bit for rounding.
2510 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2511 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2512 DAG.getConstant(1, SL, MVT::i32), Lo);
2513 // Get the 32-bit normalized integer.
2514 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2515 // Convert the normalized 32-bit integer into f32.
2516 unsigned Opc =
2517 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2518 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2519
2520 // Finally, need to scale back the converted floating number as the original
2521 // 64-bit integer is converted as a 32-bit one.
2522 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2523 ShAmt);
2524 // On GCN, use LDEXP directly.
2525 if (Subtarget->isGCN())
2526 return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2527
2528 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2529 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2530 // exponent is enough to avoid overflowing into the sign bit.
2531 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2532 DAG.getConstant(23, SL, MVT::i32));
2533 SDValue IVal =
2534 DAG.getNode(ISD::ADD, SL, MVT::i32,
2535 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2536 if (Signed) {
2537 // Set the sign bit.
2538 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2539 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2540 DAG.getConstant(31, SL, MVT::i32));
2541 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2542 }
2543 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2544}
2545
2546SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2547 bool Signed) const {
2548 SDLoc SL(Op);
2549 SDValue Src = Op.getOperand(0);
2550
2551 SDValue Lo, Hi;
2552 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2553
2554 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2555 SL, MVT::f64, Hi);
2556
2557 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2558
2559 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2560 DAG.getConstant(32, SL, MVT::i32));
2561 // TODO: Should this propagate fast-math-flags?
2562 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2563}
2564
2565SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2566 SelectionDAG &DAG) const {
2567 // TODO: Factor out code common with LowerSINT_TO_FP.
2568 EVT DestVT = Op.getValueType();
2569 SDValue Src = Op.getOperand(0);
2570 EVT SrcVT = Src.getValueType();
2571
2572 if (SrcVT == MVT::i16) {
2573 if (DestVT == MVT::f16)
2574 return Op;
2575 SDLoc DL(Op);
2576
2577 // Promote src to i32
2578 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2579 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2580 }
2581
2582 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2582, __extension__
__PRETTY_FUNCTION__))
;
2583
2584 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2585 SDLoc DL(Op);
2586
2587 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2588 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2589 SDValue FPRound =
2590 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2591
2592 return FPRound;
2593 }
2594
2595 if (DestVT == MVT::f32)
2596 return LowerINT_TO_FP32(Op, DAG, false);
2597
2598 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2598, __extension__ __PRETTY_FUNCTION__))
;
2599 return LowerINT_TO_FP64(Op, DAG, false);
2600}
2601
2602SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2603 SelectionDAG &DAG) const {
2604 EVT DestVT = Op.getValueType();
2605
2606 SDValue Src = Op.getOperand(0);
2607 EVT SrcVT = Src.getValueType();
2608
2609 if (SrcVT == MVT::i16) {
2610 if (DestVT == MVT::f16)
2611 return Op;
2612
2613 SDLoc DL(Op);
2614 // Promote src to i32
2615 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2616 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2617 }
2618
2619 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2619, __extension__
__PRETTY_FUNCTION__))
;
2620
2621 // TODO: Factor out code common with LowerUINT_TO_FP.
2622
2623 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2624 SDLoc DL(Op);
2625 SDValue Src = Op.getOperand(0);
2626
2627 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2628 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2629 SDValue FPRound =
2630 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2631
2632 return FPRound;
2633 }
2634
2635 if (DestVT == MVT::f32)
2636 return LowerINT_TO_FP32(Op, DAG, true);
2637
2638 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2638, __extension__ __PRETTY_FUNCTION__))
;
2639 return LowerINT_TO_FP64(Op, DAG, true);
2640}
2641
2642SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2643 bool Signed) const {
2644 SDLoc SL(Op);
2645
2646 SDValue Src = Op.getOperand(0);
2647 EVT SrcVT = Src.getValueType();
2648
2649 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64)(static_cast <bool> (SrcVT == MVT::f32 || SrcVT == MVT::
f64) ? void (0) : __assert_fail ("SrcVT == MVT::f32 || SrcVT == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2649, __extension__
__PRETTY_FUNCTION__))
;
2650
2651 // The basic idea of converting a floating point number into a pair of 32-bit
2652 // integers is illustrated as follows:
2653 //
2654 // tf := trunc(val);
2655 // hif := floor(tf * 2^-32);
2656 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2657 // hi := fptoi(hif);
2658 // lo := fptoi(lof);
2659 //
2660 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2661 SDValue Sign;
2662 if (Signed && SrcVT == MVT::f32) {
2663 // However, a 32-bit floating point number has only 23 bits mantissa and
2664 // it's not enough to hold all the significant bits of `lof` if val is
2665 // negative. To avoid the loss of precision, We need to take the absolute
2666 // value after truncating and flip the result back based on the original
2667 // signedness.
2668 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2669 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2670 DAG.getConstant(31, SL, MVT::i32));
2671 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2672 }
2673
2674 SDValue K0, K1;
2675 if (SrcVT == MVT::f64) {
2676 K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL),
2677 SL, SrcVT);
2678 K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL),
2679 SL, SrcVT);
2680 } else {
2681 K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U), SL,
2682 SrcVT);
2683 K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U), SL,
2684 SrcVT);
2685 }
2686 // TODO: Should this propagate fast-math-flags?
2687 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2688
2689 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2690
2691 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2692
2693 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2694 : ISD::FP_TO_UINT,
2695 SL, MVT::i32, FloorMul);
2696 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2697
2698 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2699 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2700
2701 if (Signed && SrcVT == MVT::f32) {
2702 assert(Sign)(static_cast <bool> (Sign) ? void (0) : __assert_fail (
"Sign", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2702
, __extension__ __PRETTY_FUNCTION__))
;
2703 // Flip the result based on the signedness, which is either all 0s or 1s.
2704 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2705 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2706 // r := xor(r, sign) - sign;
2707 Result =
2708 DAG.getNode(ISD::SUB, SL, MVT::i64,
2709 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2710 }
2711
2712 return Result;
2713}
2714
2715SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2716 SDLoc DL(Op);
2717 SDValue N0 = Op.getOperand(0);
2718
2719 // Convert to target node to get known bits
2720 if (N0.getValueType() == MVT::f32)
2721 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2722
2723 if (getTargetMachine().Options.UnsafeFPMath) {
2724 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2725 return SDValue();
2726 }
2727
2728 assert(N0.getSimpleValueType() == MVT::f64)(static_cast <bool> (N0.getSimpleValueType() == MVT::f64
) ? void (0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2728, __extension__
__PRETTY_FUNCTION__))
;
2729
2730 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2731 const unsigned ExpMask = 0x7ff;
2732 const unsigned ExpBiasf64 = 1023;
2733 const unsigned ExpBiasf16 = 15;
2734 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2735 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2736 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2737 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2738 DAG.getConstant(32, DL, MVT::i64));
2739 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2740 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2741 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2742 DAG.getConstant(20, DL, MVT::i64));
2743 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2744 DAG.getConstant(ExpMask, DL, MVT::i32));
2745 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2746 // add the f16 bias (15) to get the biased exponent for the f16 format.
2747 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2748 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2749
2750 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2751 DAG.getConstant(8, DL, MVT::i32));
2752 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2753 DAG.getConstant(0xffe, DL, MVT::i32));
2754
2755 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2756 DAG.getConstant(0x1ff, DL, MVT::i32));
2757 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2758
2759 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2760 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2761
2762 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2763 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2764 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2765 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2766
2767 // N = M | (E << 12);
2768 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2769 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2770 DAG.getConstant(12, DL, MVT::i32)));
2771
2772 // B = clamp(1-E, 0, 13);
2773 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2774 One, E);
2775 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2776 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2777 DAG.getConstant(13, DL, MVT::i32));
2778
2779 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2780 DAG.getConstant(0x1000, DL, MVT::i32));
2781
2782 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2783 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2784 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2785 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2786
2787 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2788 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2789 DAG.getConstant(0x7, DL, MVT::i32));
2790 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2791 DAG.getConstant(2, DL, MVT::i32));
2792 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2793 One, Zero, ISD::SETEQ);
2794 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2795 One, Zero, ISD::SETGT);
2796 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2797 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2798
2799 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2800 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2801 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2802 I, V, ISD::SETEQ);
2803
2804 // Extract the sign bit.
2805 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2806 DAG.getConstant(16, DL, MVT::i32));
2807 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2808 DAG.getConstant(0x8000, DL, MVT::i32));
2809
2810 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2811 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2812}
2813
2814SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2815 SelectionDAG &DAG) const {
2816 SDValue Src = Op.getOperand(0);
2817 unsigned OpOpcode = Op.getOpcode();
2818 EVT SrcVT = Src.getValueType();
2819 EVT DestVT = Op.getValueType();
2820
2821 // Will be selected natively
2822 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2823 return Op;
2824
2825 // Promote i16 to i32
2826 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2827 SDLoc DL(Op);
2828
2829 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2830 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2831 }
2832
2833 if (SrcVT == MVT::f16 ||
2834 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2835 SDLoc DL(Op);
2836
2837 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2838 unsigned Ext =
2839 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2840 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2841 }
2842
2843 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2844 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2845
2846 return SDValue();
2847}
2848
2849SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2850 SelectionDAG &DAG) const {
2851 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2852 MVT VT = Op.getSimpleValueType();
2853 MVT ScalarVT = VT.getScalarType();
2854
2855 assert(VT.isVector())(static_cast <bool> (VT.isVector()) ? void (0) : __assert_fail
("VT.isVector()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2855, __extension__ __PRETTY_FUNCTION__))
;
2856
2857 SDValue Src = Op.getOperand(0);
2858 SDLoc DL(Op);
2859
2860 // TODO: Don't scalarize on Evergreen?
2861 unsigned NElts = VT.getVectorNumElements();
2862 SmallVector<SDValue, 8> Args;
2863 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2864
2865 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2866 for (unsigned I = 0; I < NElts; ++I)
2867 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2868
2869 return DAG.getBuildVector(VT, DL, Args);
2870}
2871
2872//===----------------------------------------------------------------------===//
2873// Custom DAG optimizations
2874//===----------------------------------------------------------------------===//
2875
2876static bool isU24(SDValue Op, SelectionDAG &DAG) {
2877 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2878}
2879
2880static bool isI24(SDValue Op, SelectionDAG &DAG) {
2881 EVT VT = Op.getValueType();
2882 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2883 // as unsigned 24-bit values.
2884 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2885}
2886
2887static SDValue simplifyMul24(SDNode *Node24,
2888 TargetLowering::DAGCombinerInfo &DCI) {
2889 SelectionDAG &DAG = DCI.DAG;
2890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2891 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2892
2893 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2894 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2895 unsigned NewOpcode = Node24->getOpcode();
2896 if (IsIntrin) {
2897 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2898 switch (IID) {
2899 case Intrinsic::amdgcn_mul_i24:
2900 NewOpcode = AMDGPUISD::MUL_I24;
2901 break;
2902 case Intrinsic::amdgcn_mul_u24:
2903 NewOpcode = AMDGPUISD::MUL_U24;
2904 break;
2905 case Intrinsic::amdgcn_mulhi_i24:
2906 NewOpcode = AMDGPUISD::MULHI_I24;
2907 break;
2908 case Intrinsic::amdgcn_mulhi_u24:
2909 NewOpcode = AMDGPUISD::MULHI_U24;
2910 break;
2911 default:
2912 llvm_unreachable("Expected 24-bit mul intrinsic")::llvm::llvm_unreachable_internal("Expected 24-bit mul intrinsic"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2912)
;
2913 }
2914 }
2915
2916 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2917
2918 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2919 // the operands to have other uses, but will only perform simplifications that
2920 // involve bypassing some nodes for this user.
2921 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2922 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2923 if (DemandedLHS || DemandedRHS)
2924 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2925 DemandedLHS ? DemandedLHS : LHS,
2926 DemandedRHS ? DemandedRHS : RHS);
2927
2928 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2929 // operands if this node is the only user.
2930 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2931 return SDValue(Node24, 0);
2932 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2933 return SDValue(Node24, 0);
2934
2935 return SDValue();
2936}
2937
2938template <typename IntTy>
2939static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2940 uint32_t Width, const SDLoc &DL) {
2941 if (Width + Offset < 32) {
2942 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2943 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2944 return DAG.getConstant(Result, DL, MVT::i32);
2945 }
2946
2947 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2948}
2949
2950static bool hasVolatileUser(SDNode *Val) {
2951 for (SDNode *U : Val->uses()) {
2952 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2953 if (M->isVolatile())
2954 return true;
2955 }
2956 }
2957
2958 return false;
2959}
2960
2961bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2962 // i32 vectors are the canonical memory type.
2963 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2964 return false;
2965
2966 if (!VT.isByteSized())
2967 return false;
2968
2969 unsigned Size = VT.getStoreSize();
2970
2971 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2972 return false;
2973
2974 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2975 return false;
2976
2977 return true;
2978}
2979
2980// Replace load of an illegal type with a store of a bitcast to a friendlier
2981// type.
2982SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2983 DAGCombinerInfo &DCI) const {
2984 if (!DCI.isBeforeLegalize())
2985 return SDValue();
2986
2987 LoadSDNode *LN = cast<LoadSDNode>(N);
2988 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2989 return SDValue();
2990
2991 SDLoc SL(N);
2992 SelectionDAG &DAG = DCI.DAG;
2993 EVT VT = LN->getMemoryVT();
2994
2995 unsigned Size = VT.getStoreSize();
2996 Align Alignment = LN->getAlign();
2997 if (Alignment < Size && isTypeLegal(VT)) {
2998 bool IsFast;
2999 unsigned AS = LN->getAddressSpace();
3000
3001 // Expand unaligned loads earlier than legalization. Due to visitation order
3002 // problems during legalization, the emitted instructions to pack and unpack
3003 // the bytes again are not eliminated in the case of an unaligned copy.
3004 if (!allowsMisalignedMemoryAccesses(
3005 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3006 SDValue Ops[2];
3007
3008 if (VT.isVector())
3009 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
3010 else
3011 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3012
3013 return DAG.getMergeValues(Ops, SDLoc(N));
3014 }
3015
3016 if (!IsFast)
3017 return SDValue();
3018 }
3019
3020 if (!shouldCombineMemoryType(VT))
3021 return SDValue();
3022
3023 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3024
3025 SDValue NewLoad
3026 = DAG.getLoad(NewVT, SL, LN->getChain(),
3027 LN->getBasePtr(), LN->getMemOperand());
3028
3029 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3030 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3031 return SDValue(N, 0);
3032}
3033
3034// Replace store of an illegal type with a store of a bitcast to a friendlier
3035// type.
3036SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3037 DAGCombinerInfo &DCI) const {
3038 if (!DCI.isBeforeLegalize())
3039 return SDValue();
3040
3041 StoreSDNode *SN = cast<StoreSDNode>(N);
3042 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3043 return SDValue();
3044
3045 EVT VT = SN->getMemoryVT();
3046 unsigned Size = VT.getStoreSize();
3047
3048 SDLoc SL(N);
3049 SelectionDAG &DAG = DCI.DAG;
3050 Align Alignment = SN->getAlign();
3051 if (Alignment < Size && isTypeLegal(VT)) {
3052 bool IsFast;
3053 unsigned AS = SN->getAddressSpace();
3054
3055 // Expand unaligned stores earlier than legalization. Due to visitation
3056 // order problems during legalization, the emitted instructions to pack and
3057 // unpack the bytes again are not eliminated in the case of an unaligned
3058 // copy.
3059 if (!allowsMisalignedMemoryAccesses(
3060 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3061 if (VT.isVector())
3062 return scalarizeVectorStore(SN, DAG);
3063
3064 return expandUnalignedStore(SN, DAG);
3065 }
3066
3067 if (!IsFast)
3068 return SDValue();
3069 }
3070
3071 if (!shouldCombineMemoryType(VT))
3072 return SDValue();
3073
3074 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3075 SDValue Val = SN->getValue();
3076
3077 //DCI.AddToWorklist(Val.getNode());
3078
3079 bool OtherUses = !Val.hasOneUse();
3080 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3081 if (OtherUses) {
3082 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3083 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3084 }
3085
3086 return DAG.getStore(SN->getChain(), SL, CastVal,
3087 SN->getBasePtr(), SN->getMemOperand());
3088}
3089
3090// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3091// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3092// issues.
3093SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3094 DAGCombinerInfo &DCI) const {
3095 SelectionDAG &DAG = DCI.DAG;
3096 SDValue N0 = N->getOperand(0);
3097
3098 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3099 // (vt2 (truncate (assertzext vt0:x, vt1)))
3100 if (N0.getOpcode() == ISD::TRUNCATE) {
3101 SDValue N1 = N->getOperand(1);
3102 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3103 SDLoc SL(N);
3104
3105 SDValue Src = N0.getOperand(0);
3106 EVT SrcVT = Src.getValueType();
3107 if (SrcVT.bitsGE(ExtVT)) {
3108 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3109 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3110 }
3111 }
3112
3113 return SDValue();
3114}
3115
3116SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3117 SDNode *N, DAGCombinerInfo &DCI) const {
3118 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3119 switch (IID) {
3120 case Intrinsic::amdgcn_mul_i24:
3121 case Intrinsic::amdgcn_mul_u24:
3122 case Intrinsic::amdgcn_mulhi_i24:
3123 case Intrinsic::amdgcn_mulhi_u24:
3124 return simplifyMul24(N, DCI);
3125 case Intrinsic::amdgcn_fract:
3126 case Intrinsic::amdgcn_rsq:
3127 case Intrinsic::amdgcn_rcp_legacy:
3128 case Intrinsic::amdgcn_rsq_legacy:
3129 case Intrinsic::amdgcn_rsq_clamp:
3130 case Intrinsic::amdgcn_ldexp: {
3131 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3132 SDValue Src = N->getOperand(1);
3133 return Src.isUndef() ? Src : SDValue();
3134 }
3135 default:
3136 return SDValue();
3137 }
3138}
3139
3140/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3141/// binary operation \p Opc to it with the corresponding constant operands.
3142SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3143 DAGCombinerInfo &DCI, const SDLoc &SL,
3144 unsigned Opc, SDValue LHS,
3145 uint32_t ValLo, uint32_t ValHi) const {
3146 SelectionDAG &DAG = DCI.DAG;
3147 SDValue Lo, Hi;
3148 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3149
3150 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3151 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3152
3153 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3154 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3155
3156 // Re-visit the ands. It's possible we eliminated one of them and it could
3157 // simplify the vector.
3158 DCI.AddToWorklist(Lo.getNode());
3159 DCI.AddToWorklist(Hi.getNode());
3160
3161 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3162 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3163}
3164
3165SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3166 DAGCombinerInfo &DCI) const {
3167 EVT VT = N->getValueType(0);
3168
3169 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3170 if (!RHS)
3171 return SDValue();
3172
3173 SDValue LHS = N->getOperand(0);
3174 unsigned RHSVal = RHS->getZExtValue();
3175 if (!RHSVal)
3176 return LHS;
3177
3178 SDLoc SL(N);
3179 SelectionDAG &DAG = DCI.DAG;
3180
3181 switch (LHS->getOpcode()) {
3182 default:
3183 break;
3184 case ISD::ZERO_EXTEND:
3185 case ISD::SIGN_EXTEND:
3186 case ISD::ANY_EXTEND: {
3187 SDValue X = LHS->getOperand(0);
3188
3189 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3190 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3191 // Prefer build_vector as the canonical form if packed types are legal.
3192 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3193 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3194 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3195 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3196 }
3197
3198 // shl (ext x) => zext (shl x), if shift does not overflow int
3199 if (VT != MVT::i64)
3200 break;
3201 KnownBits Known = DAG.computeKnownBits(X);
3202 unsigned LZ = Known.countMinLeadingZeros();
3203 if (LZ < RHSVal)
3204 break;
3205 EVT XVT = X.getValueType();
3206 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3207 return DAG.getZExtOrTrunc(Shl, SL, VT);
3208 }
3209 }
3210
3211 if (VT != MVT::i64)
3212 return SDValue();
3213
3214 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3215
3216 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3217 // common case, splitting this into a move and a 32-bit shift is faster and
3218 // the same code size.
3219 if (RHSVal < 32)
3220 return SDValue();
3221
3222 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3223
3224 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3225 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3226
3227 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3228
3229 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3230 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3231}
3232
3233SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3234 DAGCombinerInfo &DCI) const {
3235 if (N->getValueType(0) != MVT::i64)
3236 return SDValue();
3237
3238 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3239 if (!RHS)
3240 return SDValue();
3241
3242 SelectionDAG &DAG = DCI.DAG;
3243 SDLoc SL(N);
3244 unsigned RHSVal = RHS->getZExtValue();
3245
3246 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3247 if (RHSVal == 32) {
3248 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3249 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3250 DAG.getConstant(31, SL, MVT::i32));
3251
3252 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3253 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3254 }
3255
3256 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3257 if (RHSVal == 63) {
3258 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3259 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3260 DAG.getConstant(31, SL, MVT::i32));
3261 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3262 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3263 }
3264
3265 return SDValue();
3266}
3267
3268SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3269 DAGCombinerInfo &DCI) const {
3270 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3271 if (!RHS)
3272 return SDValue();
3273
3274 EVT VT = N->getValueType(0);
3275 SDValue LHS = N->getOperand(0);
3276 unsigned ShiftAmt = RHS->getZExtValue();
3277 SelectionDAG &DAG = DCI.DAG;
3278 SDLoc SL(N);
3279
3280 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3281 // this improves the ability to match BFE patterns in isel.
3282 if (LHS.getOpcode() == ISD::AND) {
3283 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3284 if (Mask->getAPIntValue().isShiftedMask() &&
3285 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3286 return DAG.getNode(
3287 ISD::AND, SL, VT,
3288 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3289 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3290 }
3291 }
3292 }
3293
3294 if (VT != MVT::i64)
3295 return SDValue();
3296
3297 if (ShiftAmt < 32)
3298 return SDValue();
3299
3300 // srl i64:x, C for C >= 32
3301 // =>
3302 // build_pair (srl hi_32(x), C - 32), 0
3303 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3304
3305 SDValue Hi = getHiHalf64(LHS, DAG);
3306
3307 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3308 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3309
3310 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3311
3312 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3313}
3314
3315SDValue AMDGPUTargetLowering::performTruncateCombine(
3316 SDNode *N, DAGCombinerInfo &DCI) const {
3317 SDLoc SL(N);
3318 SelectionDAG &DAG = DCI.DAG;
3319 EVT VT = N->getValueType(0);
3320 SDValue Src = N->getOperand(0);
3321
3322 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3323 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3324 SDValue Vec = Src.getOperand(0);
3325 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3326 SDValue Elt0 = Vec.getOperand(0);
3327 EVT EltVT = Elt0.getValueType();
3328 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3329 if (EltVT.isFloatingPoint()) {
3330 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3331 EltVT.changeTypeToInteger(), Elt0);
3332 }
3333
3334 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3335 }
3336 }
3337 }
3338
3339 // Equivalent of above for accessing the high element of a vector as an
3340 // integer operation.
3341 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3342 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3343 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3344 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3345 SDValue BV = stripBitcast(Src.getOperand(0));
3346 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3347 BV.getValueType().getVectorNumElements() == 2) {
3348 SDValue SrcElt = BV.getOperand(1);
3349 EVT SrcEltVT = SrcElt.getValueType();
3350 if (SrcEltVT.isFloatingPoint()) {
3351 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3352 SrcEltVT.changeTypeToInteger(), SrcElt);
3353 }
3354
3355 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3356 }
3357 }
3358 }
3359 }
3360
3361 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3362 //
3363 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3364 // i16 (trunc (srl (i32 (trunc x), K)))
3365 if (VT.getScalarSizeInBits() < 32) {
3366 EVT SrcVT = Src.getValueType();
3367 if (SrcVT.getScalarSizeInBits() > 32 &&
3368 (Src.getOpcode() == ISD::SRL ||
3369 Src.getOpcode() == ISD::SRA ||
3370 Src.getOpcode() == ISD::SHL)) {
3371 SDValue Amt = Src.getOperand(1);
3372 KnownBits Known = DAG.computeKnownBits(Amt);
3373 unsigned Size = VT.getScalarSizeInBits();
3374 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3375 (Known.countMaxActiveBits() <= Log2_32(Size))) {
3376 EVT MidVT = VT.isVector() ?
3377 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3378 VT.getVectorNumElements()) : MVT::i32;
3379
3380 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3381 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3382 Src.getOperand(0));
3383 DCI.AddToWorklist(Trunc.getNode());
3384
3385 if (Amt.getValueType() != NewShiftVT) {
3386 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3387 DCI.AddToWorklist(Amt.getNode());
3388 }
3389
3390 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3391 Trunc, Amt);
3392 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3393 }
3394 }
3395 }
3396
3397 return SDValue();
3398}
3399
3400// We need to specifically handle i64 mul here to avoid unnecessary conversion
3401// instructions. If we only match on the legalized i64 mul expansion,
3402// SimplifyDemandedBits will be unable to remove them because there will be
3403// multiple uses due to the separate mul + mulh[su].
3404static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3405 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3406 if (Size <= 32) {
3407 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3408 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3409 }
3410
3411 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3412 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3413
3414 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3415 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3416
3417 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3418}
3419
3420SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3421 DAGCombinerInfo &DCI) const {
3422 EVT VT = N->getValueType(0);
3423
3424 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3425 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3426 // unnecessarily). isDivergent() is used as an approximation of whether the
3427 // value is in an SGPR.
3428 if (!N->isDivergent())
3429 return SDValue();
3430
3431 unsigned Size = VT.getSizeInBits();
3432 if (VT.isVector() || Size > 64)
3433 return SDValue();
3434
3435 // There are i16 integer mul/mad.
3436 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3437 return SDValue();
3438
3439 SelectionDAG &DAG = DCI.DAG;
3440 SDLoc DL(N);
3441
3442 SDValue N0 = N->getOperand(0);
3443 SDValue N1 = N->getOperand(1);
3444
3445 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3446 // in the source into any_extends if the result of the mul is truncated. Since
3447 // we can assume the high bits are whatever we want, use the underlying value
3448 // to avoid the unknown high bits from interfering.
3449 if (N0.getOpcode() == ISD::ANY_EXTEND)
3450 N0 = N0.getOperand(0);
3451
3452 if (N1.getOpcode() == ISD::ANY_EXTEND)
3453 N1 = N1.getOperand(0);
3454
3455 SDValue Mul;
3456
3457 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3458 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3459 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3460 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3461 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3462 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3463 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3464 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3465 } else {
3466 return SDValue();
3467 }
3468
3469 // We need to use sext even for MUL_U24, because MUL_U24 is used
3470 // for signed multiply of 8 and 16-bit types.
3471 return DAG.getSExtOrTrunc(Mul, DL, VT);
3472}
3473
3474SDValue
3475AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3476 DAGCombinerInfo &DCI) const {
3477 if (N->getValueType(0) != MVT::i32)
3478 return SDValue();
3479
3480 SelectionDAG &DAG = DCI.DAG;
3481 SDLoc DL(N);
3482
3483 SDValue N0 = N->getOperand(0);
3484 SDValue N1 = N->getOperand(1);
3485
3486 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3487 // in the source into any_extends if the result of the mul is truncated. Since
3488 // we can assume the high bits are whatever we want, use the underlying value
3489 // to avoid the unknown high bits from interfering.
3490 if (N0.getOpcode() == ISD::ANY_EXTEND)
3491 N0 = N0.getOperand(0);
3492 if (N1.getOpcode() == ISD::ANY_EXTEND)
3493 N1 = N1.getOperand(0);
3494
3495 // Try to use two fast 24-bit multiplies (one for each half of the result)
3496 // instead of one slow extending multiply.
3497 unsigned LoOpcode, HiOpcode;
3498 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3499 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3500 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3501 LoOpcode = AMDGPUISD::MUL_U24;
3502 HiOpcode = AMDGPUISD::MULHI_U24;
3503 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3504 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3505 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3506 LoOpcode = AMDGPUISD::MUL_I24;
3507 HiOpcode = AMDGPUISD::MULHI_I24;
3508 } else {
3509 return SDValue();
3510 }
3511
3512 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3513 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3514 DCI.CombineTo(N, Lo, Hi);
3515 return SDValue(N, 0);
3516}
3517
3518SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3519 DAGCombinerInfo &DCI) const {
3520 EVT VT = N->getValueType(0);
3521
3522 if (!Subtarget->hasMulI24() || VT.isVector())
3523 return SDValue();
3524
3525 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3526 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3527 // unnecessarily). isDivergent() is used as an approximation of whether the
3528 // value is in an SGPR.
3529 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3530 // valu op anyway)
3531 if (Subtarget->hasSMulHi() && !N->isDivergent())
3532 return SDValue();
3533
3534 SelectionDAG &DAG = DCI.DAG;
3535 SDLoc DL(N);
3536
3537 SDValue N0 = N->getOperand(0);
3538 SDValue N1 = N->getOperand(1);
3539
3540 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3541 return SDValue();
3542
3543 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3544 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3545
3546 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3547 DCI.AddToWorklist(Mulhi.getNode());
3548 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3549}
3550
3551SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3552 DAGCombinerInfo &DCI) const {
3553 EVT VT = N->getValueType(0);
3554
3555 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3556 return SDValue();
3557
3558 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3559 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3560 // unnecessarily). isDivergent() is used as an approximation of whether the
3561 // value is in an SGPR.
3562 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3563 // valu op anyway)
3564 if (Subtarget->hasSMulHi() && !N->isDivergent())
3565 return SDValue();
3566
3567 SelectionDAG &DAG = DCI.DAG;
3568 SDLoc DL(N);
3569
3570 SDValue N0 = N->getOperand(0);
3571 SDValue N1 = N->getOperand(1);
3572
3573 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3574 return SDValue();
3575
3576 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3577 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3578
3579 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3580 DCI.AddToWorklist(Mulhi.getNode());
3581 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3582}
3583
3584static bool isNegativeOne(SDValue Val) {
3585 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3586 return C->isAllOnes();
3587 return false;
3588}
3589
3590SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3591 SDValue Op,
3592 const SDLoc &DL,
3593 unsigned Opc) const {
3594 EVT VT = Op.getValueType();
3595 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3596 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3597 LegalVT != MVT::i16))
3598 return SDValue();
3599
3600 if (VT != MVT::i32)
3601 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3602
3603 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3604 if (VT != MVT::i32)
3605 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3606
3607 return FFBX;
3608}
3609
3610// The native instructions return -1 on 0 input. Optimize out a select that
3611// produces -1 on 0.
3612//
3613// TODO: If zero is not undef, we could also do this if the output is compared
3614// against the bitwidth.
3615//
3616// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3617SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3618 SDValue LHS, SDValue RHS,
3619 DAGCombinerInfo &DCI) const {
3620 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3621 if (!CmpRhs || !CmpRhs->isZero())
3622 return SDValue();
3623
3624 SelectionDAG &DAG = DCI.DAG;
3625 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3626 SDValue CmpLHS = Cond.getOperand(0);
3627
3628 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3629 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3630 if (CCOpcode == ISD::SETEQ &&
3631 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3632 RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3633 unsigned Opc =
3634 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3635 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3636 }
3637
3638 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3639 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3640 if (CCOpcode == ISD::SETNE &&
3641 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3642 LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3643 unsigned Opc =
3644 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3645
3646 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3647 }
3648
3649 return SDValue();
3650}
3651
3652static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3653 unsigned Op,
3654 const SDLoc &SL,
3655 SDValue Cond,
3656 SDValue N1,
3657 SDValue N2) {
3658 SelectionDAG &DAG = DCI.DAG;
3659 EVT VT = N1.getValueType();
3660
3661 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3662 N1.getOperand(0), N2.getOperand(0));
3663 DCI.AddToWorklist(NewSelect.getNode());
3664 return DAG.getNode(Op, SL, VT, NewSelect);
3665}
3666
3667// Pull a free FP operation out of a select so it may fold into uses.
3668//
3669// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3670// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3671//
3672// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3673// select c, (fabs x), +k -> fabs (select c, x, k)
3674static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3675 SDValue N) {
3676 SelectionDAG &DAG = DCI.DAG;
3677 SDValue Cond = N.getOperand(0);
3678 SDValue LHS = N.getOperand(1);
3679 SDValue RHS = N.getOperand(2);
3680
3681 EVT VT = N.getValueType();
3682 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3683 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3684 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3685 SDLoc(N), Cond, LHS, RHS);
3686 }
3687
3688 bool Inv = false;
3689 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3690 std::swap(LHS, RHS);
3691 Inv = true;
3692 }
3693
3694 // TODO: Support vector constants.
3695 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3696 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3697 SDLoc SL(N);
3698 // If one side is an fneg/fabs and the other is a constant, we can push the
3699 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3700 SDValue NewLHS = LHS.getOperand(0);
3701 SDValue NewRHS = RHS;
3702
3703 // Careful: if the neg can be folded up, don't try to pull it back down.
3704 bool ShouldFoldNeg = true;
3705
3706 if (NewLHS.hasOneUse()) {
3707 unsigned Opc = NewLHS.getOpcode();
3708 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3709 ShouldFoldNeg = false;
3710 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3711 ShouldFoldNeg = false;
3712 }
3713
3714 if (ShouldFoldNeg) {
3715 if (LHS.getOpcode() == ISD::FNEG)
3716 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3717 else if (CRHS->isNegative())
3718 return SDValue();
3719
3720 if (Inv)
3721 std::swap(NewLHS, NewRHS);
3722
3723 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3724 Cond, NewLHS, NewRHS);
3725 DCI.AddToWorklist(NewSelect.getNode());
3726 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3727 }
3728 }
3729
3730 return SDValue();
3731}
3732
3733
3734SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3735 DAGCombinerInfo &DCI) const {
3736 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3737 return Folded;
3738
3739 SDValue Cond = N->getOperand(0);
3740 if (Cond.getOpcode() != ISD::SETCC)
3741 return SDValue();
3742
3743 EVT VT = N->getValueType(0);
3744 SDValue LHS = Cond.getOperand(0);
3745 SDValue RHS = Cond.getOperand(1);
3746 SDValue CC = Cond.getOperand(2);
3747
3748 SDValue True = N->getOperand(1);
3749 SDValue False = N->getOperand(2);
3750
3751 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3752 SelectionDAG &DAG = DCI.DAG;
3753 if (DAG.isConstantValueOfAnyType(True) &&
3754 !DAG.isConstantValueOfAnyType(False)) {
3755 // Swap cmp + select pair to move constant to false input.
3756 // This will allow using VOPC cndmasks more often.
3757 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3758
3759 SDLoc SL(N);
3760 ISD::CondCode NewCC =
3761 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3762
3763 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3764 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3765 }
3766
3767 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3768 SDValue MinMax
3769 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3770 // Revisit this node so we can catch min3/max3/med3 patterns.
3771 //DCI.AddToWorklist(MinMax.getNode());
3772 return MinMax;
3773 }
3774 }
3775
3776 // There's no reason to not do this if the condition has other uses.
3777 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3778}
3779
3780static bool isInv2Pi(const APFloat &APF) {
3781 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3782 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3783 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3784
3785 return APF.bitwiseIsEqual(KF16) ||
3786 APF.bitwiseIsEqual(KF32) ||
3787 APF.bitwiseIsEqual(KF64);
3788}
3789
3790// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3791// additional cost to negate them.
3792bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3793 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3794 if (C->isZero() && !C->isNegative())
3795 return true;
3796
3797 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3798 return true;
3799 }
3800
3801 return false;
3802}
3803
3804static unsigned inverseMinMax(unsigned Opc) {
3805 switch (Opc) {
3806 case ISD::FMAXNUM:
3807 return ISD::FMINNUM;
3808 case ISD::FMINNUM:
3809 return ISD::FMAXNUM;
3810 case ISD::FMAXNUM_IEEE:
3811 return ISD::FMINNUM_IEEE;
3812 case ISD::FMINNUM_IEEE:
3813 return ISD::FMAXNUM_IEEE;
3814 case AMDGPUISD::FMAX_LEGACY:
3815 return AMDGPUISD::FMIN_LEGACY;
3816 case AMDGPUISD::FMIN_LEGACY:
3817 return AMDGPUISD::FMAX_LEGACY;
3818 default:
3819 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3819)
;
3820 }
3821}
3822
3823SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3824 DAGCombinerInfo &DCI) const {
3825 SelectionDAG &DAG = DCI.DAG;
3826 SDValue N0 = N->getOperand(0);
3827 EVT VT = N->getValueType(0);
3828
3829 unsigned Opc = N0.getOpcode();
3830
3831 // If the input has multiple uses and we can either fold the negate down, or
3832 // the other uses cannot, give up. This both prevents unprofitable
3833 // transformations and infinite loops: we won't repeatedly try to fold around
3834 // a negate that has no 'good' form.
3835 if (N0.hasOneUse()) {
3836 // This may be able to fold into the source, but at a code size cost. Don't
3837 // fold if the fold into the user is free.
3838 if (allUsesHaveSourceMods(N, 0))
3839 return SDValue();
3840 } else {
3841 if (fnegFoldsIntoOp(Opc) &&
3842 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3843 return SDValue();
3844 }
3845
3846 SDLoc SL(N);
3847 switch (Opc) {
3848 case ISD::FADD: {
3849 if (!mayIgnoreSignedZero(N0))
3850 return SDValue();
3851
3852 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3853 SDValue LHS = N0.getOperand(0);
3854 SDValue RHS = N0.getOperand(1);
3855
3856 if (LHS.getOpcode() != ISD::FNEG)
3857 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3858 else
3859 LHS = LHS.getOperand(0);
3860
3861 if (RHS.getOpcode() != ISD::FNEG)
3862 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3863 else
3864 RHS = RHS.getOperand(0);
3865
3866 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3867 if (Res.getOpcode() != ISD::FADD)
3868 return SDValue(); // Op got folded away.
3869 if (!N0.hasOneUse())
3870 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3871 return Res;
3872 }
3873 case ISD::FMUL:
3874 case AMDGPUISD::FMUL_LEGACY: {
3875 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3876 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3877 SDValue LHS = N0.getOperand(0);
3878 SDValue RHS = N0.getOperand(1);
3879
3880 if (LHS.getOpcode() == ISD::FNEG)
3881 LHS = LHS.getOperand(0);
3882 else if (RHS.getOpcode() == ISD::FNEG)
3883 RHS = RHS.getOperand(0);
3884 else
3885 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3886
3887 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3888 if (Res.getOpcode() != Opc)
3889 return SDValue(); // Op got folded away.
3890 if (!N0.hasOneUse())
3891 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3892 return Res;
3893 }
3894 case ISD::FMA:
3895 case ISD::FMAD: {
3896 // TODO: handle llvm.amdgcn.fma.legacy
3897 if (!mayIgnoreSignedZero(N0))
3898 return SDValue();
3899
3900 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3901 SDValue LHS = N0.getOperand(0);
3902 SDValue MHS = N0.getOperand(1);
3903 SDValue RHS = N0.getOperand(2);
3904
3905 if (LHS.getOpcode() == ISD::FNEG)
3906 LHS = LHS.getOperand(0);
3907 else if (MHS.getOpcode() == ISD::FNEG)
3908 MHS = MHS.getOperand(0);
3909 else
3910 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3911
3912 if (RHS.getOpcode() != ISD::FNEG)
3913 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3914 else
3915 RHS = RHS.getOperand(0);
3916
3917 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3918 if (Res.getOpcode() != Opc)
3919 return SDValue(); // Op got folded away.
3920 if (!N0.hasOneUse())
3921 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3922 return Res;
3923 }
3924 case ISD::FMAXNUM:
3925 case ISD::FMINNUM:
3926 case ISD::FMAXNUM_IEEE:
3927 case ISD::FMINNUM_IEEE:
3928 case AMDGPUISD::FMAX_LEGACY:
3929 case AMDGPUISD::FMIN_LEGACY: {
3930 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3931 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3932 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3933 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3934
3935 SDValue LHS = N0.getOperand(0);
3936 SDValue RHS = N0.getOperand(1);
3937
3938 // 0 doesn't have a negated inline immediate.
3939 // TODO: This constant check should be generalized to other operations.
3940 if (isConstantCostlierToNegate(RHS))
3941 return SDValue();
3942
3943 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3944 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3945 unsigned Opposite = inverseMinMax(Opc);
3946
3947 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3948 if (Res.getOpcode() != Opposite)
3949 return SDValue(); // Op got folded away.
3950 if (!N0.hasOneUse())
3951 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3952 return Res;
3953 }
3954 case AMDGPUISD::FMED3: {
3955 SDValue Ops[3];
3956 for (unsigned I = 0; I < 3; ++I)
3957 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3958
3959 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3960 if (Res.getOpcode() != AMDGPUISD::FMED3)
3961 return SDValue(); // Op got folded away.
3962
3963 if (!N0.hasOneUse()) {
3964 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3965 DAG.ReplaceAllUsesWith(N0, Neg);
3966
3967 for (SDNode *U : Neg->uses())
3968 DCI.AddToWorklist(U);
3969 }
3970
3971 return Res;
3972 }
3973 case ISD::FP_EXTEND:
3974 case ISD::FTRUNC:
3975 case ISD::FRINT:
3976 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3977 case ISD::FSIN:
3978 case ISD::FCANONICALIZE:
3979 case AMDGPUISD::RCP:
3980 case AMDGPUISD::RCP_LEGACY:
3981 case AMDGPUISD::RCP_IFLAG:
3982 case AMDGPUISD::SIN_HW: {
3983 SDValue CvtSrc = N0.getOperand(0);
3984 if (CvtSrc.getOpcode() == ISD::FNEG) {
3985 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3986 // (fneg (rcp (fneg x))) -> (rcp x)
3987 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3988 }
3989
3990 if (!N0.hasOneUse())
3991 return SDValue();
3992
3993 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3994 // (fneg (rcp x)) -> (rcp (fneg x))
3995 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3996 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3997 }
3998 case ISD::FP_ROUND: {
3999 SDValue CvtSrc = N0.getOperand(0);
4000
4001 if (CvtSrc.getOpcode() == ISD::FNEG) {
4002 // (fneg (fp_round (fneg x))) -> (fp_round x)
4003 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4004 CvtSrc.getOperand(0), N0.getOperand(1));
4005 }
4006
4007 if (!N0.hasOneUse())
4008 return SDValue();
4009
4010 // (fneg (fp_round x)) -> (fp_round (fneg x))
4011 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4012 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4013 }
4014 case ISD::FP16_TO_FP: {
4015 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4016 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4017 // Put the fneg back as a legal source operation that can be matched later.
4018 SDLoc SL(N);
4019
4020 SDValue Src = N0.getOperand(0);
4021 EVT SrcVT = Src.getValueType();
4022
4023 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4024 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4025 DAG.getConstant(0x8000, SL, SrcVT));
4026 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4027 }
4028 default:
4029 return SDValue();
4030 }
4031}
4032
4033SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4034 DAGCombinerInfo &DCI) const {
4035 SelectionDAG &DAG = DCI.DAG;
4036 SDValue N0 = N->getOperand(0);
4037
4038 if (!N0.hasOneUse())
4039 return SDValue();
4040
4041 switch (N0.getOpcode()) {
4042 case ISD::FP16_TO_FP: {
4043 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")(static_cast <bool> (!Subtarget->has16BitInsts() &&
"should only see if f16 is illegal") ? void (0) : __assert_fail
("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4043, __extension__
__PRETTY_FUNCTION__))
;
4044 SDLoc SL(N);
4045 SDValue Src = N0.getOperand(0);
4046 EVT SrcVT = Src.getValueType();
4047
4048 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4049 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4050 DAG.getConstant(0x7fff, SL, SrcVT));
4051 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4052 }
4053 default:
4054 return SDValue();
4055 }
4056}
4057
4058SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4059 DAGCombinerInfo &DCI) const {
4060 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4061 if (!CFP)
4062 return SDValue();
4063
4064 // XXX - Should this flush denormals?
4065 const APFloat &Val = CFP->getValueAPF();
4066 APFloat One(Val.getSemantics(), "1.0");
4067 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4068}
4069
4070SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4071 DAGCombinerInfo &DCI) const {
4072 SelectionDAG &DAG = DCI.DAG;
4073 SDLoc DL(N);
4074
4075 switch(N->getOpcode()) {
4076 default:
4077 break;
4078 case ISD::BITCAST: {
4079 EVT DestVT = N->getValueType(0);
4080
4081 // Push casts through vector builds. This helps avoid emitting a large
4082 // number of copies when materializing floating point vector constants.
4083 //
4084 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4085 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4086 if (DestVT.isVector()) {
4087 SDValue Src = N->getOperand(0);
4088 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4089 EVT SrcVT = Src.getValueType();
4090 unsigned NElts = DestVT.getVectorNumElements();
4091
4092 if (SrcVT.getVectorNumElements() == NElts) {
4093 EVT DestEltVT = DestVT.getVectorElementType();
4094
4095 SmallVector<SDValue, 8> CastedElts;
4096 SDLoc SL(N);
4097 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4098 SDValue Elt = Src.getOperand(I);
4099 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4100 }
4101
4102 return DAG.getBuildVector(DestVT, SL, CastedElts);
4103 }
4104 }
4105 }
4106
4107 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4108 break;
4109
4110 // Fold bitcasts of constants.
4111 //
4112 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4113 // TODO: Generalize and move to DAGCombiner
4114 SDValue Src = N->getOperand(0);
4115 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4116 SDLoc SL(N);
4117 uint64_t CVal = C->getZExtValue();
4118 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4119 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4120 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4121 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4122 }
4123
4124 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4125 const APInt &Val = C->getValueAPF().bitcastToAPInt();
4126 SDLoc SL(N);
4127 uint64_t CVal = Val.getZExtValue();
4128 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4129 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4130 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4131
4132 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4133 }
4134
4135 break;
4136 }
4137 case ISD::SHL: {
4138 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4139 break;
4140
4141 return performShlCombine(N, DCI);
4142 }
4143 case ISD::SRL: {
4144 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4145 break;
4146
4147 return performSrlCombine(N, DCI);
4148 }
4149 case ISD::SRA: {
4150 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4151 break;
4152
4153 return performSraCombine(N, DCI);
4154 }
4155 case ISD::TRUNCATE:
4156 return performTruncateCombine(N, DCI);
4157 case ISD::MUL:
4158 return performMulCombine(N, DCI);
4159 case ISD::SMUL_LOHI:
4160 case ISD::UMUL_LOHI:
4161 return performMulLoHiCombine(N, DCI);
4162 case ISD::MULHS:
4163 return performMulhsCombine(N, DCI);
4164 case ISD::MULHU:
4165 return performMulhuCombine(N, DCI);
4166 case AMDGPUISD::MUL_I24:
4167 case AMDGPUISD::MUL_U24:
4168 case AMDGPUISD::MULHI_I24:
4169 case AMDGPUISD::MULHI_U24:
4170 return simplifyMul24(N, DCI);
4171 case ISD::SELECT:
4172 return performSelectCombine(N, DCI);
4173 case ISD::FNEG:
4174 return performFNegCombine(N, DCI);
4175 case ISD::FABS:
4176 return performFAbsCombine(N, DCI);
4177 case AMDGPUISD::BFE_I32:
4178 case AMDGPUISD::BFE_U32: {
4179 assert(!N->getValueType(0).isVector() &&(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4180, __extension__
__PRETTY_FUNCTION__))
4180 "Vector handling of BFE not implemented")(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4180, __extension__
__PRETTY_FUNCTION__))
;
4181 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4182 if (!Width)
4183 break;
4184
4185 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4186 if (WidthVal == 0)
4187 return DAG.getConstant(0, DL, MVT::i32);
4188
4189 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4190 if (!Offset)
4191 break;
4192
4193 SDValue BitsFrom = N->getOperand(0);
4194 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4195
4196 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4197
4198 if (OffsetVal == 0) {
4199 // This is already sign / zero extended, so try to fold away extra BFEs.
4200 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4201
4202 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4203 if (OpSignBits >= SignBits)
4204 return BitsFrom;
4205
4206 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4207 if (Signed) {
4208 // This is a sign_extend_inreg. Replace it to take advantage of existing
4209 // DAG Combines. If not eliminated, we will match back to BFE during
4210 // selection.
4211
4212 // TODO: The sext_inreg of extended types ends, although we can could
4213 // handle them in a single BFE.
4214 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4215 DAG.getValueType(SmallVT));
4216 }
4217
4218 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4219 }
4220
4221 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4222 if (Signed) {
4223 return constantFoldBFE<int32_t>(DAG,
4224 CVal->getSExtValue(),
4225 OffsetVal,
4226 WidthVal,
4227 DL);
4228 }
4229
4230 return constantFoldBFE<uint32_t>(DAG,
4231 CVal->getZExtValue(),
4232 OffsetVal,
4233 WidthVal,
4234 DL);
4235 }
4236
4237 if ((OffsetVal + WidthVal) >= 32 &&
4238 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4239 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4240 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4241 BitsFrom, ShiftVal);
4242 }
4243
4244 if (BitsFrom.hasOneUse()) {
4245 APInt Demanded = APInt::getBitsSet(32,
4246 OffsetVal,
4247 OffsetVal + WidthVal);
4248
4249 KnownBits Known;
4250 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4251 !DCI.isBeforeLegalizeOps());
4252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4253 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4254 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4255 DCI.CommitTargetLoweringOpt(TLO);
4256 }
4257 }
4258
4259 break;
4260 }
4261 case ISD::LOAD:
4262 return performLoadCombine(N, DCI);
4263 case ISD::STORE:
4264 return performStoreCombine(N, DCI);
4265 case AMDGPUISD::RCP:
4266 case AMDGPUISD::RCP_IFLAG:
4267 return performRcpCombine(N, DCI);
4268 case ISD::AssertZext:
4269 case ISD::AssertSext:
4270 return performAssertSZExtCombine(N, DCI);
4271 case ISD::INTRINSIC_WO_CHAIN:
4272 return performIntrinsicWOChainCombine(N, DCI);
4273 }
4274 return SDValue();
4275}
4276
4277//===----------------------------------------------------------------------===//
4278// Helper functions
4279//===----------------------------------------------------------------------===//
4280
4281SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4282 const TargetRegisterClass *RC,
4283 Register Reg, EVT VT,
4284 const SDLoc &SL,
4285 bool RawReg) const {
4286 MachineFunction &MF = DAG.getMachineFunction();
4287 MachineRegisterInfo &MRI = MF.getRegInfo();
4288 Register VReg;
4289
4290 if (!MRI.isLiveIn(Reg)) {
4291 VReg = MRI.createVirtualRegister(RC);
4292 MRI.addLiveIn(Reg, VReg);
4293 } else {
4294 VReg = MRI.getLiveInVirtReg(Reg);
4295 }
4296
4297 if (RawReg)
4298 return DAG.getRegister(VReg, VT);
4299
4300 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4301}
4302
4303// This may be called multiple times, and nothing prevents creating multiple
4304// objects at the same offset. See if we already defined this object.
4305static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4306 int64_t Offset) {
4307 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4308 if (MFI.getObjectOffset(I) == Offset) {
4309 assert(MFI.getObjectSize(I) == Size)(static_cast <bool> (MFI.getObjectSize(I) == Size) ? void
(0) : __assert_fail ("MFI.getObjectSize(I) == Size", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4309, __extension__ __PRETTY_FUNCTION__))
;
4310 return I;
4311 }
4312 }
4313
4314 return MFI.CreateFixedObject(Size, Offset, true);
4315}
4316
4317SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4318 EVT VT,
4319 const SDLoc &SL,
4320 int64_t Offset) const {
4321 MachineFunction &MF = DAG.getMachineFunction();
4322 MachineFrameInfo &MFI = MF.getFrameInfo();
4323 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4324
4325 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4326 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4327
4328 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4329 MachineMemOperand::MODereferenceable |
4330 MachineMemOperand::MOInvariant);
4331}
4332
4333SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4334 const SDLoc &SL,
4335 SDValue Chain,
4336 SDValue ArgVal,
4337 int64_t Offset) const {
4338 MachineFunction &MF = DAG.getMachineFunction();
4339 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4340 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4341
4342 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4343 // Stores to the argument stack area are relative to the stack pointer.
4344 SDValue SP =
4345 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4346 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4347 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4348 MachineMemOperand::MODereferenceable);
4349 return Store;
4350}
4351
4352SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4353 const TargetRegisterClass *RC,
4354 EVT VT, const SDLoc &SL,
4355 const ArgDescriptor &Arg) const {
4356 assert(Arg && "Attempting to load missing argument")(static_cast <bool> (Arg && "Attempting to load missing argument"
) ? void (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4356, __extension__
__PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4357
4358 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4359 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4360 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4361
4362 if (!Arg.isMasked())
4
Calling 'ArgDescriptor::isMasked'
7
Returning from 'ArgDescriptor::isMasked'
8
Taking false branch
4363 return V;
4364
4365 unsigned Mask = Arg.getMask();
4366 unsigned Shift = countTrailingZeros<unsigned>(Mask);
9
Calling 'countTrailingZeros<unsigned int>'
16
Returning from 'countTrailingZeros<unsigned int>'
17
'Shift' initialized to 32
4367 V = DAG.getNode(ISD::SRL, SL, VT, V,
4368 DAG.getShiftAmountConstant(Shift, VT, SL));
4369 return DAG.getNode(ISD::AND, SL, VT, V,
4370 DAG.getConstant(Mask >> Shift, SL, VT));
18
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4371}
4372
4373uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4374 const MachineFunction &MF, const ImplicitParameter Param) const {
4375 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4376 const AMDGPUSubtarget &ST =
4377 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4378 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4379 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4380 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4381 ExplicitArgOffset;
4382 switch (Param) {
4383 case GRID_DIM:
4384 return ArgOffset;
4385 case GRID_OFFSET:
4386 return ArgOffset + 4;
4387 }
4388 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4388)
;
4389}
4390
4391#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4392
4393const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4394 switch ((AMDGPUISD::NodeType)Opcode) {
4395 case AMDGPUISD::FIRST_NUMBER: break;
4396 // AMDIL DAG nodes
4397 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4398 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4399
4400 // AMDGPU DAG nodes
4401 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4402 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4403 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4404 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4405 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4406 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4407 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4408 NODE_NAME_CASE(RET_GFX_FLAG)case AMDGPUISD::RET_GFX_FLAG: return "RET_GFX_FLAG";
4409 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4410 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4411 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4412 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4413 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4414 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4415 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4416 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4417 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4418 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4419 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4420 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4421 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4422 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4423 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4424 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4425 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4426 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4427 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4428 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4429 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4430 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4431 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4432 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4433 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4434 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4435 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4436 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4437 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4438 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4439 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4440 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4441 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4442 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4443 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4444 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4445 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4446 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4447 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4448 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4449 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4450 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4451 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4452 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4453 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4454 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4455 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4456 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4457 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4458 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4459 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4460 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4461 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4462 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4463 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4464 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4465 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4466 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4467 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4468 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4469 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4470 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4471 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4472 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4473 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4474 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4475 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4476 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4477 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4478 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4479 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4480 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4481 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4482 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4483 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4484 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4485 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4486 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4487 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4488 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4489 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4490 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4491 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4492 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4493 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4494 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4495 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4496 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4497 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4498 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4499 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4500 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4501 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4502 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4503 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4504 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4505 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4506 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4507 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4508 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4509 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4510 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4511 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4512 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4513 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4514 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4515 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4516 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4517 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4518 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4519 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4520 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4521 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4522 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4523 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4524 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4525 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4526 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4527 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4528 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4529 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4530 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4531 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4532 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4533 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4534 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)case AMDGPUISD::BUFFER_ATOMIC_CSUB: return "BUFFER_ATOMIC_CSUB"
;
4535 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4536 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)case AMDGPUISD::BUFFER_ATOMIC_FMIN: return "BUFFER_ATOMIC_FMIN"
;
4537 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)case AMDGPUISD::BUFFER_ATOMIC_FMAX: return "BUFFER_ATOMIC_FMAX"
;
4538
4539 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4540 }
4541 return nullptr;
4542}
4543
4544SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4545 SelectionDAG &DAG, int Enabled,
4546 int &RefinementSteps,
4547 bool &UseOneConstNR,
4548 bool Reciprocal) const {
4549 EVT VT = Operand.getValueType();
4550
4551 if (VT == MVT::f32) {
4552 RefinementSteps = 0;
4553 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4554 }
4555
4556 // TODO: There is also f64 rsq instruction, but the documentation is less
4557 // clear on its precision.
4558
4559 return SDValue();
4560}
4561
4562SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4563 SelectionDAG &DAG, int Enabled,
4564 int &RefinementSteps) const {
4565 EVT VT = Operand.getValueType();
4566
4567 if (VT == MVT::f32) {
4568 // Reciprocal, < 1 ulp error.
4569 //
4570 // This reciprocal approximation converges to < 0.5 ulp error with one
4571 // newton rhapson performed with two fused multiple adds (FMAs).
4572
4573 RefinementSteps = 0;
4574 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4575 }
4576
4577 // TODO: There is also f64 rcp instruction, but the documentation is less
4578 // clear on its precision.
4579
4580 return SDValue();
4581}
4582
4583void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4584 const SDValue Op, KnownBits &Known,
4585 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4586
4587 Known.resetAll(); // Don't know anything.
4588
4589 unsigned Opc = Op.getOpcode();
4590
4591 switch (Opc) {
4592 default:
4593 break;
4594 case AMDGPUISD::CARRY:
4595 case AMDGPUISD::BORROW: {
4596 Known.Zero = APInt::getHighBitsSet(32, 31);
4597 break;
4598 }
4599
4600 case AMDGPUISD::BFE_I32:
4601 case AMDGPUISD::BFE_U32: {
4602 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4603 if (!CWidth)
4604 return;
4605
4606 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4607
4608 if (Opc == AMDGPUISD::BFE_U32)
4609 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4610
4611 break;
4612 }
4613 case AMDGPUISD::FP_TO_FP16: {
4614 unsigned BitWidth = Known.getBitWidth();
4615
4616 // High bits are zero.
4617 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4618 break;
4619 }
4620 case AMDGPUISD::MUL_U24:
4621 case AMDGPUISD::MUL_I24: {
4622 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4623 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4624 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4625 RHSKnown.countMinTrailingZeros();
4626 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4627 // Skip extra check if all bits are known zeros.
4628 if (TrailZ >= 32)
4629 break;
4630
4631 // Truncate to 24 bits.
4632 LHSKnown = LHSKnown.trunc(24);
4633 RHSKnown = RHSKnown.trunc(24);
4634
4635 if (Opc == AMDGPUISD::MUL_I24) {
4636 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4637 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4638 unsigned MaxValBits = LHSValBits + RHSValBits;
4639 if (MaxValBits > 32)
4640 break;
4641 unsigned SignBits = 32 - MaxValBits + 1;
4642 bool LHSNegative = LHSKnown.isNegative();
4643 bool LHSNonNegative = LHSKnown.isNonNegative();
4644 bool LHSPositive = LHSKnown.isStrictlyPositive();
4645 bool RHSNegative = RHSKnown.isNegative();
4646 bool RHSNonNegative = RHSKnown.isNonNegative();
4647 bool RHSPositive = RHSKnown.isStrictlyPositive();
4648
4649 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4650 Known.Zero.setHighBits(SignBits);
4651 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4652 Known.One.setHighBits(SignBits);
4653 } else {
4654 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4655 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4656 unsigned MaxValBits = LHSValBits + RHSValBits;
4657 if (MaxValBits >= 32)
4658 break;
4659 Known.Zero.setBitsFrom(MaxValBits);
4660 }
4661 break;
4662 }
4663 case AMDGPUISD::PERM: {
4664 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4665 if (!CMask)
4666 return;
4667
4668 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4669 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4670 unsigned Sel = CMask->getZExtValue();
4671
4672 for (unsigned I = 0; I < 32; I += 8) {
4673 unsigned SelBits = Sel & 0xff;
4674 if (SelBits < 4) {
4675 SelBits *= 8;
4676 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4677 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4678 } else if (SelBits < 7) {
4679 SelBits = (SelBits & 3) * 8;
4680 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4681 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4682 } else if (SelBits == 0x0c) {
4683 Known.Zero |= 0xFFull << I;
4684 } else if (SelBits > 0x0c) {
4685 Known.One |= 0xFFull << I;
4686 }
4687 Sel >>= 8;
4688 }
4689 break;
4690 }
4691 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4692 Known.Zero.setHighBits(24);
4693 break;
4694 }
4695 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4696 Known.Zero.setHighBits(16);
4697 break;
4698 }
4699 case AMDGPUISD::LDS: {
4700 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4701 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4702
4703 Known.Zero.setHighBits(16);
4704 Known.Zero.setLowBits(Log2(Alignment));
4705 break;
4706 }
4707 case ISD::INTRINSIC_WO_CHAIN: {
4708 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4709 switch (IID) {
4710 case Intrinsic::amdgcn_mbcnt_lo:
4711 case Intrinsic::amdgcn_mbcnt_hi: {
4712 const GCNSubtarget &ST =
4713 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4714 // These return at most the wavefront size - 1.
4715 unsigned Size = Op.getValueType().getSizeInBits();
4716 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4717 break;
4718 }
4719 default:
4720 break;
4721 }
4722 }
4723 }
4724}
4725
4726unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4727 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4728 unsigned Depth) const {
4729 switch (Op.getOpcode()) {
4730 case AMDGPUISD::BFE_I32: {
4731 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4732 if (!Width)
4733 return 1;
4734
4735 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4736 if (!isNullConstant(Op.getOperand(1)))
4737 return SignBits;
4738
4739 // TODO: Could probably figure something out with non-0 offsets.
4740 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4741 return std::max(SignBits, Op0SignBits);
4742 }
4743
4744 case AMDGPUISD::BFE_U32: {
4745 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4746 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4747 }
4748
4749 case AMDGPUISD::CARRY:
4750 case AMDGPUISD::BORROW:
4751 return 31;
4752 case AMDGPUISD::BUFFER_LOAD_BYTE:
4753 return 25;
4754 case AMDGPUISD::BUFFER_LOAD_SHORT:
4755 return 17;
4756 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4757 return 24;
4758 case AMDGPUISD::BUFFER_LOAD_USHORT:
4759 return 16;
4760 case AMDGPUISD::FP_TO_FP16:
4761 return 16;
4762 default:
4763 return 1;
4764 }
4765}
4766
4767unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4768 GISelKnownBits &Analysis, Register R,
4769 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4770 unsigned Depth) const {
4771 const MachineInstr *MI = MRI.getVRegDef(R);
4772 if (!MI)
4773 return 1;
4774
4775 // TODO: Check range metadata on MMO.
4776 switch (MI->getOpcode()) {
4777 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4778 return 25;
4779 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4780 return 17;
4781 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4782 return 24;
4783 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4784 return 16;
4785 default:
4786 return 1;
4787 }
4788}
4789
4790bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4791 const SelectionDAG &DAG,
4792 bool SNaN,
4793 unsigned Depth) const {
4794 unsigned Opcode = Op.getOpcode();
4795 switch (Opcode) {
4796 case AMDGPUISD::FMIN_LEGACY:
4797 case AMDGPUISD::FMAX_LEGACY: {
4798 if (SNaN)
4799 return true;
4800
4801 // TODO: Can check no nans on one of the operands for each one, but which
4802 // one?
4803 return false;
4804 }
4805 case AMDGPUISD::FMUL_LEGACY:
4806 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4807 if (SNaN)
4808 return true;
4809 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4810 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4811 }
4812 case AMDGPUISD::FMED3:
4813 case AMDGPUISD::FMIN3:
4814 case AMDGPUISD::FMAX3:
4815 case AMDGPUISD::FMAD_FTZ: {
4816 if (SNaN)
4817 return true;
4818 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4819 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4820 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4821 }
4822 case AMDGPUISD::CVT_F32_UBYTE0:
4823 case AMDGPUISD::CVT_F32_UBYTE1:
4824 case AMDGPUISD::CVT_F32_UBYTE2:
4825 case AMDGPUISD::CVT_F32_UBYTE3:
4826 return true;
4827
4828 case AMDGPUISD::RCP:
4829 case AMDGPUISD::RSQ:
4830 case AMDGPUISD::RCP_LEGACY:
4831 case AMDGPUISD::RSQ_CLAMP: {
4832 if (SNaN)
4833 return true;
4834
4835 // TODO: Need is known positive check.
4836 return false;
4837 }
4838 case AMDGPUISD::LDEXP:
4839 case AMDGPUISD::FRACT: {
4840 if (SNaN)
4841 return true;
4842 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4843 }
4844 case AMDGPUISD::DIV_SCALE:
4845 case AMDGPUISD::DIV_FMAS:
4846 case AMDGPUISD::DIV_FIXUP:
4847 // TODO: Refine on operands.
4848 return SNaN;
4849 case AMDGPUISD::SIN_HW:
4850 case AMDGPUISD::COS_HW: {
4851 // TODO: Need check for infinity
4852 return SNaN;
4853 }
4854 case ISD::INTRINSIC_WO_CHAIN: {
4855 unsigned IntrinsicID
4856 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4857 // TODO: Handle more intrinsics
4858 switch (IntrinsicID) {
4859 case Intrinsic::amdgcn_cubeid:
4860 return true;
4861
4862 case Intrinsic::amdgcn_frexp_mant: {
4863 if (SNaN)
4864 return true;
4865 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4866 }
4867 case Intrinsic::amdgcn_cvt_pkrtz: {
4868 if (SNaN)
4869 return true;
4870 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4871 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4872 }
4873 case Intrinsic::amdgcn_rcp:
4874 case Intrinsic::amdgcn_rsq:
4875 case Intrinsic::amdgcn_rcp_legacy:
4876 case Intrinsic::amdgcn_rsq_legacy:
4877 case Intrinsic::amdgcn_rsq_clamp: {
4878 if (SNaN)
4879 return true;
4880
4881 // TODO: Need is known positive check.
4882 return false;
4883 }
4884 case Intrinsic::amdgcn_trig_preop:
4885 case Intrinsic::amdgcn_fdot2:
4886 // TODO: Refine on operand
4887 return SNaN;
4888 case Intrinsic::amdgcn_fma_legacy:
4889 if (SNaN)
4890 return true;
4891 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4892 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4893 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4894 default:
4895 return false;
4896 }
4897 }
4898 default:
4899 return false;
4900 }
4901}
4902
4903TargetLowering::AtomicExpansionKind
4904AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4905 switch (RMW->getOperation()) {
4906 case AtomicRMWInst::Nand:
4907 case AtomicRMWInst::FAdd:
4908 case AtomicRMWInst::FSub:
4909 return AtomicExpansionKind::CmpXChg;
4910 default:
4911 return AtomicExpansionKind::None;
4912 }
4913}
4914
4915bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
4916 unsigned Opc, LLT Ty1, LLT Ty2) const {
4917 return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
4918 Ty2 == LLT::scalar(32);
4919}

/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)(static_cast <bool> (!IsStack) ? void (0) : __assert_fail
("!IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __extension__ __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)(static_cast <bool> (IsStack) ? void (0) : __assert_fail
("IsStack", "llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __extension__ __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
5
Assuming the condition is true
6
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/build/llvm-toolchain-snapshot-14~++20220127100629+cd20e579df07/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
117 return 32;
13
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 251, __extension__
__PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 429, __extension__
__PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 440, __extension__
__PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 447, __extension__
__PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 729, __extension__
__PRETTY_FUNCTION__))
;
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 754, __extension__
__PRETTY_FUNCTION__))
;
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 770, __extension__
__PRETTY_FUNCTION__))
;
771 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 771, __extension__
__PRETTY_FUNCTION__))
;
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 786, __extension__
__PRETTY_FUNCTION__))
;
787 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 787, __extension__
__PRETTY_FUNCTION__))
;
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif