Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4316, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/CodeGen/Analysis.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include "llvm/Support/CommandLine.h"
25#include "llvm/Support/KnownBits.h"
26#include "llvm/Target/TargetMachine.h"
27
28using namespace llvm;
29
30#include "AMDGPUGenCallingConv.inc"
31
32static cl::opt<bool> AMDGPUBypassSlowDiv(
33 "amdgpu-bypass-slow-div",
34 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35 cl::init(true));
36
37// Find a larger type to do a load / store of a vector with.
38EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
39 unsigned StoreSize = VT.getStoreSizeInBits();
40 if (StoreSize <= 32)
41 return EVT::getIntegerVT(Ctx, StoreSize);
42
43 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")(static_cast<void> (0));
44 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45}
46
47unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
48 EVT VT = Op.getValueType();
49 KnownBits Known = DAG.computeKnownBits(Op);
50 return VT.getSizeInBits() - Known.countMinLeadingZeros();
51}
52
53unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
54 EVT VT = Op.getValueType();
55
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
59}
60
61AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Lower floating point store/load to integer store/load to reduce the number
65 // of patterns in tablegen.
66 setOperationAction(ISD::LOAD, MVT::f32, Promote);
67 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
68
69 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
70 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
71
72 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
73 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
74
75 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
77
78 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
80
81 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
83
84 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
86
87 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
89
90 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
92
93 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
95
96 setOperationAction(ISD::LOAD, MVT::i64, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
98
99 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
101
102 setOperationAction(ISD::LOAD, MVT::f64, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
104
105 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
107
108 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
110
111 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
113
114 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
116
117 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
119
120 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
122
123 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
125
126 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
128
129 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
130 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
131
132 // There are no 64-bit extloads. These should be done as a 32-bit extload and
133 // an extension to 64-bit.
134 for (MVT VT : MVT::integer_valuetypes()) {
135 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
136 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
137 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
138 }
139
140 for (MVT VT : MVT::integer_valuetypes()) {
141 if (VT == MVT::i64)
142 continue;
143
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
145 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
146 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
147 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
148
149 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
150 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
151 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
152 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
153
154 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
155 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
156 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
158 }
159
160 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
161 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
162 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
163 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
164 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
165 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
166 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
167 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
168 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
169 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
172 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
173 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
174 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
175 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
176 }
177
178 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
185
186 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
190 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
192
193 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
196 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
197 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
199
200 setOperationAction(ISD::STORE, MVT::f32, Promote);
201 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
202
203 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
204 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
205
206 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
207 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
208
209 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
210 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
211
212 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
213 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
214
215 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
216 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
217
218 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
219 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
220
221 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
222 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
223
224 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
225 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
226
227 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
228 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
229
230 setOperationAction(ISD::STORE, MVT::i64, Promote);
231 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
232
233 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
234 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
235
236 setOperationAction(ISD::STORE, MVT::f64, Promote);
237 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
238
239 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
240 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
241
242 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
243 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
244
245 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
246 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
247
248 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
249 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
250
251 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
252 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
253
254 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
255 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
256
257 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
258 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
259
260 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
261 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
262
263 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
264 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
265
266 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
267 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
268 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
269 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
270
271 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
272 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
273 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
274 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
275
276 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
277 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
278 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
279 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
280 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
281 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
282 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
283
284 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
285 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
286
287 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
288 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
289
290 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
291 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
292 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
293 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
294
295 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
296 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
297 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
298 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
299
300 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
301 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
302
303 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
304 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
305 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
306 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
307 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
308 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
309 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
310
311 setOperationAction(ISD::Constant, MVT::i32, Legal);
312 setOperationAction(ISD::Constant, MVT::i64, Legal);
313 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
314 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
315
316 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
317 setOperationAction(ISD::BRIND, MVT::Other, Expand);
318
319 // This is totally unsupported, just custom lower to produce an error.
320 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
321
322 // Library functions. These default to Expand, but we have instructions
323 // for them.
324 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
325 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
326 setOperationAction(ISD::FPOW, MVT::f32, Legal);
327 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
328 setOperationAction(ISD::FABS, MVT::f32, Legal);
329 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
330 setOperationAction(ISD::FRINT, MVT::f32, Legal);
331 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
332 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
333 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
334
335 setOperationAction(ISD::FROUND, MVT::f32, Custom);
336 setOperationAction(ISD::FROUND, MVT::f64, Custom);
337
338 setOperationAction(ISD::FLOG, MVT::f32, Custom);
339 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
340 setOperationAction(ISD::FEXP, MVT::f32, Custom);
341
342
343 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
344 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
345
346 setOperationAction(ISD::FREM, MVT::f16, Custom);
347 setOperationAction(ISD::FREM, MVT::f32, Custom);
348 setOperationAction(ISD::FREM, MVT::f64, Custom);
349
350 // Expand to fneg + fadd.
351 setOperationAction(ISD::FSUB, MVT::f64, Expand);
352
353 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
354 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
355 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
356 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
357 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
358 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
359 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
360 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
361 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
362 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
363 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
364 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
365 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
366 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
367 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
368 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
369 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
370 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
371 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
372 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
373 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
374 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
375 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
376 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
377 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
378 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
379 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
380 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
381 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
382 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
383 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
384 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
385 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
386 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
387 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
388 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
389 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
390 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
391 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
392 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
393 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
394 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
395
396 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
397 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
398 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
399
400 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
401 for (MVT VT : ScalarIntVTs) {
402 // These should use [SU]DIVREM, so set them to expand
403 setOperationAction(ISD::SDIV, VT, Expand);
404 setOperationAction(ISD::UDIV, VT, Expand);
405 setOperationAction(ISD::SREM, VT, Expand);
406 setOperationAction(ISD::UREM, VT, Expand);
407
408 // GPU does not have divrem function for signed or unsigned.
409 setOperationAction(ISD::SDIVREM, VT, Custom);
410 setOperationAction(ISD::UDIVREM, VT, Custom);
411
412 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
413 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
414 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
415
416 setOperationAction(ISD::BSWAP, VT, Expand);
417 setOperationAction(ISD::CTTZ, VT, Expand);
418 setOperationAction(ISD::CTLZ, VT, Expand);
419
420 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
421 setOperationAction(ISD::ADDC, VT, Legal);
422 setOperationAction(ISD::SUBC, VT, Legal);
423 setOperationAction(ISD::ADDE, VT, Legal);
424 setOperationAction(ISD::SUBE, VT, Legal);
425 }
426
427 // The hardware supports 32-bit FSHR, but not FSHL.
428 setOperationAction(ISD::FSHR, MVT::i32, Legal);
429
430 // The hardware supports 32-bit ROTR, but not ROTL.
431 setOperationAction(ISD::ROTL, MVT::i32, Expand);
432 setOperationAction(ISD::ROTL, MVT::i64, Expand);
433 setOperationAction(ISD::ROTR, MVT::i64, Expand);
434
435 setOperationAction(ISD::MULHU, MVT::i16, Expand);
436 setOperationAction(ISD::MULHS, MVT::i16, Expand);
437
438 setOperationAction(ISD::MUL, MVT::i64, Expand);
439 setOperationAction(ISD::MULHU, MVT::i64, Expand);
440 setOperationAction(ISD::MULHS, MVT::i64, Expand);
441 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
442 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
443 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
444 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
445 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
446
447 setOperationAction(ISD::SMIN, MVT::i32, Legal);
448 setOperationAction(ISD::UMIN, MVT::i32, Legal);
449 setOperationAction(ISD::SMAX, MVT::i32, Legal);
450 setOperationAction(ISD::UMAX, MVT::i32, Legal);
451
452 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
453 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
454 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
455 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
456
457 static const MVT::SimpleValueType VectorIntTypes[] = {
458 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
459
460 for (MVT VT : VectorIntTypes) {
461 // Expand the following operations for the current type by default.
462 setOperationAction(ISD::ADD, VT, Expand);
463 setOperationAction(ISD::AND, VT, Expand);
464 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
465 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
466 setOperationAction(ISD::MUL, VT, Expand);
467 setOperationAction(ISD::MULHU, VT, Expand);
468 setOperationAction(ISD::MULHS, VT, Expand);
469 setOperationAction(ISD::OR, VT, Expand);
470 setOperationAction(ISD::SHL, VT, Expand);
471 setOperationAction(ISD::SRA, VT, Expand);
472 setOperationAction(ISD::SRL, VT, Expand);
473 setOperationAction(ISD::ROTL, VT, Expand);
474 setOperationAction(ISD::ROTR, VT, Expand);
475 setOperationAction(ISD::SUB, VT, Expand);
476 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
477 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
478 setOperationAction(ISD::SDIV, VT, Expand);
479 setOperationAction(ISD::UDIV, VT, Expand);
480 setOperationAction(ISD::SREM, VT, Expand);
481 setOperationAction(ISD::UREM, VT, Expand);
482 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
483 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
484 setOperationAction(ISD::SDIVREM, VT, Expand);
485 setOperationAction(ISD::UDIVREM, VT, Expand);
486 setOperationAction(ISD::SELECT, VT, Expand);
487 setOperationAction(ISD::VSELECT, VT, Expand);
488 setOperationAction(ISD::SELECT_CC, VT, Expand);
489 setOperationAction(ISD::XOR, VT, Expand);
490 setOperationAction(ISD::BSWAP, VT, Expand);
491 setOperationAction(ISD::CTPOP, VT, Expand);
492 setOperationAction(ISD::CTTZ, VT, Expand);
493 setOperationAction(ISD::CTLZ, VT, Expand);
494 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
495 setOperationAction(ISD::SETCC, VT, Expand);
496 }
497
498 static const MVT::SimpleValueType FloatVectorTypes[] = {
499 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
500
501 for (MVT VT : FloatVectorTypes) {
502 setOperationAction(ISD::FABS, VT, Expand);
503 setOperationAction(ISD::FMINNUM, VT, Expand);
504 setOperationAction(ISD::FMAXNUM, VT, Expand);
505 setOperationAction(ISD::FADD, VT, Expand);
506 setOperationAction(ISD::FCEIL, VT, Expand);
507 setOperationAction(ISD::FCOS, VT, Expand);
508 setOperationAction(ISD::FDIV, VT, Expand);
509 setOperationAction(ISD::FEXP2, VT, Expand);
510 setOperationAction(ISD::FEXP, VT, Expand);
511 setOperationAction(ISD::FLOG2, VT, Expand);
512 setOperationAction(ISD::FREM, VT, Expand);
513 setOperationAction(ISD::FLOG, VT, Expand);
514 setOperationAction(ISD::FLOG10, VT, Expand);
515 setOperationAction(ISD::FPOW, VT, Expand);
516 setOperationAction(ISD::FFLOOR, VT, Expand);
517 setOperationAction(ISD::FTRUNC, VT, Expand);
518 setOperationAction(ISD::FMUL, VT, Expand);
519 setOperationAction(ISD::FMA, VT, Expand);
520 setOperationAction(ISD::FRINT, VT, Expand);
521 setOperationAction(ISD::FNEARBYINT, VT, Expand);
522 setOperationAction(ISD::FSQRT, VT, Expand);
523 setOperationAction(ISD::FSIN, VT, Expand);
524 setOperationAction(ISD::FSUB, VT, Expand);
525 setOperationAction(ISD::FNEG, VT, Expand);
526 setOperationAction(ISD::VSELECT, VT, Expand);
527 setOperationAction(ISD::SELECT_CC, VT, Expand);
528 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
529 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
530 setOperationAction(ISD::SETCC, VT, Expand);
531 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
532 }
533
534 // This causes using an unrolled select operation rather than expansion with
535 // bit operations. This is in general better, but the alternative using BFI
536 // instructions may be better if the select sources are SGPRs.
537 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
538 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
539
540 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
541 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
542
543 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
544 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
545
546 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
547 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
548
549 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
550 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
551
552 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
553 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
554
555 // There are no libcalls of any kind.
556 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
557 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
558
559 setSchedulingPreference(Sched::RegPressure);
560 setJumpIsExpensive(true);
561
562 // FIXME: This is only partially true. If we have to do vector compares, any
563 // SGPR pair can be a condition register. If we have a uniform condition, we
564 // are better off doing SALU operations, where there is only one SCC. For now,
565 // we don't have a way of knowing during instruction selection if a condition
566 // will be uniform and we always use vector compares. Assume we are using
567 // vector compares until that is fixed.
568 setHasMultipleConditionRegisters(true);
569
570 setMinCmpXchgSizeInBits(32);
571 setSupportsUnalignedAtomics(false);
572
573 PredictableSelectIsExpensive = false;
574
575 // We want to find all load dependencies for long chains of stores to enable
576 // merging into very wide vectors. The problem is with vectors with > 4
577 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
578 // vectors are a legal type, even though we have to split the loads
579 // usually. When we can more precisely specify load legality per address
580 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
581 // smarter so that they can figure out what to do in 2 iterations without all
582 // N > 4 stores on the same chain.
583 GatherAllAliasesMaxDepth = 16;
584
585 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
586 // about these during lowering.
587 MaxStoresPerMemcpy = 0xffffffff;
588 MaxStoresPerMemmove = 0xffffffff;
589 MaxStoresPerMemset = 0xffffffff;
590
591 // The expansion for 64-bit division is enormous.
592 if (AMDGPUBypassSlowDiv)
593 addBypassSlowDiv(64, 32);
594
595 setTargetDAGCombine(ISD::BITCAST);
596 setTargetDAGCombine(ISD::SHL);
597 setTargetDAGCombine(ISD::SRA);
598 setTargetDAGCombine(ISD::SRL);
599 setTargetDAGCombine(ISD::TRUNCATE);
600 setTargetDAGCombine(ISD::MUL);
601 setTargetDAGCombine(ISD::MULHU);
602 setTargetDAGCombine(ISD::MULHS);
603 setTargetDAGCombine(ISD::SELECT);
604 setTargetDAGCombine(ISD::SELECT_CC);
605 setTargetDAGCombine(ISD::STORE);
606 setTargetDAGCombine(ISD::FADD);
607 setTargetDAGCombine(ISD::FSUB);
608 setTargetDAGCombine(ISD::FNEG);
609 setTargetDAGCombine(ISD::FABS);
610 setTargetDAGCombine(ISD::AssertZext);
611 setTargetDAGCombine(ISD::AssertSext);
612 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
613}
614
615bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
616 if (getTargetMachine().Options.NoSignedZerosFPMath)
617 return true;
618
619 const auto Flags = Op.getNode()->getFlags();
620 if (Flags.hasNoSignedZeros())
621 return true;
622
623 return false;
624}
625
626//===----------------------------------------------------------------------===//
627// Target Information
628//===----------------------------------------------------------------------===//
629
630LLVM_READNONE__attribute__((__const__))
631static bool fnegFoldsIntoOp(unsigned Opc) {
632 switch (Opc) {
633 case ISD::FADD:
634 case ISD::FSUB:
635 case ISD::FMUL:
636 case ISD::FMA:
637 case ISD::FMAD:
638 case ISD::FMINNUM:
639 case ISD::FMAXNUM:
640 case ISD::FMINNUM_IEEE:
641 case ISD::FMAXNUM_IEEE:
642 case ISD::FSIN:
643 case ISD::FTRUNC:
644 case ISD::FRINT:
645 case ISD::FNEARBYINT:
646 case ISD::FCANONICALIZE:
647 case AMDGPUISD::RCP:
648 case AMDGPUISD::RCP_LEGACY:
649 case AMDGPUISD::RCP_IFLAG:
650 case AMDGPUISD::SIN_HW:
651 case AMDGPUISD::FMUL_LEGACY:
652 case AMDGPUISD::FMIN_LEGACY:
653 case AMDGPUISD::FMAX_LEGACY:
654 case AMDGPUISD::FMED3:
655 // TODO: handle llvm.amdgcn.fma.legacy
656 return true;
657 default:
658 return false;
659 }
660}
661
662/// \p returns true if the operation will definitely need to use a 64-bit
663/// encoding, and thus will use a VOP3 encoding regardless of the source
664/// modifiers.
665LLVM_READONLY__attribute__((__pure__))
666static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
667 return N->getNumOperands() > 2 || VT == MVT::f64;
668}
669
670// Most FP instructions support source modifiers, but this could be refined
671// slightly.
672LLVM_READONLY__attribute__((__pure__))
673static bool hasSourceMods(const SDNode *N) {
674 if (isa<MemSDNode>(N))
675 return false;
676
677 switch (N->getOpcode()) {
678 case ISD::CopyToReg:
679 case ISD::SELECT:
680 case ISD::FDIV:
681 case ISD::FREM:
682 case ISD::INLINEASM:
683 case ISD::INLINEASM_BR:
684 case AMDGPUISD::DIV_SCALE:
685 case ISD::INTRINSIC_W_CHAIN:
686
687 // TODO: Should really be looking at the users of the bitcast. These are
688 // problematic because bitcasts are used to legalize all stores to integer
689 // types.
690 case ISD::BITCAST:
691 return false;
692 case ISD::INTRINSIC_WO_CHAIN: {
693 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
694 case Intrinsic::amdgcn_interp_p1:
695 case Intrinsic::amdgcn_interp_p2:
696 case Intrinsic::amdgcn_interp_mov:
697 case Intrinsic::amdgcn_interp_p1_f16:
698 case Intrinsic::amdgcn_interp_p2_f16:
699 return false;
700 default:
701 return true;
702 }
703 }
704 default:
705 return true;
706 }
707}
708
709bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
710 unsigned CostThreshold) {
711 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
712 // it is truly free to use a source modifier in all cases. If there are
713 // multiple users but for each one will necessitate using VOP3, there will be
714 // a code size increase. Try to avoid increasing code size unless we know it
715 // will save on the instruction count.
716 unsigned NumMayIncreaseSize = 0;
717 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
718
719 // XXX - Should this limit number of uses to check?
720 for (const SDNode *U : N->uses()) {
721 if (!hasSourceMods(U))
722 return false;
723
724 if (!opMustUseVOP3Encoding(U, VT)) {
725 if (++NumMayIncreaseSize > CostThreshold)
726 return false;
727 }
728 }
729
730 return true;
731}
732
733EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
734 ISD::NodeType ExtendKind) const {
735 assert(!VT.isVector() && "only scalar expected")(static_cast<void> (0));
736
737 // Round to the next multiple of 32-bits.
738 unsigned Size = VT.getSizeInBits();
739 if (Size <= 32)
740 return MVT::i32;
741 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
742}
743
744MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
745 return MVT::i32;
746}
747
748bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
749 return true;
750}
751
752// The backend supports 32 and 64 bit floating point immediates.
753// FIXME: Why are we reporting vectors of FP immediates as legal?
754bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
755 bool ForCodeSize) const {
756 EVT ScalarVT = VT.getScalarType();
757 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
758 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
759}
760
761// We don't want to shrink f64 / f32 constants.
762bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
763 EVT ScalarVT = VT.getScalarType();
764 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
765}
766
767bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
768 ISD::LoadExtType ExtTy,
769 EVT NewVT) const {
770 // TODO: This may be worth removing. Check regression tests for diffs.
771 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
772 return false;
773
774 unsigned NewSize = NewVT.getStoreSizeInBits();
775
776 // If we are reducing to a 32-bit load or a smaller multi-dword load,
777 // this is always better.
778 if (NewSize >= 32)
779 return true;
780
781 EVT OldVT = N->getValueType(0);
782 unsigned OldSize = OldVT.getStoreSizeInBits();
783
784 MemSDNode *MN = cast<MemSDNode>(N);
785 unsigned AS = MN->getAddressSpace();
786 // Do not shrink an aligned scalar load to sub-dword.
787 // Scalar engine cannot do sub-dword loads.
788 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
789 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
790 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
791 (isa<LoadSDNode>(N) &&
792 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
793 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
794 return false;
795
796 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
797 // extloads, so doing one requires using a buffer_load. In cases where we
798 // still couldn't use a scalar load, using the wider load shouldn't really
799 // hurt anything.
800
801 // If the old size already had to be an extload, there's no harm in continuing
802 // to reduce the width.
803 return (OldSize < 32);
804}
805
806bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
807 const SelectionDAG &DAG,
808 const MachineMemOperand &MMO) const {
809
810 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())(static_cast<void> (0));
811
812 if (LoadTy.getScalarType() == MVT::i32)
813 return false;
814
815 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
816 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
817
818 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
819 return false;
820
821 bool Fast = false;
822 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
823 CastTy, MMO, &Fast) &&
824 Fast;
825}
826
827// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
828// profitable with the expansion for 64-bit since it's generally good to
829// speculate things.
830// FIXME: These should really have the size as a parameter.
831bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
832 return true;
833}
834
835bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
836 return true;
837}
838
839bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
840 switch (N->getOpcode()) {
841 case ISD::EntryToken:
842 case ISD::TokenFactor:
843 return true;
844 case ISD::INTRINSIC_WO_CHAIN: {
845 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
846 switch (IntrID) {
847 case Intrinsic::amdgcn_readfirstlane:
848 case Intrinsic::amdgcn_readlane:
849 return true;
850 }
851 return false;
852 }
853 case ISD::LOAD:
854 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
855 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
856 return true;
857 return false;
858 }
859 return false;
860}
861
862SDValue AMDGPUTargetLowering::getNegatedExpression(
863 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
864 NegatibleCost &Cost, unsigned Depth) const {
865
866 switch (Op.getOpcode()) {
867 case ISD::FMA:
868 case ISD::FMAD: {
869 // Negating a fma is not free if it has users without source mods.
870 if (!allUsesHaveSourceMods(Op.getNode()))
871 return SDValue();
872 break;
873 }
874 default:
875 break;
876 }
877
878 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
879 ForCodeSize, Cost, Depth);
880}
881
882//===---------------------------------------------------------------------===//
883// Target Properties
884//===---------------------------------------------------------------------===//
885
886bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
887 assert(VT.isFloatingPoint())(static_cast<void> (0));
888
889 // Packed operations do not have a fabs modifier.
890 return VT == MVT::f32 || VT == MVT::f64 ||
891 (Subtarget->has16BitInsts() && VT == MVT::f16);
892}
893
894bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
895 assert(VT.isFloatingPoint())(static_cast<void> (0));
896 // Report this based on the end legalized type.
897 VT = VT.getScalarType();
898 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
899}
900
901bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
902 unsigned NumElem,
903 unsigned AS) const {
904 return true;
905}
906
907bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
908 // There are few operations which truly have vector input operands. Any vector
909 // operation is going to involve operations on each component, and a
910 // build_vector will be a copy per element, so it always makes sense to use a
911 // build_vector input in place of the extracted element to avoid a copy into a
912 // super register.
913 //
914 // We should probably only do this if all users are extracts only, but this
915 // should be the common case.
916 return true;
917}
918
919bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
920 // Truncate is just accessing a subregister.
921
922 unsigned SrcSize = Source.getSizeInBits();
923 unsigned DestSize = Dest.getSizeInBits();
924
925 return DestSize < SrcSize && DestSize % 32 == 0 ;
926}
927
928bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
929 // Truncate is just accessing a subregister.
930
931 unsigned SrcSize = Source->getScalarSizeInBits();
932 unsigned DestSize = Dest->getScalarSizeInBits();
933
934 if (DestSize== 16 && Subtarget->has16BitInsts())
935 return SrcSize >= 32;
936
937 return DestSize < SrcSize && DestSize % 32 == 0;
938}
939
940bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
941 unsigned SrcSize = Src->getScalarSizeInBits();
942 unsigned DestSize = Dest->getScalarSizeInBits();
943
944 if (SrcSize == 16 && Subtarget->has16BitInsts())
945 return DestSize >= 32;
946
947 return SrcSize == 32 && DestSize == 64;
948}
949
950bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
951 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
952 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
953 // this will enable reducing 64-bit operations the 32-bit, which is always
954 // good.
955
956 if (Src == MVT::i16)
957 return Dest == MVT::i32 ||Dest == MVT::i64 ;
958
959 return Src == MVT::i32 && Dest == MVT::i64;
960}
961
962bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
963 return isZExtFree(Val.getValueType(), VT2);
964}
965
966bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
967 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
968 // limited number of native 64-bit operations. Shrinking an operation to fit
969 // in a single 32-bit register should always be helpful. As currently used,
970 // this is much less general than the name suggests, and is only used in
971 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
972 // not profitable, and may actually be harmful.
973 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
974}
975
976//===---------------------------------------------------------------------===//
977// TargetLowering Callbacks
978//===---------------------------------------------------------------------===//
979
980CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
981 bool IsVarArg) {
982 switch (CC) {
983 case CallingConv::AMDGPU_VS:
984 case CallingConv::AMDGPU_GS:
985 case CallingConv::AMDGPU_PS:
986 case CallingConv::AMDGPU_CS:
987 case CallingConv::AMDGPU_HS:
988 case CallingConv::AMDGPU_ES:
989 case CallingConv::AMDGPU_LS:
990 return CC_AMDGPU;
991 case CallingConv::C:
992 case CallingConv::Fast:
993 case CallingConv::Cold:
994 return CC_AMDGPU_Func;
995 case CallingConv::AMDGPU_Gfx:
996 return CC_SI_Gfx;
997 case CallingConv::AMDGPU_KERNEL:
998 case CallingConv::SPIR_KERNEL:
999 default:
1000 report_fatal_error("Unsupported calling convention for call");
1001 }
1002}
1003
1004CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1005 bool IsVarArg) {
1006 switch (CC) {
1007 case CallingConv::AMDGPU_KERNEL:
1008 case CallingConv::SPIR_KERNEL:
1009 llvm_unreachable("kernels should not be handled here")__builtin_unreachable();
1010 case CallingConv::AMDGPU_VS:
1011 case CallingConv::AMDGPU_GS:
1012 case CallingConv::AMDGPU_PS:
1013 case CallingConv::AMDGPU_CS:
1014 case CallingConv::AMDGPU_HS:
1015 case CallingConv::AMDGPU_ES:
1016 case CallingConv::AMDGPU_LS:
1017 return RetCC_SI_Shader;
1018 case CallingConv::AMDGPU_Gfx:
1019 return RetCC_SI_Gfx;
1020 case CallingConv::C:
1021 case CallingConv::Fast:
1022 case CallingConv::Cold:
1023 return RetCC_AMDGPU_Func;
1024 default:
1025 report_fatal_error("Unsupported calling convention.");
1026 }
1027}
1028
1029/// The SelectionDAGBuilder will automatically promote function arguments
1030/// with illegal types. However, this does not work for the AMDGPU targets
1031/// since the function arguments are stored in memory as these illegal types.
1032/// In order to handle this properly we need to get the original types sizes
1033/// from the LLVM IR Function and fixup the ISD:InputArg values before
1034/// passing them to AnalyzeFormalArguments()
1035
1036/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1037/// input values across multiple registers. Each item in the Ins array
1038/// represents a single value that will be stored in registers. Ins[x].VT is
1039/// the value type of the value that will be stored in the register, so
1040/// whatever SDNode we lower the argument to needs to be this type.
1041///
1042/// In order to correctly lower the arguments we need to know the size of each
1043/// argument. Since Ins[x].VT gives us the size of the register that will
1044/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1045/// for the orignal function argument so that we can deduce the correct memory
1046/// type to use for Ins[x]. In most cases the correct memory type will be
1047/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1048/// we have a kernel argument of type v8i8, this argument will be split into
1049/// 8 parts and each part will be represented by its own item in the Ins array.
1050/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1051/// the argument before it was split. From this, we deduce that the memory type
1052/// for each individual part is i8. We pass the memory type as LocVT to the
1053/// calling convention analysis function and the register type (Ins[x].VT) as
1054/// the ValVT.
1055void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1056 CCState &State,
1057 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1058 const MachineFunction &MF = State.getMachineFunction();
1059 const Function &Fn = MF.getFunction();
1060 LLVMContext &Ctx = Fn.getParent()->getContext();
1061 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1062 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1063 CallingConv::ID CC = Fn.getCallingConv();
1064
1065 Align MaxAlign = Align(1);
1066 uint64_t ExplicitArgOffset = 0;
1067 const DataLayout &DL = Fn.getParent()->getDataLayout();
1068
1069 unsigned InIndex = 0;
1070
1071 for (const Argument &Arg : Fn.args()) {
1072 const bool IsByRef = Arg.hasByRefAttr();
1073 Type *BaseArgTy = Arg.getType();
1074 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1075 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1076 if (!Alignment)
1077 Alignment = DL.getABITypeAlign(MemArgTy);
1078 MaxAlign = max(Alignment, MaxAlign);
1079 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1080
1081 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1082 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1083
1084 // We're basically throwing away everything passed into us and starting over
1085 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1086 // to us as computed in Ins.
1087 //
1088 // We also need to figure out what type legalization is trying to do to get
1089 // the correct memory offsets.
1090
1091 SmallVector<EVT, 16> ValueVTs;
1092 SmallVector<uint64_t, 16> Offsets;
1093 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1094
1095 for (unsigned Value = 0, NumValues = ValueVTs.size();
1096 Value != NumValues; ++Value) {
1097 uint64_t BasePartOffset = Offsets[Value];
1098
1099 EVT ArgVT = ValueVTs[Value];
1100 EVT MemVT = ArgVT;
1101 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1102 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1103
1104 if (NumRegs == 1) {
1105 // This argument is not split, so the IR type is the memory type.
1106 if (ArgVT.isExtended()) {
1107 // We have an extended type, like i24, so we should just use the
1108 // register type.
1109 MemVT = RegisterVT;
1110 } else {
1111 MemVT = ArgVT;
1112 }
1113 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1114 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1115 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())(static_cast<void> (0));
1116 // We have a vector value which has been split into a vector with
1117 // the same scalar type, but fewer elements. This should handle
1118 // all the floating-point vector types.
1119 MemVT = RegisterVT;
1120 } else if (ArgVT.isVector() &&
1121 ArgVT.getVectorNumElements() == NumRegs) {
1122 // This arg has been split so that each element is stored in a separate
1123 // register.
1124 MemVT = ArgVT.getScalarType();
1125 } else if (ArgVT.isExtended()) {
1126 // We have an extended type, like i65.
1127 MemVT = RegisterVT;
1128 } else {
1129 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1130 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)(static_cast<void> (0));
1131 if (RegisterVT.isInteger()) {
1132 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1133 } else if (RegisterVT.isVector()) {
1134 assert(!RegisterVT.getScalarType().isFloatingPoint())(static_cast<void> (0));
1135 unsigned NumElements = RegisterVT.getVectorNumElements();
1136 assert(MemoryBits % NumElements == 0)(static_cast<void> (0));
1137 // This vector type has been split into another vector type with
1138 // a different elements size.
1139 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1140 MemoryBits / NumElements);
1141 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1142 } else {
1143 llvm_unreachable("cannot deduce memory type.")__builtin_unreachable();
1144 }
1145 }
1146
1147 // Convert one element vectors to scalar.
1148 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1149 MemVT = MemVT.getScalarType();
1150
1151 // Round up vec3/vec5 argument.
1152 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1153 assert(MemVT.getVectorNumElements() == 3 ||(static_cast<void> (0))
1154 MemVT.getVectorNumElements() == 5)(static_cast<void> (0));
1155 MemVT = MemVT.getPow2VectorType(State.getContext());
1156 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1157 MemVT = MemVT.getRoundIntegerType(State.getContext());
1158 }
1159
1160 unsigned PartOffset = 0;
1161 for (unsigned i = 0; i != NumRegs; ++i) {
1162 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1163 BasePartOffset + PartOffset,
1164 MemVT.getSimpleVT(),
1165 CCValAssign::Full));
1166 PartOffset += MemVT.getStoreSize();
1167 }
1168 }
1169 }
1170}
1171
1172SDValue AMDGPUTargetLowering::LowerReturn(
1173 SDValue Chain, CallingConv::ID CallConv,
1174 bool isVarArg,
1175 const SmallVectorImpl<ISD::OutputArg> &Outs,
1176 const SmallVectorImpl<SDValue> &OutVals,
1177 const SDLoc &DL, SelectionDAG &DAG) const {
1178 // FIXME: Fails for r600 tests
1179 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1180 // "wave terminate should not have return values");
1181 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1182}
1183
1184//===---------------------------------------------------------------------===//
1185// Target specific lowering
1186//===---------------------------------------------------------------------===//
1187
1188/// Selects the correct CCAssignFn for a given CallingConvention value.
1189CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1190 bool IsVarArg) {
1191 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1192}
1193
1194CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1195 bool IsVarArg) {
1196 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1197}
1198
1199SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1200 SelectionDAG &DAG,
1201 MachineFrameInfo &MFI,
1202 int ClobberedFI) const {
1203 SmallVector<SDValue, 8> ArgChains;
1204 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1205 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1206
1207 // Include the original chain at the beginning of the list. When this is
1208 // used by target LowerCall hooks, this helps legalize find the
1209 // CALLSEQ_BEGIN node.
1210 ArgChains.push_back(Chain);
1211
1212 // Add a chain value for each stack argument corresponding
1213 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1214 UE = DAG.getEntryNode().getNode()->use_end();
1215 U != UE; ++U) {
1216 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1217 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1218 if (FI->getIndex() < 0) {
1219 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1220 int64_t InLastByte = InFirstByte;
1221 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1222
1223 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1224 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1225 ArgChains.push_back(SDValue(L, 1));
1226 }
1227 }
1228 }
1229 }
1230
1231 // Build a tokenfactor for all the chains.
1232 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1233}
1234
1235SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1236 SmallVectorImpl<SDValue> &InVals,
1237 StringRef Reason) const {
1238 SDValue Callee = CLI.Callee;
1239 SelectionDAG &DAG = CLI.DAG;
1240
1241 const Function &Fn = DAG.getMachineFunction().getFunction();
1242
1243 StringRef FuncName("<unknown>");
1244
1245 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1246 FuncName = G->getSymbol();
1247 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1248 FuncName = G->getGlobal()->getName();
1249
1250 DiagnosticInfoUnsupported NoCalls(
1251 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1252 DAG.getContext()->diagnose(NoCalls);
1253
1254 if (!CLI.IsTailCall) {
1255 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1256 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1257 }
1258
1259 return DAG.getEntryNode();
1260}
1261
1262SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1263 SmallVectorImpl<SDValue> &InVals) const {
1264 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1265}
1266
1267SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1268 SelectionDAG &DAG) const {
1269 const Function &Fn = DAG.getMachineFunction().getFunction();
1270
1271 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1272 SDLoc(Op).getDebugLoc());
1273 DAG.getContext()->diagnose(NoDynamicAlloca);
1274 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1275 return DAG.getMergeValues(Ops, SDLoc());
1276}
1277
1278SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1279 SelectionDAG &DAG) const {
1280 switch (Op.getOpcode()) {
1281 default:
1282 Op->print(errs(), &DAG);
1283 llvm_unreachable("Custom lowering code for this "__builtin_unreachable()
1284 "instruction is not implemented yet!")__builtin_unreachable();
1285 break;
1286 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1287 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1288 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1289 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1290 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1291 case ISD::FREM: return LowerFREM(Op, DAG);
1292 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1293 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1294 case ISD::FRINT: return LowerFRINT(Op, DAG);
1295 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1296 case ISD::FROUND: return LowerFROUND(Op, DAG);
1297 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1298 case ISD::FLOG:
1299 return LowerFLOG(Op, DAG, numbers::ln2f);
1300 case ISD::FLOG10:
1301 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1302 case ISD::FEXP:
1303 return lowerFEXP(Op, DAG);
1304 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1305 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1306 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1307 case ISD::FP_TO_SINT:
1308 case ISD::FP_TO_UINT:
1309 return LowerFP_TO_INT(Op, DAG);
1310 case ISD::CTTZ:
1311 case ISD::CTTZ_ZERO_UNDEF:
1312 case ISD::CTLZ:
1313 case ISD::CTLZ_ZERO_UNDEF:
1314 return LowerCTLZ_CTTZ(Op, DAG);
1315 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1316 }
1317 return Op;
1318}
1319
1320void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1321 SmallVectorImpl<SDValue> &Results,
1322 SelectionDAG &DAG) const {
1323 switch (N->getOpcode()) {
1324 case ISD::SIGN_EXTEND_INREG:
1325 // Different parts of legalization seem to interpret which type of
1326 // sign_extend_inreg is the one to check for custom lowering. The extended
1327 // from type is what really matters, but some places check for custom
1328 // lowering of the result type. This results in trying to use
1329 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1330 // nothing here and let the illegal result integer be handled normally.
1331 return;
1332 default:
1333 return;
1334 }
1335}
1336
1337bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1338 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1339 if (!GVar || !GVar->hasInitializer())
1340 return false;
1341
1342 return !isa<UndefValue>(GVar->getInitializer());
1343}
1344
1345SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1346 SDValue Op,
1347 SelectionDAG &DAG) const {
1348
1349 const DataLayout &DL = DAG.getDataLayout();
1350 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1351 const GlobalValue *GV = G->getGlobal();
1352
1353 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1354 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1355 if (!MFI->isModuleEntryFunction() &&
1356 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1357 SDLoc DL(Op);
1358 const Function &Fn = DAG.getMachineFunction().getFunction();
1359 DiagnosticInfoUnsupported BadLDSDecl(
1360 Fn, "local memory global used by non-kernel function",
1361 DL.getDebugLoc(), DS_Warning);
1362 DAG.getContext()->diagnose(BadLDSDecl);
1363
1364 // We currently don't have a way to correctly allocate LDS objects that
1365 // aren't directly associated with a kernel. We do force inlining of
1366 // functions that use local objects. However, if these dead functions are
1367 // not eliminated, we don't want a compile time error. Just emit a warning
1368 // and a trap, since there should be no callable path here.
1369 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1370 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1371 Trap, DAG.getRoot());
1372 DAG.setRoot(OutputChain);
1373 return DAG.getUNDEF(Op.getValueType());
1374 }
1375
1376 // XXX: What does the value of G->getOffset() mean?
1377 assert(G->getOffset() == 0 &&(static_cast<void> (0))
1378 "Do not know what to do with an non-zero offset")(static_cast<void> (0));
1379
1380 // TODO: We could emit code to handle the initialization somewhere.
1381 if (!hasDefinedInitializer(GV)) {
1382 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1383 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1384 }
1385 }
1386
1387 const Function &Fn = DAG.getMachineFunction().getFunction();
1388 DiagnosticInfoUnsupported BadInit(
1389 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1390 DAG.getContext()->diagnose(BadInit);
1391 return SDValue();
1392}
1393
1394SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1395 SelectionDAG &DAG) const {
1396 SmallVector<SDValue, 8> Args;
1397
1398 EVT VT = Op.getValueType();
1399 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1400 SDLoc SL(Op);
1401 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1402 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1403
1404 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1405 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1406 }
1407
1408 for (const SDUse &U : Op->ops())
1409 DAG.ExtractVectorElements(U.get(), Args);
1410
1411 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1412}
1413
1414SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1415 SelectionDAG &DAG) const {
1416
1417 SmallVector<SDValue, 8> Args;
1418 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1419 EVT VT = Op.getValueType();
1420 EVT SrcVT = Op.getOperand(0).getValueType();
1421
1422 // For these types, we have some TableGen patterns except if the index is 1
1423 if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1424 (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1425 Start != 1)
1426 return Op;
1427
1428 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1429 VT.getVectorNumElements());
1430
1431 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1432}
1433
1434/// Generate Min/Max node
1435SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1436 SDValue LHS, SDValue RHS,
1437 SDValue True, SDValue False,
1438 SDValue CC,
1439 DAGCombinerInfo &DCI) const {
1440 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1441 return SDValue();
1442
1443 SelectionDAG &DAG = DCI.DAG;
1444 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1445 switch (CCOpcode) {
1446 case ISD::SETOEQ:
1447 case ISD::SETONE:
1448 case ISD::SETUNE:
1449 case ISD::SETNE:
1450 case ISD::SETUEQ:
1451 case ISD::SETEQ:
1452 case ISD::SETFALSE:
1453 case ISD::SETFALSE2:
1454 case ISD::SETTRUE:
1455 case ISD::SETTRUE2:
1456 case ISD::SETUO:
1457 case ISD::SETO:
1458 break;
1459 case ISD::SETULE:
1460 case ISD::SETULT: {
1461 if (LHS == True)
1462 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1463 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1464 }
1465 case ISD::SETOLE:
1466 case ISD::SETOLT:
1467 case ISD::SETLE:
1468 case ISD::SETLT: {
1469 // Ordered. Assume ordered for undefined.
1470
1471 // Only do this after legalization to avoid interfering with other combines
1472 // which might occur.
1473 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1474 !DCI.isCalledByLegalizer())
1475 return SDValue();
1476
1477 // We need to permute the operands to get the correct NaN behavior. The
1478 // selected operand is the second one based on the failing compare with NaN,
1479 // so permute it based on the compare type the hardware uses.
1480 if (LHS == True)
1481 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1482 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1483 }
1484 case ISD::SETUGE:
1485 case ISD::SETUGT: {
1486 if (LHS == True)
1487 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1488 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1489 }
1490 case ISD::SETGT:
1491 case ISD::SETGE:
1492 case ISD::SETOGE:
1493 case ISD::SETOGT: {
1494 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1495 !DCI.isCalledByLegalizer())
1496 return SDValue();
1497
1498 if (LHS == True)
1499 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1500 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1501 }
1502 case ISD::SETCC_INVALID:
1503 llvm_unreachable("Invalid setcc condcode!")__builtin_unreachable();
1504 }
1505 return SDValue();
1506}
1507
1508std::pair<SDValue, SDValue>
1509AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1510 SDLoc SL(Op);
1511
1512 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1513
1514 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1515 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1516
1517 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1518 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1519
1520 return std::make_pair(Lo, Hi);
1521}
1522
1523SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1524 SDLoc SL(Op);
1525
1526 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1527 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1528 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1529}
1530
1531SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1532 SDLoc SL(Op);
1533
1534 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1535 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1536 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1537}
1538
1539// Split a vector type into two parts. The first part is a power of two vector.
1540// The second part is whatever is left over, and is a scalar if it would
1541// otherwise be a 1-vector.
1542std::pair<EVT, EVT>
1543AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1544 EVT LoVT, HiVT;
1545 EVT EltVT = VT.getVectorElementType();
1546 unsigned NumElts = VT.getVectorNumElements();
1547 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1548 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1549 HiVT = NumElts - LoNumElts == 1
1550 ? EltVT
1551 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1552 return std::make_pair(LoVT, HiVT);
1553}
1554
1555// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1556// scalar.
1557std::pair<SDValue, SDValue>
1558AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1559 const EVT &LoVT, const EVT &HiVT,
1560 SelectionDAG &DAG) const {
1561 assert(LoVT.getVectorNumElements() +(static_cast<void> (0))
1562 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=(static_cast<void> (0))
1563 N.getValueType().getVectorNumElements() &&(static_cast<void> (0))
1564 "More vector elements requested than available!")(static_cast<void> (0));
1565 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1566 DAG.getVectorIdxConstant(0, DL));
1567 SDValue Hi = DAG.getNode(
1568 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1569 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1570 return std::make_pair(Lo, Hi);
1571}
1572
1573SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1574 SelectionDAG &DAG) const {
1575 LoadSDNode *Load = cast<LoadSDNode>(Op);
1576 EVT VT = Op.getValueType();
1577 SDLoc SL(Op);
1578
1579
1580 // If this is a 2 element vector, we really want to scalarize and not create
1581 // weird 1 element vectors.
1582 if (VT.getVectorNumElements() == 2) {
1583 SDValue Ops[2];
1584 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1585 return DAG.getMergeValues(Ops, SL);
1586 }
1587
1588 SDValue BasePtr = Load->getBasePtr();
1589 EVT MemVT = Load->getMemoryVT();
1590
1591 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1592
1593 EVT LoVT, HiVT;
1594 EVT LoMemVT, HiMemVT;
1595 SDValue Lo, Hi;
1596
1597 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1598 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1599 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1600
1601 unsigned Size = LoMemVT.getStoreSize();
1602 unsigned BaseAlign = Load->getAlignment();
1603 unsigned HiAlign = MinAlign(BaseAlign, Size);
1604
1605 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1606 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1607 BaseAlign, Load->getMemOperand()->getFlags());
1608 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1609 SDValue HiLoad =
1610 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1611 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1612 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1613
1614 SDValue Join;
1615 if (LoVT == HiVT) {
1616 // This is the case that the vector is power of two so was evenly split.
1617 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1618 } else {
1619 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1620 DAG.getVectorIdxConstant(0, SL));
1621 Join = DAG.getNode(
1622 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1623 VT, Join, HiLoad,
1624 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1625 }
1626
1627 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1628 LoLoad.getValue(1), HiLoad.getValue(1))};
1629
1630 return DAG.getMergeValues(Ops, SL);
1631}
1632
1633SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1634 SelectionDAG &DAG) const {
1635 LoadSDNode *Load = cast<LoadSDNode>(Op);
1636 EVT VT = Op.getValueType();
1637 SDValue BasePtr = Load->getBasePtr();
1638 EVT MemVT = Load->getMemoryVT();
1639 SDLoc SL(Op);
1640 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1641 unsigned BaseAlign = Load->getAlignment();
1642 unsigned NumElements = MemVT.getVectorNumElements();
1643
1644 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1645 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1646 if (NumElements != 3 ||
1647 (BaseAlign < 8 &&
1648 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1649 return SplitVectorLoad(Op, DAG);
1650
1651 assert(NumElements == 3)(static_cast<void> (0));
1652
1653 EVT WideVT =
1654 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1655 EVT WideMemVT =
1656 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1657 SDValue WideLoad = DAG.getExtLoad(
1658 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1659 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1660 return DAG.getMergeValues(
1661 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1662 DAG.getVectorIdxConstant(0, SL)),
1663 WideLoad.getValue(1)},
1664 SL);
1665}
1666
1667SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1668 SelectionDAG &DAG) const {
1669 StoreSDNode *Store = cast<StoreSDNode>(Op);
1670 SDValue Val = Store->getValue();
1671 EVT VT = Val.getValueType();
1672
1673 // If this is a 2 element vector, we really want to scalarize and not create
1674 // weird 1 element vectors.
1675 if (VT.getVectorNumElements() == 2)
1676 return scalarizeVectorStore(Store, DAG);
1677
1678 EVT MemVT = Store->getMemoryVT();
1679 SDValue Chain = Store->getChain();
1680 SDValue BasePtr = Store->getBasePtr();
1681 SDLoc SL(Op);
1682
1683 EVT LoVT, HiVT;
1684 EVT LoMemVT, HiMemVT;
1685 SDValue Lo, Hi;
1686
1687 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1688 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1689 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1690
1691 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1692
1693 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1694 unsigned BaseAlign = Store->getAlignment();
1695 unsigned Size = LoMemVT.getStoreSize();
1696 unsigned HiAlign = MinAlign(BaseAlign, Size);
1697
1698 SDValue LoStore =
1699 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1700 Store->getMemOperand()->getFlags());
1701 SDValue HiStore =
1702 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1703 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1704
1705 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1706}
1707
1708// This is a shortcut for integer division because we have fast i32<->f32
1709// conversions, and fast f32 reciprocal instructions. The fractional part of a
1710// float is enough to accurately represent up to a 24-bit signed integer.
1711SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1712 bool Sign) const {
1713 SDLoc DL(Op);
1714 EVT VT = Op.getValueType();
1715 SDValue LHS = Op.getOperand(0);
1716 SDValue RHS = Op.getOperand(1);
1717 MVT IntVT = MVT::i32;
1718 MVT FltVT = MVT::f32;
1719
1720 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1721 if (LHSSignBits < 9)
1722 return SDValue();
1723
1724 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1725 if (RHSSignBits < 9)
1726 return SDValue();
1727
1728 unsigned BitSize = VT.getSizeInBits();
1729 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1730 unsigned DivBits = BitSize - SignBits;
1731 if (Sign)
1732 ++DivBits;
1733
1734 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1735 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1736
1737 SDValue jq = DAG.getConstant(1, DL, IntVT);
1738
1739 if (Sign) {
1740 // char|short jq = ia ^ ib;
1741 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1742
1743 // jq = jq >> (bitsize - 2)
1744 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1745 DAG.getConstant(BitSize - 2, DL, VT));
1746
1747 // jq = jq | 0x1
1748 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1749 }
1750
1751 // int ia = (int)LHS;
1752 SDValue ia = LHS;
1753
1754 // int ib, (int)RHS;
1755 SDValue ib = RHS;
1756
1757 // float fa = (float)ia;
1758 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1759
1760 // float fb = (float)ib;
1761 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1762
1763 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1764 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1765
1766 // fq = trunc(fq);
1767 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1768
1769 // float fqneg = -fq;
1770 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1771
1772 MachineFunction &MF = DAG.getMachineFunction();
1773 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1774
1775 // float fr = mad(fqneg, fb, fa);
1776 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1777 (unsigned)ISD::FMA :
1778 !MFI->getMode().allFP32Denormals() ?
1779 (unsigned)ISD::FMAD :
1780 (unsigned)AMDGPUISD::FMAD_FTZ;
1781 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1782
1783 // int iq = (int)fq;
1784 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1785
1786 // fr = fabs(fr);
1787 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1788
1789 // fb = fabs(fb);
1790 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1791
1792 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1793
1794 // int cv = fr >= fb;
1795 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1796
1797 // jq = (cv ? jq : 0);
1798 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1799
1800 // dst = iq + jq;
1801 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1802
1803 // Rem needs compensation, it's easier to recompute it
1804 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1805 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1806
1807 // Truncate to number of bits this divide really is.
1808 if (Sign) {
1809 SDValue InRegSize
1810 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1811 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1812 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1813 } else {
1814 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1815 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1816 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1817 }
1818
1819 return DAG.getMergeValues({ Div, Rem }, DL);
1820}
1821
1822void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1823 SelectionDAG &DAG,
1824 SmallVectorImpl<SDValue> &Results) const {
1825 SDLoc DL(Op);
1826 EVT VT = Op.getValueType();
1827
1828 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")(static_cast<void> (0));
1829
1830 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1831
1832 SDValue One = DAG.getConstant(1, DL, HalfVT);
1833 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1834
1835 //HiLo split
1836 SDValue LHS = Op.getOperand(0);
1837 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1838 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1839
1840 SDValue RHS = Op.getOperand(1);
1841 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1842 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1843
1844 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1845 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1846
1847 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1848 LHS_Lo, RHS_Lo);
1849
1850 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1851 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1852
1853 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1854 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1855 return;
1856 }
1857
1858 if (isTypeLegal(MVT::i64)) {
1859 MachineFunction &MF = DAG.getMachineFunction();
1860 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1861
1862 // Compute denominator reciprocal.
1863 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1864 (unsigned)ISD::FMA :
1865 !MFI->getMode().allFP32Denormals() ?
1866 (unsigned)ISD::FMAD :
1867 (unsigned)AMDGPUISD::FMAD_FTZ;
1868
1869 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1870 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1871 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1872 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1873 Cvt_Lo);
1874 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1875 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1876 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1877 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1878 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1879 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1880 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1881 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1882 Mul1);
1883 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1884 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1885 SDValue Rcp64 = DAG.getBitcast(VT,
1886 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1887
1888 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1889 SDValue One64 = DAG.getConstant(1, DL, VT);
1890 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1891 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1892
1893 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1894 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1895 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1896 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1897 Zero);
1898 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1899 One);
1900
1901 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1902 Mulhi1_Lo, Zero1);
1903 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1904 Mulhi1_Hi, Add1_Lo.getValue(1));
1905 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1906 SDValue Add1 = DAG.getBitcast(VT,
1907 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1908
1909 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1910 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1911 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1912 Zero);
1913 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1914 One);
1915
1916 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1917 Mulhi2_Lo, Zero1);
1918 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1919 Mulhi2_Hi, Add1_Lo.getValue(1));
1920 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1921 Zero, Add2_Lo.getValue(1));
1922 SDValue Add2 = DAG.getBitcast(VT,
1923 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1924 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1925
1926 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1927
1928 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1929 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1930 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1931 Mul3_Lo, Zero1);
1932 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1933 Mul3_Hi, Sub1_Lo.getValue(1));
1934 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1935 SDValue Sub1 = DAG.getBitcast(VT,
1936 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1937
1938 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1939 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1940 ISD::SETUGE);
1941 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1942 ISD::SETUGE);
1943 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1944
1945 // TODO: Here and below portions of the code can be enclosed into if/endif.
1946 // Currently control flow is unconditional and we have 4 selects after
1947 // potential endif to substitute PHIs.
1948
1949 // if C3 != 0 ...
1950 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1951 RHS_Lo, Zero1);
1952 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1953 RHS_Hi, Sub1_Lo.getValue(1));
1954 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1955 Zero, Sub2_Lo.getValue(1));
1956 SDValue Sub2 = DAG.getBitcast(VT,
1957 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1958
1959 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1960
1961 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1962 ISD::SETUGE);
1963 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1964 ISD::SETUGE);
1965 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1966
1967 // if (C6 != 0)
1968 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1969
1970 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1971 RHS_Lo, Zero1);
1972 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1973 RHS_Hi, Sub2_Lo.getValue(1));
1974 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1975 Zero, Sub3_Lo.getValue(1));
1976 SDValue Sub3 = DAG.getBitcast(VT,
1977 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1978
1979 // endif C6
1980 // endif C3
1981
1982 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1983 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1984
1985 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1986 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1987
1988 Results.push_back(Div);
1989 Results.push_back(Rem);
1990
1991 return;
1992 }
1993
1994 // r600 expandion.
1995 // Get Speculative values
1996 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1997 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1998
1999 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2000 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2001 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2002
2003 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2004 SDValue DIV_Lo = Zero;
2005
2006 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2007
2008 for (unsigned i = 0; i < halfBitWidth; ++i) {
2009 const unsigned bitPos = halfBitWidth - i - 1;
2010 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2011 // Get value of high bit
2012 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2013 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2014 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2015
2016 // Shift
2017 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2018 // Add LHS high bit
2019 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2020
2021 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2022 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2023
2024 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2025
2026 // Update REM
2027 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2028 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2029 }
2030
2031 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2032 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2033 Results.push_back(DIV);
2034 Results.push_back(REM);
2035}
2036
2037SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2038 SelectionDAG &DAG) const {
2039 SDLoc DL(Op);
2040 EVT VT = Op.getValueType();
2041
2042 if (VT == MVT::i64) {
2043 SmallVector<SDValue, 2> Results;
2044 LowerUDIVREM64(Op, DAG, Results);
2045 return DAG.getMergeValues(Results, DL);
2046 }
2047
2048 if (VT == MVT::i32) {
2049 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2050 return Res;
2051 }
2052
2053 SDValue X = Op.getOperand(0);
2054 SDValue Y = Op.getOperand(1);
2055
2056 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2057 // algorithm used here.
2058
2059 // Initial estimate of inv(y).
2060 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2061
2062 // One round of UNR.
2063 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2064 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2065 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2066 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2067
2068 // Quotient/remainder estimate.
2069 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2070 SDValue R =
2071 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2072
2073 // First quotient/remainder refinement.
2074 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2075 SDValue One = DAG.getConstant(1, DL, VT);
2076 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2077 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2078 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2079 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2080 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2081
2082 // Second quotient/remainder refinement.
2083 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2084 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2085 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2086 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2087 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2088
2089 return DAG.getMergeValues({Q, R}, DL);
2090}
2091
2092SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2093 SelectionDAG &DAG) const {
2094 SDLoc DL(Op);
2095 EVT VT = Op.getValueType();
2096
2097 SDValue LHS = Op.getOperand(0);
2098 SDValue RHS = Op.getOperand(1);
2099
2100 SDValue Zero = DAG.getConstant(0, DL, VT);
2101 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2102
2103 if (VT == MVT::i32) {
2104 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2105 return Res;
2106 }
2107
2108 if (VT == MVT::i64 &&
2109 DAG.ComputeNumSignBits(LHS) > 32 &&
2110 DAG.ComputeNumSignBits(RHS) > 32) {
2111 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2112
2113 //HiLo split
2114 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2115 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2116 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2117 LHS_Lo, RHS_Lo);
2118 SDValue Res[2] = {
2119 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2120 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2121 };
2122 return DAG.getMergeValues(Res, DL);
2123 }
2124
2125 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2126 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2127 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2128 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2129
2130 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2131 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2132
2133 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2134 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2135
2136 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2137 SDValue Rem = Div.getValue(1);
2138
2139 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2140 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2141
2142 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2143 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2144
2145 SDValue Res[2] = {
2146 Div,
2147 Rem
2148 };
2149 return DAG.getMergeValues(Res, DL);
2150}
2151
2152// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2153SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2154 SDLoc SL(Op);
2155 EVT VT = Op.getValueType();
2156 auto Flags = Op->getFlags();
2157 SDValue X = Op.getOperand(0);
2158 SDValue Y = Op.getOperand(1);
2159
2160 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2161 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2162 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2163 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2164 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2165}
2166
2167SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2168 SDLoc SL(Op);
2169 SDValue Src = Op.getOperand(0);
2170
2171 // result = trunc(src)
2172 // if (src > 0.0 && src != result)
2173 // result += 1.0
2174
2175 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2176
2177 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2178 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2179
2180 EVT SetCCVT =
2181 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2182
2183 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2184 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2185 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2186
2187 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2188 // TODO: Should this propagate fast-math-flags?
2189 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2190}
2191
2192static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2193 SelectionDAG &DAG) {
2194 const unsigned FractBits = 52;
2195 const unsigned ExpBits = 11;
2196
2197 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2198 Hi,
2199 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2200 DAG.getConstant(ExpBits, SL, MVT::i32));
2201 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2202 DAG.getConstant(1023, SL, MVT::i32));
2203
2204 return Exp;
2205}
2206
2207SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2208 SDLoc SL(Op);
2209 SDValue Src = Op.getOperand(0);
2210
2211 assert(Op.getValueType() == MVT::f64)(static_cast<void> (0));
2212
2213 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2214
2215 // Extract the upper half, since this is where we will find the sign and
2216 // exponent.
2217 SDValue Hi = getHiHalf64(Src, DAG);
2218
2219 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2220
2221 const unsigned FractBits = 52;
2222
2223 // Extract the sign bit.
2224 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2225 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2226
2227 // Extend back to 64-bits.
2228 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2229 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2230
2231 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2232 const SDValue FractMask
2233 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2234
2235 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2236 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2237 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2238
2239 EVT SetCCVT =
2240 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2241
2242 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2243
2244 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2245 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2246
2247 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2248 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2249
2250 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2251}
2252
2253SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2254 SDLoc SL(Op);
2255 SDValue Src = Op.getOperand(0);
2256
2257 assert(Op.getValueType() == MVT::f64)(static_cast<void> (0));
2258
2259 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2260 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2261 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2262
2263 // TODO: Should this propagate fast-math-flags?
2264
2265 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2266 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2267
2268 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2269
2270 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2271 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2272
2273 EVT SetCCVT =
2274 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2275 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2276
2277 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2278}
2279
2280SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2281 // FNEARBYINT and FRINT are the same, except in their handling of FP
2282 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2283 // rint, so just treat them as equivalent.
2284 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2285}
2286
2287// XXX - May require not supporting f32 denormals?
2288
2289// Don't handle v2f16. The extra instructions to scalarize and repack around the
2290// compare and vselect end up producing worse code than scalarizing the whole
2291// operation.
2292SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2293 SDLoc SL(Op);
2294 SDValue X = Op.getOperand(0);
2295 EVT VT = Op.getValueType();
2296
2297 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2298
2299 // TODO: Should this propagate fast-math-flags?
2300
2301 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2302
2303 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2304
2305 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2306 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2307 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2308
2309 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2310
2311 EVT SetCCVT =
2312 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2313
2314 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2315
2316 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2317
2318 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2319}
2320
2321SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2322 SDLoc SL(Op);
2323 SDValue Src = Op.getOperand(0);
2324
2325 // result = trunc(src);
2326 // if (src < 0.0 && src != result)
2327 // result += -1.0.
2328
2329 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2330
2331 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2332 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2333
2334 EVT SetCCVT =
2335 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2336
2337 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2338 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2339 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2340
2341 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2342 // TODO: Should this propagate fast-math-flags?
2343 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2344}
2345
2346SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2347 double Log2BaseInverted) const {
2348 EVT VT = Op.getValueType();
2349
2350 SDLoc SL(Op);
2351 SDValue Operand = Op.getOperand(0);
2352 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2353 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2354
2355 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2356}
2357
2358// exp2(M_LOG2E_F * f);
2359SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2360 EVT VT = Op.getValueType();
2361 SDLoc SL(Op);
2362 SDValue Src = Op.getOperand(0);
2363
2364 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2365 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2366 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2367}
2368
2369static bool isCtlzOpc(unsigned Opc) {
2370 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2371}
2372
2373static bool isCttzOpc(unsigned Opc) {
2374 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2375}
2376
2377SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2378 SDLoc SL(Op);
2379 SDValue Src = Op.getOperand(0);
2380
2381 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()))(static_cast<void> (0));
2382 bool Ctlz = isCtlzOpc(Op.getOpcode());
2383 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2384
2385 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2386 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2387
2388 if (Src.getValueType() == MVT::i32) {
2389 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2390 // (cttz hi:lo) -> (umin (ffbl src), 32)
2391 // (ctlz_zero_undef src) -> (ffbh src)
2392 // (cttz_zero_undef src) -> (ffbl src)
2393 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2394 if (!ZeroUndef) {
2395 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2396 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2397 }
2398 return NewOpr;
2399 }
2400
2401 SDValue Lo, Hi;
2402 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2403
2404 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2405 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2406
2407 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2408 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2409 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2410 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2411
2412 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2413 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2414 if (Ctlz)
2415 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2416 else
2417 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2418
2419 SDValue NewOpr;
2420 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2421 if (!ZeroUndef) {
2422 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2423 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2424 }
2425
2426 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2427}
2428
2429SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2430 bool Signed) const {
2431 // The regular method coverting a 64-bit integer to float roughly consists of
2432 // 2 steps: normalization and rounding. In fact, after normalization, the
2433 // conversion from a 64-bit integer to a float is essentially the same as the
2434 // one from a 32-bit integer. The only difference is that it has more
2435 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2436 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2437 // converted into the correct float number. The basic steps for the unsigned
2438 // conversion are illustrated in the following pseudo code:
2439 //
2440 // f32 uitofp(i64 u) {
2441 // i32 hi, lo = split(u);
2442 // // Only count the leading zeros in hi as we have native support of the
2443 // // conversion from i32 to f32. If hi is all 0s, the conversion is
2444 // // reduced to a 32-bit one automatically.
2445 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2446 // u <<= shamt;
2447 // hi, lo = split(u);
2448 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2449 // // convert it as a 32-bit integer and scale the result back.
2450 // return uitofp(hi) * 2^(32 - shamt);
2451 // }
2452 //
2453 // The signed one follows the same principle but uses 'ffbh_i32' to count its
2454 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2455 // converted instead followed by negation based its sign bit.
2456
2457 SDLoc SL(Op);
2458 SDValue Src = Op.getOperand(0);
2459
2460 SDValue Lo, Hi;
2461 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2462 SDValue Sign;
2463 SDValue ShAmt;
2464 if (Signed && Subtarget->isGCN()) {
2465 // We also need to consider the sign bit in Lo if Hi has just sign bits,
2466 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2467 // account. That is, the maximal shift is
2468 // - 32 if Lo and Hi have opposite signs;
2469 // - 33 if Lo and Hi have the same sign.
2470 //
2471 // Or, MaxShAmt = 33 + OppositeSign, where
2472 //
2473 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2474 // - -1 if Lo and Hi have opposite signs; and
2475 // - 0 otherwise.
2476 //
2477 // All in all, ShAmt is calculated as
2478 //
2479 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2480 //
2481 // or
2482 //
2483 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2484 //
2485 // to reduce the critical path.
2486 SDValue OppositeSign = DAG.getNode(
2487 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2488 DAG.getConstant(31, SL, MVT::i32));
2489 SDValue MaxShAmt =
2490 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2491 OppositeSign);
2492 // Count the leading sign bits.
2493 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2494 // Different from unsigned conversion, the shift should be one bit less to
2495 // preserve the sign bit.
2496 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2497 DAG.getConstant(1, SL, MVT::i32));
2498 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2499 } else {
2500 if (Signed) {
2501 // Without 'ffbh_i32', only leading zeros could be counted. Take the
2502 // absolute value first.
2503 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2504 DAG.getConstant(63, SL, MVT::i64));
2505 SDValue Abs =
2506 DAG.getNode(ISD::XOR, SL, MVT::i64,
2507 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2508 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2509 }
2510 // Count the leading zeros.
2511 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2512 // The shift amount for signed integers is [0, 32].
2513 }
2514 // Normalize the given 64-bit integer.
2515 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2516 // Split it again.
2517 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2518 // Calculate the adjust bit for rounding.
2519 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2520 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2521 DAG.getConstant(1, SL, MVT::i32), Lo);
2522 // Get the 32-bit normalized integer.
2523 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2524 // Convert the normalized 32-bit integer into f32.
2525 unsigned Opc =
2526 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2527 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2528
2529 // Finally, need to scale back the converted floating number as the original
2530 // 64-bit integer is converted as a 32-bit one.
2531 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2532 ShAmt);
2533 // On GCN, use LDEXP directly.
2534 if (Subtarget->isGCN())
2535 return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2536
2537 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2538 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2539 // exponent is enough to avoid overflowing into the sign bit.
2540 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2541 DAG.getConstant(23, SL, MVT::i32));
2542 SDValue IVal =
2543 DAG.getNode(ISD::ADD, SL, MVT::i32,
2544 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2545 if (Signed) {
2546 // Set the sign bit.
2547 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2548 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2549 DAG.getConstant(31, SL, MVT::i32));
2550 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2551 }
2552 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2553}
2554
2555SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2556 bool Signed) const {
2557 SDLoc SL(Op);
2558 SDValue Src = Op.getOperand(0);
2559
2560 SDValue Lo, Hi;
2561 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2562
2563 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2564 SL, MVT::f64, Hi);
2565
2566 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2567
2568 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2569 DAG.getConstant(32, SL, MVT::i32));
2570 // TODO: Should this propagate fast-math-flags?
2571 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2572}
2573
2574SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2575 SelectionDAG &DAG) const {
2576 // TODO: Factor out code common with LowerSINT_TO_FP.
2577 EVT DestVT = Op.getValueType();
2578 SDValue Src = Op.getOperand(0);
2579 EVT SrcVT = Src.getValueType();
2580
2581 if (SrcVT == MVT::i16) {
2582 if (DestVT == MVT::f16)
2583 return Op;
2584 SDLoc DL(Op);
2585
2586 // Promote src to i32
2587 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2588 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2589 }
2590
2591 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast<void> (0));
2592
2593 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2594 SDLoc DL(Op);
2595
2596 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2597 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2598 SDValue FPRound =
2599 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2600
2601 return FPRound;
2602 }
2603
2604 if (DestVT == MVT::f32)
2605 return LowerINT_TO_FP32(Op, DAG, false);
2606
2607 assert(DestVT == MVT::f64)(static_cast<void> (0));
2608 return LowerINT_TO_FP64(Op, DAG, false);
2609}
2610
2611SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2612 SelectionDAG &DAG) const {
2613 EVT DestVT = Op.getValueType();
2614
2615 SDValue Src = Op.getOperand(0);
2616 EVT SrcVT = Src.getValueType();
2617
2618 if (SrcVT == MVT::i16) {
2619 if (DestVT == MVT::f16)
2620 return Op;
2621
2622 SDLoc DL(Op);
2623 // Promote src to i32
2624 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2625 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2626 }
2627
2628 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast<void> (0));
2629
2630 // TODO: Factor out code common with LowerUINT_TO_FP.
2631
2632 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2633 SDLoc DL(Op);
2634 SDValue Src = Op.getOperand(0);
2635
2636 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2637 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2638 SDValue FPRound =
2639 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2640
2641 return FPRound;
2642 }
2643
2644 if (DestVT == MVT::f32)
2645 return LowerINT_TO_FP32(Op, DAG, true);
2646
2647 assert(DestVT == MVT::f64)(static_cast<void> (0));
2648 return LowerINT_TO_FP64(Op, DAG, true);
2649}
2650
2651SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2652 bool Signed) const {
2653 SDLoc SL(Op);
2654
2655 SDValue Src = Op.getOperand(0);
2656 EVT SrcVT = Src.getValueType();
2657
2658 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64)(static_cast<void> (0));
2659
2660 // The basic idea of converting a floating point number into a pair of 32-bit
2661 // integers is illustrated as follows:
2662 //
2663 // tf := trunc(val);
2664 // hif := floor(tf * 2^-32);
2665 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2666 // hi := fptoi(hif);
2667 // lo := fptoi(lof);
2668 //
2669 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2670 SDValue Sign;
2671 if (Signed && SrcVT == MVT::f32) {
2672 // However, a 32-bit floating point number has only 23 bits mantissa and
2673 // it's not enough to hold all the significant bits of `lof` if val is
2674 // negative. To avoid the loss of precision, We need to take the absolute
2675 // value after truncating and flip the result back based on the original
2676 // signedness.
2677 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2678 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2679 DAG.getConstant(31, SL, MVT::i32));
2680 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2681 }
2682
2683 SDValue K0, K1;
2684 if (SrcVT == MVT::f64) {
2685 K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL),
2686 SL, SrcVT);
2687 K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL),
2688 SL, SrcVT);
2689 } else {
2690 K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U), SL,
2691 SrcVT);
2692 K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U), SL,
2693 SrcVT);
2694 }
2695 // TODO: Should this propagate fast-math-flags?
2696 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2697
2698 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2699
2700 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2701
2702 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2703 : ISD::FP_TO_UINT,
2704 SL, MVT::i32, FloorMul);
2705 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2706
2707 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2708 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2709
2710 if (Signed && SrcVT == MVT::f32) {
2711 assert(Sign)(static_cast<void> (0));
2712 // Flip the result based on the signedness, which is either all 0s or 1s.
2713 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2714 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2715 // r := xor(r, sign) - sign;
2716 Result =
2717 DAG.getNode(ISD::SUB, SL, MVT::i64,
2718 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2719 }
2720
2721 return Result;
2722}
2723
2724SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2725 SDLoc DL(Op);
2726 SDValue N0 = Op.getOperand(0);
2727
2728 // Convert to target node to get known bits
2729 if (N0.getValueType() == MVT::f32)
2730 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2731
2732 if (getTargetMachine().Options.UnsafeFPMath) {
2733 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2734 return SDValue();
2735 }
2736
2737 assert(N0.getSimpleValueType() == MVT::f64)(static_cast<void> (0));
2738
2739 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2740 const unsigned ExpMask = 0x7ff;
2741 const unsigned ExpBiasf64 = 1023;
2742 const unsigned ExpBiasf16 = 15;
2743 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2744 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2745 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2746 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2747 DAG.getConstant(32, DL, MVT::i64));
2748 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2749 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2750 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2751 DAG.getConstant(20, DL, MVT::i64));
2752 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2753 DAG.getConstant(ExpMask, DL, MVT::i32));
2754 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2755 // add the f16 bias (15) to get the biased exponent for the f16 format.
2756 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2757 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2758
2759 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2760 DAG.getConstant(8, DL, MVT::i32));
2761 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2762 DAG.getConstant(0xffe, DL, MVT::i32));
2763
2764 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2765 DAG.getConstant(0x1ff, DL, MVT::i32));
2766 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2767
2768 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2769 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2770
2771 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2772 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2773 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2774 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2775
2776 // N = M | (E << 12);
2777 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2778 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2779 DAG.getConstant(12, DL, MVT::i32)));
2780
2781 // B = clamp(1-E, 0, 13);
2782 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2783 One, E);
2784 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2785 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2786 DAG.getConstant(13, DL, MVT::i32));
2787
2788 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2789 DAG.getConstant(0x1000, DL, MVT::i32));
2790
2791 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2792 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2793 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2794 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2795
2796 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2797 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2798 DAG.getConstant(0x7, DL, MVT::i32));
2799 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2800 DAG.getConstant(2, DL, MVT::i32));
2801 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2802 One, Zero, ISD::SETEQ);
2803 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2804 One, Zero, ISD::SETGT);
2805 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2806 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2807
2808 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2809 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2810 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2811 I, V, ISD::SETEQ);
2812
2813 // Extract the sign bit.
2814 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2815 DAG.getConstant(16, DL, MVT::i32));
2816 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2817 DAG.getConstant(0x8000, DL, MVT::i32));
2818
2819 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2820 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2821}
2822
2823SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2824 SelectionDAG &DAG) const {
2825 SDValue Src = Op.getOperand(0);
2826 unsigned OpOpcode = Op.getOpcode();
2827 EVT SrcVT = Src.getValueType();
2828 EVT DestVT = Op.getValueType();
2829
2830 // Will be selected natively
2831 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2832 return Op;
2833
2834 // Promote i16 to i32
2835 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2836 SDLoc DL(Op);
2837
2838 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2839 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2840 }
2841
2842 if (SrcVT == MVT::f16 ||
2843 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2844 SDLoc DL(Op);
2845
2846 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2847 unsigned Ext =
2848 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2849 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2850 }
2851
2852 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2853 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2854
2855 return SDValue();
2856}
2857
2858SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2859 SelectionDAG &DAG) const {
2860 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2861 MVT VT = Op.getSimpleValueType();
2862 MVT ScalarVT = VT.getScalarType();
2863
2864 assert(VT.isVector())(static_cast<void> (0));
2865
2866 SDValue Src = Op.getOperand(0);
2867 SDLoc DL(Op);
2868
2869 // TODO: Don't scalarize on Evergreen?
2870 unsigned NElts = VT.getVectorNumElements();
2871 SmallVector<SDValue, 8> Args;
2872 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2873
2874 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2875 for (unsigned I = 0; I < NElts; ++I)
2876 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2877
2878 return DAG.getBuildVector(VT, DL, Args);
2879}
2880
2881//===----------------------------------------------------------------------===//
2882// Custom DAG optimizations
2883//===----------------------------------------------------------------------===//
2884
2885static bool isU24(SDValue Op, SelectionDAG &DAG) {
2886 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2887}
2888
2889static bool isI24(SDValue Op, SelectionDAG &DAG) {
2890 EVT VT = Op.getValueType();
2891 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2892 // as unsigned 24-bit values.
2893 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2894}
2895
2896static SDValue simplifyMul24(SDNode *Node24,
2897 TargetLowering::DAGCombinerInfo &DCI) {
2898 SelectionDAG &DAG = DCI.DAG;
2899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2900 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2901
2902 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2903 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2904 unsigned NewOpcode = Node24->getOpcode();
2905 if (IsIntrin) {
2906 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2907 NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2908 AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2909 }
2910
2911 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2912
2913 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2914 // the operands to have other uses, but will only perform simplifications that
2915 // involve bypassing some nodes for this user.
2916 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2917 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2918 if (DemandedLHS || DemandedRHS)
2919 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2920 DemandedLHS ? DemandedLHS : LHS,
2921 DemandedRHS ? DemandedRHS : RHS);
2922
2923 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2924 // operands if this node is the only user.
2925 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2926 return SDValue(Node24, 0);
2927 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2928 return SDValue(Node24, 0);
2929
2930 return SDValue();
2931}
2932
2933template <typename IntTy>
2934static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2935 uint32_t Width, const SDLoc &DL) {
2936 if (Width + Offset < 32) {
2937 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2938 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2939 return DAG.getConstant(Result, DL, MVT::i32);
2940 }
2941
2942 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2943}
2944
2945static bool hasVolatileUser(SDNode *Val) {
2946 for (SDNode *U : Val->uses()) {
2947 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2948 if (M->isVolatile())
2949 return true;
2950 }
2951 }
2952
2953 return false;
2954}
2955
2956bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2957 // i32 vectors are the canonical memory type.
2958 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2959 return false;
2960
2961 if (!VT.isByteSized())
2962 return false;
2963
2964 unsigned Size = VT.getStoreSize();
2965
2966 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2967 return false;
2968
2969 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2970 return false;
2971
2972 return true;
2973}
2974
2975// Replace load of an illegal type with a store of a bitcast to a friendlier
2976// type.
2977SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2978 DAGCombinerInfo &DCI) const {
2979 if (!DCI.isBeforeLegalize())
2980 return SDValue();
2981
2982 LoadSDNode *LN = cast<LoadSDNode>(N);
2983 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2984 return SDValue();
2985
2986 SDLoc SL(N);
2987 SelectionDAG &DAG = DCI.DAG;
2988 EVT VT = LN->getMemoryVT();
2989
2990 unsigned Size = VT.getStoreSize();
2991 Align Alignment = LN->getAlign();
2992 if (Alignment < Size && isTypeLegal(VT)) {
2993 bool IsFast;
2994 unsigned AS = LN->getAddressSpace();
2995
2996 // Expand unaligned loads earlier than legalization. Due to visitation order
2997 // problems during legalization, the emitted instructions to pack and unpack
2998 // the bytes again are not eliminated in the case of an unaligned copy.
2999 if (!allowsMisalignedMemoryAccesses(
3000 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3001 SDValue Ops[2];
3002
3003 if (VT.isVector())
3004 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
3005 else
3006 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3007
3008 return DAG.getMergeValues(Ops, SDLoc(N));
3009 }
3010
3011 if (!IsFast)
3012 return SDValue();
3013 }
3014
3015 if (!shouldCombineMemoryType(VT))
3016 return SDValue();
3017
3018 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3019
3020 SDValue NewLoad
3021 = DAG.getLoad(NewVT, SL, LN->getChain(),
3022 LN->getBasePtr(), LN->getMemOperand());
3023
3024 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3025 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3026 return SDValue(N, 0);
3027}
3028
3029// Replace store of an illegal type with a store of a bitcast to a friendlier
3030// type.
3031SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3032 DAGCombinerInfo &DCI) const {
3033 if (!DCI.isBeforeLegalize())
3034 return SDValue();
3035
3036 StoreSDNode *SN = cast<StoreSDNode>(N);
3037 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3038 return SDValue();
3039
3040 EVT VT = SN->getMemoryVT();
3041 unsigned Size = VT.getStoreSize();
3042
3043 SDLoc SL(N);
3044 SelectionDAG &DAG = DCI.DAG;
3045 Align Alignment = SN->getAlign();
3046 if (Alignment < Size && isTypeLegal(VT)) {
3047 bool IsFast;
3048 unsigned AS = SN->getAddressSpace();
3049
3050 // Expand unaligned stores earlier than legalization. Due to visitation
3051 // order problems during legalization, the emitted instructions to pack and
3052 // unpack the bytes again are not eliminated in the case of an unaligned
3053 // copy.
3054 if (!allowsMisalignedMemoryAccesses(
3055 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3056 if (VT.isVector())
3057 return scalarizeVectorStore(SN, DAG);
3058
3059 return expandUnalignedStore(SN, DAG);
3060 }
3061
3062 if (!IsFast)
3063 return SDValue();
3064 }
3065
3066 if (!shouldCombineMemoryType(VT))
3067 return SDValue();
3068
3069 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3070 SDValue Val = SN->getValue();
3071
3072 //DCI.AddToWorklist(Val.getNode());
3073
3074 bool OtherUses = !Val.hasOneUse();
3075 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3076 if (OtherUses) {
3077 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3078 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3079 }
3080
3081 return DAG.getStore(SN->getChain(), SL, CastVal,
3082 SN->getBasePtr(), SN->getMemOperand());
3083}
3084
3085// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3086// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3087// issues.
3088SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3089 DAGCombinerInfo &DCI) const {
3090 SelectionDAG &DAG = DCI.DAG;
3091 SDValue N0 = N->getOperand(0);
3092
3093 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3094 // (vt2 (truncate (assertzext vt0:x, vt1)))
3095 if (N0.getOpcode() == ISD::TRUNCATE) {
3096 SDValue N1 = N->getOperand(1);
3097 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3098 SDLoc SL(N);
3099
3100 SDValue Src = N0.getOperand(0);
3101 EVT SrcVT = Src.getValueType();
3102 if (SrcVT.bitsGE(ExtVT)) {
3103 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3104 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3105 }
3106 }
3107
3108 return SDValue();
3109}
3110
3111SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3112 SDNode *N, DAGCombinerInfo &DCI) const {
3113 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3114 switch (IID) {
3115 case Intrinsic::amdgcn_mul_i24:
3116 case Intrinsic::amdgcn_mul_u24:
3117 return simplifyMul24(N, DCI);
3118 case Intrinsic::amdgcn_fract:
3119 case Intrinsic::amdgcn_rsq:
3120 case Intrinsic::amdgcn_rcp_legacy:
3121 case Intrinsic::amdgcn_rsq_legacy:
3122 case Intrinsic::amdgcn_rsq_clamp:
3123 case Intrinsic::amdgcn_ldexp: {
3124 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3125 SDValue Src = N->getOperand(1);
3126 return Src.isUndef() ? Src : SDValue();
3127 }
3128 default:
3129 return SDValue();
3130 }
3131}
3132
3133/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3134/// binary operation \p Opc to it with the corresponding constant operands.
3135SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3136 DAGCombinerInfo &DCI, const SDLoc &SL,
3137 unsigned Opc, SDValue LHS,
3138 uint32_t ValLo, uint32_t ValHi) const {
3139 SelectionDAG &DAG = DCI.DAG;
3140 SDValue Lo, Hi;
3141 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3142
3143 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3144 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3145
3146 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3147 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3148
3149 // Re-visit the ands. It's possible we eliminated one of them and it could
3150 // simplify the vector.
3151 DCI.AddToWorklist(Lo.getNode());
3152 DCI.AddToWorklist(Hi.getNode());
3153
3154 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3155 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3156}
3157
3158SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3159 DAGCombinerInfo &DCI) const {
3160 EVT VT = N->getValueType(0);
3161
3162 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3163 if (!RHS)
3164 return SDValue();
3165
3166 SDValue LHS = N->getOperand(0);
3167 unsigned RHSVal = RHS->getZExtValue();
3168 if (!RHSVal)
3169 return LHS;
3170
3171 SDLoc SL(N);
3172 SelectionDAG &DAG = DCI.DAG;
3173
3174 switch (LHS->getOpcode()) {
3175 default:
3176 break;
3177 case ISD::ZERO_EXTEND:
3178 case ISD::SIGN_EXTEND:
3179 case ISD::ANY_EXTEND: {
3180 SDValue X = LHS->getOperand(0);
3181
3182 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3183 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3184 // Prefer build_vector as the canonical form if packed types are legal.
3185 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3186 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3187 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3188 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3189 }
3190
3191 // shl (ext x) => zext (shl x), if shift does not overflow int
3192 if (VT != MVT::i64)
3193 break;
3194 KnownBits Known = DAG.computeKnownBits(X);
3195 unsigned LZ = Known.countMinLeadingZeros();
3196 if (LZ < RHSVal)
3197 break;
3198 EVT XVT = X.getValueType();
3199 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3200 return DAG.getZExtOrTrunc(Shl, SL, VT);
3201 }
3202 }
3203
3204 if (VT != MVT::i64)
3205 return SDValue();
3206
3207 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3208
3209 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3210 // common case, splitting this into a move and a 32-bit shift is faster and
3211 // the same code size.
3212 if (RHSVal < 32)
3213 return SDValue();
3214
3215 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3216
3217 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3218 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3219
3220 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3221
3222 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3223 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3224}
3225
3226SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3227 DAGCombinerInfo &DCI) const {
3228 if (N->getValueType(0) != MVT::i64)
3229 return SDValue();
3230
3231 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3232 if (!RHS)
3233 return SDValue();
3234
3235 SelectionDAG &DAG = DCI.DAG;
3236 SDLoc SL(N);
3237 unsigned RHSVal = RHS->getZExtValue();
3238
3239 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3240 if (RHSVal == 32) {
3241 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3242 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3243 DAG.getConstant(31, SL, MVT::i32));
3244
3245 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3246 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3247 }
3248
3249 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3250 if (RHSVal == 63) {
3251 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3252 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3253 DAG.getConstant(31, SL, MVT::i32));
3254 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3255 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3256 }
3257
3258 return SDValue();
3259}
3260
3261SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3262 DAGCombinerInfo &DCI) const {
3263 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3264 if (!RHS)
3265 return SDValue();
3266
3267 EVT VT = N->getValueType(0);
3268 SDValue LHS = N->getOperand(0);
3269 unsigned ShiftAmt = RHS->getZExtValue();
3270 SelectionDAG &DAG = DCI.DAG;
3271 SDLoc SL(N);
3272
3273 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3274 // this improves the ability to match BFE patterns in isel.
3275 if (LHS.getOpcode() == ISD::AND) {
3276 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3277 if (Mask->getAPIntValue().isShiftedMask() &&
3278 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3279 return DAG.getNode(
3280 ISD::AND, SL, VT,
3281 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3282 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3283 }
3284 }
3285 }
3286
3287 if (VT != MVT::i64)
3288 return SDValue();
3289
3290 if (ShiftAmt < 32)
3291 return SDValue();
3292
3293 // srl i64:x, C for C >= 32
3294 // =>
3295 // build_pair (srl hi_32(x), C - 32), 0
3296 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3297
3298 SDValue Hi = getHiHalf64(LHS, DAG);
3299
3300 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3301 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3302
3303 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3304
3305 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3306}
3307
3308SDValue AMDGPUTargetLowering::performTruncateCombine(
3309 SDNode *N, DAGCombinerInfo &DCI) const {
3310 SDLoc SL(N);
3311 SelectionDAG &DAG = DCI.DAG;
3312 EVT VT = N->getValueType(0);
3313 SDValue Src = N->getOperand(0);
3314
3315 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3316 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3317 SDValue Vec = Src.getOperand(0);
3318 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3319 SDValue Elt0 = Vec.getOperand(0);
3320 EVT EltVT = Elt0.getValueType();
3321 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3322 if (EltVT.isFloatingPoint()) {
3323 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3324 EltVT.changeTypeToInteger(), Elt0);
3325 }
3326
3327 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3328 }
3329 }
3330 }
3331
3332 // Equivalent of above for accessing the high element of a vector as an
3333 // integer operation.
3334 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3335 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3336 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3337 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3338 SDValue BV = stripBitcast(Src.getOperand(0));
3339 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3340 BV.getValueType().getVectorNumElements() == 2) {
3341 SDValue SrcElt = BV.getOperand(1);
3342 EVT SrcEltVT = SrcElt.getValueType();
3343 if (SrcEltVT.isFloatingPoint()) {
3344 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3345 SrcEltVT.changeTypeToInteger(), SrcElt);
3346 }
3347
3348 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3349 }
3350 }
3351 }
3352 }
3353
3354 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3355 //
3356 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3357 // i16 (trunc (srl (i32 (trunc x), K)))
3358 if (VT.getScalarSizeInBits() < 32) {
3359 EVT SrcVT = Src.getValueType();
3360 if (SrcVT.getScalarSizeInBits() > 32 &&
3361 (Src.getOpcode() == ISD::SRL ||
3362 Src.getOpcode() == ISD::SRA ||
3363 Src.getOpcode() == ISD::SHL)) {
3364 SDValue Amt = Src.getOperand(1);
3365 KnownBits Known = DAG.computeKnownBits(Amt);
3366 unsigned Size = VT.getScalarSizeInBits();
3367 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3368 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3369 EVT MidVT = VT.isVector() ?
3370 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3371 VT.getVectorNumElements()) : MVT::i32;
3372
3373 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3374 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3375 Src.getOperand(0));
3376 DCI.AddToWorklist(Trunc.getNode());
3377
3378 if (Amt.getValueType() != NewShiftVT) {
3379 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3380 DCI.AddToWorklist(Amt.getNode());
3381 }
3382
3383 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3384 Trunc, Amt);
3385 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3386 }
3387 }
3388 }
3389
3390 return SDValue();
3391}
3392
3393// We need to specifically handle i64 mul here to avoid unnecessary conversion
3394// instructions. If we only match on the legalized i64 mul expansion,
3395// SimplifyDemandedBits will be unable to remove them because there will be
3396// multiple uses due to the separate mul + mulh[su].
3397static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3398 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3399 if (Size <= 32) {
3400 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3401 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3402 }
3403
3404 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3405 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3406
3407 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3408 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3409
3410 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3411}
3412
3413SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3414 DAGCombinerInfo &DCI) const {
3415 EVT VT = N->getValueType(0);
3416
3417 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3418 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3419 // unnecessarily). isDivergent() is used as an approximation of whether the
3420 // value is in an SGPR.
3421 if (!N->isDivergent())
3422 return SDValue();
3423
3424 unsigned Size = VT.getSizeInBits();
3425 if (VT.isVector() || Size > 64)
3426 return SDValue();
3427
3428 // There are i16 integer mul/mad.
3429 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3430 return SDValue();
3431
3432 SelectionDAG &DAG = DCI.DAG;
3433 SDLoc DL(N);
3434
3435 SDValue N0 = N->getOperand(0);
3436 SDValue N1 = N->getOperand(1);
3437
3438 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3439 // in the source into any_extends if the result of the mul is truncated. Since
3440 // we can assume the high bits are whatever we want, use the underlying value
3441 // to avoid the unknown high bits from interfering.
3442 if (N0.getOpcode() == ISD::ANY_EXTEND)
3443 N0 = N0.getOperand(0);
3444
3445 if (N1.getOpcode() == ISD::ANY_EXTEND)
3446 N1 = N1.getOperand(0);
3447
3448 SDValue Mul;
3449
3450 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3451 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3452 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3453 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3454 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3455 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3456 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3457 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3458 } else {
3459 return SDValue();
3460 }
3461
3462 // We need to use sext even for MUL_U24, because MUL_U24 is used
3463 // for signed multiply of 8 and 16-bit types.
3464 return DAG.getSExtOrTrunc(Mul, DL, VT);
3465}
3466
3467SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3468 DAGCombinerInfo &DCI) const {
3469 EVT VT = N->getValueType(0);
3470
3471 if (!Subtarget->hasMulI24() || VT.isVector())
3472 return SDValue();
3473
3474 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3475 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3476 // unnecessarily). isDivergent() is used as an approximation of whether the
3477 // value is in an SGPR.
3478 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3479 // valu op anyway)
3480 if (Subtarget->hasSMulHi() && !N->isDivergent())
3481 return SDValue();
3482
3483 SelectionDAG &DAG = DCI.DAG;
3484 SDLoc DL(N);
3485
3486 SDValue N0 = N->getOperand(0);
3487 SDValue N1 = N->getOperand(1);
3488
3489 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3490 return SDValue();
3491
3492 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3493 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3494
3495 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3496 DCI.AddToWorklist(Mulhi.getNode());
3497 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3498}
3499
3500SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3501 DAGCombinerInfo &DCI) const {
3502 EVT VT = N->getValueType(0);
3503
3504 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3505 return SDValue();
3506
3507 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3508 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3509 // unnecessarily). isDivergent() is used as an approximation of whether the
3510 // value is in an SGPR.
3511 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3512 // valu op anyway)
3513 if (Subtarget->hasSMulHi() && !N->isDivergent())
3514 return SDValue();
3515
3516 SelectionDAG &DAG = DCI.DAG;
3517 SDLoc DL(N);
3518
3519 SDValue N0 = N->getOperand(0);
3520 SDValue N1 = N->getOperand(1);
3521
3522 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3523 return SDValue();
3524
3525 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3526 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3527
3528 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3529 DCI.AddToWorklist(Mulhi.getNode());
3530 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3531}
3532
3533static bool isNegativeOne(SDValue Val) {
3534 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3535 return C->isAllOnesValue();
3536 return false;
3537}
3538
3539SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3540 SDValue Op,
3541 const SDLoc &DL,
3542 unsigned Opc) const {
3543 EVT VT = Op.getValueType();
3544 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3545 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3546 LegalVT != MVT::i16))
3547 return SDValue();
3548
3549 if (VT != MVT::i32)
3550 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3551
3552 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3553 if (VT != MVT::i32)
3554 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3555
3556 return FFBX;
3557}
3558
3559// The native instructions return -1 on 0 input. Optimize out a select that
3560// produces -1 on 0.
3561//
3562// TODO: If zero is not undef, we could also do this if the output is compared
3563// against the bitwidth.
3564//
3565// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3566SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3567 SDValue LHS, SDValue RHS,
3568 DAGCombinerInfo &DCI) const {
3569 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3570 if (!CmpRhs || !CmpRhs->isNullValue())
3571 return SDValue();
3572
3573 SelectionDAG &DAG = DCI.DAG;
3574 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3575 SDValue CmpLHS = Cond.getOperand(0);
3576
3577 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3578 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3579 if (CCOpcode == ISD::SETEQ &&
3580 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3581 RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3582 unsigned Opc =
3583 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3584 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3585 }
3586
3587 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3588 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3589 if (CCOpcode == ISD::SETNE &&
3590 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3591 LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3592 unsigned Opc =
3593 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3594
3595 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3596 }
3597
3598 return SDValue();
3599}
3600
3601static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3602 unsigned Op,
3603 const SDLoc &SL,
3604 SDValue Cond,
3605 SDValue N1,
3606 SDValue N2) {
3607 SelectionDAG &DAG = DCI.DAG;
3608 EVT VT = N1.getValueType();
3609
3610 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3611 N1.getOperand(0), N2.getOperand(0));
3612 DCI.AddToWorklist(NewSelect.getNode());
3613 return DAG.getNode(Op, SL, VT, NewSelect);
3614}
3615
3616// Pull a free FP operation out of a select so it may fold into uses.
3617//
3618// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3619// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3620//
3621// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3622// select c, (fabs x), +k -> fabs (select c, x, k)
3623static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3624 SDValue N) {
3625 SelectionDAG &DAG = DCI.DAG;
3626 SDValue Cond = N.getOperand(0);
3627 SDValue LHS = N.getOperand(1);
3628 SDValue RHS = N.getOperand(2);
3629
3630 EVT VT = N.getValueType();
3631 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3632 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3633 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3634 SDLoc(N), Cond, LHS, RHS);
3635 }
3636
3637 bool Inv = false;
3638 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3639 std::swap(LHS, RHS);
3640 Inv = true;
3641 }
3642
3643 // TODO: Support vector constants.
3644 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3645 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3646 SDLoc SL(N);
3647 // If one side is an fneg/fabs and the other is a constant, we can push the
3648 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3649 SDValue NewLHS = LHS.getOperand(0);
3650 SDValue NewRHS = RHS;
3651
3652 // Careful: if the neg can be folded up, don't try to pull it back down.
3653 bool ShouldFoldNeg = true;
3654
3655 if (NewLHS.hasOneUse()) {
3656 unsigned Opc = NewLHS.getOpcode();
3657 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3658 ShouldFoldNeg = false;
3659 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3660 ShouldFoldNeg = false;
3661 }
3662
3663 if (ShouldFoldNeg) {
3664 if (LHS.getOpcode() == ISD::FNEG)
3665 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3666 else if (CRHS->isNegative())
3667 return SDValue();
3668
3669 if (Inv)
3670 std::swap(NewLHS, NewRHS);
3671
3672 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3673 Cond, NewLHS, NewRHS);
3674 DCI.AddToWorklist(NewSelect.getNode());
3675 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3676 }
3677 }
3678
3679 return SDValue();
3680}
3681
3682
3683SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3684 DAGCombinerInfo &DCI) const {
3685 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3686 return Folded;
3687
3688 SDValue Cond = N->getOperand(0);
3689 if (Cond.getOpcode() != ISD::SETCC)
3690 return SDValue();
3691
3692 EVT VT = N->getValueType(0);
3693 SDValue LHS = Cond.getOperand(0);
3694 SDValue RHS = Cond.getOperand(1);
3695 SDValue CC = Cond.getOperand(2);
3696
3697 SDValue True = N->getOperand(1);
3698 SDValue False = N->getOperand(2);
3699
3700 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3701 SelectionDAG &DAG = DCI.DAG;
3702 if (DAG.isConstantValueOfAnyType(True) &&
3703 !DAG.isConstantValueOfAnyType(False)) {
3704 // Swap cmp + select pair to move constant to false input.
3705 // This will allow using VOPC cndmasks more often.
3706 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3707
3708 SDLoc SL(N);
3709 ISD::CondCode NewCC =
3710 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3711
3712 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3713 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3714 }
3715
3716 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3717 SDValue MinMax
3718 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3719 // Revisit this node so we can catch min3/max3/med3 patterns.
3720 //DCI.AddToWorklist(MinMax.getNode());
3721 return MinMax;
3722 }
3723 }
3724
3725 // There's no reason to not do this if the condition has other uses.
3726 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3727}
3728
3729static bool isInv2Pi(const APFloat &APF) {
3730 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3731 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3732 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3733
3734 return APF.bitwiseIsEqual(KF16) ||
3735 APF.bitwiseIsEqual(KF32) ||
3736 APF.bitwiseIsEqual(KF64);
3737}
3738
3739// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3740// additional cost to negate them.
3741bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3742 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3743 if (C->isZero() && !C->isNegative())
3744 return true;
3745
3746 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3747 return true;
3748 }
3749
3750 return false;
3751}
3752
3753static unsigned inverseMinMax(unsigned Opc) {
3754 switch (Opc) {
3755 case ISD::FMAXNUM:
3756 return ISD::FMINNUM;
3757 case ISD::FMINNUM:
3758 return ISD::FMAXNUM;
3759 case ISD::FMAXNUM_IEEE:
3760 return ISD::FMINNUM_IEEE;
3761 case ISD::FMINNUM_IEEE:
3762 return ISD::FMAXNUM_IEEE;
3763 case AMDGPUISD::FMAX_LEGACY:
3764 return AMDGPUISD::FMIN_LEGACY;
3765 case AMDGPUISD::FMIN_LEGACY:
3766 return AMDGPUISD::FMAX_LEGACY;
3767 default:
3768 llvm_unreachable("invalid min/max opcode")__builtin_unreachable();
3769 }
3770}
3771
3772SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3773 DAGCombinerInfo &DCI) const {
3774 SelectionDAG &DAG = DCI.DAG;
3775 SDValue N0 = N->getOperand(0);
3776 EVT VT = N->getValueType(0);
3777
3778 unsigned Opc = N0.getOpcode();
3779
3780 // If the input has multiple uses and we can either fold the negate down, or
3781 // the other uses cannot, give up. This both prevents unprofitable
3782 // transformations and infinite loops: we won't repeatedly try to fold around
3783 // a negate that has no 'good' form.
3784 if (N0.hasOneUse()) {
3785 // This may be able to fold into the source, but at a code size cost. Don't
3786 // fold if the fold into the user is free.
3787 if (allUsesHaveSourceMods(N, 0))
3788 return SDValue();
3789 } else {
3790 if (fnegFoldsIntoOp(Opc) &&
3791 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3792 return SDValue();
3793 }
3794
3795 SDLoc SL(N);
3796 switch (Opc) {
3797 case ISD::FADD: {
3798 if (!mayIgnoreSignedZero(N0))
3799 return SDValue();
3800
3801 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3802 SDValue LHS = N0.getOperand(0);
3803 SDValue RHS = N0.getOperand(1);
3804
3805 if (LHS.getOpcode() != ISD::FNEG)
3806 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3807 else
3808 LHS = LHS.getOperand(0);
3809
3810 if (RHS.getOpcode() != ISD::FNEG)
3811 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3812 else
3813 RHS = RHS.getOperand(0);
3814
3815 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3816 if (Res.getOpcode() != ISD::FADD)
3817 return SDValue(); // Op got folded away.
3818 if (!N0.hasOneUse())
3819 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3820 return Res;
3821 }
3822 case ISD::FMUL:
3823 case AMDGPUISD::FMUL_LEGACY: {
3824 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3825 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3826 SDValue LHS = N0.getOperand(0);
3827 SDValue RHS = N0.getOperand(1);
3828
3829 if (LHS.getOpcode() == ISD::FNEG)
3830 LHS = LHS.getOperand(0);
3831 else if (RHS.getOpcode() == ISD::FNEG)
3832 RHS = RHS.getOperand(0);
3833 else
3834 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3835
3836 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3837 if (Res.getOpcode() != Opc)
3838 return SDValue(); // Op got folded away.
3839 if (!N0.hasOneUse())
3840 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3841 return Res;
3842 }
3843 case ISD::FMA:
3844 case ISD::FMAD: {
3845 // TODO: handle llvm.amdgcn.fma.legacy
3846 if (!mayIgnoreSignedZero(N0))
3847 return SDValue();
3848
3849 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3850 SDValue LHS = N0.getOperand(0);
3851 SDValue MHS = N0.getOperand(1);
3852 SDValue RHS = N0.getOperand(2);
3853
3854 if (LHS.getOpcode() == ISD::FNEG)
3855 LHS = LHS.getOperand(0);
3856 else if (MHS.getOpcode() == ISD::FNEG)
3857 MHS = MHS.getOperand(0);
3858 else
3859 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3860
3861 if (RHS.getOpcode() != ISD::FNEG)
3862 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3863 else
3864 RHS = RHS.getOperand(0);
3865
3866 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3867 if (Res.getOpcode() != Opc)
3868 return SDValue(); // Op got folded away.
3869 if (!N0.hasOneUse())
3870 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3871 return Res;
3872 }
3873 case ISD::FMAXNUM:
3874 case ISD::FMINNUM:
3875 case ISD::FMAXNUM_IEEE:
3876 case ISD::FMINNUM_IEEE:
3877 case AMDGPUISD::FMAX_LEGACY:
3878 case AMDGPUISD::FMIN_LEGACY: {
3879 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3880 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3881 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3882 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3883
3884 SDValue LHS = N0.getOperand(0);
3885 SDValue RHS = N0.getOperand(1);
3886
3887 // 0 doesn't have a negated inline immediate.
3888 // TODO: This constant check should be generalized to other operations.
3889 if (isConstantCostlierToNegate(RHS))
3890 return SDValue();
3891
3892 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3893 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3894 unsigned Opposite = inverseMinMax(Opc);
3895
3896 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3897 if (Res.getOpcode() != Opposite)
3898 return SDValue(); // Op got folded away.
3899 if (!N0.hasOneUse())
3900 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3901 return Res;
3902 }
3903 case AMDGPUISD::FMED3: {
3904 SDValue Ops[3];
3905 for (unsigned I = 0; I < 3; ++I)
3906 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3907
3908 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3909 if (Res.getOpcode() != AMDGPUISD::FMED3)
3910 return SDValue(); // Op got folded away.
3911
3912 if (!N0.hasOneUse()) {
3913 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3914 DAG.ReplaceAllUsesWith(N0, Neg);
3915
3916 for (SDNode *U : Neg->uses())
3917 DCI.AddToWorklist(U);
3918 }
3919
3920 return Res;
3921 }
3922 case ISD::FP_EXTEND:
3923 case ISD::FTRUNC:
3924 case ISD::FRINT:
3925 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3926 case ISD::FSIN:
3927 case ISD::FCANONICALIZE:
3928 case AMDGPUISD::RCP:
3929 case AMDGPUISD::RCP_LEGACY:
3930 case AMDGPUISD::RCP_IFLAG:
3931 case AMDGPUISD::SIN_HW: {
3932 SDValue CvtSrc = N0.getOperand(0);
3933 if (CvtSrc.getOpcode() == ISD::FNEG) {
3934 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3935 // (fneg (rcp (fneg x))) -> (rcp x)
3936 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3937 }
3938
3939 if (!N0.hasOneUse())
3940 return SDValue();
3941
3942 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3943 // (fneg (rcp x)) -> (rcp (fneg x))
3944 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3945 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3946 }
3947 case ISD::FP_ROUND: {
3948 SDValue CvtSrc = N0.getOperand(0);
3949
3950 if (CvtSrc.getOpcode() == ISD::FNEG) {
3951 // (fneg (fp_round (fneg x))) -> (fp_round x)
3952 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3953 CvtSrc.getOperand(0), N0.getOperand(1));
3954 }
3955
3956 if (!N0.hasOneUse())
3957 return SDValue();
3958
3959 // (fneg (fp_round x)) -> (fp_round (fneg x))
3960 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3961 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3962 }
3963 case ISD::FP16_TO_FP: {
3964 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3965 // f16, but legalization of f16 fneg ends up pulling it out of the source.
3966 // Put the fneg back as a legal source operation that can be matched later.
3967 SDLoc SL(N);
3968
3969 SDValue Src = N0.getOperand(0);
3970 EVT SrcVT = Src.getValueType();
3971
3972 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3973 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3974 DAG.getConstant(0x8000, SL, SrcVT));
3975 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3976 }
3977 default:
3978 return SDValue();
3979 }
3980}
3981
3982SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3983 DAGCombinerInfo &DCI) const {
3984 SelectionDAG &DAG = DCI.DAG;
3985 SDValue N0 = N->getOperand(0);
3986
3987 if (!N0.hasOneUse())
3988 return SDValue();
3989
3990 switch (N0.getOpcode()) {
3991 case ISD::FP16_TO_FP: {
3992 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")(static_cast<void> (0));
3993 SDLoc SL(N);
3994 SDValue Src = N0.getOperand(0);
3995 EVT SrcVT = Src.getValueType();
3996
3997 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3998 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3999 DAG.getConstant(0x7fff, SL, SrcVT));
4000 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4001 }
4002 default:
4003 return SDValue();
4004 }
4005}
4006
4007SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4008 DAGCombinerInfo &DCI) const {
4009 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4010 if (!CFP)
4011 return SDValue();
4012
4013 // XXX - Should this flush denormals?
4014 const APFloat &Val = CFP->getValueAPF();
4015 APFloat One(Val.getSemantics(), "1.0");
4016 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4017}
4018
4019SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4020 DAGCombinerInfo &DCI) const {
4021 SelectionDAG &DAG = DCI.DAG;
4022 SDLoc DL(N);
4023
4024 switch(N->getOpcode()) {
4025 default:
4026 break;
4027 case ISD::BITCAST: {
4028 EVT DestVT = N->getValueType(0);
4029
4030 // Push casts through vector builds. This helps avoid emitting a large
4031 // number of copies when materializing floating point vector constants.
4032 //
4033 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4034 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4035 if (DestVT.isVector()) {
4036 SDValue Src = N->getOperand(0);
4037 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4038 EVT SrcVT = Src.getValueType();
4039 unsigned NElts = DestVT.getVectorNumElements();
4040
4041 if (SrcVT.getVectorNumElements() == NElts) {
4042 EVT DestEltVT = DestVT.getVectorElementType();
4043
4044 SmallVector<SDValue, 8> CastedElts;
4045 SDLoc SL(N);
4046 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4047 SDValue Elt = Src.getOperand(I);
4048 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4049 }
4050
4051 return DAG.getBuildVector(DestVT, SL, CastedElts);
4052 }
4053 }
4054 }
4055
4056 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4057 break;
4058
4059 // Fold bitcasts of constants.
4060 //
4061 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4062 // TODO: Generalize and move to DAGCombiner
4063 SDValue Src = N->getOperand(0);
4064 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4065 SDLoc SL(N);
4066 uint64_t CVal = C->getZExtValue();
4067 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4068 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4069 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4070 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4071 }
4072
4073 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4074 const APInt &Val = C->getValueAPF().bitcastToAPInt();
4075 SDLoc SL(N);
4076 uint64_t CVal = Val.getZExtValue();
4077 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4078 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4079 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4080
4081 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4082 }
4083
4084 break;
4085 }
4086 case ISD::SHL: {
4087 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4088 break;
4089
4090 return performShlCombine(N, DCI);
4091 }
4092 case ISD::SRL: {
4093 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4094 break;
4095
4096 return performSrlCombine(N, DCI);
4097 }
4098 case ISD::SRA: {
4099 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4100 break;
4101
4102 return performSraCombine(N, DCI);
4103 }
4104 case ISD::TRUNCATE:
4105 return performTruncateCombine(N, DCI);
4106 case ISD::MUL:
4107 return performMulCombine(N, DCI);
4108 case ISD::MULHS:
4109 return performMulhsCombine(N, DCI);
4110 case ISD::MULHU:
4111 return performMulhuCombine(N, DCI);
4112 case AMDGPUISD::MUL_I24:
4113 case AMDGPUISD::MUL_U24:
4114 case AMDGPUISD::MULHI_I24:
4115 case AMDGPUISD::MULHI_U24:
4116 return simplifyMul24(N, DCI);
4117 case ISD::SELECT:
4118 return performSelectCombine(N, DCI);
4119 case ISD::FNEG:
4120 return performFNegCombine(N, DCI);
4121 case ISD::FABS:
4122 return performFAbsCombine(N, DCI);
4123 case AMDGPUISD::BFE_I32:
4124 case AMDGPUISD::BFE_U32: {
4125 assert(!N->getValueType(0).isVector() &&(static_cast<void> (0))
4126 "Vector handling of BFE not implemented")(static_cast<void> (0));
4127 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4128 if (!Width)
4129 break;
4130
4131 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4132 if (WidthVal == 0)
4133 return DAG.getConstant(0, DL, MVT::i32);
4134
4135 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4136 if (!Offset)
4137 break;
4138
4139 SDValue BitsFrom = N->getOperand(0);
4140 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4141
4142 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4143
4144 if (OffsetVal == 0) {
4145 // This is already sign / zero extended, so try to fold away extra BFEs.
4146 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4147
4148 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4149 if (OpSignBits >= SignBits)
4150 return BitsFrom;
4151
4152 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4153 if (Signed) {
4154 // This is a sign_extend_inreg. Replace it to take advantage of existing
4155 // DAG Combines. If not eliminated, we will match back to BFE during
4156 // selection.
4157
4158 // TODO: The sext_inreg of extended types ends, although we can could
4159 // handle them in a single BFE.
4160 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4161 DAG.getValueType(SmallVT));
4162 }
4163
4164 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4165 }
4166
4167 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4168 if (Signed) {
4169 return constantFoldBFE<int32_t>(DAG,
4170 CVal->getSExtValue(),
4171 OffsetVal,
4172 WidthVal,
4173 DL);
4174 }
4175
4176 return constantFoldBFE<uint32_t>(DAG,
4177 CVal->getZExtValue(),
4178 OffsetVal,
4179 WidthVal,
4180 DL);
4181 }
4182
4183 if ((OffsetVal + WidthVal) >= 32 &&
4184 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4185 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4186 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4187 BitsFrom, ShiftVal);
4188 }
4189
4190 if (BitsFrom.hasOneUse()) {
4191 APInt Demanded = APInt::getBitsSet(32,
4192 OffsetVal,
4193 OffsetVal + WidthVal);
4194
4195 KnownBits Known;
4196 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4197 !DCI.isBeforeLegalizeOps());
4198 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4199 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4200 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4201 DCI.CommitTargetLoweringOpt(TLO);
4202 }
4203 }
4204
4205 break;
4206 }
4207 case ISD::LOAD:
4208 return performLoadCombine(N, DCI);
4209 case ISD::STORE:
4210 return performStoreCombine(N, DCI);
4211 case AMDGPUISD::RCP:
4212 case AMDGPUISD::RCP_IFLAG:
4213 return performRcpCombine(N, DCI);
4214 case ISD::AssertZext:
4215 case ISD::AssertSext:
4216 return performAssertSZExtCombine(N, DCI);
4217 case ISD::INTRINSIC_WO_CHAIN:
4218 return performIntrinsicWOChainCombine(N, DCI);
4219 }
4220 return SDValue();
4221}
4222
4223//===----------------------------------------------------------------------===//
4224// Helper functions
4225//===----------------------------------------------------------------------===//
4226
4227SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4228 const TargetRegisterClass *RC,
4229 Register Reg, EVT VT,
4230 const SDLoc &SL,
4231 bool RawReg) const {
4232 MachineFunction &MF = DAG.getMachineFunction();
4233 MachineRegisterInfo &MRI = MF.getRegInfo();
4234 Register VReg;
4235
4236 if (!MRI.isLiveIn(Reg)) {
4237 VReg = MRI.createVirtualRegister(RC);
4238 MRI.addLiveIn(Reg, VReg);
4239 } else {
4240 VReg = MRI.getLiveInVirtReg(Reg);
4241 }
4242
4243 if (RawReg)
4244 return DAG.getRegister(VReg, VT);
4245
4246 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4247}
4248
4249// This may be called multiple times, and nothing prevents creating multiple
4250// objects at the same offset. See if we already defined this object.
4251static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4252 int64_t Offset) {
4253 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4254 if (MFI.getObjectOffset(I) == Offset) {
4255 assert(MFI.getObjectSize(I) == Size)(static_cast<void> (0));
4256 return I;
4257 }
4258 }
4259
4260 return MFI.CreateFixedObject(Size, Offset, true);
4261}
4262
4263SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4264 EVT VT,
4265 const SDLoc &SL,
4266 int64_t Offset) const {
4267 MachineFunction &MF = DAG.getMachineFunction();
4268 MachineFrameInfo &MFI = MF.getFrameInfo();
4269 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4270
4271 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4272 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4273
4274 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4275 MachineMemOperand::MODereferenceable |
4276 MachineMemOperand::MOInvariant);
4277}
4278
4279SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4280 const SDLoc &SL,
4281 SDValue Chain,
4282 SDValue ArgVal,
4283 int64_t Offset) const {
4284 MachineFunction &MF = DAG.getMachineFunction();
4285 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4286 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4287
4288 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4289 // Stores to the argument stack area are relative to the stack pointer.
4290 SDValue SP =
4291 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4292 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4293 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4294 MachineMemOperand::MODereferenceable);
4295 return Store;
4296}
4297
4298SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4299 const TargetRegisterClass *RC,
4300 EVT VT, const SDLoc &SL,
4301 const ArgDescriptor &Arg) const {
4302 assert(Arg && "Attempting to load missing argument")(static_cast<void> (0));
4303
4304 SDValue V = Arg.isRegister() ?
1
'?' condition is true
4305 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4306 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4307
4308 if (!Arg.isMasked())
2
Calling 'ArgDescriptor::isMasked'
5
Returning from 'ArgDescriptor::isMasked'
6
Taking false branch
4309 return V;
4310
4311 unsigned Mask = Arg.getMask();
4312 unsigned Shift = countTrailingZeros<unsigned>(Mask);
7
Calling 'countTrailingZeros<unsigned int>'
14
Returning from 'countTrailingZeros<unsigned int>'
15
'Shift' initialized to 32
4313 V = DAG.getNode(ISD::SRL, SL, VT, V,
4314 DAG.getShiftAmountConstant(Shift, VT, SL));
4315 return DAG.getNode(ISD::AND, SL, VT, V,
4316 DAG.getConstant(Mask >> Shift, SL, VT));
16
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4317}
4318
4319uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4320 const MachineFunction &MF, const ImplicitParameter Param) const {
4321 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4322 const AMDGPUSubtarget &ST =
4323 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4324 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4325 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4326 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4327 ExplicitArgOffset;
4328 switch (Param) {
4329 case GRID_DIM:
4330 return ArgOffset;
4331 case GRID_OFFSET:
4332 return ArgOffset + 4;
4333 }
4334 llvm_unreachable("unexpected implicit parameter type")__builtin_unreachable();
4335}
4336
4337#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4338
4339const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4340 switch ((AMDGPUISD::NodeType)Opcode) {
4341 case AMDGPUISD::FIRST_NUMBER: break;
4342 // AMDIL DAG nodes
4343 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4344 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4345
4346 // AMDGPU DAG nodes
4347 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4348 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4349 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4350 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4351 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4352 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4353 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4354 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4355 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4356 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4357 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4358 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4359 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4360 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4361 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4362 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4363 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4364 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4365 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4366 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4367 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4368 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4369 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4370 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4371 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4372 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4373 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4374 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4375 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4376 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4377 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4378 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4379 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4380 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4381 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4382 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4383 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4384 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4385 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4386 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4387 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4388 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4389 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4390 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4391 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4392 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4393 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4394 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4395 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4396 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4397 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4398 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4399 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4400 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4401 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4402 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4403 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4404 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4405 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4406 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4407 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4408 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4409 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4410 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4411 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4412 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4413 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4414 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4415 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4416 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4417 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4418 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4419 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4420 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4421 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4422 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4423 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4424 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4425 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4426 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4427 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4428 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4429 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4430 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4431 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4432 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4433 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4434 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4435 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4436 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4437 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4438 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4439 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4440 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4441 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4442 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4443 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4444 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4445 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4446 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4447 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4448 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4449 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4450 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4451 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4452 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4453 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4454 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4455 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4456 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4457 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4458 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4459 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4460 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4461 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4462 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4463 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4464 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4465 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4466 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4467 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4468 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4469 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4470 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4471 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4472 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4473 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4474 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4475 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4476 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4477 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4478 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4479 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)case AMDGPUISD::BUFFER_ATOMIC_CSUB: return "BUFFER_ATOMIC_CSUB"
;
4480 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4481 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)case AMDGPUISD::BUFFER_ATOMIC_FMIN: return "BUFFER_ATOMIC_FMIN"
;
4482 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)case AMDGPUISD::BUFFER_ATOMIC_FMAX: return "BUFFER_ATOMIC_FMAX"
;
4483
4484 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4485 }
4486 return nullptr;
4487}
4488
4489SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4490 SelectionDAG &DAG, int Enabled,
4491 int &RefinementSteps,
4492 bool &UseOneConstNR,
4493 bool Reciprocal) const {
4494 EVT VT = Operand.getValueType();
4495
4496 if (VT == MVT::f32) {
4497 RefinementSteps = 0;
4498 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4499 }
4500
4501 // TODO: There is also f64 rsq instruction, but the documentation is less
4502 // clear on its precision.
4503
4504 return SDValue();
4505}
4506
4507SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4508 SelectionDAG &DAG, int Enabled,
4509 int &RefinementSteps) const {
4510 EVT VT = Operand.getValueType();
4511
4512 if (VT == MVT::f32) {
4513 // Reciprocal, < 1 ulp error.
4514 //
4515 // This reciprocal approximation converges to < 0.5 ulp error with one
4516 // newton rhapson performed with two fused multiple adds (FMAs).
4517
4518 RefinementSteps = 0;
4519 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4520 }
4521
4522 // TODO: There is also f64 rcp instruction, but the documentation is less
4523 // clear on its precision.
4524
4525 return SDValue();
4526}
4527
4528void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4529 const SDValue Op, KnownBits &Known,
4530 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4531
4532 Known.resetAll(); // Don't know anything.
4533
4534 unsigned Opc = Op.getOpcode();
4535
4536 switch (Opc) {
4537 default:
4538 break;
4539 case AMDGPUISD::CARRY:
4540 case AMDGPUISD::BORROW: {
4541 Known.Zero = APInt::getHighBitsSet(32, 31);
4542 break;
4543 }
4544
4545 case AMDGPUISD::BFE_I32:
4546 case AMDGPUISD::BFE_U32: {
4547 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4548 if (!CWidth)
4549 return;
4550
4551 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4552
4553 if (Opc == AMDGPUISD::BFE_U32)
4554 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4555
4556 break;
4557 }
4558 case AMDGPUISD::FP_TO_FP16: {
4559 unsigned BitWidth = Known.getBitWidth();
4560
4561 // High bits are zero.
4562 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4563 break;
4564 }
4565 case AMDGPUISD::MUL_U24:
4566 case AMDGPUISD::MUL_I24: {
4567 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4568 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4569 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4570 RHSKnown.countMinTrailingZeros();
4571 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4572 // Skip extra check if all bits are known zeros.
4573 if (TrailZ >= 32)
4574 break;
4575
4576 // Truncate to 24 bits.
4577 LHSKnown = LHSKnown.trunc(24);
4578 RHSKnown = RHSKnown.trunc(24);
4579
4580 if (Opc == AMDGPUISD::MUL_I24) {
4581 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4582 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4583 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4584 if (MaxValBits >= 32)
4585 break;
4586 bool LHSNegative = LHSKnown.isNegative();
4587 bool LHSNonNegative = LHSKnown.isNonNegative();
4588 bool LHSPositive = LHSKnown.isStrictlyPositive();
4589 bool RHSNegative = RHSKnown.isNegative();
4590 bool RHSNonNegative = RHSKnown.isNonNegative();
4591 bool RHSPositive = RHSKnown.isStrictlyPositive();
4592
4593 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4594 Known.Zero.setHighBits(32 - MaxValBits);
4595 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4596 Known.One.setHighBits(32 - MaxValBits);
4597 } else {
4598 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4599 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4600 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4601 if (MaxValBits >= 32)
4602 break;
4603 Known.Zero.setHighBits(32 - MaxValBits);
4604 }
4605 break;
4606 }
4607 case AMDGPUISD::PERM: {
4608 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4609 if (!CMask)
4610 return;
4611
4612 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4613 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4614 unsigned Sel = CMask->getZExtValue();
4615
4616 for (unsigned I = 0; I < 32; I += 8) {
4617 unsigned SelBits = Sel & 0xff;
4618 if (SelBits < 4) {
4619 SelBits *= 8;
4620 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4621 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4622 } else if (SelBits < 7) {
4623 SelBits = (SelBits & 3) * 8;
4624 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4625 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4626 } else if (SelBits == 0x0c) {
4627 Known.Zero |= 0xFFull << I;
4628 } else if (SelBits > 0x0c) {
4629 Known.One |= 0xFFull << I;
4630 }
4631 Sel >>= 8;
4632 }
4633 break;
4634 }
4635 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4636 Known.Zero.setHighBits(24);
4637 break;
4638 }
4639 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4640 Known.Zero.setHighBits(16);
4641 break;
4642 }
4643 case AMDGPUISD::LDS: {
4644 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4645 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4646
4647 Known.Zero.setHighBits(16);
4648 Known.Zero.setLowBits(Log2(Alignment));
4649 break;
4650 }
4651 case ISD::INTRINSIC_WO_CHAIN: {
4652 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4653 switch (IID) {
4654 case Intrinsic::amdgcn_mbcnt_lo:
4655 case Intrinsic::amdgcn_mbcnt_hi: {
4656 const GCNSubtarget &ST =
4657 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4658 // These return at most the wavefront size - 1.
4659 unsigned Size = Op.getValueType().getSizeInBits();
4660 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4661 break;
4662 }
4663 default:
4664 break;
4665 }
4666 }
4667 }
4668}
4669
4670unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4671 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4672 unsigned Depth) const {
4673 switch (Op.getOpcode()) {
4674 case AMDGPUISD::BFE_I32: {
4675 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4676 if (!Width)
4677 return 1;
4678
4679 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4680 if (!isNullConstant(Op.getOperand(1)))
4681 return SignBits;
4682
4683 // TODO: Could probably figure something out with non-0 offsets.
4684 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4685 return std::max(SignBits, Op0SignBits);
4686 }
4687
4688 case AMDGPUISD::BFE_U32: {
4689 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4690 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4691 }
4692
4693 case AMDGPUISD::CARRY:
4694 case AMDGPUISD::BORROW:
4695 return 31;
4696 case AMDGPUISD::BUFFER_LOAD_BYTE:
4697 return 25;
4698 case AMDGPUISD::BUFFER_LOAD_SHORT:
4699 return 17;
4700 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4701 return 24;
4702 case AMDGPUISD::BUFFER_LOAD_USHORT:
4703 return 16;
4704 case AMDGPUISD::FP_TO_FP16:
4705 return 16;
4706 default:
4707 return 1;
4708 }
4709}
4710
4711unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4712 GISelKnownBits &Analysis, Register R,
4713 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4714 unsigned Depth) const {
4715 const MachineInstr *MI = MRI.getVRegDef(R);
4716 if (!MI)
4717 return 1;
4718
4719 // TODO: Check range metadata on MMO.
4720 switch (MI->getOpcode()) {
4721 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4722 return 25;
4723 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4724 return 17;
4725 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4726 return 24;
4727 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4728 return 16;
4729 default:
4730 return 1;
4731 }
4732}
4733
4734bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4735 const SelectionDAG &DAG,
4736 bool SNaN,
4737 unsigned Depth) const {
4738 unsigned Opcode = Op.getOpcode();
4739 switch (Opcode) {
4740 case AMDGPUISD::FMIN_LEGACY:
4741 case AMDGPUISD::FMAX_LEGACY: {
4742 if (SNaN)
4743 return true;
4744
4745 // TODO: Can check no nans on one of the operands for each one, but which
4746 // one?
4747 return false;
4748 }
4749 case AMDGPUISD::FMUL_LEGACY:
4750 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4751 if (SNaN)
4752 return true;
4753 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4754 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4755 }
4756 case AMDGPUISD::FMED3:
4757 case AMDGPUISD::FMIN3:
4758 case AMDGPUISD::FMAX3:
4759 case AMDGPUISD::FMAD_FTZ: {
4760 if (SNaN)
4761 return true;
4762 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4763 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4764 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4765 }
4766 case AMDGPUISD::CVT_F32_UBYTE0:
4767 case AMDGPUISD::CVT_F32_UBYTE1:
4768 case AMDGPUISD::CVT_F32_UBYTE2:
4769 case AMDGPUISD::CVT_F32_UBYTE3:
4770 return true;
4771
4772 case AMDGPUISD::RCP:
4773 case AMDGPUISD::RSQ:
4774 case AMDGPUISD::RCP_LEGACY:
4775 case AMDGPUISD::RSQ_CLAMP: {
4776 if (SNaN)
4777 return true;
4778
4779 // TODO: Need is known positive check.
4780 return false;
4781 }
4782 case AMDGPUISD::LDEXP:
4783 case AMDGPUISD::FRACT: {
4784 if (SNaN)
4785 return true;
4786 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4787 }
4788 case AMDGPUISD::DIV_SCALE:
4789 case AMDGPUISD::DIV_FMAS:
4790 case AMDGPUISD::DIV_FIXUP:
4791 // TODO: Refine on operands.
4792 return SNaN;
4793 case AMDGPUISD::SIN_HW:
4794 case AMDGPUISD::COS_HW: {
4795 // TODO: Need check for infinity
4796 return SNaN;
4797 }
4798 case ISD::INTRINSIC_WO_CHAIN: {
4799 unsigned IntrinsicID
4800 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4801 // TODO: Handle more intrinsics
4802 switch (IntrinsicID) {
4803 case Intrinsic::amdgcn_cubeid:
4804 return true;
4805
4806 case Intrinsic::amdgcn_frexp_mant: {
4807 if (SNaN)
4808 return true;
4809 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4810 }
4811 case Intrinsic::amdgcn_cvt_pkrtz: {
4812 if (SNaN)
4813 return true;
4814 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4815 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4816 }
4817 case Intrinsic::amdgcn_rcp:
4818 case Intrinsic::amdgcn_rsq:
4819 case Intrinsic::amdgcn_rcp_legacy:
4820 case Intrinsic::amdgcn_rsq_legacy:
4821 case Intrinsic::amdgcn_rsq_clamp: {
4822 if (SNaN)
4823 return true;
4824
4825 // TODO: Need is known positive check.
4826 return false;
4827 }
4828 case Intrinsic::amdgcn_trig_preop:
4829 case Intrinsic::amdgcn_fdot2:
4830 // TODO: Refine on operand
4831 return SNaN;
4832 case Intrinsic::amdgcn_fma_legacy:
4833 if (SNaN)
4834 return true;
4835 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4836 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4837 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4838 default:
4839 return false;
4840 }
4841 }
4842 default:
4843 return false;
4844 }
4845}
4846
4847TargetLowering::AtomicExpansionKind
4848AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4849 switch (RMW->getOperation()) {
4850 case AtomicRMWInst::Nand:
4851 case AtomicRMWInst::FAdd:
4852 case AtomicRMWInst::FSub:
4853 return AtomicExpansionKind::CmpXChg;
4854 default:
4855 return AtomicExpansionKind::None;
4856 }
4857}
4858
4859bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal(
4860 unsigned Opc, LLT Ty1, LLT Ty2) const {
4861 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
4862}

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)(static_cast<void> (0));
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)(static_cast<void> (0));
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
3
Assuming the condition is true
4
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
8.1
'ZB' is not equal to ZB_Undefined
8.1
'ZB' is not equal to ZB_Undefined
8.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
9
Assuming 'Val' is equal to 0
10
Taking true branch
117 return 32;
11
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
8
Calling 'TrailingZerosCounter::count'
12
Returning from 'TrailingZerosCounter::count'
13
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast<void> (0));
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
771 assert(B <= 32 && "Bit width out of range.")(static_cast<void> (0));
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
787 assert(B <= 64 && "Bit width out of range.")(static_cast<void> (0));
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif