Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4170, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/include -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2019-12-09-002921-48462-1 -x c++ /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUCallLowering.h"
18#include "AMDGPUFrameLowering.h"
19#include "AMDGPURegisterInfo.h"
20#include "AMDGPUSubtarget.h"
21#include "AMDGPUTargetMachine.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "R600MachineFunctionInfo.h"
24#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
27#include "llvm/CodeGen/Analysis.h"
28#include "llvm/CodeGen/CallingConvLower.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineRegisterInfo.h"
31#include "llvm/CodeGen/SelectionDAG.h"
32#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
33#include "llvm/IR/DataLayout.h"
34#include "llvm/IR/DiagnosticInfo.h"
35#include "llvm/Support/KnownBits.h"
36#include "llvm/Support/MathExtras.h"
37using namespace llvm;
38
39#include "AMDGPUGenCallingConv.inc"
40
41// Find a larger type to do a load / store of a vector with.
42EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
43 unsigned StoreSize = VT.getStoreSizeInBits();
44 if (StoreSize <= 32)
45 return EVT::getIntegerVT(Ctx, StoreSize);
46
47 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")((StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? static_cast<void> (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 47, __PRETTY_FUNCTION__))
;
48 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
49}
50
51unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
52 EVT VT = Op.getValueType();
53 KnownBits Known = DAG.computeKnownBits(Op);
54 return VT.getSizeInBits() - Known.countMinLeadingZeros();
55}
56
57unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
58 EVT VT = Op.getValueType();
59
60 // In order for this to be a signed 24-bit value, bit 23, must
61 // be a sign bit.
62 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
63}
64
65AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
66 const AMDGPUSubtarget &STI)
67 : TargetLowering(TM), Subtarget(&STI) {
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
70 setOperationAction(ISD::LOAD, MVT::f32, Promote);
71 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
72
73 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
74 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
75
76 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
77 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
78
79 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
80 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
81
82 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
83 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
84
85 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87
88 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
89 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
90
91 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
92 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
93
94 setOperationAction(ISD::LOAD, MVT::i64, Promote);
95 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
96
97 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
98 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
99
100 setOperationAction(ISD::LOAD, MVT::f64, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
102
103 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
105
106 // There are no 64-bit extloads. These should be done as a 32-bit extload and
107 // an extension to 64-bit.
108 for (MVT VT : MVT::integer_valuetypes()) {
109 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
110 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
111 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
112 }
113
114 for (MVT VT : MVT::integer_valuetypes()) {
115 if (VT == MVT::i64)
116 continue;
117
118 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
119 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
120 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
121 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
122
123 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
124 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
125 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
126 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
127
128 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
129 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
130 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
131 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
132 }
133
134 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
135 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
137 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
138 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
139 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
141 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
143 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
145 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
146 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
148 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
149 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
150 }
151
152 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
153 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
154 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
155 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
156 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
157 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
158 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
159
160 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
161 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
162 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
163 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
164
165 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
166 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
167 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
168 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
169
170 setOperationAction(ISD::STORE, MVT::f32, Promote);
171 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
172
173 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
174 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
175
176 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
177 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
178
179 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
180 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
181
182 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
183 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
184
185 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
186 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
187
188 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
189 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
190
191 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
192 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
193
194 setOperationAction(ISD::STORE, MVT::i64, Promote);
195 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
196
197 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
198 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
199
200 setOperationAction(ISD::STORE, MVT::f64, Promote);
201 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
202
203 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
204 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
205
206 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
207 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
208 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
210
211 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
212 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
213 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
214 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
215
216 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
217 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
218 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
219 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
220 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
221 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
222 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
223
224 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
225 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
226
227 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
228 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
229
230 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
231 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
232
233 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
234 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
235
236
237 setOperationAction(ISD::Constant, MVT::i32, Legal);
238 setOperationAction(ISD::Constant, MVT::i64, Legal);
239 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
240 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
241
242 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
243 setOperationAction(ISD::BRIND, MVT::Other, Expand);
244
245 // This is totally unsupported, just custom lower to produce an error.
246 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
247
248 // Library functions. These default to Expand, but we have instructions
249 // for them.
250 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
251 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
252 setOperationAction(ISD::FPOW, MVT::f32, Legal);
253 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
254 setOperationAction(ISD::FABS, MVT::f32, Legal);
255 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
256 setOperationAction(ISD::FRINT, MVT::f32, Legal);
257 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
258 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
259 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
260
261 setOperationAction(ISD::FROUND, MVT::f32, Custom);
262 setOperationAction(ISD::FROUND, MVT::f64, Custom);
263
264 setOperationAction(ISD::FLOG, MVT::f32, Custom);
265 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
266 setOperationAction(ISD::FEXP, MVT::f32, Custom);
267
268
269 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
270 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
271
272 setOperationAction(ISD::FREM, MVT::f32, Custom);
273 setOperationAction(ISD::FREM, MVT::f64, Custom);
274
275 // Expand to fneg + fadd.
276 setOperationAction(ISD::FSUB, MVT::f64, Expand);
277
278 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
279 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
280 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
281 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
282 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
283 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
284 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
285 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
286 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
287 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
288 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
289 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
290 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
291 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
292 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
293 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
294 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
295 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
296 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
297 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
298 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
299 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
300
301 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
302 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
303 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
304
305 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
306 for (MVT VT : ScalarIntVTs) {
307 // These should use [SU]DIVREM, so set them to expand
308 setOperationAction(ISD::SDIV, VT, Expand);
309 setOperationAction(ISD::UDIV, VT, Expand);
310 setOperationAction(ISD::SREM, VT, Expand);
311 setOperationAction(ISD::UREM, VT, Expand);
312
313 // GPU does not have divrem function for signed or unsigned.
314 setOperationAction(ISD::SDIVREM, VT, Custom);
315 setOperationAction(ISD::UDIVREM, VT, Custom);
316
317 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
318 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
319 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
320
321 setOperationAction(ISD::BSWAP, VT, Expand);
322 setOperationAction(ISD::CTTZ, VT, Expand);
323 setOperationAction(ISD::CTLZ, VT, Expand);
324
325 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
326 setOperationAction(ISD::ADDC, VT, Legal);
327 setOperationAction(ISD::SUBC, VT, Legal);
328 setOperationAction(ISD::ADDE, VT, Legal);
329 setOperationAction(ISD::SUBE, VT, Legal);
330 }
331
332 // The hardware supports 32-bit ROTR, but not ROTL.
333 setOperationAction(ISD::ROTL, MVT::i32, Expand);
334 setOperationAction(ISD::ROTL, MVT::i64, Expand);
335 setOperationAction(ISD::ROTR, MVT::i64, Expand);
336
337 setOperationAction(ISD::MUL, MVT::i64, Expand);
338 setOperationAction(ISD::MULHU, MVT::i64, Expand);
339 setOperationAction(ISD::MULHS, MVT::i64, Expand);
340 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
341 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
342 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
343 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
344 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
345
346 setOperationAction(ISD::SMIN, MVT::i32, Legal);
347 setOperationAction(ISD::UMIN, MVT::i32, Legal);
348 setOperationAction(ISD::SMAX, MVT::i32, Legal);
349 setOperationAction(ISD::UMAX, MVT::i32, Legal);
350
351 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
352 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
353 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
354 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355
356 static const MVT::SimpleValueType VectorIntTypes[] = {
357 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
358 };
359
360 for (MVT VT : VectorIntTypes) {
361 // Expand the following operations for the current type by default.
362 setOperationAction(ISD::ADD, VT, Expand);
363 setOperationAction(ISD::AND, VT, Expand);
364 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
365 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
366 setOperationAction(ISD::MUL, VT, Expand);
367 setOperationAction(ISD::MULHU, VT, Expand);
368 setOperationAction(ISD::MULHS, VT, Expand);
369 setOperationAction(ISD::OR, VT, Expand);
370 setOperationAction(ISD::SHL, VT, Expand);
371 setOperationAction(ISD::SRA, VT, Expand);
372 setOperationAction(ISD::SRL, VT, Expand);
373 setOperationAction(ISD::ROTL, VT, Expand);
374 setOperationAction(ISD::ROTR, VT, Expand);
375 setOperationAction(ISD::SUB, VT, Expand);
376 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
377 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
378 setOperationAction(ISD::SDIV, VT, Expand);
379 setOperationAction(ISD::UDIV, VT, Expand);
380 setOperationAction(ISD::SREM, VT, Expand);
381 setOperationAction(ISD::UREM, VT, Expand);
382 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
383 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
384 setOperationAction(ISD::SDIVREM, VT, Custom);
385 setOperationAction(ISD::UDIVREM, VT, Expand);
386 setOperationAction(ISD::SELECT, VT, Expand);
387 setOperationAction(ISD::VSELECT, VT, Expand);
388 setOperationAction(ISD::SELECT_CC, VT, Expand);
389 setOperationAction(ISD::XOR, VT, Expand);
390 setOperationAction(ISD::BSWAP, VT, Expand);
391 setOperationAction(ISD::CTPOP, VT, Expand);
392 setOperationAction(ISD::CTTZ, VT, Expand);
393 setOperationAction(ISD::CTLZ, VT, Expand);
394 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
395 setOperationAction(ISD::SETCC, VT, Expand);
396 }
397
398 static const MVT::SimpleValueType FloatVectorTypes[] = {
399 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
400 };
401
402 for (MVT VT : FloatVectorTypes) {
403 setOperationAction(ISD::FABS, VT, Expand);
404 setOperationAction(ISD::FMINNUM, VT, Expand);
405 setOperationAction(ISD::FMAXNUM, VT, Expand);
406 setOperationAction(ISD::FADD, VT, Expand);
407 setOperationAction(ISD::FCEIL, VT, Expand);
408 setOperationAction(ISD::FCOS, VT, Expand);
409 setOperationAction(ISD::FDIV, VT, Expand);
410 setOperationAction(ISD::FEXP2, VT, Expand);
411 setOperationAction(ISD::FEXP, VT, Expand);
412 setOperationAction(ISD::FLOG2, VT, Expand);
413 setOperationAction(ISD::FREM, VT, Expand);
414 setOperationAction(ISD::FLOG, VT, Expand);
415 setOperationAction(ISD::FLOG10, VT, Expand);
416 setOperationAction(ISD::FPOW, VT, Expand);
417 setOperationAction(ISD::FFLOOR, VT, Expand);
418 setOperationAction(ISD::FTRUNC, VT, Expand);
419 setOperationAction(ISD::FMUL, VT, Expand);
420 setOperationAction(ISD::FMA, VT, Expand);
421 setOperationAction(ISD::FRINT, VT, Expand);
422 setOperationAction(ISD::FNEARBYINT, VT, Expand);
423 setOperationAction(ISD::FSQRT, VT, Expand);
424 setOperationAction(ISD::FSIN, VT, Expand);
425 setOperationAction(ISD::FSUB, VT, Expand);
426 setOperationAction(ISD::FNEG, VT, Expand);
427 setOperationAction(ISD::VSELECT, VT, Expand);
428 setOperationAction(ISD::SELECT_CC, VT, Expand);
429 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
430 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
431 setOperationAction(ISD::SETCC, VT, Expand);
432 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
433 }
434
435 // This causes using an unrolled select operation rather than expansion with
436 // bit operations. This is in general better, but the alternative using BFI
437 // instructions may be better if the select sources are SGPRs.
438 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
439 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
440
441 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
442 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
443
444 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
445 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
446
447 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
448 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
449
450 // There are no libcalls of any kind.
451 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
452 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
453
454 setBooleanContents(ZeroOrNegativeOneBooleanContent);
455 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
456
457 setSchedulingPreference(Sched::RegPressure);
458 setJumpIsExpensive(true);
459
460 // FIXME: This is only partially true. If we have to do vector compares, any
461 // SGPR pair can be a condition register. If we have a uniform condition, we
462 // are better off doing SALU operations, where there is only one SCC. For now,
463 // we don't have a way of knowing during instruction selection if a condition
464 // will be uniform and we always use vector compares. Assume we are using
465 // vector compares until that is fixed.
466 setHasMultipleConditionRegisters(true);
467
468 setMinCmpXchgSizeInBits(32);
469 setSupportsUnalignedAtomics(false);
470
471 PredictableSelectIsExpensive = false;
472
473 // We want to find all load dependencies for long chains of stores to enable
474 // merging into very wide vectors. The problem is with vectors with > 4
475 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
476 // vectors are a legal type, even though we have to split the loads
477 // usually. When we can more precisely specify load legality per address
478 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
479 // smarter so that they can figure out what to do in 2 iterations without all
480 // N > 4 stores on the same chain.
481 GatherAllAliasesMaxDepth = 16;
482
483 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
484 // about these during lowering.
485 MaxStoresPerMemcpy = 0xffffffff;
486 MaxStoresPerMemmove = 0xffffffff;
487 MaxStoresPerMemset = 0xffffffff;
488
489 setTargetDAGCombine(ISD::BITCAST);
490 setTargetDAGCombine(ISD::SHL);
491 setTargetDAGCombine(ISD::SRA);
492 setTargetDAGCombine(ISD::SRL);
493 setTargetDAGCombine(ISD::TRUNCATE);
494 setTargetDAGCombine(ISD::MUL);
495 setTargetDAGCombine(ISD::MULHU);
496 setTargetDAGCombine(ISD::MULHS);
497 setTargetDAGCombine(ISD::SELECT);
498 setTargetDAGCombine(ISD::SELECT_CC);
499 setTargetDAGCombine(ISD::STORE);
500 setTargetDAGCombine(ISD::FADD);
501 setTargetDAGCombine(ISD::FSUB);
502 setTargetDAGCombine(ISD::FNEG);
503 setTargetDAGCombine(ISD::FABS);
504 setTargetDAGCombine(ISD::AssertZext);
505 setTargetDAGCombine(ISD::AssertSext);
506 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
507}
508
509//===----------------------------------------------------------------------===//
510// Target Information
511//===----------------------------------------------------------------------===//
512
513LLVM_READNONE__attribute__((__const__))
514static bool fnegFoldsIntoOp(unsigned Opc) {
515 switch (Opc) {
516 case ISD::FADD:
517 case ISD::FSUB:
518 case ISD::FMUL:
519 case ISD::FMA:
520 case ISD::FMAD:
521 case ISD::FMINNUM:
522 case ISD::FMAXNUM:
523 case ISD::FMINNUM_IEEE:
524 case ISD::FMAXNUM_IEEE:
525 case ISD::FSIN:
526 case ISD::FTRUNC:
527 case ISD::FRINT:
528 case ISD::FNEARBYINT:
529 case ISD::FCANONICALIZE:
530 case AMDGPUISD::RCP:
531 case AMDGPUISD::RCP_LEGACY:
532 case AMDGPUISD::RCP_IFLAG:
533 case AMDGPUISD::SIN_HW:
534 case AMDGPUISD::FMUL_LEGACY:
535 case AMDGPUISD::FMIN_LEGACY:
536 case AMDGPUISD::FMAX_LEGACY:
537 case AMDGPUISD::FMED3:
538 return true;
539 default:
540 return false;
541 }
542}
543
544/// \p returns true if the operation will definitely need to use a 64-bit
545/// encoding, and thus will use a VOP3 encoding regardless of the source
546/// modifiers.
547LLVM_READONLY__attribute__((__pure__))
548static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
549 return N->getNumOperands() > 2 || VT == MVT::f64;
550}
551
552// Most FP instructions support source modifiers, but this could be refined
553// slightly.
554LLVM_READONLY__attribute__((__pure__))
555static bool hasSourceMods(const SDNode *N) {
556 if (isa<MemSDNode>(N))
557 return false;
558
559 switch (N->getOpcode()) {
560 case ISD::CopyToReg:
561 case ISD::SELECT:
562 case ISD::FDIV:
563 case ISD::FREM:
564 case ISD::INLINEASM:
565 case ISD::INLINEASM_BR:
566 case AMDGPUISD::DIV_SCALE:
567 case ISD::INTRINSIC_W_CHAIN:
568
569 // TODO: Should really be looking at the users of the bitcast. These are
570 // problematic because bitcasts are used to legalize all stores to integer
571 // types.
572 case ISD::BITCAST:
573 return false;
574 case ISD::INTRINSIC_WO_CHAIN: {
575 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
576 case Intrinsic::amdgcn_interp_p1:
577 case Intrinsic::amdgcn_interp_p2:
578 case Intrinsic::amdgcn_interp_mov:
579 case Intrinsic::amdgcn_interp_p1_f16:
580 case Intrinsic::amdgcn_interp_p2_f16:
581 return false;
582 default:
583 return true;
584 }
585 }
586 default:
587 return true;
588 }
589}
590
591bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
592 unsigned CostThreshold) {
593 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
594 // it is truly free to use a source modifier in all cases. If there are
595 // multiple users but for each one will necessitate using VOP3, there will be
596 // a code size increase. Try to avoid increasing code size unless we know it
597 // will save on the instruction count.
598 unsigned NumMayIncreaseSize = 0;
599 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
600
601 // XXX - Should this limit number of uses to check?
602 for (const SDNode *U : N->uses()) {
603 if (!hasSourceMods(U))
604 return false;
605
606 if (!opMustUseVOP3Encoding(U, VT)) {
607 if (++NumMayIncreaseSize > CostThreshold)
608 return false;
609 }
610 }
611
612 return true;
613}
614
615MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
616 return MVT::i32;
617}
618
619bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
620 return true;
621}
622
623// The backend supports 32 and 64 bit floating point immediates.
624// FIXME: Why are we reporting vectors of FP immediates as legal?
625bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
626 bool ForCodeSize) const {
627 EVT ScalarVT = VT.getScalarType();
628 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
629 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
630}
631
632// We don't want to shrink f64 / f32 constants.
633bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
634 EVT ScalarVT = VT.getScalarType();
635 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
636}
637
638bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
639 ISD::LoadExtType ExtTy,
640 EVT NewVT) const {
641 // TODO: This may be worth removing. Check regression tests for diffs.
642 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
643 return false;
644
645 unsigned NewSize = NewVT.getStoreSizeInBits();
646
647 // If we are reducing to a 32-bit load, this is always better.
648 if (NewSize == 32)
649 return true;
650
651 EVT OldVT = N->getValueType(0);
652 unsigned OldSize = OldVT.getStoreSizeInBits();
653
654 MemSDNode *MN = cast<MemSDNode>(N);
655 unsigned AS = MN->getAddressSpace();
656 // Do not shrink an aligned scalar load to sub-dword.
657 // Scalar engine cannot do sub-dword loads.
658 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
659 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
660 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
661 (isa<LoadSDNode>(N) &&
662 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
663 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
664 return false;
665
666 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
667 // extloads, so doing one requires using a buffer_load. In cases where we
668 // still couldn't use a scalar load, using the wider load shouldn't really
669 // hurt anything.
670
671 // If the old size already had to be an extload, there's no harm in continuing
672 // to reduce the width.
673 return (OldSize < 32);
674}
675
676bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
677 const SelectionDAG &DAG,
678 const MachineMemOperand &MMO) const {
679
680 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())((LoadTy.getSizeInBits() == CastTy.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 680, __PRETTY_FUNCTION__))
;
681
682 if (LoadTy.getScalarType() == MVT::i32)
683 return false;
684
685 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
686 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
687
688 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
689 return false;
690
691 bool Fast = false;
692 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
693 CastTy, MMO, &Fast) &&
694 Fast;
695}
696
697// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
698// profitable with the expansion for 64-bit since it's generally good to
699// speculate things.
700// FIXME: These should really have the size as a parameter.
701bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
702 return true;
703}
704
705bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
706 return true;
707}
708
709bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
710 switch (N->getOpcode()) {
711 default:
712 return false;
713 case ISD::EntryToken:
714 case ISD::TokenFactor:
715 return true;
716 case ISD::INTRINSIC_WO_CHAIN:
717 {
718 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
719 switch (IntrID) {
720 default:
721 return false;
722 case Intrinsic::amdgcn_readfirstlane:
723 case Intrinsic::amdgcn_readlane:
724 return true;
725 }
726 }
727 break;
728 case ISD::LOAD:
729 {
730 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
731 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
732 return true;
733 return false;
734 }
735 break;
736 }
737}
738
739//===---------------------------------------------------------------------===//
740// Target Properties
741//===---------------------------------------------------------------------===//
742
743bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
744 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 744, __PRETTY_FUNCTION__))
;
745
746 // Packed operations do not have a fabs modifier.
747 return VT == MVT::f32 || VT == MVT::f64 ||
748 (Subtarget->has16BitInsts() && VT == MVT::f16);
749}
750
751bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
752 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 752, __PRETTY_FUNCTION__))
;
753 return VT == MVT::f32 || VT == MVT::f64 ||
754 (Subtarget->has16BitInsts() && VT == MVT::f16) ||
755 (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
756}
757
758bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
759 unsigned NumElem,
760 unsigned AS) const {
761 return true;
762}
763
764bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
765 // There are few operations which truly have vector input operands. Any vector
766 // operation is going to involve operations on each component, and a
767 // build_vector will be a copy per element, so it always makes sense to use a
768 // build_vector input in place of the extracted element to avoid a copy into a
769 // super register.
770 //
771 // We should probably only do this if all users are extracts only, but this
772 // should be the common case.
773 return true;
774}
775
776bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
777 // Truncate is just accessing a subregister.
778
779 unsigned SrcSize = Source.getSizeInBits();
780 unsigned DestSize = Dest.getSizeInBits();
781
782 return DestSize < SrcSize && DestSize % 32 == 0 ;
783}
784
785bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
786 // Truncate is just accessing a subregister.
787
788 unsigned SrcSize = Source->getScalarSizeInBits();
789 unsigned DestSize = Dest->getScalarSizeInBits();
790
791 if (DestSize== 16 && Subtarget->has16BitInsts())
792 return SrcSize >= 32;
793
794 return DestSize < SrcSize && DestSize % 32 == 0;
795}
796
797bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
798 unsigned SrcSize = Src->getScalarSizeInBits();
799 unsigned DestSize = Dest->getScalarSizeInBits();
800
801 if (SrcSize == 16 && Subtarget->has16BitInsts())
802 return DestSize >= 32;
803
804 return SrcSize == 32 && DestSize == 64;
805}
806
807bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
808 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
809 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
810 // this will enable reducing 64-bit operations the 32-bit, which is always
811 // good.
812
813 if (Src == MVT::i16)
814 return Dest == MVT::i32 ||Dest == MVT::i64 ;
815
816 return Src == MVT::i32 && Dest == MVT::i64;
817}
818
819bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
820 return isZExtFree(Val.getValueType(), VT2);
821}
822
823bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
824 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
825 // limited number of native 64-bit operations. Shrinking an operation to fit
826 // in a single 32-bit register should always be helpful. As currently used,
827 // this is much less general than the name suggests, and is only used in
828 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
829 // not profitable, and may actually be harmful.
830 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
831}
832
833//===---------------------------------------------------------------------===//
834// TargetLowering Callbacks
835//===---------------------------------------------------------------------===//
836
837CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
838 bool IsVarArg) {
839 switch (CC) {
840 case CallingConv::AMDGPU_VS:
841 case CallingConv::AMDGPU_GS:
842 case CallingConv::AMDGPU_PS:
843 case CallingConv::AMDGPU_CS:
844 case CallingConv::AMDGPU_HS:
845 case CallingConv::AMDGPU_ES:
846 case CallingConv::AMDGPU_LS:
847 return CC_AMDGPU;
848 case CallingConv::C:
849 case CallingConv::Fast:
850 case CallingConv::Cold:
851 return CC_AMDGPU_Func;
852 case CallingConv::AMDGPU_KERNEL:
853 case CallingConv::SPIR_KERNEL:
854 default:
855 report_fatal_error("Unsupported calling convention for call");
856 }
857}
858
859CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
860 bool IsVarArg) {
861 switch (CC) {
862 case CallingConv::AMDGPU_KERNEL:
863 case CallingConv::SPIR_KERNEL:
864 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 864)
;
865 case CallingConv::AMDGPU_VS:
866 case CallingConv::AMDGPU_GS:
867 case CallingConv::AMDGPU_PS:
868 case CallingConv::AMDGPU_CS:
869 case CallingConv::AMDGPU_HS:
870 case CallingConv::AMDGPU_ES:
871 case CallingConv::AMDGPU_LS:
872 return RetCC_SI_Shader;
873 case CallingConv::C:
874 case CallingConv::Fast:
875 case CallingConv::Cold:
876 return RetCC_AMDGPU_Func;
877 default:
878 report_fatal_error("Unsupported calling convention.");
879 }
880}
881
882/// The SelectionDAGBuilder will automatically promote function arguments
883/// with illegal types. However, this does not work for the AMDGPU targets
884/// since the function arguments are stored in memory as these illegal types.
885/// In order to handle this properly we need to get the original types sizes
886/// from the LLVM IR Function and fixup the ISD:InputArg values before
887/// passing them to AnalyzeFormalArguments()
888
889/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
890/// input values across multiple registers. Each item in the Ins array
891/// represents a single value that will be stored in registers. Ins[x].VT is
892/// the value type of the value that will be stored in the register, so
893/// whatever SDNode we lower the argument to needs to be this type.
894///
895/// In order to correctly lower the arguments we need to know the size of each
896/// argument. Since Ins[x].VT gives us the size of the register that will
897/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
898/// for the orignal function argument so that we can deduce the correct memory
899/// type to use for Ins[x]. In most cases the correct memory type will be
900/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
901/// we have a kernel argument of type v8i8, this argument will be split into
902/// 8 parts and each part will be represented by its own item in the Ins array.
903/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
904/// the argument before it was split. From this, we deduce that the memory type
905/// for each individual part is i8. We pass the memory type as LocVT to the
906/// calling convention analysis function and the register type (Ins[x].VT) as
907/// the ValVT.
908void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
909 CCState &State,
910 const SmallVectorImpl<ISD::InputArg> &Ins) const {
911 const MachineFunction &MF = State.getMachineFunction();
912 const Function &Fn = MF.getFunction();
913 LLVMContext &Ctx = Fn.getParent()->getContext();
914 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
915 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
916 CallingConv::ID CC = Fn.getCallingConv();
917
918 unsigned MaxAlign = 1;
919 uint64_t ExplicitArgOffset = 0;
920 const DataLayout &DL = Fn.getParent()->getDataLayout();
921
922 unsigned InIndex = 0;
923
924 for (const Argument &Arg : Fn.args()) {
925 Type *BaseArgTy = Arg.getType();
926 unsigned Align = DL.getABITypeAlignment(BaseArgTy);
927 MaxAlign = std::max(Align, MaxAlign);
928 unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
929
930 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
931 ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
932
933 // We're basically throwing away everything passed into us and starting over
934 // to get accurate in-memory offsets. The "PartOffset" is completely useless
935 // to us as computed in Ins.
936 //
937 // We also need to figure out what type legalization is trying to do to get
938 // the correct memory offsets.
939
940 SmallVector<EVT, 16> ValueVTs;
941 SmallVector<uint64_t, 16> Offsets;
942 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
943
944 for (unsigned Value = 0, NumValues = ValueVTs.size();
945 Value != NumValues; ++Value) {
946 uint64_t BasePartOffset = Offsets[Value];
947
948 EVT ArgVT = ValueVTs[Value];
949 EVT MemVT = ArgVT;
950 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
951 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
952
953 if (NumRegs == 1) {
954 // This argument is not split, so the IR type is the memory type.
955 if (ArgVT.isExtended()) {
956 // We have an extended type, like i24, so we should just use the
957 // register type.
958 MemVT = RegisterVT;
959 } else {
960 MemVT = ArgVT;
961 }
962 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
963 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
964 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())((ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements
()) ? static_cast<void> (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 964, __PRETTY_FUNCTION__))
;
965 // We have a vector value which has been split into a vector with
966 // the same scalar type, but fewer elements. This should handle
967 // all the floating-point vector types.
968 MemVT = RegisterVT;
969 } else if (ArgVT.isVector() &&
970 ArgVT.getVectorNumElements() == NumRegs) {
971 // This arg has been split so that each element is stored in a separate
972 // register.
973 MemVT = ArgVT.getScalarType();
974 } else if (ArgVT.isExtended()) {
975 // We have an extended type, like i65.
976 MemVT = RegisterVT;
977 } else {
978 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
979 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)((ArgVT.getStoreSizeInBits() % NumRegs == 0) ? static_cast<
void> (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 979, __PRETTY_FUNCTION__))
;
980 if (RegisterVT.isInteger()) {
981 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
982 } else if (RegisterVT.isVector()) {
983 assert(!RegisterVT.getScalarType().isFloatingPoint())((!RegisterVT.getScalarType().isFloatingPoint()) ? static_cast
<void> (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 983, __PRETTY_FUNCTION__))
;
984 unsigned NumElements = RegisterVT.getVectorNumElements();
985 assert(MemoryBits % NumElements == 0)((MemoryBits % NumElements == 0) ? static_cast<void> (0
) : __assert_fail ("MemoryBits % NumElements == 0", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 985, __PRETTY_FUNCTION__))
;
986 // This vector type has been split into another vector type with
987 // a different elements size.
988 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
989 MemoryBits / NumElements);
990 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
991 } else {
992 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 992)
;
993 }
994 }
995
996 // Convert one element vectors to scalar.
997 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
998 MemVT = MemVT.getScalarType();
999
1000 // Round up vec3/vec5 argument.
1001 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1002 assert(MemVT.getVectorNumElements() == 3 ||((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1003, __PRETTY_FUNCTION__))
1003 MemVT.getVectorNumElements() == 5)((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1003, __PRETTY_FUNCTION__))
;
1004 MemVT = MemVT.getPow2VectorType(State.getContext());
1005 }
1006
1007 unsigned PartOffset = 0;
1008 for (unsigned i = 0; i != NumRegs; ++i) {
1009 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1010 BasePartOffset + PartOffset,
1011 MemVT.getSimpleVT(),
1012 CCValAssign::Full));
1013 PartOffset += MemVT.getStoreSize();
1014 }
1015 }
1016 }
1017}
1018
1019SDValue AMDGPUTargetLowering::LowerReturn(
1020 SDValue Chain, CallingConv::ID CallConv,
1021 bool isVarArg,
1022 const SmallVectorImpl<ISD::OutputArg> &Outs,
1023 const SmallVectorImpl<SDValue> &OutVals,
1024 const SDLoc &DL, SelectionDAG &DAG) const {
1025 // FIXME: Fails for r600 tests
1026 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1027 // "wave terminate should not have return values");
1028 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1029}
1030
1031//===---------------------------------------------------------------------===//
1032// Target specific lowering
1033//===---------------------------------------------------------------------===//
1034
1035/// Selects the correct CCAssignFn for a given CallingConvention value.
1036CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1037 bool IsVarArg) {
1038 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1039}
1040
1041CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1042 bool IsVarArg) {
1043 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1044}
1045
1046SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1047 SelectionDAG &DAG,
1048 MachineFrameInfo &MFI,
1049 int ClobberedFI) const {
1050 SmallVector<SDValue, 8> ArgChains;
1051 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1052 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1053
1054 // Include the original chain at the beginning of the list. When this is
1055 // used by target LowerCall hooks, this helps legalize find the
1056 // CALLSEQ_BEGIN node.
1057 ArgChains.push_back(Chain);
1058
1059 // Add a chain value for each stack argument corresponding
1060 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1061 UE = DAG.getEntryNode().getNode()->use_end();
1062 U != UE; ++U) {
1063 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1064 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1065 if (FI->getIndex() < 0) {
1066 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1067 int64_t InLastByte = InFirstByte;
1068 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1069
1070 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1071 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1072 ArgChains.push_back(SDValue(L, 1));
1073 }
1074 }
1075 }
1076 }
1077
1078 // Build a tokenfactor for all the chains.
1079 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1080}
1081
1082SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1083 SmallVectorImpl<SDValue> &InVals,
1084 StringRef Reason) const {
1085 SDValue Callee = CLI.Callee;
1086 SelectionDAG &DAG = CLI.DAG;
1087
1088 const Function &Fn = DAG.getMachineFunction().getFunction();
1089
1090 StringRef FuncName("<unknown>");
1091
1092 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1093 FuncName = G->getSymbol();
1094 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1095 FuncName = G->getGlobal()->getName();
1096
1097 DiagnosticInfoUnsupported NoCalls(
1098 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1099 DAG.getContext()->diagnose(NoCalls);
1100
1101 if (!CLI.IsTailCall) {
1102 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1103 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1104 }
1105
1106 return DAG.getEntryNode();
1107}
1108
1109SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1110 SmallVectorImpl<SDValue> &InVals) const {
1111 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1112}
1113
1114SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1115 SelectionDAG &DAG) const {
1116 const Function &Fn = DAG.getMachineFunction().getFunction();
1117
1118 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1119 SDLoc(Op).getDebugLoc());
1120 DAG.getContext()->diagnose(NoDynamicAlloca);
1121 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1122 return DAG.getMergeValues(Ops, SDLoc());
1123}
1124
1125SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1126 SelectionDAG &DAG) const {
1127 switch (Op.getOpcode()) {
1128 default:
1129 Op->print(errs(), &DAG);
1130 llvm_unreachable("Custom lowering code for this"::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1131)
1131 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1131)
;
1132 break;
1133 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1134 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1135 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1136 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1137 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1138 case ISD::FREM: return LowerFREM(Op, DAG);
1139 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1140 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1141 case ISD::FRINT: return LowerFRINT(Op, DAG);
1142 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1143 case ISD::FROUND: return LowerFROUND(Op, DAG);
1144 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1145 case ISD::FLOG:
1146 return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
1147 case ISD::FLOG10:
1148 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1149 case ISD::FEXP:
1150 return lowerFEXP(Op, DAG);
1151 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1152 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1153 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1154 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1155 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1156 case ISD::CTTZ:
1157 case ISD::CTTZ_ZERO_UNDEF:
1158 case ISD::CTLZ:
1159 case ISD::CTLZ_ZERO_UNDEF:
1160 return LowerCTLZ_CTTZ(Op, DAG);
1161 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1162 }
1163 return Op;
1164}
1165
1166void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1167 SmallVectorImpl<SDValue> &Results,
1168 SelectionDAG &DAG) const {
1169 switch (N->getOpcode()) {
1170 case ISD::SIGN_EXTEND_INREG:
1171 // Different parts of legalization seem to interpret which type of
1172 // sign_extend_inreg is the one to check for custom lowering. The extended
1173 // from type is what really matters, but some places check for custom
1174 // lowering of the result type. This results in trying to use
1175 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1176 // nothing here and let the illegal result integer be handled normally.
1177 return;
1178 default:
1179 return;
1180 }
1181}
1182
1183bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1184 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1185 if (!GVar || !GVar->hasInitializer())
1186 return false;
1187
1188 return !isa<UndefValue>(GVar->getInitializer());
1189}
1190
1191SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1192 SDValue Op,
1193 SelectionDAG &DAG) const {
1194
1195 const DataLayout &DL = DAG.getDataLayout();
1196 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1197 const GlobalValue *GV = G->getGlobal();
1198
1199 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1200 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1201 if (!MFI->isEntryFunction()) {
1202 const Function &Fn = DAG.getMachineFunction().getFunction();
1203 DiagnosticInfoUnsupported BadLDSDecl(
1204 Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
1205 DAG.getContext()->diagnose(BadLDSDecl);
1206 }
1207
1208 // XXX: What does the value of G->getOffset() mean?
1209 assert(G->getOffset() == 0 &&((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1210, __PRETTY_FUNCTION__))
1210 "Do not know what to do with an non-zero offset")((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1210, __PRETTY_FUNCTION__))
;
1211
1212 // TODO: We could emit code to handle the initialization somewhere.
1213 if (!hasDefinedInitializer(GV)) {
1214 unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1215 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1216 }
1217 }
1218
1219 const Function &Fn = DAG.getMachineFunction().getFunction();
1220 DiagnosticInfoUnsupported BadInit(
1221 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1222 DAG.getContext()->diagnose(BadInit);
1223 return SDValue();
1224}
1225
1226SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1227 SelectionDAG &DAG) const {
1228 SmallVector<SDValue, 8> Args;
1229
1230 EVT VT = Op.getValueType();
1231 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1232 SDLoc SL(Op);
1233 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1234 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1235
1236 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1237 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1238 }
1239
1240 for (const SDUse &U : Op->ops())
1241 DAG.ExtractVectorElements(U.get(), Args);
1242
1243 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1244}
1245
1246SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1247 SelectionDAG &DAG) const {
1248
1249 SmallVector<SDValue, 8> Args;
1250 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1251 EVT VT = Op.getValueType();
1252 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1253 VT.getVectorNumElements());
1254
1255 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1256}
1257
1258/// Generate Min/Max node
1259SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1260 SDValue LHS, SDValue RHS,
1261 SDValue True, SDValue False,
1262 SDValue CC,
1263 DAGCombinerInfo &DCI) const {
1264 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1265 return SDValue();
1266
1267 SelectionDAG &DAG = DCI.DAG;
1268 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1269 switch (CCOpcode) {
1270 case ISD::SETOEQ:
1271 case ISD::SETONE:
1272 case ISD::SETUNE:
1273 case ISD::SETNE:
1274 case ISD::SETUEQ:
1275 case ISD::SETEQ:
1276 case ISD::SETFALSE:
1277 case ISD::SETFALSE2:
1278 case ISD::SETTRUE:
1279 case ISD::SETTRUE2:
1280 case ISD::SETUO:
1281 case ISD::SETO:
1282 break;
1283 case ISD::SETULE:
1284 case ISD::SETULT: {
1285 if (LHS == True)
1286 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1287 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1288 }
1289 case ISD::SETOLE:
1290 case ISD::SETOLT:
1291 case ISD::SETLE:
1292 case ISD::SETLT: {
1293 // Ordered. Assume ordered for undefined.
1294
1295 // Only do this after legalization to avoid interfering with other combines
1296 // which might occur.
1297 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1298 !DCI.isCalledByLegalizer())
1299 return SDValue();
1300
1301 // We need to permute the operands to get the correct NaN behavior. The
1302 // selected operand is the second one based on the failing compare with NaN,
1303 // so permute it based on the compare type the hardware uses.
1304 if (LHS == True)
1305 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1306 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1307 }
1308 case ISD::SETUGE:
1309 case ISD::SETUGT: {
1310 if (LHS == True)
1311 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1312 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1313 }
1314 case ISD::SETGT:
1315 case ISD::SETGE:
1316 case ISD::SETOGE:
1317 case ISD::SETOGT: {
1318 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1319 !DCI.isCalledByLegalizer())
1320 return SDValue();
1321
1322 if (LHS == True)
1323 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1324 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1325 }
1326 case ISD::SETCC_INVALID:
1327 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1327)
;
1328 }
1329 return SDValue();
1330}
1331
1332std::pair<SDValue, SDValue>
1333AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1334 SDLoc SL(Op);
1335
1336 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1337
1338 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1339 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1340
1341 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1342 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1343
1344 return std::make_pair(Lo, Hi);
1345}
1346
1347SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1348 SDLoc SL(Op);
1349
1350 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1351 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1352 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1353}
1354
1355SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1356 SDLoc SL(Op);
1357
1358 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1359 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1360 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1361}
1362
1363// Split a vector type into two parts. The first part is a power of two vector.
1364// The second part is whatever is left over, and is a scalar if it would
1365// otherwise be a 1-vector.
1366std::pair<EVT, EVT>
1367AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1368 EVT LoVT, HiVT;
1369 EVT EltVT = VT.getVectorElementType();
1370 unsigned NumElts = VT.getVectorNumElements();
1371 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1372 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1373 HiVT = NumElts - LoNumElts == 1
1374 ? EltVT
1375 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1376 return std::make_pair(LoVT, HiVT);
1377}
1378
1379// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1380// scalar.
1381std::pair<SDValue, SDValue>
1382AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1383 const EVT &LoVT, const EVT &HiVT,
1384 SelectionDAG &DAG) const {
1385 assert(LoVT.getVectorNumElements() +((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1388, __PRETTY_FUNCTION__))
1386 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1388, __PRETTY_FUNCTION__))
1387 N.getValueType().getVectorNumElements() &&((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1388, __PRETTY_FUNCTION__))
1388 "More vector elements requested than available!")((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1388, __PRETTY_FUNCTION__))
;
1389 auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1390 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1391 DAG.getConstant(0, DL, IdxTy));
1392 SDValue Hi = DAG.getNode(
1393 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1394 HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
1395 return std::make_pair(Lo, Hi);
1396}
1397
1398SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1399 SelectionDAG &DAG) const {
1400 LoadSDNode *Load = cast<LoadSDNode>(Op);
1401 EVT VT = Op.getValueType();
1402
1403
1404 // If this is a 2 element vector, we really want to scalarize and not create
1405 // weird 1 element vectors.
1406 if (VT.getVectorNumElements() == 2)
1407 return scalarizeVectorLoad(Load, DAG);
1408
1409 SDValue BasePtr = Load->getBasePtr();
1410 EVT MemVT = Load->getMemoryVT();
1411 SDLoc SL(Op);
1412
1413 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1414
1415 EVT LoVT, HiVT;
1416 EVT LoMemVT, HiMemVT;
1417 SDValue Lo, Hi;
1418
1419 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1420 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1421 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1422
1423 unsigned Size = LoMemVT.getStoreSize();
1424 unsigned BaseAlign = Load->getAlignment();
1425 unsigned HiAlign = MinAlign(BaseAlign, Size);
1426
1427 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1428 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1429 BaseAlign, Load->getMemOperand()->getFlags());
1430 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1431 SDValue HiLoad =
1432 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1433 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1434 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1435
1436 auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1437 SDValue Join;
1438 if (LoVT == HiVT) {
1439 // This is the case that the vector is power of two so was evenly split.
1440 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1441 } else {
1442 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1443 DAG.getConstant(0, SL, IdxTy));
1444 Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
1445 : ISD::INSERT_VECTOR_ELT,
1446 SL, VT, Join, HiLoad,
1447 DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
1448 }
1449
1450 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1451 LoLoad.getValue(1), HiLoad.getValue(1))};
1452
1453 return DAG.getMergeValues(Ops, SL);
1454}
1455
1456// Widen a vector load from vec3 to vec4.
1457SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
1458 SelectionDAG &DAG) const {
1459 LoadSDNode *Load = cast<LoadSDNode>(Op);
1460 EVT VT = Op.getValueType();
1461 assert(VT.getVectorNumElements() == 3)((VT.getVectorNumElements() == 3) ? static_cast<void> (
0) : __assert_fail ("VT.getVectorNumElements() == 3", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1461, __PRETTY_FUNCTION__))
;
1462 SDValue BasePtr = Load->getBasePtr();
1463 EVT MemVT = Load->getMemoryVT();
1464 SDLoc SL(Op);
1465 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1466 unsigned BaseAlign = Load->getAlignment();
1467
1468 EVT WideVT =
1469 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1470 EVT WideMemVT =
1471 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1472 SDValue WideLoad = DAG.getExtLoad(
1473 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1474 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1475 return DAG.getMergeValues(
1476 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1477 DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
1478 WideLoad.getValue(1)},
1479 SL);
1480}
1481
1482SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1483 SelectionDAG &DAG) const {
1484 StoreSDNode *Store = cast<StoreSDNode>(Op);
1485 SDValue Val = Store->getValue();
1486 EVT VT = Val.getValueType();
1487
1488 // If this is a 2 element vector, we really want to scalarize and not create
1489 // weird 1 element vectors.
1490 if (VT.getVectorNumElements() == 2)
1491 return scalarizeVectorStore(Store, DAG);
1492
1493 EVT MemVT = Store->getMemoryVT();
1494 SDValue Chain = Store->getChain();
1495 SDValue BasePtr = Store->getBasePtr();
1496 SDLoc SL(Op);
1497
1498 EVT LoVT, HiVT;
1499 EVT LoMemVT, HiMemVT;
1500 SDValue Lo, Hi;
1501
1502 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1503 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1504 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1505
1506 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1507
1508 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1509 unsigned BaseAlign = Store->getAlignment();
1510 unsigned Size = LoMemVT.getStoreSize();
1511 unsigned HiAlign = MinAlign(BaseAlign, Size);
1512
1513 SDValue LoStore =
1514 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1515 Store->getMemOperand()->getFlags());
1516 SDValue HiStore =
1517 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1518 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1519
1520 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1521}
1522
1523// This is a shortcut for integer division because we have fast i32<->f32
1524// conversions, and fast f32 reciprocal instructions. The fractional part of a
1525// float is enough to accurately represent up to a 24-bit signed integer.
1526SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1527 bool Sign) const {
1528 SDLoc DL(Op);
1529 EVT VT = Op.getValueType();
1530 SDValue LHS = Op.getOperand(0);
1531 SDValue RHS = Op.getOperand(1);
1532 MVT IntVT = MVT::i32;
1533 MVT FltVT = MVT::f32;
1534
1535 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1536 if (LHSSignBits < 9)
1537 return SDValue();
1538
1539 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1540 if (RHSSignBits < 9)
1541 return SDValue();
1542
1543 unsigned BitSize = VT.getSizeInBits();
1544 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1545 unsigned DivBits = BitSize - SignBits;
1546 if (Sign)
1547 ++DivBits;
1548
1549 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1550 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1551
1552 SDValue jq = DAG.getConstant(1, DL, IntVT);
1553
1554 if (Sign) {
1555 // char|short jq = ia ^ ib;
1556 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1557
1558 // jq = jq >> (bitsize - 2)
1559 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1560 DAG.getConstant(BitSize - 2, DL, VT));
1561
1562 // jq = jq | 0x1
1563 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1564 }
1565
1566 // int ia = (int)LHS;
1567 SDValue ia = LHS;
1568
1569 // int ib, (int)RHS;
1570 SDValue ib = RHS;
1571
1572 // float fa = (float)ia;
1573 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1574
1575 // float fb = (float)ib;
1576 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1577
1578 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1579 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1580
1581 // fq = trunc(fq);
1582 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1583
1584 // float fqneg = -fq;
1585 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1586
1587 // float fr = mad(fqneg, fb, fa);
1588 unsigned OpCode = Subtarget->hasFP32Denormals() ?
1589 (unsigned)AMDGPUISD::FMAD_FTZ :
1590 (unsigned)ISD::FMAD;
1591 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1592
1593 // int iq = (int)fq;
1594 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1595
1596 // fr = fabs(fr);
1597 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1598
1599 // fb = fabs(fb);
1600 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1601
1602 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1603
1604 // int cv = fr >= fb;
1605 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1606
1607 // jq = (cv ? jq : 0);
1608 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1609
1610 // dst = iq + jq;
1611 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1612
1613 // Rem needs compensation, it's easier to recompute it
1614 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1615 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1616
1617 // Truncate to number of bits this divide really is.
1618 if (Sign) {
1619 SDValue InRegSize
1620 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1621 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1622 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1623 } else {
1624 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1625 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1626 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1627 }
1628
1629 return DAG.getMergeValues({ Div, Rem }, DL);
1630}
1631
1632void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1633 SelectionDAG &DAG,
1634 SmallVectorImpl<SDValue> &Results) const {
1635 SDLoc DL(Op);
1636 EVT VT = Op.getValueType();
1637
1638 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")((VT == MVT::i64 && "LowerUDIVREM64 expects an i64") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1638, __PRETTY_FUNCTION__))
;
1639
1640 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1641
1642 SDValue One = DAG.getConstant(1, DL, HalfVT);
1643 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1644
1645 //HiLo split
1646 SDValue LHS = Op.getOperand(0);
1647 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1648 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1649
1650 SDValue RHS = Op.getOperand(1);
1651 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1652 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1653
1654 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1655 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1656
1657 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1658 LHS_Lo, RHS_Lo);
1659
1660 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1661 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1662
1663 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1664 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1665 return;
1666 }
1667
1668 if (isTypeLegal(MVT::i64)) {
1669 // Compute denominator reciprocal.
1670 unsigned FMAD = Subtarget->hasFP32Denormals() ?
1671 (unsigned)AMDGPUISD::FMAD_FTZ :
1672 (unsigned)ISD::FMAD;
1673
1674 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1675 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1676 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1677 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1678 Cvt_Lo);
1679 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1680 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1681 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1682 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1683 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1684 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1685 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1686 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1687 Mul1);
1688 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1689 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1690 SDValue Rcp64 = DAG.getBitcast(VT,
1691 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1692
1693 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1694 SDValue One64 = DAG.getConstant(1, DL, VT);
1695 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1696 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1697
1698 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1699 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1700 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1701 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1702 Zero);
1703 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1704 One);
1705
1706 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1707 Mulhi1_Lo, Zero1);
1708 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1709 Mulhi1_Hi, Add1_Lo.getValue(1));
1710 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1711 SDValue Add1 = DAG.getBitcast(VT,
1712 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1713
1714 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1715 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1716 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1717 Zero);
1718 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1719 One);
1720
1721 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1722 Mulhi2_Lo, Zero1);
1723 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1724 Mulhi2_Hi, Add1_Lo.getValue(1));
1725 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1726 Zero, Add2_Lo.getValue(1));
1727 SDValue Add2 = DAG.getBitcast(VT,
1728 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1729 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1730
1731 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1732
1733 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1734 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1735 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1736 Mul3_Lo, Zero1);
1737 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1738 Mul3_Hi, Sub1_Lo.getValue(1));
1739 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1740 SDValue Sub1 = DAG.getBitcast(VT,
1741 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1742
1743 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1744 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1745 ISD::SETUGE);
1746 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1747 ISD::SETUGE);
1748 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1749
1750 // TODO: Here and below portions of the code can be enclosed into if/endif.
1751 // Currently control flow is unconditional and we have 4 selects after
1752 // potential endif to substitute PHIs.
1753
1754 // if C3 != 0 ...
1755 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1756 RHS_Lo, Zero1);
1757 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1758 RHS_Hi, Sub1_Lo.getValue(1));
1759 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1760 Zero, Sub2_Lo.getValue(1));
1761 SDValue Sub2 = DAG.getBitcast(VT,
1762 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1763
1764 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1765
1766 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1767 ISD::SETUGE);
1768 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1769 ISD::SETUGE);
1770 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1771
1772 // if (C6 != 0)
1773 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1774
1775 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1776 RHS_Lo, Zero1);
1777 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1778 RHS_Hi, Sub2_Lo.getValue(1));
1779 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1780 Zero, Sub3_Lo.getValue(1));
1781 SDValue Sub3 = DAG.getBitcast(VT,
1782 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1783
1784 // endif C6
1785 // endif C3
1786
1787 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1788 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1789
1790 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1791 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1792
1793 Results.push_back(Div);
1794 Results.push_back(Rem);
1795
1796 return;
1797 }
1798
1799 // r600 expandion.
1800 // Get Speculative values
1801 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1802 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1803
1804 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1805 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1806 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1807
1808 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1809 SDValue DIV_Lo = Zero;
1810
1811 const unsigned halfBitWidth = HalfVT.getSizeInBits();
1812
1813 for (unsigned i = 0; i < halfBitWidth; ++i) {
1814 const unsigned bitPos = halfBitWidth - i - 1;
1815 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1816 // Get value of high bit
1817 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1818 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1819 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1820
1821 // Shift
1822 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1823 // Add LHS high bit
1824 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1825
1826 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1827 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1828
1829 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1830
1831 // Update REM
1832 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1833 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1834 }
1835
1836 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1837 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1838 Results.push_back(DIV);
1839 Results.push_back(REM);
1840}
1841
1842SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1843 SelectionDAG &DAG) const {
1844 SDLoc DL(Op);
1845 EVT VT = Op.getValueType();
1846
1847 if (VT == MVT::i64) {
1848 SmallVector<SDValue, 2> Results;
1849 LowerUDIVREM64(Op, DAG, Results);
1850 return DAG.getMergeValues(Results, DL);
1851 }
1852
1853 if (VT == MVT::i32) {
1854 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1855 return Res;
1856 }
1857
1858 SDValue Num = Op.getOperand(0);
1859 SDValue Den = Op.getOperand(1);
1860
1861 // RCP = URECIP(Den) = 2^32 / Den + e
1862 // e is rounding error.
1863 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1864
1865 // RCP_LO = mul(RCP, Den) */
1866 SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1867
1868 // RCP_HI = mulhu (RCP, Den) */
1869 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1870
1871 // NEG_RCP_LO = -RCP_LO
1872 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1873 RCP_LO);
1874
1875 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1876 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1877 NEG_RCP_LO, RCP_LO,
1878 ISD::SETEQ);
1879 // Calculate the rounding error from the URECIP instruction
1880 // E = mulhu(ABS_RCP_LO, RCP)
1881 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1882
1883 // RCP_A_E = RCP + E
1884 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1885
1886 // RCP_S_E = RCP - E
1887 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1888
1889 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1890 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1891 RCP_A_E, RCP_S_E,
1892 ISD::SETEQ);
1893 // Quotient = mulhu(Tmp0, Num)
1894 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1895
1896 // Num_S_Remainder = Quotient * Den
1897 SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1898
1899 // Remainder = Num - Num_S_Remainder
1900 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1901
1902 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1903 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1904 DAG.getConstant(-1, DL, VT),
1905 DAG.getConstant(0, DL, VT),
1906 ISD::SETUGE);
1907 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1908 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1909 Num_S_Remainder,
1910 DAG.getConstant(-1, DL, VT),
1911 DAG.getConstant(0, DL, VT),
1912 ISD::SETUGE);
1913 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1914 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1915 Remainder_GE_Zero);
1916
1917 // Calculate Division result:
1918
1919 // Quotient_A_One = Quotient + 1
1920 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1921 DAG.getConstant(1, DL, VT));
1922
1923 // Quotient_S_One = Quotient - 1
1924 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1925 DAG.getConstant(1, DL, VT));
1926
1927 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1928 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1929 Quotient, Quotient_A_One, ISD::SETEQ);
1930
1931 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1932 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1933 Quotient_S_One, Div, ISD::SETEQ);
1934
1935 // Calculate Rem result:
1936
1937 // Remainder_S_Den = Remainder - Den
1938 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1939
1940 // Remainder_A_Den = Remainder + Den
1941 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1942
1943 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1944 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1945 Remainder, Remainder_S_Den, ISD::SETEQ);
1946
1947 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1948 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1949 Remainder_A_Den, Rem, ISD::SETEQ);
1950 SDValue Ops[2] = {
1951 Div,
1952 Rem
1953 };
1954 return DAG.getMergeValues(Ops, DL);
1955}
1956
1957SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1958 SelectionDAG &DAG) const {
1959 SDLoc DL(Op);
1960 EVT VT = Op.getValueType();
1961
1962 SDValue LHS = Op.getOperand(0);
1963 SDValue RHS = Op.getOperand(1);
1964
1965 SDValue Zero = DAG.getConstant(0, DL, VT);
1966 SDValue NegOne = DAG.getConstant(-1, DL, VT);
1967
1968 if (VT == MVT::i32) {
1969 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1970 return Res;
1971 }
1972
1973 if (VT == MVT::i64 &&
1974 DAG.ComputeNumSignBits(LHS) > 32 &&
1975 DAG.ComputeNumSignBits(RHS) > 32) {
1976 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1977
1978 //HiLo split
1979 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1980 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1981 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1982 LHS_Lo, RHS_Lo);
1983 SDValue Res[2] = {
1984 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1985 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1986 };
1987 return DAG.getMergeValues(Res, DL);
1988 }
1989
1990 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1991 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1992 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1993 SDValue RSign = LHSign; // Remainder sign is the same as LHS
1994
1995 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1996 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1997
1998 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1999 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2000
2001 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2002 SDValue Rem = Div.getValue(1);
2003
2004 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2005 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2006
2007 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2008 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2009
2010 SDValue Res[2] = {
2011 Div,
2012 Rem
2013 };
2014 return DAG.getMergeValues(Res, DL);
2015}
2016
2017// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
2018SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2019 SDLoc SL(Op);
2020 EVT VT = Op.getValueType();
2021 SDValue X = Op.getOperand(0);
2022 SDValue Y = Op.getOperand(1);
2023
2024 // TODO: Should this propagate fast-math-flags?
2025
2026 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
2027 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
2028 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
2029
2030 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
2031}
2032
2033SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2034 SDLoc SL(Op);
2035 SDValue Src = Op.getOperand(0);
2036
2037 // result = trunc(src)
2038 // if (src > 0.0 && src != result)
2039 // result += 1.0
2040
2041 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2042
2043 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2044 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2045
2046 EVT SetCCVT =
2047 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2048
2049 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2050 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2051 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2052
2053 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2054 // TODO: Should this propagate fast-math-flags?
2055 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2056}
2057
2058static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2059 SelectionDAG &DAG) {
2060 const unsigned FractBits = 52;
2061 const unsigned ExpBits = 11;
2062
2063 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2064 Hi,
2065 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2066 DAG.getConstant(ExpBits, SL, MVT::i32));
2067 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2068 DAG.getConstant(1023, SL, MVT::i32));
2069
2070 return Exp;
2071}
2072
2073SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2074 SDLoc SL(Op);
2075 SDValue Src = Op.getOperand(0);
2076
2077 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2077, __PRETTY_FUNCTION__))
;
2078
2079 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2080 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2081
2082 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2083
2084 // Extract the upper half, since this is where we will find the sign and
2085 // exponent.
2086 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2087
2088 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2089
2090 const unsigned FractBits = 52;
2091
2092 // Extract the sign bit.
2093 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2094 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2095
2096 // Extend back to 64-bits.
2097 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2098 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2099
2100 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2101 const SDValue FractMask
2102 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2103
2104 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2105 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2106 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2107
2108 EVT SetCCVT =
2109 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2110
2111 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2112
2113 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2114 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2115
2116 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2117 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2118
2119 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2120}
2121
2122SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2123 SDLoc SL(Op);
2124 SDValue Src = Op.getOperand(0);
2125
2126 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2126, __PRETTY_FUNCTION__))
;
2127
2128 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2129 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2130 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2131
2132 // TODO: Should this propagate fast-math-flags?
2133
2134 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2135 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2136
2137 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2138
2139 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2140 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2141
2142 EVT SetCCVT =
2143 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2144 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2145
2146 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2147}
2148
2149SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2150 // FNEARBYINT and FRINT are the same, except in their handling of FP
2151 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2152 // rint, so just treat them as equivalent.
2153 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2154}
2155
2156// XXX - May require not supporting f32 denormals?
2157
2158// Don't handle v2f16. The extra instructions to scalarize and repack around the
2159// compare and vselect end up producing worse code than scalarizing the whole
2160// operation.
2161SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
2162 SDLoc SL(Op);
2163 SDValue X = Op.getOperand(0);
2164 EVT VT = Op.getValueType();
2165
2166 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2167
2168 // TODO: Should this propagate fast-math-flags?
2169
2170 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2171
2172 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2173
2174 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2175 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2176 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2177
2178 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2179
2180 EVT SetCCVT =
2181 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2182
2183 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2184
2185 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2186
2187 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2188}
2189
2190SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
2191 SDLoc SL(Op);
2192 SDValue X = Op.getOperand(0);
2193
2194 SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2195
2196 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2197 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2198 const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2199 const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2200 EVT SetCCVT =
2201 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2202
2203 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2204
2205 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2206
2207 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2208
2209 const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff)0x000fffffffffffffL, SL,
2210 MVT::i64);
2211
2212 SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2213 SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2214 DAG.getConstant(INT64_C(0x0008000000000000)0x0008000000000000L, SL,
2215 MVT::i64),
2216 Exp);
2217
2218 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2219 SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2220 DAG.getConstant(0, SL, MVT::i64), Tmp0,
2221 ISD::SETNE);
2222
2223 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2224 D, DAG.getConstant(0, SL, MVT::i64));
2225 SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2226
2227 K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2228 K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2229
2230 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2231 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2232 SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2233
2234 SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2235 ExpEqNegOne,
2236 DAG.getConstantFP(1.0, SL, MVT::f64),
2237 DAG.getConstantFP(0.0, SL, MVT::f64));
2238
2239 SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2240
2241 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2242 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2243
2244 return K;
2245}
2246
2247SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2248 EVT VT = Op.getValueType();
2249
2250 if (VT == MVT::f32 || VT == MVT::f16)
2251 return LowerFROUND32_16(Op, DAG);
2252
2253 if (VT == MVT::f64)
2254 return LowerFROUND64(Op, DAG);
2255
2256 llvm_unreachable("unhandled type")::llvm::llvm_unreachable_internal("unhandled type", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2256)
;
2257}
2258
2259SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2260 SDLoc SL(Op);
2261 SDValue Src = Op.getOperand(0);
2262
2263 // result = trunc(src);
2264 // if (src < 0.0 && src != result)
2265 // result += -1.0.
2266
2267 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2268
2269 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2270 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2271
2272 EVT SetCCVT =
2273 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2274
2275 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2276 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2277 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2278
2279 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2280 // TODO: Should this propagate fast-math-flags?
2281 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2282}
2283
2284SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2285 double Log2BaseInverted) const {
2286 EVT VT = Op.getValueType();
2287
2288 SDLoc SL(Op);
2289 SDValue Operand = Op.getOperand(0);
2290 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2291 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2292
2293 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2294}
2295
2296// exp2(M_LOG2E_F * f);
2297SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2298 EVT VT = Op.getValueType();
2299 SDLoc SL(Op);
2300 SDValue Src = Op.getOperand(0);
2301
2302 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2303 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2304 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2305}
2306
2307static bool isCtlzOpc(unsigned Opc) {
2308 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2309}
2310
2311static bool isCttzOpc(unsigned Opc) {
2312 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2313}
2314
2315SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2316 SDLoc SL(Op);
2317 SDValue Src = Op.getOperand(0);
2318 bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2319 Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2320
2321 unsigned ISDOpc, NewOpc;
2322 if (isCtlzOpc(Op.getOpcode())) {
2323 ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2324 NewOpc = AMDGPUISD::FFBH_U32;
2325 } else if (isCttzOpc(Op.getOpcode())) {
2326 ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2327 NewOpc = AMDGPUISD::FFBL_B32;
2328 } else
2329 llvm_unreachable("Unexpected OPCode!!!")::llvm::llvm_unreachable_internal("Unexpected OPCode!!!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2329)
;
2330
2331
2332 if (ZeroUndef && Src.getValueType() == MVT::i32)
2333 return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2334
2335 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2336
2337 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2338 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2339
2340 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2341 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2342
2343 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2344 *DAG.getContext(), MVT::i32);
2345
2346 SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2347 SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2348
2349 SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2350 SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2351
2352 const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2353 SDValue Add, NewOpr;
2354 if (isCtlzOpc(Op.getOpcode())) {
2355 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2356 // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2357 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2358 } else {
2359 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2360 // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2361 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2362 }
2363
2364 if (!ZeroUndef) {
2365 // Test if the full 64-bit input is zero.
2366
2367 // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2368 // which we probably don't want.
2369 SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2370 SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2371 SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2372
2373 // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2374 // with the same cycles, otherwise it is slower.
2375 // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2376 // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2377
2378 const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2379
2380 // The instruction returns -1 for 0 input, but the defined intrinsic
2381 // behavior is to return the number of bits.
2382 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2383 SrcIsZero, Bits32, NewOpr);
2384 }
2385
2386 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2387}
2388
2389SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2390 bool Signed) const {
2391 // Unsigned
2392 // cul2f(ulong u)
2393 //{
2394 // uint lz = clz(u);
2395 // uint e = (u != 0) ? 127U + 63U - lz : 0;
2396 // u = (u << lz) & 0x7fffffffffffffffUL;
2397 // ulong t = u & 0xffffffffffUL;
2398 // uint v = (e << 23) | (uint)(u >> 40);
2399 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2400 // return as_float(v + r);
2401 //}
2402 // Signed
2403 // cl2f(long l)
2404 //{
2405 // long s = l >> 63;
2406 // float r = cul2f((l + s) ^ s);
2407 // return s ? -r : r;
2408 //}
2409
2410 SDLoc SL(Op);
2411 SDValue Src = Op.getOperand(0);
2412 SDValue L = Src;
2413
2414 SDValue S;
2415 if (Signed) {
2416 const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2417 S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2418
2419 SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2420 L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2421 }
2422
2423 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2424 *DAG.getContext(), MVT::f32);
2425
2426
2427 SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2428 SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2429 SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2430 LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2431
2432 SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2433 SDValue E = DAG.getSelect(SL, MVT::i32,
2434 DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2435 DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2436 ZeroI32);
2437
2438 SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2439 DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2440 DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2441
2442 SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2443 DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2444
2445 SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2446 U, DAG.getConstant(40, SL, MVT::i64));
2447
2448 SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2449 DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2450 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2451
2452 SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2453 SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2454 SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2455
2456 SDValue One = DAG.getConstant(1, SL, MVT::i32);
2457
2458 SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2459
2460 SDValue R = DAG.getSelect(SL, MVT::i32,
2461 RCmp,
2462 One,
2463 DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2464 R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2465 R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2466
2467 if (!Signed)
2468 return R;
2469
2470 SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2471 return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2472}
2473
2474SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2475 bool Signed) const {
2476 SDLoc SL(Op);
2477 SDValue Src = Op.getOperand(0);
2478
2479 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2480
2481 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2482 DAG.getConstant(0, SL, MVT::i32));
2483 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2484 DAG.getConstant(1, SL, MVT::i32));
2485
2486 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2487 SL, MVT::f64, Hi);
2488
2489 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2490
2491 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2492 DAG.getConstant(32, SL, MVT::i32));
2493 // TODO: Should this propagate fast-math-flags?
2494 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2495}
2496
2497SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2498 SelectionDAG &DAG) const {
2499 assert(Op.getOperand(0).getValueType() == MVT::i64 &&((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2500, __PRETTY_FUNCTION__))
2500 "operation should be legal")((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2500, __PRETTY_FUNCTION__))
;
2501
2502 // TODO: Factor out code common with LowerSINT_TO_FP.
2503
2504 EVT DestVT = Op.getValueType();
2505 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2506 SDLoc DL(Op);
2507 SDValue Src = Op.getOperand(0);
2508
2509 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2510 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2511 SDValue FPRound =
2512 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2513
2514 return FPRound;
2515 }
2516
2517 if (DestVT == MVT::f32)
2518 return LowerINT_TO_FP32(Op, DAG, false);
2519
2520 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2520, __PRETTY_FUNCTION__))
;
2521 return LowerINT_TO_FP64(Op, DAG, false);
2522}
2523
2524SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2525 SelectionDAG &DAG) const {
2526 assert(Op.getOperand(0).getValueType() == MVT::i64 &&((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2527, __PRETTY_FUNCTION__))
2527 "operation should be legal")((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2527, __PRETTY_FUNCTION__))
;
2528
2529 // TODO: Factor out code common with LowerUINT_TO_FP.
2530
2531 EVT DestVT = Op.getValueType();
2532 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2533 SDLoc DL(Op);
2534 SDValue Src = Op.getOperand(0);
2535
2536 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2537 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2538 SDValue FPRound =
2539 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2540
2541 return FPRound;
2542 }
2543
2544 if (DestVT == MVT::f32)
2545 return LowerINT_TO_FP32(Op, DAG, true);
2546
2547 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2547, __PRETTY_FUNCTION__))
;
2548 return LowerINT_TO_FP64(Op, DAG, true);
2549}
2550
2551SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2552 bool Signed) const {
2553 SDLoc SL(Op);
2554
2555 SDValue Src = Op.getOperand(0);
2556
2557 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2558
2559 SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL), SL,
2560 MVT::f64);
2561 SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL), SL,
2562 MVT::f64);
2563 // TODO: Should this propagate fast-math-flags?
2564 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2565
2566 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2567
2568
2569 SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2570
2571 SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2572 MVT::i32, FloorMul);
2573 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2574
2575 SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2576
2577 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2578}
2579
2580SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2581 SDLoc DL(Op);
2582 SDValue N0 = Op.getOperand(0);
2583
2584 // Convert to target node to get known bits
2585 if (N0.getValueType() == MVT::f32)
2586 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2587
2588 if (getTargetMachine().Options.UnsafeFPMath) {
2589 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2590 return SDValue();
2591 }
2592
2593 assert(N0.getSimpleValueType() == MVT::f64)((N0.getSimpleValueType() == MVT::f64) ? static_cast<void>
(0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2593, __PRETTY_FUNCTION__))
;
2594
2595 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2596 const unsigned ExpMask = 0x7ff;
2597 const unsigned ExpBiasf64 = 1023;
2598 const unsigned ExpBiasf16 = 15;
2599 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2600 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2601 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2602 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2603 DAG.getConstant(32, DL, MVT::i64));
2604 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2605 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2606 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2607 DAG.getConstant(20, DL, MVT::i64));
2608 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2609 DAG.getConstant(ExpMask, DL, MVT::i32));
2610 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2611 // add the f16 bias (15) to get the biased exponent for the f16 format.
2612 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2613 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2614
2615 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2616 DAG.getConstant(8, DL, MVT::i32));
2617 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2618 DAG.getConstant(0xffe, DL, MVT::i32));
2619
2620 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2621 DAG.getConstant(0x1ff, DL, MVT::i32));
2622 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2623
2624 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2625 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2626
2627 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2628 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2629 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2630 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2631
2632 // N = M | (E << 12);
2633 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2634 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2635 DAG.getConstant(12, DL, MVT::i32)));
2636
2637 // B = clamp(1-E, 0, 13);
2638 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2639 One, E);
2640 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2641 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2642 DAG.getConstant(13, DL, MVT::i32));
2643
2644 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2645 DAG.getConstant(0x1000, DL, MVT::i32));
2646
2647 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2648 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2649 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2650 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2651
2652 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2653 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2654 DAG.getConstant(0x7, DL, MVT::i32));
2655 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2656 DAG.getConstant(2, DL, MVT::i32));
2657 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2658 One, Zero, ISD::SETEQ);
2659 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2660 One, Zero, ISD::SETGT);
2661 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2662 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2663
2664 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2665 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2666 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2667 I, V, ISD::SETEQ);
2668
2669 // Extract the sign bit.
2670 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2671 DAG.getConstant(16, DL, MVT::i32));
2672 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2673 DAG.getConstant(0x8000, DL, MVT::i32));
2674
2675 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2676 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2677}
2678
2679SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2680 SelectionDAG &DAG) const {
2681 SDValue Src = Op.getOperand(0);
2682
2683 // TODO: Factor out code common with LowerFP_TO_UINT.
2684
2685 EVT SrcVT = Src.getValueType();
2686 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2687 SDLoc DL(Op);
2688
2689 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2690 SDValue FpToInt32 =
2691 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2692
2693 return FpToInt32;
2694 }
2695
2696 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2697 return LowerFP64_TO_INT(Op, DAG, true);
2698
2699 return SDValue();
2700}
2701
2702SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2703 SelectionDAG &DAG) const {
2704 SDValue Src = Op.getOperand(0);
2705
2706 // TODO: Factor out code common with LowerFP_TO_SINT.
2707
2708 EVT SrcVT = Src.getValueType();
2709 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2710 SDLoc DL(Op);
2711
2712 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2713 SDValue FpToInt32 =
2714 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2715
2716 return FpToInt32;
2717 }
2718
2719 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2720 return LowerFP64_TO_INT(Op, DAG, false);
2721
2722 return SDValue();
2723}
2724
2725SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2726 SelectionDAG &DAG) const {
2727 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2728 MVT VT = Op.getSimpleValueType();
2729 MVT ScalarVT = VT.getScalarType();
2730
2731 assert(VT.isVector())((VT.isVector()) ? static_cast<void> (0) : __assert_fail
("VT.isVector()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2731, __PRETTY_FUNCTION__))
;
2732
2733 SDValue Src = Op.getOperand(0);
2734 SDLoc DL(Op);
2735
2736 // TODO: Don't scalarize on Evergreen?
2737 unsigned NElts = VT.getVectorNumElements();
2738 SmallVector<SDValue, 8> Args;
2739 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2740
2741 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2742 for (unsigned I = 0; I < NElts; ++I)
2743 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2744
2745 return DAG.getBuildVector(VT, DL, Args);
2746}
2747
2748//===----------------------------------------------------------------------===//
2749// Custom DAG optimizations
2750//===----------------------------------------------------------------------===//
2751
2752static bool isU24(SDValue Op, SelectionDAG &DAG) {
2753 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2754}
2755
2756static bool isI24(SDValue Op, SelectionDAG &DAG) {
2757 EVT VT = Op.getValueType();
2758 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2759 // as unsigned 24-bit values.
2760 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2761}
2762
2763static SDValue simplifyI24(SDNode *Node24,
2764 TargetLowering::DAGCombinerInfo &DCI) {
2765 SelectionDAG &DAG = DCI.DAG;
2766 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2767
2768 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2769 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2770 unsigned NewOpcode = Node24->getOpcode();
2771 if (IsIntrin) {
2772 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2773 NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2774 AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2775 }
2776
2777 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2778
2779 // First try to simplify using GetDemandedBits which allows the operands to
2780 // have other uses, but will only perform simplifications that involve
2781 // bypassing some nodes for this user.
2782 SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
2783 SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
2784 if (DemandedLHS || DemandedRHS)
2785 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2786 DemandedLHS ? DemandedLHS : LHS,
2787 DemandedRHS ? DemandedRHS : RHS);
2788
2789 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2790 // operands if this node is the only user.
2791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2792 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2793 return SDValue(Node24, 0);
2794 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2795 return SDValue(Node24, 0);
2796
2797 return SDValue();
2798}
2799
2800template <typename IntTy>
2801static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2802 uint32_t Width, const SDLoc &DL) {
2803 if (Width + Offset < 32) {
2804 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2805 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2806 return DAG.getConstant(Result, DL, MVT::i32);
2807 }
2808
2809 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2810}
2811
2812static bool hasVolatileUser(SDNode *Val) {
2813 for (SDNode *U : Val->uses()) {
2814 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2815 if (M->isVolatile())
2816 return true;
2817 }
2818 }
2819
2820 return false;
2821}
2822
2823bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2824 // i32 vectors are the canonical memory type.
2825 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2826 return false;
2827
2828 if (!VT.isByteSized())
2829 return false;
2830
2831 unsigned Size = VT.getStoreSize();
2832
2833 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2834 return false;
2835
2836 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2837 return false;
2838
2839 return true;
2840}
2841
2842// Replace load of an illegal type with a store of a bitcast to a friendlier
2843// type.
2844SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2845 DAGCombinerInfo &DCI) const {
2846 if (!DCI.isBeforeLegalize())
2847 return SDValue();
2848
2849 LoadSDNode *LN = cast<LoadSDNode>(N);
2850 if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2851 return SDValue();
2852
2853 SDLoc SL(N);
2854 SelectionDAG &DAG = DCI.DAG;
2855 EVT VT = LN->getMemoryVT();
2856
2857 unsigned Size = VT.getStoreSize();
2858 unsigned Align = LN->getAlignment();
2859 if (Align < Size && isTypeLegal(VT)) {
2860 bool IsFast;
2861 unsigned AS = LN->getAddressSpace();
2862
2863 // Expand unaligned loads earlier than legalization. Due to visitation order
2864 // problems during legalization, the emitted instructions to pack and unpack
2865 // the bytes again are not eliminated in the case of an unaligned copy.
2866 if (!allowsMisalignedMemoryAccesses(
2867 VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
2868 if (VT.isVector())
2869 return scalarizeVectorLoad(LN, DAG);
2870
2871 SDValue Ops[2];
2872 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2873 return DAG.getMergeValues(Ops, SDLoc(N));
2874 }
2875
2876 if (!IsFast)
2877 return SDValue();
2878 }
2879
2880 if (!shouldCombineMemoryType(VT))
2881 return SDValue();
2882
2883 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2884
2885 SDValue NewLoad
2886 = DAG.getLoad(NewVT, SL, LN->getChain(),
2887 LN->getBasePtr(), LN->getMemOperand());
2888
2889 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2890 DCI.CombineTo(N, BC, NewLoad.getValue(1));
2891 return SDValue(N, 0);
2892}
2893
2894// Replace store of an illegal type with a store of a bitcast to a friendlier
2895// type.
2896SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2897 DAGCombinerInfo &DCI) const {
2898 if (!DCI.isBeforeLegalize())
2899 return SDValue();
2900
2901 StoreSDNode *SN = cast<StoreSDNode>(N);
2902 if (SN->isVolatile() || !ISD::isNormalStore(SN))
2903 return SDValue();
2904
2905 EVT VT = SN->getMemoryVT();
2906 unsigned Size = VT.getStoreSize();
2907
2908 SDLoc SL(N);
2909 SelectionDAG &DAG = DCI.DAG;
2910 unsigned Align = SN->getAlignment();
2911 if (Align < Size && isTypeLegal(VT)) {
2912 bool IsFast;
2913 unsigned AS = SN->getAddressSpace();
2914
2915 // Expand unaligned stores earlier than legalization. Due to visitation
2916 // order problems during legalization, the emitted instructions to pack and
2917 // unpack the bytes again are not eliminated in the case of an unaligned
2918 // copy.
2919 if (!allowsMisalignedMemoryAccesses(
2920 VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
2921 if (VT.isVector())
2922 return scalarizeVectorStore(SN, DAG);
2923
2924 return expandUnalignedStore(SN, DAG);
2925 }
2926
2927 if (!IsFast)
2928 return SDValue();
2929 }
2930
2931 if (!shouldCombineMemoryType(VT))
2932 return SDValue();
2933
2934 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2935 SDValue Val = SN->getValue();
2936
2937 //DCI.AddToWorklist(Val.getNode());
2938
2939 bool OtherUses = !Val.hasOneUse();
2940 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2941 if (OtherUses) {
2942 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2943 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2944 }
2945
2946 return DAG.getStore(SN->getChain(), SL, CastVal,
2947 SN->getBasePtr(), SN->getMemOperand());
2948}
2949
2950// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2951// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2952// issues.
2953SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2954 DAGCombinerInfo &DCI) const {
2955 SelectionDAG &DAG = DCI.DAG;
2956 SDValue N0 = N->getOperand(0);
2957
2958 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2959 // (vt2 (truncate (assertzext vt0:x, vt1)))
2960 if (N0.getOpcode() == ISD::TRUNCATE) {
2961 SDValue N1 = N->getOperand(1);
2962 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2963 SDLoc SL(N);
2964
2965 SDValue Src = N0.getOperand(0);
2966 EVT SrcVT = Src.getValueType();
2967 if (SrcVT.bitsGE(ExtVT)) {
2968 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2969 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2970 }
2971 }
2972
2973 return SDValue();
2974}
2975
2976SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
2977 SDNode *N, DAGCombinerInfo &DCI) const {
2978 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2979 switch (IID) {
2980 case Intrinsic::amdgcn_mul_i24:
2981 case Intrinsic::amdgcn_mul_u24:
2982 return simplifyI24(N, DCI);
2983 default:
2984 return SDValue();
2985 }
2986}
2987
2988/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2989/// binary operation \p Opc to it with the corresponding constant operands.
2990SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2991 DAGCombinerInfo &DCI, const SDLoc &SL,
2992 unsigned Opc, SDValue LHS,
2993 uint32_t ValLo, uint32_t ValHi) const {
2994 SelectionDAG &DAG = DCI.DAG;
2995 SDValue Lo, Hi;
2996 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2997
2998 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2999 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3000
3001 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3002 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3003
3004 // Re-visit the ands. It's possible we eliminated one of them and it could
3005 // simplify the vector.
3006 DCI.AddToWorklist(Lo.getNode());
3007 DCI.AddToWorklist(Hi.getNode());
3008
3009 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3010 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3011}
3012
3013SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3014 DAGCombinerInfo &DCI) const {
3015 EVT VT = N->getValueType(0);
3016
3017 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3018 if (!RHS)
3019 return SDValue();
3020
3021 SDValue LHS = N->getOperand(0);
3022 unsigned RHSVal = RHS->getZExtValue();
3023 if (!RHSVal)
3024 return LHS;
3025
3026 SDLoc SL(N);
3027 SelectionDAG &DAG = DCI.DAG;
3028
3029 switch (LHS->getOpcode()) {
3030 default:
3031 break;
3032 case ISD::ZERO_EXTEND:
3033 case ISD::SIGN_EXTEND:
3034 case ISD::ANY_EXTEND: {
3035 SDValue X = LHS->getOperand(0);
3036
3037 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3038 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3039 // Prefer build_vector as the canonical form if packed types are legal.
3040 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3041 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3042 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3043 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3044 }
3045
3046 // shl (ext x) => zext (shl x), if shift does not overflow int
3047 if (VT != MVT::i64)
3048 break;
3049 KnownBits Known = DAG.computeKnownBits(X);
3050 unsigned LZ = Known.countMinLeadingZeros();
3051 if (LZ < RHSVal)
3052 break;
3053 EVT XVT = X.getValueType();
3054 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3055 return DAG.getZExtOrTrunc(Shl, SL, VT);
3056 }
3057 }
3058
3059 if (VT != MVT::i64)
3060 return SDValue();
3061
3062 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3063
3064 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3065 // common case, splitting this into a move and a 32-bit shift is faster and
3066 // the same code size.
3067 if (RHSVal < 32)
3068 return SDValue();
3069
3070 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3071
3072 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3073 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3074
3075 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3076
3077 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3078 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3079}
3080
3081SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3082 DAGCombinerInfo &DCI) const {
3083 if (N->getValueType(0) != MVT::i64)
3084 return SDValue();
3085
3086 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3087 if (!RHS)
3088 return SDValue();
3089
3090 SelectionDAG &DAG = DCI.DAG;
3091 SDLoc SL(N);
3092 unsigned RHSVal = RHS->getZExtValue();
3093
3094 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3095 if (RHSVal == 32) {
3096 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3097 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3098 DAG.getConstant(31, SL, MVT::i32));
3099
3100 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3101 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3102 }
3103
3104 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3105 if (RHSVal == 63) {
3106 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3107 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3108 DAG.getConstant(31, SL, MVT::i32));
3109 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3110 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3111 }
3112
3113 return SDValue();
3114}
3115
3116SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3117 DAGCombinerInfo &DCI) const {
3118 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3119 if (!RHS)
3120 return SDValue();
3121
3122 EVT VT = N->getValueType(0);
3123 SDValue LHS = N->getOperand(0);
3124 unsigned ShiftAmt = RHS->getZExtValue();
3125 SelectionDAG &DAG = DCI.DAG;
3126 SDLoc SL(N);
3127
3128 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3129 // this improves the ability to match BFE patterns in isel.
3130 if (LHS.getOpcode() == ISD::AND) {
3131 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3132 if (Mask->getAPIntValue().isShiftedMask() &&
3133 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3134 return DAG.getNode(
3135 ISD::AND, SL, VT,
3136 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3137 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3138 }
3139 }
3140 }
3141
3142 if (VT != MVT::i64)
3143 return SDValue();
3144
3145 if (ShiftAmt < 32)
3146 return SDValue();
3147
3148 // srl i64:x, C for C >= 32
3149 // =>
3150 // build_pair (srl hi_32(x), C - 32), 0
3151 SDValue One = DAG.getConstant(1, SL, MVT::i32);
3152 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3153
3154 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3155 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3156
3157 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3158 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3159
3160 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3161
3162 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3163}
3164
3165SDValue AMDGPUTargetLowering::performTruncateCombine(
3166 SDNode *N, DAGCombinerInfo &DCI) const {
3167 SDLoc SL(N);
3168 SelectionDAG &DAG = DCI.DAG;
3169 EVT VT = N->getValueType(0);
3170 SDValue Src = N->getOperand(0);
3171
3172 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3173 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3174 SDValue Vec = Src.getOperand(0);
3175 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3176 SDValue Elt0 = Vec.getOperand(0);
3177 EVT EltVT = Elt0.getValueType();
3178 if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
3179 if (EltVT.isFloatingPoint()) {
3180 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3181 EltVT.changeTypeToInteger(), Elt0);
3182 }
3183
3184 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3185 }
3186 }
3187 }
3188
3189 // Equivalent of above for accessing the high element of a vector as an
3190 // integer operation.
3191 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3192 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3193 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3194 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3195 SDValue BV = stripBitcast(Src.getOperand(0));
3196 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3197 BV.getValueType().getVectorNumElements() == 2) {
3198 SDValue SrcElt = BV.getOperand(1);
3199 EVT SrcEltVT = SrcElt.getValueType();
3200 if (SrcEltVT.isFloatingPoint()) {
3201 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3202 SrcEltVT.changeTypeToInteger(), SrcElt);
3203 }
3204
3205 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3206 }
3207 }
3208 }
3209 }
3210
3211 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3212 //
3213 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3214 // i16 (trunc (srl (i32 (trunc x), K)))
3215 if (VT.getScalarSizeInBits() < 32) {
3216 EVT SrcVT = Src.getValueType();
3217 if (SrcVT.getScalarSizeInBits() > 32 &&
3218 (Src.getOpcode() == ISD::SRL ||
3219 Src.getOpcode() == ISD::SRA ||
3220 Src.getOpcode() == ISD::SHL)) {
3221 SDValue Amt = Src.getOperand(1);
3222 KnownBits Known = DAG.computeKnownBits(Amt);
3223 unsigned Size = VT.getScalarSizeInBits();
3224 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3225 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3226 EVT MidVT = VT.isVector() ?
3227 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3228 VT.getVectorNumElements()) : MVT::i32;
3229
3230 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3231 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3232 Src.getOperand(0));
3233 DCI.AddToWorklist(Trunc.getNode());
3234
3235 if (Amt.getValueType() != NewShiftVT) {
3236 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3237 DCI.AddToWorklist(Amt.getNode());
3238 }
3239
3240 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3241 Trunc, Amt);
3242 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3243 }
3244 }
3245 }
3246
3247 return SDValue();
3248}
3249
3250// We need to specifically handle i64 mul here to avoid unnecessary conversion
3251// instructions. If we only match on the legalized i64 mul expansion,
3252// SimplifyDemandedBits will be unable to remove them because there will be
3253// multiple uses due to the separate mul + mulh[su].
3254static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3255 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3256 if (Size <= 32) {
3257 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3258 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3259 }
3260
3261 // Because we want to eliminate extension instructions before the
3262 // operation, we need to create a single user here (i.e. not the separate
3263 // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3264
3265 unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3266
3267 SDValue Mul = DAG.getNode(MulOpc, SL,
3268 DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3269
3270 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3271 Mul.getValue(0), Mul.getValue(1));
3272}
3273
3274SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3275 DAGCombinerInfo &DCI) const {
3276 EVT VT = N->getValueType(0);
3277
3278 unsigned Size = VT.getSizeInBits();
3279 if (VT.isVector() || Size > 64)
3280 return SDValue();
3281
3282 // There are i16 integer mul/mad.
3283 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3284 return SDValue();
3285
3286 SelectionDAG &DAG = DCI.DAG;
3287 SDLoc DL(N);
3288
3289 SDValue N0 = N->getOperand(0);
3290 SDValue N1 = N->getOperand(1);
3291
3292 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3293 // in the source into any_extends if the result of the mul is truncated. Since
3294 // we can assume the high bits are whatever we want, use the underlying value
3295 // to avoid the unknown high bits from interfering.
3296 if (N0.getOpcode() == ISD::ANY_EXTEND)
3297 N0 = N0.getOperand(0);
3298
3299 if (N1.getOpcode() == ISD::ANY_EXTEND)
3300 N1 = N1.getOperand(0);
3301
3302 SDValue Mul;
3303
3304 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3305 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3306 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3307 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3308 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3309 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3310 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3311 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3312 } else {
3313 return SDValue();
3314 }
3315
3316 // We need to use sext even for MUL_U24, because MUL_U24 is used
3317 // for signed multiply of 8 and 16-bit types.
3318 return DAG.getSExtOrTrunc(Mul, DL, VT);
3319}
3320
3321SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3322 DAGCombinerInfo &DCI) const {
3323 EVT VT = N->getValueType(0);
3324
3325 if (!Subtarget->hasMulI24() || VT.isVector())
3326 return SDValue();
3327
3328 SelectionDAG &DAG = DCI.DAG;
3329 SDLoc DL(N);
3330
3331 SDValue N0 = N->getOperand(0);
3332 SDValue N1 = N->getOperand(1);
3333
3334 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3335 return SDValue();
3336
3337 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3338 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3339
3340 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3341 DCI.AddToWorklist(Mulhi.getNode());
3342 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3343}
3344
3345SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3346 DAGCombinerInfo &DCI) const {
3347 EVT VT = N->getValueType(0);
3348
3349 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3350 return SDValue();
3351
3352 SelectionDAG &DAG = DCI.DAG;
3353 SDLoc DL(N);
3354
3355 SDValue N0 = N->getOperand(0);
3356 SDValue N1 = N->getOperand(1);
3357
3358 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3359 return SDValue();
3360
3361 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3362 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3363
3364 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3365 DCI.AddToWorklist(Mulhi.getNode());
3366 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3367}
3368
3369SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
3370 SDNode *N, DAGCombinerInfo &DCI) const {
3371 SelectionDAG &DAG = DCI.DAG;
3372
3373 // Simplify demanded bits before splitting into multiple users.
3374 if (SDValue V = simplifyI24(N, DCI))
3375 return V;
3376
3377 SDValue N0 = N->getOperand(0);
3378 SDValue N1 = N->getOperand(1);
3379
3380 bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3381
3382 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3383 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3384
3385 SDLoc SL(N);
3386
3387 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3388 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3389 return DAG.getMergeValues({ MulLo, MulHi }, SL);
3390}
3391
3392static bool isNegativeOne(SDValue Val) {
3393 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3394 return C->isAllOnesValue();
3395 return false;
3396}
3397
3398SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3399 SDValue Op,
3400 const SDLoc &DL,
3401 unsigned Opc) const {
3402 EVT VT = Op.getValueType();
3403 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3404 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3405 LegalVT != MVT::i16))
3406 return SDValue();
3407
3408 if (VT != MVT::i32)
3409 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3410
3411 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3412 if (VT != MVT::i32)
3413 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3414
3415 return FFBX;
3416}
3417
3418// The native instructions return -1 on 0 input. Optimize out a select that
3419// produces -1 on 0.
3420//
3421// TODO: If zero is not undef, we could also do this if the output is compared
3422// against the bitwidth.
3423//
3424// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3425SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3426 SDValue LHS, SDValue RHS,
3427 DAGCombinerInfo &DCI) const {
3428 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3429 if (!CmpRhs || !CmpRhs->isNullValue())
3430 return SDValue();
3431
3432 SelectionDAG &DAG = DCI.DAG;
3433 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3434 SDValue CmpLHS = Cond.getOperand(0);
3435
3436 unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3437 AMDGPUISD::FFBH_U32;
3438
3439 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3440 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3441 if (CCOpcode == ISD::SETEQ &&
3442 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3443 RHS.getOperand(0) == CmpLHS &&
3444 isNegativeOne(LHS)) {
3445 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3446 }
3447
3448 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3449 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3450 if (CCOpcode == ISD::SETNE &&
3451 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3452 LHS.getOperand(0) == CmpLHS &&
3453 isNegativeOne(RHS)) {
3454 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3455 }
3456
3457 return SDValue();
3458}
3459
3460static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3461 unsigned Op,
3462 const SDLoc &SL,
3463 SDValue Cond,
3464 SDValue N1,
3465 SDValue N2) {
3466 SelectionDAG &DAG = DCI.DAG;
3467 EVT VT = N1.getValueType();
3468
3469 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3470 N1.getOperand(0), N2.getOperand(0));
3471 DCI.AddToWorklist(NewSelect.getNode());
3472 return DAG.getNode(Op, SL, VT, NewSelect);
3473}
3474
3475// Pull a free FP operation out of a select so it may fold into uses.
3476//
3477// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3478// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3479//
3480// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3481// select c, (fabs x), +k -> fabs (select c, x, k)
3482static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3483 SDValue N) {
3484 SelectionDAG &DAG = DCI.DAG;
3485 SDValue Cond = N.getOperand(0);
3486 SDValue LHS = N.getOperand(1);
3487 SDValue RHS = N.getOperand(2);
3488
3489 EVT VT = N.getValueType();
3490 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3491 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3492 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3493 SDLoc(N), Cond, LHS, RHS);
3494 }
3495
3496 bool Inv = false;
3497 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3498 std::swap(LHS, RHS);
3499 Inv = true;
3500 }
3501
3502 // TODO: Support vector constants.
3503 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3504 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3505 SDLoc SL(N);
3506 // If one side is an fneg/fabs and the other is a constant, we can push the
3507 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3508 SDValue NewLHS = LHS.getOperand(0);
3509 SDValue NewRHS = RHS;
3510
3511 // Careful: if the neg can be folded up, don't try to pull it back down.
3512 bool ShouldFoldNeg = true;
3513
3514 if (NewLHS.hasOneUse()) {
3515 unsigned Opc = NewLHS.getOpcode();
3516 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3517 ShouldFoldNeg = false;
3518 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3519 ShouldFoldNeg = false;
3520 }
3521
3522 if (ShouldFoldNeg) {
3523 if (LHS.getOpcode() == ISD::FNEG)
3524 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3525 else if (CRHS->isNegative())
3526 return SDValue();
3527
3528 if (Inv)
3529 std::swap(NewLHS, NewRHS);
3530
3531 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3532 Cond, NewLHS, NewRHS);
3533 DCI.AddToWorklist(NewSelect.getNode());
3534 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3535 }
3536 }
3537
3538 return SDValue();
3539}
3540
3541
3542SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3543 DAGCombinerInfo &DCI) const {
3544 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3545 return Folded;
3546
3547 SDValue Cond = N->getOperand(0);
3548 if (Cond.getOpcode() != ISD::SETCC)
3549 return SDValue();
3550
3551 EVT VT = N->getValueType(0);
3552 SDValue LHS = Cond.getOperand(0);
3553 SDValue RHS = Cond.getOperand(1);
3554 SDValue CC = Cond.getOperand(2);
3555
3556 SDValue True = N->getOperand(1);
3557 SDValue False = N->getOperand(2);
3558
3559 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3560 SelectionDAG &DAG = DCI.DAG;
3561 if (DAG.isConstantValueOfAnyType(True) &&
3562 !DAG.isConstantValueOfAnyType(False)) {
3563 // Swap cmp + select pair to move constant to false input.
3564 // This will allow using VOPC cndmasks more often.
3565 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3566
3567 SDLoc SL(N);
3568 ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3569 LHS.getValueType().isInteger());
3570
3571 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3572 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3573 }
3574
3575 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3576 SDValue MinMax
3577 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3578 // Revisit this node so we can catch min3/max3/med3 patterns.
3579 //DCI.AddToWorklist(MinMax.getNode());
3580 return MinMax;
3581 }
3582 }
3583
3584 // There's no reason to not do this if the condition has other uses.
3585 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3586}
3587
3588static bool isInv2Pi(const APFloat &APF) {
3589 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3590 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3591 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3592
3593 return APF.bitwiseIsEqual(KF16) ||
3594 APF.bitwiseIsEqual(KF32) ||
3595 APF.bitwiseIsEqual(KF64);
3596}
3597
3598// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3599// additional cost to negate them.
3600bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3601 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3602 if (C->isZero() && !C->isNegative())
3603 return true;
3604
3605 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3606 return true;
3607 }
3608
3609 return false;
3610}
3611
3612static unsigned inverseMinMax(unsigned Opc) {
3613 switch (Opc) {
3614 case ISD::FMAXNUM:
3615 return ISD::FMINNUM;
3616 case ISD::FMINNUM:
3617 return ISD::FMAXNUM;
3618 case ISD::FMAXNUM_IEEE:
3619 return ISD::FMINNUM_IEEE;
3620 case ISD::FMINNUM_IEEE:
3621 return ISD::FMAXNUM_IEEE;
3622 case AMDGPUISD::FMAX_LEGACY:
3623 return AMDGPUISD::FMIN_LEGACY;
3624 case AMDGPUISD::FMIN_LEGACY:
3625 return AMDGPUISD::FMAX_LEGACY;
3626 default:
3627 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3627)
;
3628 }
3629}
3630
3631SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3632 DAGCombinerInfo &DCI) const {
3633 SelectionDAG &DAG = DCI.DAG;
3634 SDValue N0 = N->getOperand(0);
3635 EVT VT = N->getValueType(0);
3636
3637 unsigned Opc = N0.getOpcode();
3638
3639 // If the input has multiple uses and we can either fold the negate down, or
3640 // the other uses cannot, give up. This both prevents unprofitable
3641 // transformations and infinite loops: we won't repeatedly try to fold around
3642 // a negate that has no 'good' form.
3643 if (N0.hasOneUse()) {
3644 // This may be able to fold into the source, but at a code size cost. Don't
3645 // fold if the fold into the user is free.
3646 if (allUsesHaveSourceMods(N, 0))
3647 return SDValue();
3648 } else {
3649 if (fnegFoldsIntoOp(Opc) &&
3650 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3651 return SDValue();
3652 }
3653
3654 SDLoc SL(N);
3655 switch (Opc) {
3656 case ISD::FADD: {
3657 if (!mayIgnoreSignedZero(N0))
3658 return SDValue();
3659
3660 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3661 SDValue LHS = N0.getOperand(0);
3662 SDValue RHS = N0.getOperand(1);
3663
3664 if (LHS.getOpcode() != ISD::FNEG)
3665 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3666 else
3667 LHS = LHS.getOperand(0);
3668
3669 if (RHS.getOpcode() != ISD::FNEG)
3670 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3671 else
3672 RHS = RHS.getOperand(0);
3673
3674 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3675 if (Res.getOpcode() != ISD::FADD)
3676 return SDValue(); // Op got folded away.
3677 if (!N0.hasOneUse())
3678 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3679 return Res;
3680 }
3681 case ISD::FMUL:
3682 case AMDGPUISD::FMUL_LEGACY: {
3683 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3684 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3685 SDValue LHS = N0.getOperand(0);
3686 SDValue RHS = N0.getOperand(1);
3687
3688 if (LHS.getOpcode() == ISD::FNEG)
3689 LHS = LHS.getOperand(0);
3690 else if (RHS.getOpcode() == ISD::FNEG)
3691 RHS = RHS.getOperand(0);
3692 else
3693 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3694
3695 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3696 if (Res.getOpcode() != Opc)
3697 return SDValue(); // Op got folded away.
3698 if (!N0.hasOneUse())
3699 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3700 return Res;
3701 }
3702 case ISD::FMA:
3703 case ISD::FMAD: {
3704 if (!mayIgnoreSignedZero(N0))
3705 return SDValue();
3706
3707 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3708 SDValue LHS = N0.getOperand(0);
3709 SDValue MHS = N0.getOperand(1);
3710 SDValue RHS = N0.getOperand(2);
3711
3712 if (LHS.getOpcode() == ISD::FNEG)
3713 LHS = LHS.getOperand(0);
3714 else if (MHS.getOpcode() == ISD::FNEG)
3715 MHS = MHS.getOperand(0);
3716 else
3717 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3718
3719 if (RHS.getOpcode() != ISD::FNEG)
3720 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3721 else
3722 RHS = RHS.getOperand(0);
3723
3724 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3725 if (Res.getOpcode() != Opc)
3726 return SDValue(); // Op got folded away.
3727 if (!N0.hasOneUse())
3728 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3729 return Res;
3730 }
3731 case ISD::FMAXNUM:
3732 case ISD::FMINNUM:
3733 case ISD::FMAXNUM_IEEE:
3734 case ISD::FMINNUM_IEEE:
3735 case AMDGPUISD::FMAX_LEGACY:
3736 case AMDGPUISD::FMIN_LEGACY: {
3737 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3738 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3739 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3740 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3741
3742 SDValue LHS = N0.getOperand(0);
3743 SDValue RHS = N0.getOperand(1);
3744
3745 // 0 doesn't have a negated inline immediate.
3746 // TODO: This constant check should be generalized to other operations.
3747 if (isConstantCostlierToNegate(RHS))
3748 return SDValue();
3749
3750 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3751 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3752 unsigned Opposite = inverseMinMax(Opc);
3753
3754 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3755 if (Res.getOpcode() != Opposite)
3756 return SDValue(); // Op got folded away.
3757 if (!N0.hasOneUse())
3758 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3759 return Res;
3760 }
3761 case AMDGPUISD::FMED3: {
3762 SDValue Ops[3];
3763 for (unsigned I = 0; I < 3; ++I)
3764 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3765
3766 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3767 if (Res.getOpcode() != AMDGPUISD::FMED3)
3768 return SDValue(); // Op got folded away.
3769 if (!N0.hasOneUse())
3770 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3771 return Res;
3772 }
3773 case ISD::FP_EXTEND:
3774 case ISD::FTRUNC:
3775 case ISD::FRINT:
3776 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3777 case ISD::FSIN:
3778 case ISD::FCANONICALIZE:
3779 case AMDGPUISD::RCP:
3780 case AMDGPUISD::RCP_LEGACY:
3781 case AMDGPUISD::RCP_IFLAG:
3782 case AMDGPUISD::SIN_HW: {
3783 SDValue CvtSrc = N0.getOperand(0);
3784 if (CvtSrc.getOpcode() == ISD::FNEG) {
3785 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3786 // (fneg (rcp (fneg x))) -> (rcp x)
3787 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3788 }
3789
3790 if (!N0.hasOneUse())
3791 return SDValue();
3792
3793 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3794 // (fneg (rcp x)) -> (rcp (fneg x))
3795 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3796 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3797 }
3798 case ISD::FP_ROUND: {
3799 SDValue CvtSrc = N0.getOperand(0);
3800
3801 if (CvtSrc.getOpcode() == ISD::FNEG) {
3802 // (fneg (fp_round (fneg x))) -> (fp_round x)
3803 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3804 CvtSrc.getOperand(0), N0.getOperand(1));
3805 }
3806
3807 if (!N0.hasOneUse())
3808 return SDValue();
3809
3810 // (fneg (fp_round x)) -> (fp_round (fneg x))
3811 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3812 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3813 }
3814 case ISD::FP16_TO_FP: {
3815 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3816 // f16, but legalization of f16 fneg ends up pulling it out of the source.
3817 // Put the fneg back as a legal source operation that can be matched later.
3818 SDLoc SL(N);
3819
3820 SDValue Src = N0.getOperand(0);
3821 EVT SrcVT = Src.getValueType();
3822
3823 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3824 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3825 DAG.getConstant(0x8000, SL, SrcVT));
3826 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3827 }
3828 default:
3829 return SDValue();
3830 }
3831}
3832
3833SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3834 DAGCombinerInfo &DCI) const {
3835 SelectionDAG &DAG = DCI.DAG;
3836 SDValue N0 = N->getOperand(0);
3837
3838 if (!N0.hasOneUse())
3839 return SDValue();
3840
3841 switch (N0.getOpcode()) {
3842 case ISD::FP16_TO_FP: {
3843 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")((!Subtarget->has16BitInsts() && "should only see if f16 is illegal"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3843, __PRETTY_FUNCTION__))
;
3844 SDLoc SL(N);
3845 SDValue Src = N0.getOperand(0);
3846 EVT SrcVT = Src.getValueType();
3847
3848 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3849 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3850 DAG.getConstant(0x7fff, SL, SrcVT));
3851 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3852 }
3853 default:
3854 return SDValue();
3855 }
3856}
3857
3858SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3859 DAGCombinerInfo &DCI) const {
3860 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3861 if (!CFP)
3862 return SDValue();
3863
3864 // XXX - Should this flush denormals?
3865 const APFloat &Val = CFP->getValueAPF();
3866 APFloat One(Val.getSemantics(), "1.0");
3867 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3868}
3869
3870SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3871 DAGCombinerInfo &DCI) const {
3872 SelectionDAG &DAG = DCI.DAG;
3873 SDLoc DL(N);
3874
3875 switch(N->getOpcode()) {
3876 default:
3877 break;
3878 case ISD::BITCAST: {
3879 EVT DestVT = N->getValueType(0);
3880
3881 // Push casts through vector builds. This helps avoid emitting a large
3882 // number of copies when materializing floating point vector constants.
3883 //
3884 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3885 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3886 if (DestVT.isVector()) {
3887 SDValue Src = N->getOperand(0);
3888 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3889 EVT SrcVT = Src.getValueType();
3890 unsigned NElts = DestVT.getVectorNumElements();
3891
3892 if (SrcVT.getVectorNumElements() == NElts) {
3893 EVT DestEltVT = DestVT.getVectorElementType();
3894
3895 SmallVector<SDValue, 8> CastedElts;
3896 SDLoc SL(N);
3897 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3898 SDValue Elt = Src.getOperand(I);
3899 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3900 }
3901
3902 return DAG.getBuildVector(DestVT, SL, CastedElts);
3903 }
3904 }
3905 }
3906
3907 if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3908 break;
3909
3910 // Fold bitcasts of constants.
3911 //
3912 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3913 // TODO: Generalize and move to DAGCombiner
3914 SDValue Src = N->getOperand(0);
3915 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3916 if (Src.getValueType() == MVT::i64) {
3917 SDLoc SL(N);
3918 uint64_t CVal = C->getZExtValue();
3919 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3920 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3921 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3922 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
3923 }
3924 }
3925
3926 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3927 const APInt &Val = C->getValueAPF().bitcastToAPInt();
3928 SDLoc SL(N);
3929 uint64_t CVal = Val.getZExtValue();
3930 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3931 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3932 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3933
3934 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3935 }
3936
3937 break;
3938 }
3939 case ISD::SHL: {
3940 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3941 break;
3942
3943 return performShlCombine(N, DCI);
3944 }
3945 case ISD::SRL: {
3946 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3947 break;
3948
3949 return performSrlCombine(N, DCI);
3950 }
3951 case ISD::SRA: {
3952 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3953 break;
3954
3955 return performSraCombine(N, DCI);
3956 }
3957 case ISD::TRUNCATE:
3958 return performTruncateCombine(N, DCI);
3959 case ISD::MUL:
3960 return performMulCombine(N, DCI);
3961 case ISD::MULHS:
3962 return performMulhsCombine(N, DCI);
3963 case ISD::MULHU:
3964 return performMulhuCombine(N, DCI);
3965 case AMDGPUISD::MUL_I24:
3966 case AMDGPUISD::MUL_U24:
3967 case AMDGPUISD::MULHI_I24:
3968 case AMDGPUISD::MULHI_U24: {
3969 if (SDValue V = simplifyI24(N, DCI))
3970 return V;
3971 return SDValue();
3972 }
3973 case AMDGPUISD::MUL_LOHI_I24:
3974 case AMDGPUISD::MUL_LOHI_U24:
3975 return performMulLoHi24Combine(N, DCI);
3976 case ISD::SELECT:
3977 return performSelectCombine(N, DCI);
3978 case ISD::FNEG:
3979 return performFNegCombine(N, DCI);
3980 case ISD::FABS:
3981 return performFAbsCombine(N, DCI);
3982 case AMDGPUISD::BFE_I32:
3983 case AMDGPUISD::BFE_U32: {
3984 assert(!N->getValueType(0).isVector() &&((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3985, __PRETTY_FUNCTION__))
3985 "Vector handling of BFE not implemented")((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3985, __PRETTY_FUNCTION__))
;
3986 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3987 if (!Width)
3988 break;
3989
3990 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3991 if (WidthVal == 0)
3992 return DAG.getConstant(0, DL, MVT::i32);
3993
3994 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3995 if (!Offset)
3996 break;
3997
3998 SDValue BitsFrom = N->getOperand(0);
3999 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4000
4001 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4002
4003 if (OffsetVal == 0) {
4004 // This is already sign / zero extended, so try to fold away extra BFEs.
4005 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4006
4007 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4008 if (OpSignBits >= SignBits)
4009 return BitsFrom;
4010
4011 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4012 if (Signed) {
4013 // This is a sign_extend_inreg. Replace it to take advantage of existing
4014 // DAG Combines. If not eliminated, we will match back to BFE during
4015 // selection.
4016
4017 // TODO: The sext_inreg of extended types ends, although we can could
4018 // handle them in a single BFE.
4019 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4020 DAG.getValueType(SmallVT));
4021 }
4022
4023 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4024 }
4025
4026 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4027 if (Signed) {
4028 return constantFoldBFE<int32_t>(DAG,
4029 CVal->getSExtValue(),
4030 OffsetVal,
4031 WidthVal,
4032 DL);
4033 }
4034
4035 return constantFoldBFE<uint32_t>(DAG,
4036 CVal->getZExtValue(),
4037 OffsetVal,
4038 WidthVal,
4039 DL);
4040 }
4041
4042 if ((OffsetVal + WidthVal) >= 32 &&
4043 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4044 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4045 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4046 BitsFrom, ShiftVal);
4047 }
4048
4049 if (BitsFrom.hasOneUse()) {
4050 APInt Demanded = APInt::getBitsSet(32,
4051 OffsetVal,
4052 OffsetVal + WidthVal);
4053
4054 KnownBits Known;
4055 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4056 !DCI.isBeforeLegalizeOps());
4057 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4058 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4059 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4060 DCI.CommitTargetLoweringOpt(TLO);
4061 }
4062 }
4063
4064 break;
4065 }
4066 case ISD::LOAD:
4067 return performLoadCombine(N, DCI);
4068 case ISD::STORE:
4069 return performStoreCombine(N, DCI);
4070 case AMDGPUISD::RCP:
4071 case AMDGPUISD::RCP_IFLAG:
4072 return performRcpCombine(N, DCI);
4073 case ISD::AssertZext:
4074 case ISD::AssertSext:
4075 return performAssertSZExtCombine(N, DCI);
4076 case ISD::INTRINSIC_WO_CHAIN:
4077 return performIntrinsicWOChainCombine(N, DCI);
4078 }
4079 return SDValue();
4080}
4081
4082//===----------------------------------------------------------------------===//
4083// Helper functions
4084//===----------------------------------------------------------------------===//
4085
4086SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4087 const TargetRegisterClass *RC,
4088 unsigned Reg, EVT VT,
4089 const SDLoc &SL,
4090 bool RawReg) const {
4091 MachineFunction &MF = DAG.getMachineFunction();
4092 MachineRegisterInfo &MRI = MF.getRegInfo();
4093 unsigned VReg;
4094
4095 if (!MRI.isLiveIn(Reg)) {
4096 VReg = MRI.createVirtualRegister(RC);
4097 MRI.addLiveIn(Reg, VReg);
4098 } else {
4099 VReg = MRI.getLiveInVirtReg(Reg);
4100 }
4101
4102 if (RawReg)
4103 return DAG.getRegister(VReg, VT);
4104
4105 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4106}
4107
4108// This may be called multiple times, and nothing prevents creating multiple
4109// objects at the same offset. See if we already defined this object.
4110static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4111 int64_t Offset) {
4112 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4113 if (MFI.getObjectOffset(I) == Offset) {
4114 assert(MFI.getObjectSize(I) == Size)((MFI.getObjectSize(I) == Size) ? static_cast<void> (0)
: __assert_fail ("MFI.getObjectSize(I) == Size", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4114, __PRETTY_FUNCTION__))
;
4115 return I;
4116 }
4117 }
4118
4119 return MFI.CreateFixedObject(Size, Offset, true);
4120}
4121
4122SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4123 EVT VT,
4124 const SDLoc &SL,
4125 int64_t Offset) const {
4126 MachineFunction &MF = DAG.getMachineFunction();
4127 MachineFrameInfo &MFI = MF.getFrameInfo();
4128 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4129
4130 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4131 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4132
4133 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
4134 MachineMemOperand::MODereferenceable |
4135 MachineMemOperand::MOInvariant);
4136}
4137
4138SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4139 const SDLoc &SL,
4140 SDValue Chain,
4141 SDValue ArgVal,
4142 int64_t Offset) const {
4143 MachineFunction &MF = DAG.getMachineFunction();
4144 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4145
4146 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4147 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
4148 MachineMemOperand::MODereferenceable);
4149 return Store;
4150}
4151
4152SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4153 const TargetRegisterClass *RC,
4154 EVT VT, const SDLoc &SL,
4155 const ArgDescriptor &Arg) const {
4156 assert(Arg && "Attempting to load missing argument")((Arg && "Attempting to load missing argument") ? static_cast
<void> (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4156, __PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4157
4158 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4159 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4160 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4161
4162 if (!Arg.isMasked())
4
Calling 'ArgDescriptor::isMasked'
7
Returning from 'ArgDescriptor::isMasked'
8
Taking false branch
4163 return V;
4164
4165 unsigned Mask = Arg.getMask();
4166 unsigned Shift = countTrailingZeros<unsigned>(Mask);
9
Calling 'countTrailingZeros<unsigned int>'
16
Returning from 'countTrailingZeros<unsigned int>'
17
'Shift' initialized to 32
4167 V = DAG.getNode(ISD::SRL, SL, VT, V,
4168 DAG.getShiftAmountConstant(Shift, VT, SL));
4169 return DAG.getNode(ISD::AND, SL, VT, V,
4170 DAG.getConstant(Mask >> Shift, SL, VT));
18
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4171}
4172
4173uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4174 const MachineFunction &MF, const ImplicitParameter Param) const {
4175 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4176 const AMDGPUSubtarget &ST =
4177 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4178 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4179 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4180 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4181 ExplicitArgOffset;
4182 switch (Param) {
4183 case GRID_DIM:
4184 return ArgOffset;
4185 case GRID_OFFSET:
4186 return ArgOffset + 4;
4187 }
4188 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4188)
;
4189}
4190
4191#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4192
4193const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4194 switch ((AMDGPUISD::NodeType)Opcode) {
4195 case AMDGPUISD::FIRST_NUMBER: break;
4196 // AMDIL DAG nodes
4197 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4198 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4199
4200 // AMDGPU DAG nodes
4201 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4202 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4203 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4204 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4205 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4206 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4207 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4208 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4209 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4210 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4211 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4212 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4213 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4214 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4215 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4216 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4217 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4218 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4219 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4220 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4221 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4222 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4223 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4224 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4225 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4226 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4227 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4228 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4229 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4230 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4231 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4232 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4233 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4234 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4235 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4236 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4237 NODE_NAME_CASE(TRIG_PREOP)case AMDGPUISD::TRIG_PREOP: return "TRIG_PREOP";
4238 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4239 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4240 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4241 NODE_NAME_CASE(RSQ_LEGACY)case AMDGPUISD::RSQ_LEGACY: return "RSQ_LEGACY";
4242 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4243 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4244 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4245 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4246 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4247 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4248 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4249 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4250 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4251 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4252 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4253 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4254 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4255 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4256 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4257 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4258 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4259 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4260 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4261 NODE_NAME_CASE(MUL_LOHI_U24)case AMDGPUISD::MUL_LOHI_U24: return "MUL_LOHI_U24";
4262 NODE_NAME_CASE(MUL_LOHI_I24)case AMDGPUISD::MUL_LOHI_I24: return "MUL_LOHI_I24";
4263 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4264 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4265 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4266 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4267 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4268 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4269 NODE_NAME_CASE(EXPORT)case AMDGPUISD::EXPORT: return "EXPORT";
4270 NODE_NAME_CASE(EXPORT_DONE)case AMDGPUISD::EXPORT_DONE: return "EXPORT_DONE";
4271 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4272 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4273 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4274 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4275 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4276 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4277 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4278 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4279 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4280 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4281 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4282 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4283 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4284 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4285 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4286 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4287 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4288 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4289 NODE_NAME_CASE(FP16_ZEXT)case AMDGPUISD::FP16_ZEXT: return "FP16_ZEXT";
4290 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4291 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4292 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4293 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4294 NODE_NAME_CASE(KILL)case AMDGPUISD::KILL: return "KILL";
4295 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4296 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4297 NODE_NAME_CASE(INTERP_P1LL_F16)case AMDGPUISD::INTERP_P1LL_F16: return "INTERP_P1LL_F16";
4298 NODE_NAME_CASE(INTERP_P1LV_F16)case AMDGPUISD::INTERP_P1LV_F16: return "INTERP_P1LV_F16";
4299 NODE_NAME_CASE(INTERP_P2_F16)case AMDGPUISD::INTERP_P2_F16: return "INTERP_P2_F16";
4300 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4301 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4302 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4303 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4304 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4305 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4306 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4307 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4308 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4309 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4310 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4311 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4312 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4313 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4314 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4315 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4316 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4317 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4318 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4319 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4320 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4321 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4322 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4323 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4324 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4325 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4326 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4327 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4328 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4329 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4330 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4331 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4332 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4333 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4334 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4335 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4336 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4337 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4338 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4339 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4340 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4341 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4342 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4343 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4344 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4345 NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)case AMDGPUISD::BUFFER_ATOMIC_PK_FADD: return "BUFFER_ATOMIC_PK_FADD"
;
4346 NODE_NAME_CASE(ATOMIC_PK_FADD)case AMDGPUISD::ATOMIC_PK_FADD: return "ATOMIC_PK_FADD";
4347
4348 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4349 }
4350 return nullptr;
4351}
4352
4353SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4354 SelectionDAG &DAG, int Enabled,
4355 int &RefinementSteps,
4356 bool &UseOneConstNR,
4357 bool Reciprocal) const {
4358 EVT VT = Operand.getValueType();
4359
4360 if (VT == MVT::f32) {
4361 RefinementSteps = 0;
4362 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4363 }
4364
4365 // TODO: There is also f64 rsq instruction, but the documentation is less
4366 // clear on its precision.
4367
4368 return SDValue();
4369}
4370
4371SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4372 SelectionDAG &DAG, int Enabled,
4373 int &RefinementSteps) const {
4374 EVT VT = Operand.getValueType();
4375
4376 if (VT == MVT::f32) {
4377 // Reciprocal, < 1 ulp error.
4378 //
4379 // This reciprocal approximation converges to < 0.5 ulp error with one
4380 // newton rhapson performed with two fused multiple adds (FMAs).
4381
4382 RefinementSteps = 0;
4383 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4384 }
4385
4386 // TODO: There is also f64 rcp instruction, but the documentation is less
4387 // clear on its precision.
4388
4389 return SDValue();
4390}
4391
4392void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4393 const SDValue Op, KnownBits &Known,
4394 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4395
4396 Known.resetAll(); // Don't know anything.
4397
4398 unsigned Opc = Op.getOpcode();
4399
4400 switch (Opc) {
4401 default:
4402 break;
4403 case AMDGPUISD::CARRY:
4404 case AMDGPUISD::BORROW: {
4405 Known.Zero = APInt::getHighBitsSet(32, 31);
4406 break;
4407 }
4408
4409 case AMDGPUISD::BFE_I32:
4410 case AMDGPUISD::BFE_U32: {
4411 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4412 if (!CWidth)
4413 return;
4414
4415 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4416
4417 if (Opc == AMDGPUISD::BFE_U32)
4418 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4419
4420 break;
4421 }
4422 case AMDGPUISD::FP_TO_FP16:
4423 case AMDGPUISD::FP16_ZEXT: {
4424 unsigned BitWidth = Known.getBitWidth();
4425
4426 // High bits are zero.
4427 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4428 break;
4429 }
4430 case AMDGPUISD::MUL_U24:
4431 case AMDGPUISD::MUL_I24: {
4432 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4433 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4434 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4435 RHSKnown.countMinTrailingZeros();
4436 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4437 // Skip extra check if all bits are known zeros.
4438 if (TrailZ >= 32)
4439 break;
4440
4441 // Truncate to 24 bits.
4442 LHSKnown = LHSKnown.trunc(24);
4443 RHSKnown = RHSKnown.trunc(24);
4444
4445 bool Negative = false;
4446 if (Opc == AMDGPUISD::MUL_I24) {
4447 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4448 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4449 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4450 if (MaxValBits >= 32)
4451 break;
4452 bool LHSNegative = LHSKnown.isNegative();
4453 bool LHSPositive = LHSKnown.isNonNegative();
4454 bool RHSNegative = RHSKnown.isNegative();
4455 bool RHSPositive = RHSKnown.isNonNegative();
4456 if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
4457 break;
4458 Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
4459 if (Negative)
4460 Known.One.setHighBits(32 - MaxValBits);
4461 else
4462 Known.Zero.setHighBits(32 - MaxValBits);
4463 } else {
4464 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4465 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4466 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4467 if (MaxValBits >= 32)
4468 break;
4469 Known.Zero.setHighBits(32 - MaxValBits);
4470 }
4471 break;
4472 }
4473 case AMDGPUISD::PERM: {
4474 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4475 if (!CMask)
4476 return;
4477
4478 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4479 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4480 unsigned Sel = CMask->getZExtValue();
4481
4482 for (unsigned I = 0; I < 32; I += 8) {
4483 unsigned SelBits = Sel & 0xff;
4484 if (SelBits < 4) {
4485 SelBits *= 8;
4486 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4487 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4488 } else if (SelBits < 7) {
4489 SelBits = (SelBits & 3) * 8;
4490 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4491 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4492 } else if (SelBits == 0x0c) {
4493 Known.Zero |= 0xFFull << I;
4494 } else if (SelBits > 0x0c) {
4495 Known.One |= 0xFFull << I;
4496 }
4497 Sel >>= 8;
4498 }
4499 break;
4500 }
4501 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4502 Known.Zero.setHighBits(24);
4503 break;
4504 }
4505 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4506 Known.Zero.setHighBits(16);
4507 break;
4508 }
4509 case AMDGPUISD::LDS: {
4510 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4511 unsigned Align = GA->getGlobal()->getAlignment();
4512
4513 Known.Zero.setHighBits(16);
4514 if (Align)
4515 Known.Zero.setLowBits(Log2_32(Align));
4516 break;
4517 }
4518 case ISD::INTRINSIC_WO_CHAIN: {
4519 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4520 switch (IID) {
4521 case Intrinsic::amdgcn_mbcnt_lo:
4522 case Intrinsic::amdgcn_mbcnt_hi: {
4523 const GCNSubtarget &ST =
4524 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4525 // These return at most the wavefront size - 1.
4526 unsigned Size = Op.getValueType().getSizeInBits();
4527 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4528 break;
4529 }
4530 default:
4531 break;
4532 }
4533 }
4534 }
4535}
4536
4537unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4538 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4539 unsigned Depth) const {
4540 switch (Op.getOpcode()) {
4541 case AMDGPUISD::BFE_I32: {
4542 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4543 if (!Width)
4544 return 1;
4545
4546 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4547 if (!isNullConstant(Op.getOperand(1)))
4548 return SignBits;
4549
4550 // TODO: Could probably figure something out with non-0 offsets.
4551 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4552 return std::max(SignBits, Op0SignBits);
4553 }
4554
4555 case AMDGPUISD::BFE_U32: {
4556 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4557 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4558 }
4559
4560 case AMDGPUISD::CARRY:
4561 case AMDGPUISD::BORROW:
4562 return 31;
4563 case AMDGPUISD::BUFFER_LOAD_BYTE:
4564 return 25;
4565 case AMDGPUISD::BUFFER_LOAD_SHORT:
4566 return 17;
4567 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4568 return 24;
4569 case AMDGPUISD::BUFFER_LOAD_USHORT:
4570 return 16;
4571 case AMDGPUISD::FP_TO_FP16:
4572 case AMDGPUISD::FP16_ZEXT:
4573 return 16;
4574 default:
4575 return 1;
4576 }
4577}
4578
4579bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4580 const SelectionDAG &DAG,
4581 bool SNaN,
4582 unsigned Depth) const {
4583 unsigned Opcode = Op.getOpcode();
4584 switch (Opcode) {
4585 case AMDGPUISD::FMIN_LEGACY:
4586 case AMDGPUISD::FMAX_LEGACY: {
4587 if (SNaN)
4588 return true;
4589
4590 // TODO: Can check no nans on one of the operands for each one, but which
4591 // one?
4592 return false;
4593 }
4594 case AMDGPUISD::FMUL_LEGACY:
4595 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4596 if (SNaN)
4597 return true;
4598 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4599 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4600 }
4601 case AMDGPUISD::FMED3:
4602 case AMDGPUISD::FMIN3:
4603 case AMDGPUISD::FMAX3:
4604 case AMDGPUISD::FMAD_FTZ: {
4605 if (SNaN)
4606 return true;
4607 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4608 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4609 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4610 }
4611 case AMDGPUISD::CVT_F32_UBYTE0:
4612 case AMDGPUISD::CVT_F32_UBYTE1:
4613 case AMDGPUISD::CVT_F32_UBYTE2:
4614 case AMDGPUISD::CVT_F32_UBYTE3:
4615 return true;
4616
4617 case AMDGPUISD::RCP:
4618 case AMDGPUISD::RSQ:
4619 case AMDGPUISD::RCP_LEGACY:
4620 case AMDGPUISD::RSQ_LEGACY:
4621 case AMDGPUISD::RSQ_CLAMP: {
4622 if (SNaN)
4623 return true;
4624
4625 // TODO: Need is known positive check.
4626 return false;
4627 }
4628 case AMDGPUISD::LDEXP:
4629 case AMDGPUISD::FRACT: {
4630 if (SNaN)
4631 return true;
4632 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4633 }
4634 case AMDGPUISD::DIV_SCALE:
4635 case AMDGPUISD::DIV_FMAS:
4636 case AMDGPUISD::DIV_FIXUP:
4637 case AMDGPUISD::TRIG_PREOP:
4638 // TODO: Refine on operands.
4639 return SNaN;
4640 case AMDGPUISD::SIN_HW:
4641 case AMDGPUISD::COS_HW: {
4642 // TODO: Need check for infinity
4643 return SNaN;
4644 }
4645 case ISD::INTRINSIC_WO_CHAIN: {
4646 unsigned IntrinsicID
4647 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4648 // TODO: Handle more intrinsics
4649 switch (IntrinsicID) {
4650 case Intrinsic::amdgcn_cubeid:
4651 return true;
4652
4653 case Intrinsic::amdgcn_frexp_mant: {
4654 if (SNaN)
4655 return true;
4656 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4657 }
4658 case Intrinsic::amdgcn_cvt_pkrtz: {
4659 if (SNaN)
4660 return true;
4661 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4662 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4663 }
4664 case Intrinsic::amdgcn_fdot2:
4665 // TODO: Refine on operand
4666 return SNaN;
4667 default:
4668 return false;
4669 }
4670 }
4671 default:
4672 return false;
4673 }
4674}
4675
4676TargetLowering::AtomicExpansionKind
4677AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4678 switch (RMW->getOperation()) {
4679 case AtomicRMWInst::Nand:
4680 case AtomicRMWInst::FAdd:
4681 case AtomicRMWInst::FSub:
4682 return AtomicExpansionKind::CmpXChg;
4683 default:
4684 return AtomicExpansionKind::None;
4685 }
4686}

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
5
Assuming the condition is true
6
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include "llvm/Support/SwapByteOrder.h"
18#include <algorithm>
19#include <cassert>
20#include <climits>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
117 return 32;
13
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 251, __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315// NOTE: The following support functions use the _32/_64 extensions instead of
316// type overloading so that signed and unsigned integers can be used without
317// ambiguity.
318
319/// Return the high 32 bits of a 64 bit value.
320constexpr inline uint32_t Hi_32(uint64_t Value) {
321 return static_cast<uint32_t>(Value >> 32);
322}
323
324/// Return the low 32 bits of a 64 bit value.
325constexpr inline uint32_t Lo_32(uint64_t Value) {
326 return static_cast<uint32_t>(Value);
327}
328
329/// Make a 64-bit integer from a high / low pair of 32-bit integers.
330constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
331 return ((uint64_t)High << 32) | (uint64_t)Low;
332}
333
334/// Checks if an integer fits into the given bit width.
335template <unsigned N> constexpr inline bool isInt(int64_t x) {
336 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
337}
338// Template specializations to get better code for common cases.
339template <> constexpr inline bool isInt<8>(int64_t x) {
340 return static_cast<int8_t>(x) == x;
341}
342template <> constexpr inline bool isInt<16>(int64_t x) {
343 return static_cast<int16_t>(x) == x;
344}
345template <> constexpr inline bool isInt<32>(int64_t x) {
346 return static_cast<int32_t>(x) == x;
347}
348
349/// Checks if a signed integer is an N bit number shifted left by S.
350template <unsigned N, unsigned S>
351constexpr inline bool isShiftedInt(int64_t x) {
352 static_assert(
353 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
354 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
355 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
356}
357
358/// Checks if an unsigned integer fits into the given bit width.
359///
360/// This is written as two functions rather than as simply
361///
362/// return N >= 64 || X < (UINT64_C(1) << N);
363///
364/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
365/// left too many places.
366template <unsigned N>
367constexpr inline typename std::enable_if<(N < 64), bool>::type
368isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline typename std::enable_if<N >= 64, bool>::type
374isUInt(uint64_t X) {
375 return true;
376}
377
378// Template specializations to get better code for common cases.
379template <> constexpr inline bool isUInt<8>(uint64_t x) {
380 return static_cast<uint8_t>(x) == x;
381}
382template <> constexpr inline bool isUInt<16>(uint64_t x) {
383 return static_cast<uint16_t>(x) == x;
384}
385template <> constexpr inline bool isUInt<32>(uint64_t x) {
386 return static_cast<uint32_t>(x) == x;
387}
388
389/// Checks if a unsigned integer is an N bit number shifted left by S.
390template <unsigned N, unsigned S>
391constexpr inline bool isShiftedUInt(uint64_t x) {
392 static_assert(
393 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
394 static_assert(N + S <= 64,
395 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
396 // Per the two static_asserts above, S must be strictly less than 64. So
397 // 1 << S is not undefined behavior.
398 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
399}
400
401/// Gets the maximum value for a N-bit unsigned integer.
402inline uint64_t maxUIntN(uint64_t N) {
403 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 403, __PRETTY_FUNCTION__))
;
404
405 // uint64_t(1) << 64 is undefined behavior, so we can't do
406 // (uint64_t(1) << N) - 1
407 // without checking first that N != 64. But this works and doesn't have a
408 // branch.
409 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
410}
411
412/// Gets the minimum value for a N-bit signed integer.
413inline int64_t minIntN(int64_t N) {
414 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 414, __PRETTY_FUNCTION__))
;
415
416 return -(UINT64_C(1)1UL<<(N-1));
417}
418
419/// Gets the maximum value for a N-bit signed integer.
420inline int64_t maxIntN(int64_t N) {
421 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 421, __PRETTY_FUNCTION__))
;
422
423 // This relies on two's complement wraparound when N == 64, so we convert to
424 // int64_t only at the very end to avoid UB.
425 return (UINT64_C(1)1UL << (N - 1)) - 1;
426}
427
428/// Checks if an unsigned integer fits into the given (dynamic) bit width.
429inline bool isUIntN(unsigned N, uint64_t x) {
430 return N >= 64 || x <= maxUIntN(N);
431}
432
433/// Checks if an signed integer fits into the given (dynamic) bit width.
434inline bool isIntN(unsigned N, int64_t x) {
435 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
436}
437
438/// Return true if the argument is a non-empty sequence of ones starting at the
439/// least significant bit with the remainder zero (32 bit version).
440/// Ex. isMask_32(0x0000FFFFU) == true.
441constexpr inline bool isMask_32(uint32_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument is a non-empty sequence of ones starting at the
446/// least significant bit with the remainder zero (64 bit version).
447constexpr inline bool isMask_64(uint64_t Value) {
448 return Value && ((Value + 1) & Value) == 0;
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
453constexpr inline bool isShiftedMask_32(uint32_t Value) {
454 return Value && isMask_32((Value - 1) | Value);
455}
456
457/// Return true if the argument contains a non-empty sequence of ones with the
458/// remainder zero (64 bit version.)
459constexpr inline bool isShiftedMask_64(uint64_t Value) {
460 return Value && isMask_64((Value - 1) | Value);
461}
462
463/// Return true if the argument is a power of two > 0.
464/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
465constexpr inline bool isPowerOf2_32(uint32_t Value) {
466 return Value && !(Value & (Value - 1));
467}
468
469/// Return true if the argument is a power of two > 0 (64 bit edition.)
470constexpr inline bool isPowerOf2_64(uint64_t Value) {
471 return Value && !(Value & (Value - 1));
472}
473
474/// Return a byte-swapped representation of the 16-bit argument.
475inline uint16_t ByteSwap_16(uint16_t Value) {
476 return sys::SwapByteOrder_16(Value);
477}
478
479/// Return a byte-swapped representation of the 32-bit argument.
480inline uint32_t ByteSwap_32(uint32_t Value) {
481 return sys::SwapByteOrder_32(Value);
482}
483
484/// Return a byte-swapped representation of the 64-bit argument.
485inline uint64_t ByteSwap_64(uint64_t Value) {
486 return sys::SwapByteOrder_64(Value);
487}
488
489/// Count the number of ones from the most significant bit to the first
490/// zero bit.
491///
492/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countLeadingZeros<T>(~Value, ZB);
503}
504
505/// Count the number of ones from the least significant bit to the first
506/// zero bit.
507///
508/// Ex. countTrailingOnes(0x00FF00FF) == 8.
509/// Only unsigned integral types are allowed.
510///
511/// \param ZB the behavior on an input of all ones. Only ZB_Width and
512/// ZB_Undefined are valid arguments.
513template <typename T>
514unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
515 static_assert(std::numeric_limits<T>::is_integer &&
516 !std::numeric_limits<T>::is_signed,
517 "Only unsigned integral types are allowed.");
518 return countTrailingZeros<T>(~Value, ZB);
519}
520
521namespace detail {
522template <typename T, std::size_t SizeOfT> struct PopulationCounter {
523 static unsigned count(T Value) {
524 // Generic version, forward to 32 bits.
525 static_assert(SizeOfT <= 4, "Not implemented!");
526#if defined(__GNUC__4)
527 return __builtin_popcount(Value);
528#else
529 uint32_t v = Value;
530 v = v - ((v >> 1) & 0x55555555);
531 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
532 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
533#endif
534 }
535};
536
537template <typename T> struct PopulationCounter<T, 8> {
538 static unsigned count(T Value) {
539#if defined(__GNUC__4)
540 return __builtin_popcountll(Value);
541#else
542 uint64_t v = Value;
543 v = v - ((v >> 1) & 0x5555555555555555ULL);
544 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
545 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
546 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
547#endif
548 }
549};
550} // namespace detail
551
552/// Count the number of set bits in a value.
553/// Ex. countPopulation(0xF000F000) = 8
554/// Returns 0 if the word is zero.
555template <typename T>
556inline unsigned countPopulation(T Value) {
557 static_assert(std::numeric_limits<T>::is_integer &&
558 !std::numeric_limits<T>::is_signed,
559 "Only unsigned integral types are allowed.");
560 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
561}
562
563/// Compile time Log2.
564/// Valid only for positive powers of two.
565template <size_t kValue> constexpr inline size_t CTLog2() {
566 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
567 "Value is not a valid power of 2");
568 return 1 + CTLog2<kValue / 2>();
569}
570
571template <> constexpr inline size_t CTLog2<1>() { return 0; }
572
573/// Return the log base 2 of the specified value.
574inline double Log2(double Value) {
575#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
576 return __builtin_log(Value) / __builtin_log(2.0);
577#else
578 return log2(Value);
579#endif
580}
581
582/// Return the floor log base 2 of the specified value, -1 if the value is zero.
583/// (32 bit edition.)
584/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
585inline unsigned Log2_32(uint32_t Value) {
586 return 31 - countLeadingZeros(Value);
587}
588
589/// Return the floor log base 2 of the specified value, -1 if the value is zero.
590/// (64 bit edition.)
591inline unsigned Log2_64(uint64_t Value) {
592 return 63 - countLeadingZeros(Value);
593}
594
595/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
596/// (32 bit edition).
597/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
598inline unsigned Log2_32_Ceil(uint32_t Value) {
599 return 32 - countLeadingZeros(Value - 1);
600}
601
602/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
603/// (64 bit edition.)
604inline unsigned Log2_64_Ceil(uint64_t Value) {
605 return 64 - countLeadingZeros(Value - 1);
606}
607
608/// Return the greatest common divisor of the values using Euclid's algorithm.
609template <typename T>
610inline T greatestCommonDivisor(T A, T B) {
611 while (B) {
612 T Tmp = B;
613 B = A % B;
614 A = Tmp;
615 }
616 return A;
617}
618
619inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
620 return greatestCommonDivisor<uint64_t>(A, B);
621}
622
623/// This function takes a 64-bit integer and returns the bit equivalent double.
624inline double BitsToDouble(uint64_t Bits) {
625 double D;
626 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
627 memcpy(&D, &Bits, sizeof(Bits));
628 return D;
629}
630
631/// This function takes a 32-bit integer and returns the bit equivalent float.
632inline float BitsToFloat(uint32_t Bits) {
633 float F;
634 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
635 memcpy(&F, &Bits, sizeof(Bits));
636 return F;
637}
638
639/// This function takes a double and returns the bit equivalent 64-bit integer.
640/// Note that copying doubles around changes the bits of NaNs on some hosts,
641/// notably x86, so this routine cannot be used if these bits are needed.
642inline uint64_t DoubleToBits(double Double) {
643 uint64_t Bits;
644 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
645 memcpy(&Bits, &Double, sizeof(Double));
646 return Bits;
647}
648
649/// This function takes a float and returns the bit equivalent 32-bit integer.
650/// Note that copying floats around changes the bits of NaNs on some hosts,
651/// notably x86, so this routine cannot be used if these bits are needed.
652inline uint32_t FloatToBits(float Float) {
653 uint32_t Bits;
654 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
655 memcpy(&Bits, &Float, sizeof(Float));
656 return Bits;
657}
658
659/// A and B are either alignments or offsets. Return the minimum alignment that
660/// may be assumed after adding the two together.
661constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
662 // The largest power of 2 that divides both A and B.
663 //
664 // Replace "-Value" by "1+~Value" in the following commented code to avoid
665 // MSVC warning C4146
666 // return (A | B) & -(A | B);
667 return (A | B) & (1 + ~(A | B));
668}
669
670/// Returns the next power of two (in 64-bits) that is strictly greater than A.
671/// Returns zero on overflow.
672inline uint64_t NextPowerOf2(uint64_t A) {
673 A |= (A >> 1);
674 A |= (A >> 2);
675 A |= (A >> 4);
676 A |= (A >> 8);
677 A |= (A >> 16);
678 A |= (A >> 32);
679 return A + 1;
680}
681
682/// Returns the power of two which is less than or equal to the given value.
683/// Essentially, it is a floor operation across the domain of powers of two.
684inline uint64_t PowerOf2Floor(uint64_t A) {
685 if (!A) return 0;
686 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
687}
688
689/// Returns the power of two which is greater than or equal to the given value.
690/// Essentially, it is a ceil operation across the domain of powers of two.
691inline uint64_t PowerOf2Ceil(uint64_t A) {
692 if (!A)
693 return 0;
694 return NextPowerOf2(A - 1);
695}
696
697/// Returns the next integer (mod 2**64) that is greater than or equal to
698/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
699///
700/// If non-zero \p Skew is specified, the return value will be a minimal
701/// integer that is greater than or equal to \p Value and equal to
702/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
703/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
704///
705/// Examples:
706/// \code
707/// alignTo(5, 8) = 8
708/// alignTo(17, 8) = 24
709/// alignTo(~0LL, 8) = 0
710/// alignTo(321, 255) = 510
711///
712/// alignTo(5, 8, 7) = 7
713/// alignTo(17, 8, 1) = 17
714/// alignTo(~0LL, 8, 3) = 3
715/// alignTo(321, 255, 42) = 552
716/// \endcode
717inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
718 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 718, __PRETTY_FUNCTION__))
;
719 Skew %= Align;
720 return (Value + Align - 1 - Skew) / Align * Align + Skew;
721}
722
723/// Returns the next integer (mod 2**64) that is greater than or equal to
724/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
725template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
726 static_assert(Align != 0u, "Align must be non-zero");
727 return (Value + Align - 1) / Align * Align;
728}
729
730/// Returns the integer ceil(Numerator / Denominator).
731inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
732 return alignTo(Numerator, Denominator) / Denominator;
733}
734
735/// Returns the largest uint64_t less than or equal to \p Value and is
736/// \p Skew mod \p Align. \p Align must be non-zero
737inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
738 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 738, __PRETTY_FUNCTION__))
;
739 Skew %= Align;
740 return (Value - Skew) / Align * Align + Skew;
741}
742
743/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
744/// Requires 0 < B <= 32.
745template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
746 static_assert(B > 0, "Bit width can't be 0.");
747 static_assert(B <= 32, "Bit width out of range.");
748 return int32_t(X << (32 - B)) >> (32 - B);
749}
750
751/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
752/// Requires 0 < B < 32.
753inline int32_t SignExtend32(uint32_t X, unsigned B) {
754 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 754, __PRETTY_FUNCTION__))
;
755 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 755, __PRETTY_FUNCTION__))
;
756 return int32_t(X << (32 - B)) >> (32 - B);
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
760/// Requires 0 < B < 64.
761template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 64, "Bit width out of range.");
764 return int64_t(x << (64 - B)) >> (64 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
768/// Requires 0 < B < 64.
769inline int64_t SignExtend64(uint64_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 770, __PRETTY_FUNCTION__))
;
771 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 771, __PRETTY_FUNCTION__))
;
772 return int64_t(X << (64 - B)) >> (64 - B);
773}
774
775/// Subtract two unsigned integers, X and Y, of type T and return the absolute
776/// value of the result.
777template <typename T>
778typename std::enable_if<std::is_unsigned<T>::value, T>::type
779AbsoluteDifference(T X, T Y) {
780 return std::max(X, Y) - std::min(X, Y);
781}
782
783/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
784/// maximum representable value of T on overflow. ResultOverflowed indicates if
785/// the result is larger than the maximum representable value of type T.
786template <typename T>
787typename std::enable_if<std::is_unsigned<T>::value, T>::type
788SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
789 bool Dummy;
790 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
791 // Hacker's Delight, p. 29
792 T Z = X + Y;
793 Overflowed = (Z < X || Z < Y);
794 if (Overflowed)
795 return std::numeric_limits<T>::max();
796 else
797 return Z;
798}
799
800/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
801/// maximum representable value of T on overflow. ResultOverflowed indicates if
802/// the result is larger than the maximum representable value of type T.
803template <typename T>
804typename std::enable_if<std::is_unsigned<T>::value, T>::type
805SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
806 bool Dummy;
807 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
808
809 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
810 // because it fails for uint16_t (where multiplication can have undefined
811 // behavior due to promotion to int), and requires a division in addition
812 // to the multiplication.
813
814 Overflowed = false;
815
816 // Log2(Z) would be either Log2Z or Log2Z + 1.
817 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
818 // will necessarily be less than Log2Max as desired.
819 int Log2Z = Log2_64(X) + Log2_64(Y);
820 const T Max = std::numeric_limits<T>::max();
821 int Log2Max = Log2_64(Max);
822 if (Log2Z < Log2Max) {
823 return X * Y;
824 }
825 if (Log2Z > Log2Max) {
826 Overflowed = true;
827 return Max;
828 }
829
830 // We're going to use the top bit, and maybe overflow one
831 // bit past it. Multiply all but the bottom bit then add
832 // that on at the end.
833 T Z = (X >> 1) * Y;
834 if (Z & ~(Max >> 1)) {
835 Overflowed = true;
836 return Max;
837 }
838 Z <<= 1;
839 if (X & 1)
840 return SaturatingAdd(Z, Y, ResultOverflowed);
841
842 return Z;
843}
844
845/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
846/// the product. Clamp the result to the maximum representable value of T on
847/// overflow. ResultOverflowed indicates if the result is larger than the
848/// maximum representable value of type T.
849template <typename T>
850typename std::enable_if<std::is_unsigned<T>::value, T>::type
851SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
852 bool Dummy;
853 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
854
855 T Product = SaturatingMultiply(X, Y, &Overflowed);
856 if (Overflowed)
857 return Product;
858
859 return SaturatingAdd(A, Product, &Overflowed);
860}
861
862/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
863extern const float huge_valf;
864
865
866/// Add two signed integers, computing the two's complement truncated result,
867/// returning true if overflow occured.
868template <typename T>
869typename std::enable_if<std::is_signed<T>::value, T>::type
870AddOverflow(T X, T Y, T &Result) {
871#if __has_builtin(__builtin_add_overflow)1
872 return __builtin_add_overflow(X, Y, &Result);
873#else
874 // Perform the unsigned addition.
875 using U = typename std::make_unsigned<T>::type;
876 const U UX = static_cast<U>(X);
877 const U UY = static_cast<U>(Y);
878 const U UResult = UX + UY;
879
880 // Convert to signed.
881 Result = static_cast<T>(UResult);
882
883 // Adding two positive numbers should result in a positive number.
884 if (X > 0 && Y > 0)
885 return Result <= 0;
886 // Adding two negatives should result in a negative number.
887 if (X < 0 && Y < 0)
888 return Result >= 0;
889 return false;
890#endif
891}
892
893/// Subtract two signed integers, computing the two's complement truncated
894/// result, returning true if an overflow ocurred.
895template <typename T>
896typename std::enable_if<std::is_signed<T>::value, T>::type
897SubOverflow(T X, T Y, T &Result) {
898#if __has_builtin(__builtin_sub_overflow)1
899 return __builtin_sub_overflow(X, Y, &Result);
900#else
901 // Perform the unsigned addition.
902 using U = typename std::make_unsigned<T>::type;
903 const U UX = static_cast<U>(X);
904 const U UY = static_cast<U>(Y);
905 const U UResult = UX - UY;
906
907 // Convert to signed.
908 Result = static_cast<T>(UResult);
909
910 // Subtracting a positive number from a negative results in a negative number.
911 if (X <= 0 && Y > 0)
912 return Result >= 0;
913 // Subtracting a negative number from a positive results in a positive number.
914 if (X >= 0 && Y < 0)
915 return Result <= 0;
916 return false;
917#endif
918}
919
920
921/// Multiply two signed integers, computing the two's complement truncated
922/// result, returning true if an overflow ocurred.
923template <typename T>
924typename std::enable_if<std::is_signed<T>::value, T>::type
925MulOverflow(T X, T Y, T &Result) {
926 // Perform the unsigned multiplication on absolute values.
927 using U = typename std::make_unsigned<T>::type;
928 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
929 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
930 const U UResult = UX * UY;
931
932 // Convert to signed.
933 const bool IsNegative = (X < 0) ^ (Y < 0);
934 Result = IsNegative ? (0 - UResult) : UResult;
935
936 // If any of the args was 0, result is 0 and no overflow occurs.
937 if (UX == 0 || UY == 0)
938 return false;
939
940 // UX and UY are in [1, 2^n], where n is the number of digits.
941 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
942 // positive) divided by an argument compares to the other.
943 if (IsNegative)
944 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
945 else
946 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
947}
948
949} // End llvm namespace
950
951#endif