Bug Summary

File:lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4159, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/include -I /build/llvm-toolchain-snapshot-10~svn374877/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~svn374877=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2019-10-15-233810-7101-1 -x c++ /build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUCallLowering.h"
18#include "AMDGPUFrameLowering.h"
19#include "AMDGPURegisterInfo.h"
20#include "AMDGPUSubtarget.h"
21#include "AMDGPUTargetMachine.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "R600MachineFunctionInfo.h"
24#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
27#include "llvm/CodeGen/Analysis.h"
28#include "llvm/CodeGen/CallingConvLower.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineRegisterInfo.h"
31#include "llvm/CodeGen/SelectionDAG.h"
32#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
33#include "llvm/IR/DataLayout.h"
34#include "llvm/IR/DiagnosticInfo.h"
35#include "llvm/Support/KnownBits.h"
36#include "llvm/Support/MathExtras.h"
37using namespace llvm;
38
39#include "AMDGPUGenCallingConv.inc"
40
41// Find a larger type to do a load / store of a vector with.
42EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
43 unsigned StoreSize = VT.getStoreSizeInBits();
44 if (StoreSize <= 32)
45 return EVT::getIntegerVT(Ctx, StoreSize);
46
47 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")((StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? static_cast<void> (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 47, __PRETTY_FUNCTION__))
;
48 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
49}
50
51unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
52 EVT VT = Op.getValueType();
53 KnownBits Known = DAG.computeKnownBits(Op);
54 return VT.getSizeInBits() - Known.countMinLeadingZeros();
55}
56
57unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
58 EVT VT = Op.getValueType();
59
60 // In order for this to be a signed 24-bit value, bit 23, must
61 // be a sign bit.
62 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
63}
64
65AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
66 const AMDGPUSubtarget &STI)
67 : TargetLowering(TM), Subtarget(&STI) {
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
70 setOperationAction(ISD::LOAD, MVT::f32, Promote);
71 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
72
73 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
74 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
75
76 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
77 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
78
79 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
80 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
81
82 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
83 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
84
85 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87
88 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
89 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
90
91 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
92 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
93
94 setOperationAction(ISD::LOAD, MVT::i64, Promote);
95 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
96
97 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
98 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
99
100 setOperationAction(ISD::LOAD, MVT::f64, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
102
103 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
105
106 // There are no 64-bit extloads. These should be done as a 32-bit extload and
107 // an extension to 64-bit.
108 for (MVT VT : MVT::integer_valuetypes()) {
109 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
110 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
111 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
112 }
113
114 for (MVT VT : MVT::integer_valuetypes()) {
115 if (VT == MVT::i64)
116 continue;
117
118 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
119 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
120 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
121 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
122
123 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
124 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
125 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
126 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
127
128 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
129 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
130 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
131 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
132 }
133
134 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
135 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
137 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
138 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
139 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
141 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
143 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
145 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
146 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
148 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
149 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
150 }
151
152 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
153 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
154 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
155 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
156 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
157 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
158 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
159
160 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
161 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
162 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
163 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
164
165 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
166 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
167 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
168 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
169
170 setOperationAction(ISD::STORE, MVT::f32, Promote);
171 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
172
173 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
174 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
175
176 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
177 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
178
179 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
180 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
181
182 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
183 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
184
185 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
186 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
187
188 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
189 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
190
191 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
192 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
193
194 setOperationAction(ISD::STORE, MVT::i64, Promote);
195 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
196
197 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
198 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
199
200 setOperationAction(ISD::STORE, MVT::f64, Promote);
201 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
202
203 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
204 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
205
206 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
207 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
208 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
210
211 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
212 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
213 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
214 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
215
216 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
217 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
218 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
219 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
220 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
221 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
222 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
223
224 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
225 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
226
227 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
228 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
229
230 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
231 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
232
233 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
234 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
235
236
237 setOperationAction(ISD::Constant, MVT::i32, Legal);
238 setOperationAction(ISD::Constant, MVT::i64, Legal);
239 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
240 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
241
242 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
243 setOperationAction(ISD::BRIND, MVT::Other, Expand);
244
245 // This is totally unsupported, just custom lower to produce an error.
246 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
247
248 // Library functions. These default to Expand, but we have instructions
249 // for them.
250 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
251 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
252 setOperationAction(ISD::FPOW, MVT::f32, Legal);
253 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
254 setOperationAction(ISD::FABS, MVT::f32, Legal);
255 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
256 setOperationAction(ISD::FRINT, MVT::f32, Legal);
257 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
258 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
259 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
260
261 setOperationAction(ISD::FROUND, MVT::f32, Custom);
262 setOperationAction(ISD::FROUND, MVT::f64, Custom);
263
264 setOperationAction(ISD::FLOG, MVT::f32, Custom);
265 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
266 setOperationAction(ISD::FEXP, MVT::f32, Custom);
267
268
269 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
270 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
271
272 setOperationAction(ISD::FREM, MVT::f32, Custom);
273 setOperationAction(ISD::FREM, MVT::f64, Custom);
274
275 // Expand to fneg + fadd.
276 setOperationAction(ISD::FSUB, MVT::f64, Expand);
277
278 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
279 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
280 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
281 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
282 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
283 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
284 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
285 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
286 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
287 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
288 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
289 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
290 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
291 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
292 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
293 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
294 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
295 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
296 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
297 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
298 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
299 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
300
301 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
302 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
303 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
304
305 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
306 for (MVT VT : ScalarIntVTs) {
307 // These should use [SU]DIVREM, so set them to expand
308 setOperationAction(ISD::SDIV, VT, Expand);
309 setOperationAction(ISD::UDIV, VT, Expand);
310 setOperationAction(ISD::SREM, VT, Expand);
311 setOperationAction(ISD::UREM, VT, Expand);
312
313 // GPU does not have divrem function for signed or unsigned.
314 setOperationAction(ISD::SDIVREM, VT, Custom);
315 setOperationAction(ISD::UDIVREM, VT, Custom);
316
317 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
318 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
319 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
320
321 setOperationAction(ISD::BSWAP, VT, Expand);
322 setOperationAction(ISD::CTTZ, VT, Expand);
323 setOperationAction(ISD::CTLZ, VT, Expand);
324
325 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
326 setOperationAction(ISD::ADDC, VT, Legal);
327 setOperationAction(ISD::SUBC, VT, Legal);
328 setOperationAction(ISD::ADDE, VT, Legal);
329 setOperationAction(ISD::SUBE, VT, Legal);
330 }
331
332 // The hardware supports 32-bit ROTR, but not ROTL.
333 setOperationAction(ISD::ROTL, MVT::i32, Expand);
334 setOperationAction(ISD::ROTL, MVT::i64, Expand);
335 setOperationAction(ISD::ROTR, MVT::i64, Expand);
336
337 setOperationAction(ISD::MUL, MVT::i64, Expand);
338 setOperationAction(ISD::MULHU, MVT::i64, Expand);
339 setOperationAction(ISD::MULHS, MVT::i64, Expand);
340 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
341 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
342 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
343 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
344 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
345
346 setOperationAction(ISD::SMIN, MVT::i32, Legal);
347 setOperationAction(ISD::UMIN, MVT::i32, Legal);
348 setOperationAction(ISD::SMAX, MVT::i32, Legal);
349 setOperationAction(ISD::UMAX, MVT::i32, Legal);
350
351 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
352 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
353 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
354 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355
356 static const MVT::SimpleValueType VectorIntTypes[] = {
357 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
358 };
359
360 for (MVT VT : VectorIntTypes) {
361 // Expand the following operations for the current type by default.
362 setOperationAction(ISD::ADD, VT, Expand);
363 setOperationAction(ISD::AND, VT, Expand);
364 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
365 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
366 setOperationAction(ISD::MUL, VT, Expand);
367 setOperationAction(ISD::MULHU, VT, Expand);
368 setOperationAction(ISD::MULHS, VT, Expand);
369 setOperationAction(ISD::OR, VT, Expand);
370 setOperationAction(ISD::SHL, VT, Expand);
371 setOperationAction(ISD::SRA, VT, Expand);
372 setOperationAction(ISD::SRL, VT, Expand);
373 setOperationAction(ISD::ROTL, VT, Expand);
374 setOperationAction(ISD::ROTR, VT, Expand);
375 setOperationAction(ISD::SUB, VT, Expand);
376 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
377 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
378 setOperationAction(ISD::SDIV, VT, Expand);
379 setOperationAction(ISD::UDIV, VT, Expand);
380 setOperationAction(ISD::SREM, VT, Expand);
381 setOperationAction(ISD::UREM, VT, Expand);
382 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
383 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
384 setOperationAction(ISD::SDIVREM, VT, Custom);
385 setOperationAction(ISD::UDIVREM, VT, Expand);
386 setOperationAction(ISD::SELECT, VT, Expand);
387 setOperationAction(ISD::VSELECT, VT, Expand);
388 setOperationAction(ISD::SELECT_CC, VT, Expand);
389 setOperationAction(ISD::XOR, VT, Expand);
390 setOperationAction(ISD::BSWAP, VT, Expand);
391 setOperationAction(ISD::CTPOP, VT, Expand);
392 setOperationAction(ISD::CTTZ, VT, Expand);
393 setOperationAction(ISD::CTLZ, VT, Expand);
394 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
395 setOperationAction(ISD::SETCC, VT, Expand);
396 }
397
398 static const MVT::SimpleValueType FloatVectorTypes[] = {
399 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
400 };
401
402 for (MVT VT : FloatVectorTypes) {
403 setOperationAction(ISD::FABS, VT, Expand);
404 setOperationAction(ISD::FMINNUM, VT, Expand);
405 setOperationAction(ISD::FMAXNUM, VT, Expand);
406 setOperationAction(ISD::FADD, VT, Expand);
407 setOperationAction(ISD::FCEIL, VT, Expand);
408 setOperationAction(ISD::FCOS, VT, Expand);
409 setOperationAction(ISD::FDIV, VT, Expand);
410 setOperationAction(ISD::FEXP2, VT, Expand);
411 setOperationAction(ISD::FEXP, VT, Expand);
412 setOperationAction(ISD::FLOG2, VT, Expand);
413 setOperationAction(ISD::FREM, VT, Expand);
414 setOperationAction(ISD::FLOG, VT, Expand);
415 setOperationAction(ISD::FLOG10, VT, Expand);
416 setOperationAction(ISD::FPOW, VT, Expand);
417 setOperationAction(ISD::FFLOOR, VT, Expand);
418 setOperationAction(ISD::FTRUNC, VT, Expand);
419 setOperationAction(ISD::FMUL, VT, Expand);
420 setOperationAction(ISD::FMA, VT, Expand);
421 setOperationAction(ISD::FRINT, VT, Expand);
422 setOperationAction(ISD::FNEARBYINT, VT, Expand);
423 setOperationAction(ISD::FSQRT, VT, Expand);
424 setOperationAction(ISD::FSIN, VT, Expand);
425 setOperationAction(ISD::FSUB, VT, Expand);
426 setOperationAction(ISD::FNEG, VT, Expand);
427 setOperationAction(ISD::VSELECT, VT, Expand);
428 setOperationAction(ISD::SELECT_CC, VT, Expand);
429 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
430 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
431 setOperationAction(ISD::SETCC, VT, Expand);
432 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
433 }
434
435 // This causes using an unrolled select operation rather than expansion with
436 // bit operations. This is in general better, but the alternative using BFI
437 // instructions may be better if the select sources are SGPRs.
438 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
439 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
440
441 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
442 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
443
444 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
445 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
446
447 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
448 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
449
450 // There are no libcalls of any kind.
451 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
452 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
453
454 setBooleanContents(ZeroOrNegativeOneBooleanContent);
455 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
456
457 setSchedulingPreference(Sched::RegPressure);
458 setJumpIsExpensive(true);
459
460 // FIXME: This is only partially true. If we have to do vector compares, any
461 // SGPR pair can be a condition register. If we have a uniform condition, we
462 // are better off doing SALU operations, where there is only one SCC. For now,
463 // we don't have a way of knowing during instruction selection if a condition
464 // will be uniform and we always use vector compares. Assume we are using
465 // vector compares until that is fixed.
466 setHasMultipleConditionRegisters(true);
467
468 setMinCmpXchgSizeInBits(32);
469 setSupportsUnalignedAtomics(false);
470
471 PredictableSelectIsExpensive = false;
472
473 // We want to find all load dependencies for long chains of stores to enable
474 // merging into very wide vectors. The problem is with vectors with > 4
475 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
476 // vectors are a legal type, even though we have to split the loads
477 // usually. When we can more precisely specify load legality per address
478 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
479 // smarter so that they can figure out what to do in 2 iterations without all
480 // N > 4 stores on the same chain.
481 GatherAllAliasesMaxDepth = 16;
482
483 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
484 // about these during lowering.
485 MaxStoresPerMemcpy = 0xffffffff;
486 MaxStoresPerMemmove = 0xffffffff;
487 MaxStoresPerMemset = 0xffffffff;
488
489 setTargetDAGCombine(ISD::BITCAST);
490 setTargetDAGCombine(ISD::SHL);
491 setTargetDAGCombine(ISD::SRA);
492 setTargetDAGCombine(ISD::SRL);
493 setTargetDAGCombine(ISD::TRUNCATE);
494 setTargetDAGCombine(ISD::MUL);
495 setTargetDAGCombine(ISD::MULHU);
496 setTargetDAGCombine(ISD::MULHS);
497 setTargetDAGCombine(ISD::SELECT);
498 setTargetDAGCombine(ISD::SELECT_CC);
499 setTargetDAGCombine(ISD::STORE);
500 setTargetDAGCombine(ISD::FADD);
501 setTargetDAGCombine(ISD::FSUB);
502 setTargetDAGCombine(ISD::FNEG);
503 setTargetDAGCombine(ISD::FABS);
504 setTargetDAGCombine(ISD::AssertZext);
505 setTargetDAGCombine(ISD::AssertSext);
506 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
507}
508
509//===----------------------------------------------------------------------===//
510// Target Information
511//===----------------------------------------------------------------------===//
512
513LLVM_READNONE__attribute__((__const__))
514static bool fnegFoldsIntoOp(unsigned Opc) {
515 switch (Opc) {
516 case ISD::FADD:
517 case ISD::FSUB:
518 case ISD::FMUL:
519 case ISD::FMA:
520 case ISD::FMAD:
521 case ISD::FMINNUM:
522 case ISD::FMAXNUM:
523 case ISD::FMINNUM_IEEE:
524 case ISD::FMAXNUM_IEEE:
525 case ISD::FSIN:
526 case ISD::FTRUNC:
527 case ISD::FRINT:
528 case ISD::FNEARBYINT:
529 case ISD::FCANONICALIZE:
530 case AMDGPUISD::RCP:
531 case AMDGPUISD::RCP_LEGACY:
532 case AMDGPUISD::RCP_IFLAG:
533 case AMDGPUISD::SIN_HW:
534 case AMDGPUISD::FMUL_LEGACY:
535 case AMDGPUISD::FMIN_LEGACY:
536 case AMDGPUISD::FMAX_LEGACY:
537 case AMDGPUISD::FMED3:
538 return true;
539 default:
540 return false;
541 }
542}
543
544/// \p returns true if the operation will definitely need to use a 64-bit
545/// encoding, and thus will use a VOP3 encoding regardless of the source
546/// modifiers.
547LLVM_READONLY__attribute__((__pure__))
548static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
549 return N->getNumOperands() > 2 || VT == MVT::f64;
550}
551
552// Most FP instructions support source modifiers, but this could be refined
553// slightly.
554LLVM_READONLY__attribute__((__pure__))
555static bool hasSourceMods(const SDNode *N) {
556 if (isa<MemSDNode>(N))
557 return false;
558
559 switch (N->getOpcode()) {
560 case ISD::CopyToReg:
561 case ISD::SELECT:
562 case ISD::FDIV:
563 case ISD::FREM:
564 case ISD::INLINEASM:
565 case ISD::INLINEASM_BR:
566 case AMDGPUISD::INTERP_P1:
567 case AMDGPUISD::INTERP_P2:
568 case AMDGPUISD::DIV_SCALE:
569
570 // TODO: Should really be looking at the users of the bitcast. These are
571 // problematic because bitcasts are used to legalize all stores to integer
572 // types.
573 case ISD::BITCAST:
574 return false;
575 default:
576 return true;
577 }
578}
579
580bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
581 unsigned CostThreshold) {
582 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
583 // it is truly free to use a source modifier in all cases. If there are
584 // multiple users but for each one will necessitate using VOP3, there will be
585 // a code size increase. Try to avoid increasing code size unless we know it
586 // will save on the instruction count.
587 unsigned NumMayIncreaseSize = 0;
588 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
589
590 // XXX - Should this limit number of uses to check?
591 for (const SDNode *U : N->uses()) {
592 if (!hasSourceMods(U))
593 return false;
594
595 if (!opMustUseVOP3Encoding(U, VT)) {
596 if (++NumMayIncreaseSize > CostThreshold)
597 return false;
598 }
599 }
600
601 return true;
602}
603
604MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
605 return MVT::i32;
606}
607
608bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
609 return true;
610}
611
612// The backend supports 32 and 64 bit floating point immediates.
613// FIXME: Why are we reporting vectors of FP immediates as legal?
614bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
615 bool ForCodeSize) const {
616 EVT ScalarVT = VT.getScalarType();
617 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
618 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
619}
620
621// We don't want to shrink f64 / f32 constants.
622bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
623 EVT ScalarVT = VT.getScalarType();
624 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
625}
626
627bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
628 ISD::LoadExtType ExtTy,
629 EVT NewVT) const {
630 // TODO: This may be worth removing. Check regression tests for diffs.
631 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
632 return false;
633
634 unsigned NewSize = NewVT.getStoreSizeInBits();
635
636 // If we are reducing to a 32-bit load, this is always better.
637 if (NewSize == 32)
638 return true;
639
640 EVT OldVT = N->getValueType(0);
641 unsigned OldSize = OldVT.getStoreSizeInBits();
642
643 MemSDNode *MN = cast<MemSDNode>(N);
644 unsigned AS = MN->getAddressSpace();
645 // Do not shrink an aligned scalar load to sub-dword.
646 // Scalar engine cannot do sub-dword loads.
647 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
648 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
649 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
650 (isa<LoadSDNode>(N) &&
651 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
652 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
653 return false;
654
655 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
656 // extloads, so doing one requires using a buffer_load. In cases where we
657 // still couldn't use a scalar load, using the wider load shouldn't really
658 // hurt anything.
659
660 // If the old size already had to be an extload, there's no harm in continuing
661 // to reduce the width.
662 return (OldSize < 32);
663}
664
665bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
666 const SelectionDAG &DAG,
667 const MachineMemOperand &MMO) const {
668
669 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())((LoadTy.getSizeInBits() == CastTy.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 669, __PRETTY_FUNCTION__))
;
670
671 if (LoadTy.getScalarType() == MVT::i32)
672 return false;
673
674 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
675 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
676
677 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
678 return false;
679
680 bool Fast = false;
681 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
682 CastTy, MMO, &Fast) &&
683 Fast;
684}
685
686// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
687// profitable with the expansion for 64-bit since it's generally good to
688// speculate things.
689// FIXME: These should really have the size as a parameter.
690bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
691 return true;
692}
693
694bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
695 return true;
696}
697
698bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
699 switch (N->getOpcode()) {
700 default:
701 return false;
702 case ISD::EntryToken:
703 case ISD::TokenFactor:
704 return true;
705 case ISD::INTRINSIC_WO_CHAIN:
706 {
707 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
708 switch (IntrID) {
709 default:
710 return false;
711 case Intrinsic::amdgcn_readfirstlane:
712 case Intrinsic::amdgcn_readlane:
713 return true;
714 }
715 }
716 break;
717 case ISD::LOAD:
718 {
719 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
720 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
721 return true;
722 return false;
723 }
724 break;
725 }
726}
727
728//===---------------------------------------------------------------------===//
729// Target Properties
730//===---------------------------------------------------------------------===//
731
732bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
733 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 733, __PRETTY_FUNCTION__))
;
734
735 // Packed operations do not have a fabs modifier.
736 return VT == MVT::f32 || VT == MVT::f64 ||
737 (Subtarget->has16BitInsts() && VT == MVT::f16);
738}
739
740bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
741 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 741, __PRETTY_FUNCTION__))
;
742 return VT == MVT::f32 || VT == MVT::f64 ||
743 (Subtarget->has16BitInsts() && VT == MVT::f16) ||
744 (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
745}
746
747bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
748 unsigned NumElem,
749 unsigned AS) const {
750 return true;
751}
752
753bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
754 // There are few operations which truly have vector input operands. Any vector
755 // operation is going to involve operations on each component, and a
756 // build_vector will be a copy per element, so it always makes sense to use a
757 // build_vector input in place of the extracted element to avoid a copy into a
758 // super register.
759 //
760 // We should probably only do this if all users are extracts only, but this
761 // should be the common case.
762 return true;
763}
764
765bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
766 // Truncate is just accessing a subregister.
767
768 unsigned SrcSize = Source.getSizeInBits();
769 unsigned DestSize = Dest.getSizeInBits();
770
771 return DestSize < SrcSize && DestSize % 32 == 0 ;
772}
773
774bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
775 // Truncate is just accessing a subregister.
776
777 unsigned SrcSize = Source->getScalarSizeInBits();
778 unsigned DestSize = Dest->getScalarSizeInBits();
779
780 if (DestSize== 16 && Subtarget->has16BitInsts())
781 return SrcSize >= 32;
782
783 return DestSize < SrcSize && DestSize % 32 == 0;
784}
785
786bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
787 unsigned SrcSize = Src->getScalarSizeInBits();
788 unsigned DestSize = Dest->getScalarSizeInBits();
789
790 if (SrcSize == 16 && Subtarget->has16BitInsts())
791 return DestSize >= 32;
792
793 return SrcSize == 32 && DestSize == 64;
794}
795
796bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
797 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
798 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
799 // this will enable reducing 64-bit operations the 32-bit, which is always
800 // good.
801
802 if (Src == MVT::i16)
803 return Dest == MVT::i32 ||Dest == MVT::i64 ;
804
805 return Src == MVT::i32 && Dest == MVT::i64;
806}
807
808bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
809 return isZExtFree(Val.getValueType(), VT2);
810}
811
812bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
813 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
814 // limited number of native 64-bit operations. Shrinking an operation to fit
815 // in a single 32-bit register should always be helpful. As currently used,
816 // this is much less general than the name suggests, and is only used in
817 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
818 // not profitable, and may actually be harmful.
819 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
820}
821
822//===---------------------------------------------------------------------===//
823// TargetLowering Callbacks
824//===---------------------------------------------------------------------===//
825
826CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
827 bool IsVarArg) {
828 switch (CC) {
829 case CallingConv::AMDGPU_VS:
830 case CallingConv::AMDGPU_GS:
831 case CallingConv::AMDGPU_PS:
832 case CallingConv::AMDGPU_CS:
833 case CallingConv::AMDGPU_HS:
834 case CallingConv::AMDGPU_ES:
835 case CallingConv::AMDGPU_LS:
836 return CC_AMDGPU;
837 case CallingConv::C:
838 case CallingConv::Fast:
839 case CallingConv::Cold:
840 return CC_AMDGPU_Func;
841 case CallingConv::AMDGPU_KERNEL:
842 case CallingConv::SPIR_KERNEL:
843 default:
844 report_fatal_error("Unsupported calling convention for call");
845 }
846}
847
848CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
849 bool IsVarArg) {
850 switch (CC) {
851 case CallingConv::AMDGPU_KERNEL:
852 case CallingConv::SPIR_KERNEL:
853 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 853)
;
854 case CallingConv::AMDGPU_VS:
855 case CallingConv::AMDGPU_GS:
856 case CallingConv::AMDGPU_PS:
857 case CallingConv::AMDGPU_CS:
858 case CallingConv::AMDGPU_HS:
859 case CallingConv::AMDGPU_ES:
860 case CallingConv::AMDGPU_LS:
861 return RetCC_SI_Shader;
862 case CallingConv::C:
863 case CallingConv::Fast:
864 case CallingConv::Cold:
865 return RetCC_AMDGPU_Func;
866 default:
867 report_fatal_error("Unsupported calling convention.");
868 }
869}
870
871/// The SelectionDAGBuilder will automatically promote function arguments
872/// with illegal types. However, this does not work for the AMDGPU targets
873/// since the function arguments are stored in memory as these illegal types.
874/// In order to handle this properly we need to get the original types sizes
875/// from the LLVM IR Function and fixup the ISD:InputArg values before
876/// passing them to AnalyzeFormalArguments()
877
878/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
879/// input values across multiple registers. Each item in the Ins array
880/// represents a single value that will be stored in registers. Ins[x].VT is
881/// the value type of the value that will be stored in the register, so
882/// whatever SDNode we lower the argument to needs to be this type.
883///
884/// In order to correctly lower the arguments we need to know the size of each
885/// argument. Since Ins[x].VT gives us the size of the register that will
886/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
887/// for the orignal function argument so that we can deduce the correct memory
888/// type to use for Ins[x]. In most cases the correct memory type will be
889/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
890/// we have a kernel argument of type v8i8, this argument will be split into
891/// 8 parts and each part will be represented by its own item in the Ins array.
892/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
893/// the argument before it was split. From this, we deduce that the memory type
894/// for each individual part is i8. We pass the memory type as LocVT to the
895/// calling convention analysis function and the register type (Ins[x].VT) as
896/// the ValVT.
897void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
898 CCState &State,
899 const SmallVectorImpl<ISD::InputArg> &Ins) const {
900 const MachineFunction &MF = State.getMachineFunction();
901 const Function &Fn = MF.getFunction();
902 LLVMContext &Ctx = Fn.getParent()->getContext();
903 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
904 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
905 CallingConv::ID CC = Fn.getCallingConv();
906
907 unsigned MaxAlign = 1;
908 uint64_t ExplicitArgOffset = 0;
909 const DataLayout &DL = Fn.getParent()->getDataLayout();
910
911 unsigned InIndex = 0;
912
913 for (const Argument &Arg : Fn.args()) {
914 Type *BaseArgTy = Arg.getType();
915 unsigned Align = DL.getABITypeAlignment(BaseArgTy);
916 MaxAlign = std::max(Align, MaxAlign);
917 unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
918
919 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
920 ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
921
922 // We're basically throwing away everything passed into us and starting over
923 // to get accurate in-memory offsets. The "PartOffset" is completely useless
924 // to us as computed in Ins.
925 //
926 // We also need to figure out what type legalization is trying to do to get
927 // the correct memory offsets.
928
929 SmallVector<EVT, 16> ValueVTs;
930 SmallVector<uint64_t, 16> Offsets;
931 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
932
933 for (unsigned Value = 0, NumValues = ValueVTs.size();
934 Value != NumValues; ++Value) {
935 uint64_t BasePartOffset = Offsets[Value];
936
937 EVT ArgVT = ValueVTs[Value];
938 EVT MemVT = ArgVT;
939 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
940 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
941
942 if (NumRegs == 1) {
943 // This argument is not split, so the IR type is the memory type.
944 if (ArgVT.isExtended()) {
945 // We have an extended type, like i24, so we should just use the
946 // register type.
947 MemVT = RegisterVT;
948 } else {
949 MemVT = ArgVT;
950 }
951 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
952 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
953 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())((ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements
()) ? static_cast<void> (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 953, __PRETTY_FUNCTION__))
;
954 // We have a vector value which has been split into a vector with
955 // the same scalar type, but fewer elements. This should handle
956 // all the floating-point vector types.
957 MemVT = RegisterVT;
958 } else if (ArgVT.isVector() &&
959 ArgVT.getVectorNumElements() == NumRegs) {
960 // This arg has been split so that each element is stored in a separate
961 // register.
962 MemVT = ArgVT.getScalarType();
963 } else if (ArgVT.isExtended()) {
964 // We have an extended type, like i65.
965 MemVT = RegisterVT;
966 } else {
967 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
968 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)((ArgVT.getStoreSizeInBits() % NumRegs == 0) ? static_cast<
void> (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 968, __PRETTY_FUNCTION__))
;
969 if (RegisterVT.isInteger()) {
970 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
971 } else if (RegisterVT.isVector()) {
972 assert(!RegisterVT.getScalarType().isFloatingPoint())((!RegisterVT.getScalarType().isFloatingPoint()) ? static_cast
<void> (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 972, __PRETTY_FUNCTION__))
;
973 unsigned NumElements = RegisterVT.getVectorNumElements();
974 assert(MemoryBits % NumElements == 0)((MemoryBits % NumElements == 0) ? static_cast<void> (0
) : __assert_fail ("MemoryBits % NumElements == 0", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 974, __PRETTY_FUNCTION__))
;
975 // This vector type has been split into another vector type with
976 // a different elements size.
977 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
978 MemoryBits / NumElements);
979 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
980 } else {
981 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 981)
;
982 }
983 }
984
985 // Convert one element vectors to scalar.
986 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
987 MemVT = MemVT.getScalarType();
988
989 // Round up vec3/vec5 argument.
990 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
991 assert(MemVT.getVectorNumElements() == 3 ||((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 992, __PRETTY_FUNCTION__))
992 MemVT.getVectorNumElements() == 5)((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 992, __PRETTY_FUNCTION__))
;
993 MemVT = MemVT.getPow2VectorType(State.getContext());
994 }
995
996 unsigned PartOffset = 0;
997 for (unsigned i = 0; i != NumRegs; ++i) {
998 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
999 BasePartOffset + PartOffset,
1000 MemVT.getSimpleVT(),
1001 CCValAssign::Full));
1002 PartOffset += MemVT.getStoreSize();
1003 }
1004 }
1005 }
1006}
1007
1008SDValue AMDGPUTargetLowering::LowerReturn(
1009 SDValue Chain, CallingConv::ID CallConv,
1010 bool isVarArg,
1011 const SmallVectorImpl<ISD::OutputArg> &Outs,
1012 const SmallVectorImpl<SDValue> &OutVals,
1013 const SDLoc &DL, SelectionDAG &DAG) const {
1014 // FIXME: Fails for r600 tests
1015 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1016 // "wave terminate should not have return values");
1017 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1018}
1019
1020//===---------------------------------------------------------------------===//
1021// Target specific lowering
1022//===---------------------------------------------------------------------===//
1023
1024/// Selects the correct CCAssignFn for a given CallingConvention value.
1025CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1026 bool IsVarArg) {
1027 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1028}
1029
1030CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1031 bool IsVarArg) {
1032 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1033}
1034
1035SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1036 SelectionDAG &DAG,
1037 MachineFrameInfo &MFI,
1038 int ClobberedFI) const {
1039 SmallVector<SDValue, 8> ArgChains;
1040 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1041 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1042
1043 // Include the original chain at the beginning of the list. When this is
1044 // used by target LowerCall hooks, this helps legalize find the
1045 // CALLSEQ_BEGIN node.
1046 ArgChains.push_back(Chain);
1047
1048 // Add a chain value for each stack argument corresponding
1049 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1050 UE = DAG.getEntryNode().getNode()->use_end();
1051 U != UE; ++U) {
1052 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1053 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1054 if (FI->getIndex() < 0) {
1055 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1056 int64_t InLastByte = InFirstByte;
1057 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1058
1059 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1060 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1061 ArgChains.push_back(SDValue(L, 1));
1062 }
1063 }
1064 }
1065 }
1066
1067 // Build a tokenfactor for all the chains.
1068 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1069}
1070
1071SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1072 SmallVectorImpl<SDValue> &InVals,
1073 StringRef Reason) const {
1074 SDValue Callee = CLI.Callee;
1075 SelectionDAG &DAG = CLI.DAG;
1076
1077 const Function &Fn = DAG.getMachineFunction().getFunction();
1078
1079 StringRef FuncName("<unknown>");
1080
1081 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1082 FuncName = G->getSymbol();
1083 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1084 FuncName = G->getGlobal()->getName();
1085
1086 DiagnosticInfoUnsupported NoCalls(
1087 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1088 DAG.getContext()->diagnose(NoCalls);
1089
1090 if (!CLI.IsTailCall) {
1091 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1092 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1093 }
1094
1095 return DAG.getEntryNode();
1096}
1097
1098SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1099 SmallVectorImpl<SDValue> &InVals) const {
1100 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1101}
1102
1103SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1104 SelectionDAG &DAG) const {
1105 const Function &Fn = DAG.getMachineFunction().getFunction();
1106
1107 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1108 SDLoc(Op).getDebugLoc());
1109 DAG.getContext()->diagnose(NoDynamicAlloca);
1110 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1111 return DAG.getMergeValues(Ops, SDLoc());
1112}
1113
1114SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1115 SelectionDAG &DAG) const {
1116 switch (Op.getOpcode()) {
1117 default:
1118 Op->print(errs(), &DAG);
1119 llvm_unreachable("Custom lowering code for this"::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1120)
1120 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1120)
;
1121 break;
1122 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1123 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1124 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1125 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1126 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1127 case ISD::FREM: return LowerFREM(Op, DAG);
1128 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1129 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1130 case ISD::FRINT: return LowerFRINT(Op, DAG);
1131 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1132 case ISD::FROUND: return LowerFROUND(Op, DAG);
1133 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1134 case ISD::FLOG:
1135 return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
1136 case ISD::FLOG10:
1137 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1138 case ISD::FEXP:
1139 return lowerFEXP(Op, DAG);
1140 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1141 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1142 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1143 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1144 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1145 case ISD::CTTZ:
1146 case ISD::CTTZ_ZERO_UNDEF:
1147 case ISD::CTLZ:
1148 case ISD::CTLZ_ZERO_UNDEF:
1149 return LowerCTLZ_CTTZ(Op, DAG);
1150 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1151 }
1152 return Op;
1153}
1154
1155void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1156 SmallVectorImpl<SDValue> &Results,
1157 SelectionDAG &DAG) const {
1158 switch (N->getOpcode()) {
1159 case ISD::SIGN_EXTEND_INREG:
1160 // Different parts of legalization seem to interpret which type of
1161 // sign_extend_inreg is the one to check for custom lowering. The extended
1162 // from type is what really matters, but some places check for custom
1163 // lowering of the result type. This results in trying to use
1164 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1165 // nothing here and let the illegal result integer be handled normally.
1166 return;
1167 default:
1168 return;
1169 }
1170}
1171
1172bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1173 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1174 if (!GVar || !GVar->hasInitializer())
1175 return false;
1176
1177 return !isa<UndefValue>(GVar->getInitializer());
1178}
1179
1180SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1181 SDValue Op,
1182 SelectionDAG &DAG) const {
1183
1184 const DataLayout &DL = DAG.getDataLayout();
1185 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1186 const GlobalValue *GV = G->getGlobal();
1187
1188 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1189 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1190 if (!MFI->isEntryFunction()) {
1191 const Function &Fn = DAG.getMachineFunction().getFunction();
1192 DiagnosticInfoUnsupported BadLDSDecl(
1193 Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
1194 DAG.getContext()->diagnose(BadLDSDecl);
1195 }
1196
1197 // XXX: What does the value of G->getOffset() mean?
1198 assert(G->getOffset() == 0 &&((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1199, __PRETTY_FUNCTION__))
1199 "Do not know what to do with an non-zero offset")((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1199, __PRETTY_FUNCTION__))
;
1200
1201 // TODO: We could emit code to handle the initialization somewhere.
1202 if (!hasDefinedInitializer(GV)) {
1203 unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1204 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1205 }
1206 }
1207
1208 const Function &Fn = DAG.getMachineFunction().getFunction();
1209 DiagnosticInfoUnsupported BadInit(
1210 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1211 DAG.getContext()->diagnose(BadInit);
1212 return SDValue();
1213}
1214
1215SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1216 SelectionDAG &DAG) const {
1217 SmallVector<SDValue, 8> Args;
1218
1219 EVT VT = Op.getValueType();
1220 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1221 SDLoc SL(Op);
1222 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1223 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1224
1225 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1226 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1227 }
1228
1229 for (const SDUse &U : Op->ops())
1230 DAG.ExtractVectorElements(U.get(), Args);
1231
1232 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1233}
1234
1235SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1236 SelectionDAG &DAG) const {
1237
1238 SmallVector<SDValue, 8> Args;
1239 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1240 EVT VT = Op.getValueType();
1241 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1242 VT.getVectorNumElements());
1243
1244 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1245}
1246
1247/// Generate Min/Max node
1248SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1249 SDValue LHS, SDValue RHS,
1250 SDValue True, SDValue False,
1251 SDValue CC,
1252 DAGCombinerInfo &DCI) const {
1253 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1254 return SDValue();
1255
1256 SelectionDAG &DAG = DCI.DAG;
1257 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1258 switch (CCOpcode) {
1259 case ISD::SETOEQ:
1260 case ISD::SETONE:
1261 case ISD::SETUNE:
1262 case ISD::SETNE:
1263 case ISD::SETUEQ:
1264 case ISD::SETEQ:
1265 case ISD::SETFALSE:
1266 case ISD::SETFALSE2:
1267 case ISD::SETTRUE:
1268 case ISD::SETTRUE2:
1269 case ISD::SETUO:
1270 case ISD::SETO:
1271 break;
1272 case ISD::SETULE:
1273 case ISD::SETULT: {
1274 if (LHS == True)
1275 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1276 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1277 }
1278 case ISD::SETOLE:
1279 case ISD::SETOLT:
1280 case ISD::SETLE:
1281 case ISD::SETLT: {
1282 // Ordered. Assume ordered for undefined.
1283
1284 // Only do this after legalization to avoid interfering with other combines
1285 // which might occur.
1286 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1287 !DCI.isCalledByLegalizer())
1288 return SDValue();
1289
1290 // We need to permute the operands to get the correct NaN behavior. The
1291 // selected operand is the second one based on the failing compare with NaN,
1292 // so permute it based on the compare type the hardware uses.
1293 if (LHS == True)
1294 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1295 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1296 }
1297 case ISD::SETUGE:
1298 case ISD::SETUGT: {
1299 if (LHS == True)
1300 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1301 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1302 }
1303 case ISD::SETGT:
1304 case ISD::SETGE:
1305 case ISD::SETOGE:
1306 case ISD::SETOGT: {
1307 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1308 !DCI.isCalledByLegalizer())
1309 return SDValue();
1310
1311 if (LHS == True)
1312 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1313 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1314 }
1315 case ISD::SETCC_INVALID:
1316 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1316)
;
1317 }
1318 return SDValue();
1319}
1320
1321std::pair<SDValue, SDValue>
1322AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1323 SDLoc SL(Op);
1324
1325 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1326
1327 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1328 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1329
1330 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1331 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1332
1333 return std::make_pair(Lo, Hi);
1334}
1335
1336SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1337 SDLoc SL(Op);
1338
1339 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1340 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1341 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1342}
1343
1344SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1345 SDLoc SL(Op);
1346
1347 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1348 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1349 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1350}
1351
1352// Split a vector type into two parts. The first part is a power of two vector.
1353// The second part is whatever is left over, and is a scalar if it would
1354// otherwise be a 1-vector.
1355std::pair<EVT, EVT>
1356AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1357 EVT LoVT, HiVT;
1358 EVT EltVT = VT.getVectorElementType();
1359 unsigned NumElts = VT.getVectorNumElements();
1360 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1361 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1362 HiVT = NumElts - LoNumElts == 1
1363 ? EltVT
1364 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1365 return std::make_pair(LoVT, HiVT);
1366}
1367
1368// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1369// scalar.
1370std::pair<SDValue, SDValue>
1371AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1372 const EVT &LoVT, const EVT &HiVT,
1373 SelectionDAG &DAG) const {
1374 assert(LoVT.getVectorNumElements() +((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1377, __PRETTY_FUNCTION__))
1375 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1377, __PRETTY_FUNCTION__))
1376 N.getValueType().getVectorNumElements() &&((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1377, __PRETTY_FUNCTION__))
1377 "More vector elements requested than available!")((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1377, __PRETTY_FUNCTION__))
;
1378 auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1379 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1380 DAG.getConstant(0, DL, IdxTy));
1381 SDValue Hi = DAG.getNode(
1382 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1383 HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
1384 return std::make_pair(Lo, Hi);
1385}
1386
1387SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1388 SelectionDAG &DAG) const {
1389 LoadSDNode *Load = cast<LoadSDNode>(Op);
1390 EVT VT = Op.getValueType();
1391
1392
1393 // If this is a 2 element vector, we really want to scalarize and not create
1394 // weird 1 element vectors.
1395 if (VT.getVectorNumElements() == 2)
1396 return scalarizeVectorLoad(Load, DAG);
1397
1398 SDValue BasePtr = Load->getBasePtr();
1399 EVT MemVT = Load->getMemoryVT();
1400 SDLoc SL(Op);
1401
1402 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1403
1404 EVT LoVT, HiVT;
1405 EVT LoMemVT, HiMemVT;
1406 SDValue Lo, Hi;
1407
1408 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1409 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1410 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1411
1412 unsigned Size = LoMemVT.getStoreSize();
1413 unsigned BaseAlign = Load->getAlignment();
1414 unsigned HiAlign = MinAlign(BaseAlign, Size);
1415
1416 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1417 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1418 BaseAlign, Load->getMemOperand()->getFlags());
1419 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1420 SDValue HiLoad =
1421 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1422 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1423 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1424
1425 auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1426 SDValue Join;
1427 if (LoVT == HiVT) {
1428 // This is the case that the vector is power of two so was evenly split.
1429 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1430 } else {
1431 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1432 DAG.getConstant(0, SL, IdxTy));
1433 Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
1434 : ISD::INSERT_VECTOR_ELT,
1435 SL, VT, Join, HiLoad,
1436 DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
1437 }
1438
1439 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1440 LoLoad.getValue(1), HiLoad.getValue(1))};
1441
1442 return DAG.getMergeValues(Ops, SL);
1443}
1444
1445// Widen a vector load from vec3 to vec4.
1446SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
1447 SelectionDAG &DAG) const {
1448 LoadSDNode *Load = cast<LoadSDNode>(Op);
1449 EVT VT = Op.getValueType();
1450 assert(VT.getVectorNumElements() == 3)((VT.getVectorNumElements() == 3) ? static_cast<void> (
0) : __assert_fail ("VT.getVectorNumElements() == 3", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1450, __PRETTY_FUNCTION__))
;
1451 SDValue BasePtr = Load->getBasePtr();
1452 EVT MemVT = Load->getMemoryVT();
1453 SDLoc SL(Op);
1454 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1455 unsigned BaseAlign = Load->getAlignment();
1456
1457 EVT WideVT =
1458 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1459 EVT WideMemVT =
1460 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1461 SDValue WideLoad = DAG.getExtLoad(
1462 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1463 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1464 return DAG.getMergeValues(
1465 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1466 DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
1467 WideLoad.getValue(1)},
1468 SL);
1469}
1470
1471SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1472 SelectionDAG &DAG) const {
1473 StoreSDNode *Store = cast<StoreSDNode>(Op);
1474 SDValue Val = Store->getValue();
1475 EVT VT = Val.getValueType();
1476
1477 // If this is a 2 element vector, we really want to scalarize and not create
1478 // weird 1 element vectors.
1479 if (VT.getVectorNumElements() == 2)
1480 return scalarizeVectorStore(Store, DAG);
1481
1482 EVT MemVT = Store->getMemoryVT();
1483 SDValue Chain = Store->getChain();
1484 SDValue BasePtr = Store->getBasePtr();
1485 SDLoc SL(Op);
1486
1487 EVT LoVT, HiVT;
1488 EVT LoMemVT, HiMemVT;
1489 SDValue Lo, Hi;
1490
1491 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1492 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1493 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1494
1495 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1496
1497 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1498 unsigned BaseAlign = Store->getAlignment();
1499 unsigned Size = LoMemVT.getStoreSize();
1500 unsigned HiAlign = MinAlign(BaseAlign, Size);
1501
1502 SDValue LoStore =
1503 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1504 Store->getMemOperand()->getFlags());
1505 SDValue HiStore =
1506 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1507 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1508
1509 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1510}
1511
1512// This is a shortcut for integer division because we have fast i32<->f32
1513// conversions, and fast f32 reciprocal instructions. The fractional part of a
1514// float is enough to accurately represent up to a 24-bit signed integer.
1515SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1516 bool Sign) const {
1517 SDLoc DL(Op);
1518 EVT VT = Op.getValueType();
1519 SDValue LHS = Op.getOperand(0);
1520 SDValue RHS = Op.getOperand(1);
1521 MVT IntVT = MVT::i32;
1522 MVT FltVT = MVT::f32;
1523
1524 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1525 if (LHSSignBits < 9)
1526 return SDValue();
1527
1528 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1529 if (RHSSignBits < 9)
1530 return SDValue();
1531
1532 unsigned BitSize = VT.getSizeInBits();
1533 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1534 unsigned DivBits = BitSize - SignBits;
1535 if (Sign)
1536 ++DivBits;
1537
1538 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1539 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1540
1541 SDValue jq = DAG.getConstant(1, DL, IntVT);
1542
1543 if (Sign) {
1544 // char|short jq = ia ^ ib;
1545 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1546
1547 // jq = jq >> (bitsize - 2)
1548 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1549 DAG.getConstant(BitSize - 2, DL, VT));
1550
1551 // jq = jq | 0x1
1552 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1553 }
1554
1555 // int ia = (int)LHS;
1556 SDValue ia = LHS;
1557
1558 // int ib, (int)RHS;
1559 SDValue ib = RHS;
1560
1561 // float fa = (float)ia;
1562 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1563
1564 // float fb = (float)ib;
1565 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1566
1567 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1568 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1569
1570 // fq = trunc(fq);
1571 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1572
1573 // float fqneg = -fq;
1574 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1575
1576 // float fr = mad(fqneg, fb, fa);
1577 unsigned OpCode = Subtarget->hasFP32Denormals() ?
1578 (unsigned)AMDGPUISD::FMAD_FTZ :
1579 (unsigned)ISD::FMAD;
1580 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1581
1582 // int iq = (int)fq;
1583 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1584
1585 // fr = fabs(fr);
1586 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1587
1588 // fb = fabs(fb);
1589 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1590
1591 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1592
1593 // int cv = fr >= fb;
1594 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1595
1596 // jq = (cv ? jq : 0);
1597 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1598
1599 // dst = iq + jq;
1600 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1601
1602 // Rem needs compensation, it's easier to recompute it
1603 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1604 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1605
1606 // Truncate to number of bits this divide really is.
1607 if (Sign) {
1608 SDValue InRegSize
1609 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1610 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1611 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1612 } else {
1613 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1614 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1615 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1616 }
1617
1618 return DAG.getMergeValues({ Div, Rem }, DL);
1619}
1620
1621void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1622 SelectionDAG &DAG,
1623 SmallVectorImpl<SDValue> &Results) const {
1624 SDLoc DL(Op);
1625 EVT VT = Op.getValueType();
1626
1627 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")((VT == MVT::i64 && "LowerUDIVREM64 expects an i64") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1627, __PRETTY_FUNCTION__))
;
1628
1629 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1630
1631 SDValue One = DAG.getConstant(1, DL, HalfVT);
1632 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1633
1634 //HiLo split
1635 SDValue LHS = Op.getOperand(0);
1636 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1637 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1638
1639 SDValue RHS = Op.getOperand(1);
1640 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1641 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1642
1643 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1644 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1645
1646 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1647 LHS_Lo, RHS_Lo);
1648
1649 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1650 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1651
1652 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1653 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1654 return;
1655 }
1656
1657 if (isTypeLegal(MVT::i64)) {
1658 // Compute denominator reciprocal.
1659 unsigned FMAD = Subtarget->hasFP32Denormals() ?
1660 (unsigned)AMDGPUISD::FMAD_FTZ :
1661 (unsigned)ISD::FMAD;
1662
1663 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1664 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1665 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1666 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1667 Cvt_Lo);
1668 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1669 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1670 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1671 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1672 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1673 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1674 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1675 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1676 Mul1);
1677 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1678 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1679 SDValue Rcp64 = DAG.getBitcast(VT,
1680 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1681
1682 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1683 SDValue One64 = DAG.getConstant(1, DL, VT);
1684 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1685 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1686
1687 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1688 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1689 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1690 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1691 Zero);
1692 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1693 One);
1694
1695 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1696 Mulhi1_Lo, Zero1);
1697 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1698 Mulhi1_Hi, Add1_Lo.getValue(1));
1699 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1700 SDValue Add1 = DAG.getBitcast(VT,
1701 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1702
1703 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1704 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1705 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1706 Zero);
1707 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1708 One);
1709
1710 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1711 Mulhi2_Lo, Zero1);
1712 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1713 Mulhi2_Hi, Add1_Lo.getValue(1));
1714 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1715 Zero, Add2_Lo.getValue(1));
1716 SDValue Add2 = DAG.getBitcast(VT,
1717 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1718 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1719
1720 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1721
1722 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1723 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1724 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1725 Mul3_Lo, Zero1);
1726 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1727 Mul3_Hi, Sub1_Lo.getValue(1));
1728 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1729 SDValue Sub1 = DAG.getBitcast(VT,
1730 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1731
1732 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1733 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1734 ISD::SETUGE);
1735 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1736 ISD::SETUGE);
1737 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1738
1739 // TODO: Here and below portions of the code can be enclosed into if/endif.
1740 // Currently control flow is unconditional and we have 4 selects after
1741 // potential endif to substitute PHIs.
1742
1743 // if C3 != 0 ...
1744 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1745 RHS_Lo, Zero1);
1746 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1747 RHS_Hi, Sub1_Lo.getValue(1));
1748 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1749 Zero, Sub2_Lo.getValue(1));
1750 SDValue Sub2 = DAG.getBitcast(VT,
1751 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1752
1753 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1754
1755 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1756 ISD::SETUGE);
1757 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1758 ISD::SETUGE);
1759 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1760
1761 // if (C6 != 0)
1762 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1763
1764 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1765 RHS_Lo, Zero1);
1766 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1767 RHS_Hi, Sub2_Lo.getValue(1));
1768 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1769 Zero, Sub3_Lo.getValue(1));
1770 SDValue Sub3 = DAG.getBitcast(VT,
1771 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1772
1773 // endif C6
1774 // endif C3
1775
1776 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1777 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1778
1779 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1780 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1781
1782 Results.push_back(Div);
1783 Results.push_back(Rem);
1784
1785 return;
1786 }
1787
1788 // r600 expandion.
1789 // Get Speculative values
1790 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1791 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1792
1793 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1794 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1795 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1796
1797 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1798 SDValue DIV_Lo = Zero;
1799
1800 const unsigned halfBitWidth = HalfVT.getSizeInBits();
1801
1802 for (unsigned i = 0; i < halfBitWidth; ++i) {
1803 const unsigned bitPos = halfBitWidth - i - 1;
1804 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1805 // Get value of high bit
1806 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1807 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1808 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1809
1810 // Shift
1811 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1812 // Add LHS high bit
1813 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1814
1815 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1816 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1817
1818 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1819
1820 // Update REM
1821 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1822 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1823 }
1824
1825 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1826 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1827 Results.push_back(DIV);
1828 Results.push_back(REM);
1829}
1830
1831SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1832 SelectionDAG &DAG) const {
1833 SDLoc DL(Op);
1834 EVT VT = Op.getValueType();
1835
1836 if (VT == MVT::i64) {
1837 SmallVector<SDValue, 2> Results;
1838 LowerUDIVREM64(Op, DAG, Results);
1839 return DAG.getMergeValues(Results, DL);
1840 }
1841
1842 if (VT == MVT::i32) {
1843 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1844 return Res;
1845 }
1846
1847 SDValue Num = Op.getOperand(0);
1848 SDValue Den = Op.getOperand(1);
1849
1850 // RCP = URECIP(Den) = 2^32 / Den + e
1851 // e is rounding error.
1852 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1853
1854 // RCP_LO = mul(RCP, Den) */
1855 SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1856
1857 // RCP_HI = mulhu (RCP, Den) */
1858 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1859
1860 // NEG_RCP_LO = -RCP_LO
1861 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1862 RCP_LO);
1863
1864 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1865 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1866 NEG_RCP_LO, RCP_LO,
1867 ISD::SETEQ);
1868 // Calculate the rounding error from the URECIP instruction
1869 // E = mulhu(ABS_RCP_LO, RCP)
1870 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1871
1872 // RCP_A_E = RCP + E
1873 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1874
1875 // RCP_S_E = RCP - E
1876 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1877
1878 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1879 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1880 RCP_A_E, RCP_S_E,
1881 ISD::SETEQ);
1882 // Quotient = mulhu(Tmp0, Num)
1883 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1884
1885 // Num_S_Remainder = Quotient * Den
1886 SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1887
1888 // Remainder = Num - Num_S_Remainder
1889 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1890
1891 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1892 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1893 DAG.getConstant(-1, DL, VT),
1894 DAG.getConstant(0, DL, VT),
1895 ISD::SETUGE);
1896 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1897 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1898 Num_S_Remainder,
1899 DAG.getConstant(-1, DL, VT),
1900 DAG.getConstant(0, DL, VT),
1901 ISD::SETUGE);
1902 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1903 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1904 Remainder_GE_Zero);
1905
1906 // Calculate Division result:
1907
1908 // Quotient_A_One = Quotient + 1
1909 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1910 DAG.getConstant(1, DL, VT));
1911
1912 // Quotient_S_One = Quotient - 1
1913 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1914 DAG.getConstant(1, DL, VT));
1915
1916 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1917 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1918 Quotient, Quotient_A_One, ISD::SETEQ);
1919
1920 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1921 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1922 Quotient_S_One, Div, ISD::SETEQ);
1923
1924 // Calculate Rem result:
1925
1926 // Remainder_S_Den = Remainder - Den
1927 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1928
1929 // Remainder_A_Den = Remainder + Den
1930 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1931
1932 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1933 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1934 Remainder, Remainder_S_Den, ISD::SETEQ);
1935
1936 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1937 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1938 Remainder_A_Den, Rem, ISD::SETEQ);
1939 SDValue Ops[2] = {
1940 Div,
1941 Rem
1942 };
1943 return DAG.getMergeValues(Ops, DL);
1944}
1945
1946SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1947 SelectionDAG &DAG) const {
1948 SDLoc DL(Op);
1949 EVT VT = Op.getValueType();
1950
1951 SDValue LHS = Op.getOperand(0);
1952 SDValue RHS = Op.getOperand(1);
1953
1954 SDValue Zero = DAG.getConstant(0, DL, VT);
1955 SDValue NegOne = DAG.getConstant(-1, DL, VT);
1956
1957 if (VT == MVT::i32) {
1958 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1959 return Res;
1960 }
1961
1962 if (VT == MVT::i64 &&
1963 DAG.ComputeNumSignBits(LHS) > 32 &&
1964 DAG.ComputeNumSignBits(RHS) > 32) {
1965 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1966
1967 //HiLo split
1968 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1969 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1970 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1971 LHS_Lo, RHS_Lo);
1972 SDValue Res[2] = {
1973 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1974 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1975 };
1976 return DAG.getMergeValues(Res, DL);
1977 }
1978
1979 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1980 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1981 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1982 SDValue RSign = LHSign; // Remainder sign is the same as LHS
1983
1984 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1985 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1986
1987 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1988 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1989
1990 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1991 SDValue Rem = Div.getValue(1);
1992
1993 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1994 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1995
1996 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1997 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1998
1999 SDValue Res[2] = {
2000 Div,
2001 Rem
2002 };
2003 return DAG.getMergeValues(Res, DL);
2004}
2005
2006// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
2007SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2008 SDLoc SL(Op);
2009 EVT VT = Op.getValueType();
2010 SDValue X = Op.getOperand(0);
2011 SDValue Y = Op.getOperand(1);
2012
2013 // TODO: Should this propagate fast-math-flags?
2014
2015 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
2016 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
2017 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
2018
2019 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
2020}
2021
2022SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2023 SDLoc SL(Op);
2024 SDValue Src = Op.getOperand(0);
2025
2026 // result = trunc(src)
2027 // if (src > 0.0 && src != result)
2028 // result += 1.0
2029
2030 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2031
2032 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2033 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2034
2035 EVT SetCCVT =
2036 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2037
2038 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2039 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2040 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2041
2042 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2043 // TODO: Should this propagate fast-math-flags?
2044 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2045}
2046
2047static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2048 SelectionDAG &DAG) {
2049 const unsigned FractBits = 52;
2050 const unsigned ExpBits = 11;
2051
2052 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2053 Hi,
2054 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2055 DAG.getConstant(ExpBits, SL, MVT::i32));
2056 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2057 DAG.getConstant(1023, SL, MVT::i32));
2058
2059 return Exp;
2060}
2061
2062SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2063 SDLoc SL(Op);
2064 SDValue Src = Op.getOperand(0);
2065
2066 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2066, __PRETTY_FUNCTION__))
;
2067
2068 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2069 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2070
2071 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2072
2073 // Extract the upper half, since this is where we will find the sign and
2074 // exponent.
2075 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2076
2077 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2078
2079 const unsigned FractBits = 52;
2080
2081 // Extract the sign bit.
2082 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2083 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2084
2085 // Extend back to 64-bits.
2086 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2087 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2088
2089 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2090 const SDValue FractMask
2091 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2092
2093 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2094 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2095 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2096
2097 EVT SetCCVT =
2098 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2099
2100 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2101
2102 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2103 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2104
2105 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2106 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2107
2108 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2109}
2110
2111SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2112 SDLoc SL(Op);
2113 SDValue Src = Op.getOperand(0);
2114
2115 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2115, __PRETTY_FUNCTION__))
;
2116
2117 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2118 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2119 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2120
2121 // TODO: Should this propagate fast-math-flags?
2122
2123 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2124 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2125
2126 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2127
2128 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2129 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2130
2131 EVT SetCCVT =
2132 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2133 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2134
2135 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2136}
2137
2138SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2139 // FNEARBYINT and FRINT are the same, except in their handling of FP
2140 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2141 // rint, so just treat them as equivalent.
2142 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2143}
2144
2145// XXX - May require not supporting f32 denormals?
2146
2147// Don't handle v2f16. The extra instructions to scalarize and repack around the
2148// compare and vselect end up producing worse code than scalarizing the whole
2149// operation.
2150SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
2151 SDLoc SL(Op);
2152 SDValue X = Op.getOperand(0);
2153 EVT VT = Op.getValueType();
2154
2155 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2156
2157 // TODO: Should this propagate fast-math-flags?
2158
2159 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2160
2161 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2162
2163 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2164 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2165 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2166
2167 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2168
2169 EVT SetCCVT =
2170 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2171
2172 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2173
2174 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2175
2176 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2177}
2178
2179SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
2180 SDLoc SL(Op);
2181 SDValue X = Op.getOperand(0);
2182
2183 SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2184
2185 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2186 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2187 const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2188 const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2189 EVT SetCCVT =
2190 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2191
2192 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2193
2194 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2195
2196 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2197
2198 const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff)0x000fffffffffffffL, SL,
2199 MVT::i64);
2200
2201 SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2202 SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2203 DAG.getConstant(INT64_C(0x0008000000000000)0x0008000000000000L, SL,
2204 MVT::i64),
2205 Exp);
2206
2207 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2208 SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2209 DAG.getConstant(0, SL, MVT::i64), Tmp0,
2210 ISD::SETNE);
2211
2212 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2213 D, DAG.getConstant(0, SL, MVT::i64));
2214 SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2215
2216 K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2217 K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2218
2219 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2220 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2221 SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2222
2223 SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2224 ExpEqNegOne,
2225 DAG.getConstantFP(1.0, SL, MVT::f64),
2226 DAG.getConstantFP(0.0, SL, MVT::f64));
2227
2228 SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2229
2230 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2231 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2232
2233 return K;
2234}
2235
2236SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2237 EVT VT = Op.getValueType();
2238
2239 if (VT == MVT::f32 || VT == MVT::f16)
2240 return LowerFROUND32_16(Op, DAG);
2241
2242 if (VT == MVT::f64)
2243 return LowerFROUND64(Op, DAG);
2244
2245 llvm_unreachable("unhandled type")::llvm::llvm_unreachable_internal("unhandled type", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2245)
;
2246}
2247
2248SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2249 SDLoc SL(Op);
2250 SDValue Src = Op.getOperand(0);
2251
2252 // result = trunc(src);
2253 // if (src < 0.0 && src != result)
2254 // result += -1.0.
2255
2256 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2257
2258 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2259 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2260
2261 EVT SetCCVT =
2262 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2263
2264 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2265 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2266 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2267
2268 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2269 // TODO: Should this propagate fast-math-flags?
2270 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2271}
2272
2273SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2274 double Log2BaseInverted) const {
2275 EVT VT = Op.getValueType();
2276
2277 SDLoc SL(Op);
2278 SDValue Operand = Op.getOperand(0);
2279 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2280 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2281
2282 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2283}
2284
2285// exp2(M_LOG2E_F * f);
2286SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2287 EVT VT = Op.getValueType();
2288 SDLoc SL(Op);
2289 SDValue Src = Op.getOperand(0);
2290
2291 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2292 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2293 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2294}
2295
2296static bool isCtlzOpc(unsigned Opc) {
2297 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2298}
2299
2300static bool isCttzOpc(unsigned Opc) {
2301 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2302}
2303
2304SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2305 SDLoc SL(Op);
2306 SDValue Src = Op.getOperand(0);
2307 bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2308 Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2309
2310 unsigned ISDOpc, NewOpc;
2311 if (isCtlzOpc(Op.getOpcode())) {
2312 ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2313 NewOpc = AMDGPUISD::FFBH_U32;
2314 } else if (isCttzOpc(Op.getOpcode())) {
2315 ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2316 NewOpc = AMDGPUISD::FFBL_B32;
2317 } else
2318 llvm_unreachable("Unexpected OPCode!!!")::llvm::llvm_unreachable_internal("Unexpected OPCode!!!", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2318)
;
2319
2320
2321 if (ZeroUndef && Src.getValueType() == MVT::i32)
2322 return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2323
2324 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2325
2326 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2327 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2328
2329 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2330 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2331
2332 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2333 *DAG.getContext(), MVT::i32);
2334
2335 SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2336 SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2337
2338 SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2339 SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2340
2341 const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2342 SDValue Add, NewOpr;
2343 if (isCtlzOpc(Op.getOpcode())) {
2344 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2345 // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2346 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2347 } else {
2348 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2349 // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2350 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2351 }
2352
2353 if (!ZeroUndef) {
2354 // Test if the full 64-bit input is zero.
2355
2356 // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2357 // which we probably don't want.
2358 SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2359 SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2360 SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2361
2362 // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2363 // with the same cycles, otherwise it is slower.
2364 // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2365 // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2366
2367 const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2368
2369 // The instruction returns -1 for 0 input, but the defined intrinsic
2370 // behavior is to return the number of bits.
2371 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2372 SrcIsZero, Bits32, NewOpr);
2373 }
2374
2375 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2376}
2377
2378SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2379 bool Signed) const {
2380 // Unsigned
2381 // cul2f(ulong u)
2382 //{
2383 // uint lz = clz(u);
2384 // uint e = (u != 0) ? 127U + 63U - lz : 0;
2385 // u = (u << lz) & 0x7fffffffffffffffUL;
2386 // ulong t = u & 0xffffffffffUL;
2387 // uint v = (e << 23) | (uint)(u >> 40);
2388 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2389 // return as_float(v + r);
2390 //}
2391 // Signed
2392 // cl2f(long l)
2393 //{
2394 // long s = l >> 63;
2395 // float r = cul2f((l + s) ^ s);
2396 // return s ? -r : r;
2397 //}
2398
2399 SDLoc SL(Op);
2400 SDValue Src = Op.getOperand(0);
2401 SDValue L = Src;
2402
2403 SDValue S;
2404 if (Signed) {
2405 const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2406 S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2407
2408 SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2409 L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2410 }
2411
2412 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2413 *DAG.getContext(), MVT::f32);
2414
2415
2416 SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2417 SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2418 SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2419 LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2420
2421 SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2422 SDValue E = DAG.getSelect(SL, MVT::i32,
2423 DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2424 DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2425 ZeroI32);
2426
2427 SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2428 DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2429 DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2430
2431 SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2432 DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2433
2434 SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2435 U, DAG.getConstant(40, SL, MVT::i64));
2436
2437 SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2438 DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2439 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2440
2441 SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2442 SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2443 SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2444
2445 SDValue One = DAG.getConstant(1, SL, MVT::i32);
2446
2447 SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2448
2449 SDValue R = DAG.getSelect(SL, MVT::i32,
2450 RCmp,
2451 One,
2452 DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2453 R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2454 R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2455
2456 if (!Signed)
2457 return R;
2458
2459 SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2460 return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2461}
2462
2463SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2464 bool Signed) const {
2465 SDLoc SL(Op);
2466 SDValue Src = Op.getOperand(0);
2467
2468 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2469
2470 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2471 DAG.getConstant(0, SL, MVT::i32));
2472 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2473 DAG.getConstant(1, SL, MVT::i32));
2474
2475 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2476 SL, MVT::f64, Hi);
2477
2478 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2479
2480 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2481 DAG.getConstant(32, SL, MVT::i32));
2482 // TODO: Should this propagate fast-math-flags?
2483 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2484}
2485
2486SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2487 SelectionDAG &DAG) const {
2488 assert(Op.getOperand(0).getValueType() == MVT::i64 &&((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2489, __PRETTY_FUNCTION__))
2489 "operation should be legal")((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2489, __PRETTY_FUNCTION__))
;
2490
2491 // TODO: Factor out code common with LowerSINT_TO_FP.
2492
2493 EVT DestVT = Op.getValueType();
2494 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2495 SDLoc DL(Op);
2496 SDValue Src = Op.getOperand(0);
2497
2498 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2499 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2500 SDValue FPRound =
2501 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2502
2503 return FPRound;
2504 }
2505
2506 if (DestVT == MVT::f32)
2507 return LowerINT_TO_FP32(Op, DAG, false);
2508
2509 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2509, __PRETTY_FUNCTION__))
;
2510 return LowerINT_TO_FP64(Op, DAG, false);
2511}
2512
2513SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2514 SelectionDAG &DAG) const {
2515 assert(Op.getOperand(0).getValueType() == MVT::i64 &&((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2516, __PRETTY_FUNCTION__))
2516 "operation should be legal")((Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2516, __PRETTY_FUNCTION__))
;
2517
2518 // TODO: Factor out code common with LowerUINT_TO_FP.
2519
2520 EVT DestVT = Op.getValueType();
2521 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2522 SDLoc DL(Op);
2523 SDValue Src = Op.getOperand(0);
2524
2525 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2526 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2527 SDValue FPRound =
2528 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2529
2530 return FPRound;
2531 }
2532
2533 if (DestVT == MVT::f32)
2534 return LowerINT_TO_FP32(Op, DAG, true);
2535
2536 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2536, __PRETTY_FUNCTION__))
;
2537 return LowerINT_TO_FP64(Op, DAG, true);
2538}
2539
2540SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2541 bool Signed) const {
2542 SDLoc SL(Op);
2543
2544 SDValue Src = Op.getOperand(0);
2545
2546 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2547
2548 SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL), SL,
2549 MVT::f64);
2550 SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL), SL,
2551 MVT::f64);
2552 // TODO: Should this propagate fast-math-flags?
2553 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2554
2555 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2556
2557
2558 SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2559
2560 SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2561 MVT::i32, FloorMul);
2562 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2563
2564 SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2565
2566 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2567}
2568
2569SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2570 SDLoc DL(Op);
2571 SDValue N0 = Op.getOperand(0);
2572
2573 // Convert to target node to get known bits
2574 if (N0.getValueType() == MVT::f32)
2575 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2576
2577 if (getTargetMachine().Options.UnsafeFPMath) {
2578 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2579 return SDValue();
2580 }
2581
2582 assert(N0.getSimpleValueType() == MVT::f64)((N0.getSimpleValueType() == MVT::f64) ? static_cast<void>
(0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2582, __PRETTY_FUNCTION__))
;
2583
2584 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2585 const unsigned ExpMask = 0x7ff;
2586 const unsigned ExpBiasf64 = 1023;
2587 const unsigned ExpBiasf16 = 15;
2588 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2589 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2590 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2591 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2592 DAG.getConstant(32, DL, MVT::i64));
2593 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2594 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2595 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2596 DAG.getConstant(20, DL, MVT::i64));
2597 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2598 DAG.getConstant(ExpMask, DL, MVT::i32));
2599 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2600 // add the f16 bias (15) to get the biased exponent for the f16 format.
2601 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2602 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2603
2604 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2605 DAG.getConstant(8, DL, MVT::i32));
2606 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2607 DAG.getConstant(0xffe, DL, MVT::i32));
2608
2609 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2610 DAG.getConstant(0x1ff, DL, MVT::i32));
2611 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2612
2613 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2614 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2615
2616 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2617 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2618 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2619 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2620
2621 // N = M | (E << 12);
2622 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2623 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2624 DAG.getConstant(12, DL, MVT::i32)));
2625
2626 // B = clamp(1-E, 0, 13);
2627 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2628 One, E);
2629 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2630 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2631 DAG.getConstant(13, DL, MVT::i32));
2632
2633 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2634 DAG.getConstant(0x1000, DL, MVT::i32));
2635
2636 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2637 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2638 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2639 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2640
2641 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2642 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2643 DAG.getConstant(0x7, DL, MVT::i32));
2644 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2645 DAG.getConstant(2, DL, MVT::i32));
2646 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2647 One, Zero, ISD::SETEQ);
2648 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2649 One, Zero, ISD::SETGT);
2650 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2651 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2652
2653 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2654 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2655 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2656 I, V, ISD::SETEQ);
2657
2658 // Extract the sign bit.
2659 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2660 DAG.getConstant(16, DL, MVT::i32));
2661 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2662 DAG.getConstant(0x8000, DL, MVT::i32));
2663
2664 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2665 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2666}
2667
2668SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2669 SelectionDAG &DAG) const {
2670 SDValue Src = Op.getOperand(0);
2671
2672 // TODO: Factor out code common with LowerFP_TO_UINT.
2673
2674 EVT SrcVT = Src.getValueType();
2675 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2676 SDLoc DL(Op);
2677
2678 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2679 SDValue FpToInt32 =
2680 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2681
2682 return FpToInt32;
2683 }
2684
2685 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2686 return LowerFP64_TO_INT(Op, DAG, true);
2687
2688 return SDValue();
2689}
2690
2691SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2692 SelectionDAG &DAG) const {
2693 SDValue Src = Op.getOperand(0);
2694
2695 // TODO: Factor out code common with LowerFP_TO_SINT.
2696
2697 EVT SrcVT = Src.getValueType();
2698 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2699 SDLoc DL(Op);
2700
2701 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2702 SDValue FpToInt32 =
2703 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2704
2705 return FpToInt32;
2706 }
2707
2708 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2709 return LowerFP64_TO_INT(Op, DAG, false);
2710
2711 return SDValue();
2712}
2713
2714SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2715 SelectionDAG &DAG) const {
2716 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2717 MVT VT = Op.getSimpleValueType();
2718 MVT ScalarVT = VT.getScalarType();
2719
2720 assert(VT.isVector())((VT.isVector()) ? static_cast<void> (0) : __assert_fail
("VT.isVector()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2720, __PRETTY_FUNCTION__))
;
2721
2722 SDValue Src = Op.getOperand(0);
2723 SDLoc DL(Op);
2724
2725 // TODO: Don't scalarize on Evergreen?
2726 unsigned NElts = VT.getVectorNumElements();
2727 SmallVector<SDValue, 8> Args;
2728 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2729
2730 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2731 for (unsigned I = 0; I < NElts; ++I)
2732 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2733
2734 return DAG.getBuildVector(VT, DL, Args);
2735}
2736
2737//===----------------------------------------------------------------------===//
2738// Custom DAG optimizations
2739//===----------------------------------------------------------------------===//
2740
2741static bool isU24(SDValue Op, SelectionDAG &DAG) {
2742 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2743}
2744
2745static bool isI24(SDValue Op, SelectionDAG &DAG) {
2746 EVT VT = Op.getValueType();
2747 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2748 // as unsigned 24-bit values.
2749 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2750}
2751
2752static SDValue simplifyI24(SDNode *Node24,
2753 TargetLowering::DAGCombinerInfo &DCI) {
2754 SelectionDAG &DAG = DCI.DAG;
2755 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2756
2757 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2758 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2759 unsigned NewOpcode = Node24->getOpcode();
2760 if (IsIntrin) {
2761 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2762 NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2763 AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2764 }
2765
2766 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2767
2768 // First try to simplify using GetDemandedBits which allows the operands to
2769 // have other uses, but will only perform simplifications that involve
2770 // bypassing some nodes for this user.
2771 SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
2772 SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
2773 if (DemandedLHS || DemandedRHS)
2774 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2775 DemandedLHS ? DemandedLHS : LHS,
2776 DemandedRHS ? DemandedRHS : RHS);
2777
2778 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2779 // operands if this node is the only user.
2780 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2781 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2782 return SDValue(Node24, 0);
2783 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2784 return SDValue(Node24, 0);
2785
2786 return SDValue();
2787}
2788
2789template <typename IntTy>
2790static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2791 uint32_t Width, const SDLoc &DL) {
2792 if (Width + Offset < 32) {
2793 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2794 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2795 return DAG.getConstant(Result, DL, MVT::i32);
2796 }
2797
2798 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2799}
2800
2801static bool hasVolatileUser(SDNode *Val) {
2802 for (SDNode *U : Val->uses()) {
2803 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2804 if (M->isVolatile())
2805 return true;
2806 }
2807 }
2808
2809 return false;
2810}
2811
2812bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2813 // i32 vectors are the canonical memory type.
2814 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2815 return false;
2816
2817 if (!VT.isByteSized())
2818 return false;
2819
2820 unsigned Size = VT.getStoreSize();
2821
2822 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2823 return false;
2824
2825 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2826 return false;
2827
2828 return true;
2829}
2830
2831// Replace load of an illegal type with a store of a bitcast to a friendlier
2832// type.
2833SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2834 DAGCombinerInfo &DCI) const {
2835 if (!DCI.isBeforeLegalize())
2836 return SDValue();
2837
2838 LoadSDNode *LN = cast<LoadSDNode>(N);
2839 if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2840 return SDValue();
2841
2842 SDLoc SL(N);
2843 SelectionDAG &DAG = DCI.DAG;
2844 EVT VT = LN->getMemoryVT();
2845
2846 unsigned Size = VT.getStoreSize();
2847 unsigned Align = LN->getAlignment();
2848 if (Align < Size && isTypeLegal(VT)) {
2849 bool IsFast;
2850 unsigned AS = LN->getAddressSpace();
2851
2852 // Expand unaligned loads earlier than legalization. Due to visitation order
2853 // problems during legalization, the emitted instructions to pack and unpack
2854 // the bytes again are not eliminated in the case of an unaligned copy.
2855 if (!allowsMisalignedMemoryAccesses(
2856 VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
2857 if (VT.isVector())
2858 return scalarizeVectorLoad(LN, DAG);
2859
2860 SDValue Ops[2];
2861 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2862 return DAG.getMergeValues(Ops, SDLoc(N));
2863 }
2864
2865 if (!IsFast)
2866 return SDValue();
2867 }
2868
2869 if (!shouldCombineMemoryType(VT))
2870 return SDValue();
2871
2872 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2873
2874 SDValue NewLoad
2875 = DAG.getLoad(NewVT, SL, LN->getChain(),
2876 LN->getBasePtr(), LN->getMemOperand());
2877
2878 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2879 DCI.CombineTo(N, BC, NewLoad.getValue(1));
2880 return SDValue(N, 0);
2881}
2882
2883// Replace store of an illegal type with a store of a bitcast to a friendlier
2884// type.
2885SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2886 DAGCombinerInfo &DCI) const {
2887 if (!DCI.isBeforeLegalize())
2888 return SDValue();
2889
2890 StoreSDNode *SN = cast<StoreSDNode>(N);
2891 if (SN->isVolatile() || !ISD::isNormalStore(SN))
2892 return SDValue();
2893
2894 EVT VT = SN->getMemoryVT();
2895 unsigned Size = VT.getStoreSize();
2896
2897 SDLoc SL(N);
2898 SelectionDAG &DAG = DCI.DAG;
2899 unsigned Align = SN->getAlignment();
2900 if (Align < Size && isTypeLegal(VT)) {
2901 bool IsFast;
2902 unsigned AS = SN->getAddressSpace();
2903
2904 // Expand unaligned stores earlier than legalization. Due to visitation
2905 // order problems during legalization, the emitted instructions to pack and
2906 // unpack the bytes again are not eliminated in the case of an unaligned
2907 // copy.
2908 if (!allowsMisalignedMemoryAccesses(
2909 VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
2910 if (VT.isVector())
2911 return scalarizeVectorStore(SN, DAG);
2912
2913 return expandUnalignedStore(SN, DAG);
2914 }
2915
2916 if (!IsFast)
2917 return SDValue();
2918 }
2919
2920 if (!shouldCombineMemoryType(VT))
2921 return SDValue();
2922
2923 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2924 SDValue Val = SN->getValue();
2925
2926 //DCI.AddToWorklist(Val.getNode());
2927
2928 bool OtherUses = !Val.hasOneUse();
2929 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2930 if (OtherUses) {
2931 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2932 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2933 }
2934
2935 return DAG.getStore(SN->getChain(), SL, CastVal,
2936 SN->getBasePtr(), SN->getMemOperand());
2937}
2938
2939// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2940// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2941// issues.
2942SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2943 DAGCombinerInfo &DCI) const {
2944 SelectionDAG &DAG = DCI.DAG;
2945 SDValue N0 = N->getOperand(0);
2946
2947 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2948 // (vt2 (truncate (assertzext vt0:x, vt1)))
2949 if (N0.getOpcode() == ISD::TRUNCATE) {
2950 SDValue N1 = N->getOperand(1);
2951 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2952 SDLoc SL(N);
2953
2954 SDValue Src = N0.getOperand(0);
2955 EVT SrcVT = Src.getValueType();
2956 if (SrcVT.bitsGE(ExtVT)) {
2957 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2958 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2959 }
2960 }
2961
2962 return SDValue();
2963}
2964
2965SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
2966 SDNode *N, DAGCombinerInfo &DCI) const {
2967 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2968 switch (IID) {
2969 case Intrinsic::amdgcn_mul_i24:
2970 case Intrinsic::amdgcn_mul_u24:
2971 return simplifyI24(N, DCI);
2972 default:
2973 return SDValue();
2974 }
2975}
2976
2977/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2978/// binary operation \p Opc to it with the corresponding constant operands.
2979SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2980 DAGCombinerInfo &DCI, const SDLoc &SL,
2981 unsigned Opc, SDValue LHS,
2982 uint32_t ValLo, uint32_t ValHi) const {
2983 SelectionDAG &DAG = DCI.DAG;
2984 SDValue Lo, Hi;
2985 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2986
2987 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2988 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2989
2990 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2991 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2992
2993 // Re-visit the ands. It's possible we eliminated one of them and it could
2994 // simplify the vector.
2995 DCI.AddToWorklist(Lo.getNode());
2996 DCI.AddToWorklist(Hi.getNode());
2997
2998 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2999 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3000}
3001
3002SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3003 DAGCombinerInfo &DCI) const {
3004 EVT VT = N->getValueType(0);
3005
3006 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3007 if (!RHS)
3008 return SDValue();
3009
3010 SDValue LHS = N->getOperand(0);
3011 unsigned RHSVal = RHS->getZExtValue();
3012 if (!RHSVal)
3013 return LHS;
3014
3015 SDLoc SL(N);
3016 SelectionDAG &DAG = DCI.DAG;
3017
3018 switch (LHS->getOpcode()) {
3019 default:
3020 break;
3021 case ISD::ZERO_EXTEND:
3022 case ISD::SIGN_EXTEND:
3023 case ISD::ANY_EXTEND: {
3024 SDValue X = LHS->getOperand(0);
3025
3026 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3027 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3028 // Prefer build_vector as the canonical form if packed types are legal.
3029 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3030 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3031 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3032 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3033 }
3034
3035 // shl (ext x) => zext (shl x), if shift does not overflow int
3036 if (VT != MVT::i64)
3037 break;
3038 KnownBits Known = DAG.computeKnownBits(X);
3039 unsigned LZ = Known.countMinLeadingZeros();
3040 if (LZ < RHSVal)
3041 break;
3042 EVT XVT = X.getValueType();
3043 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3044 return DAG.getZExtOrTrunc(Shl, SL, VT);
3045 }
3046 }
3047
3048 if (VT != MVT::i64)
3049 return SDValue();
3050
3051 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3052
3053 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3054 // common case, splitting this into a move and a 32-bit shift is faster and
3055 // the same code size.
3056 if (RHSVal < 32)
3057 return SDValue();
3058
3059 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3060
3061 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3062 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3063
3064 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3065
3066 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3067 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3068}
3069
3070SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3071 DAGCombinerInfo &DCI) const {
3072 if (N->getValueType(0) != MVT::i64)
3073 return SDValue();
3074
3075 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3076 if (!RHS)
3077 return SDValue();
3078
3079 SelectionDAG &DAG = DCI.DAG;
3080 SDLoc SL(N);
3081 unsigned RHSVal = RHS->getZExtValue();
3082
3083 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3084 if (RHSVal == 32) {
3085 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3086 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3087 DAG.getConstant(31, SL, MVT::i32));
3088
3089 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3090 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3091 }
3092
3093 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3094 if (RHSVal == 63) {
3095 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3096 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3097 DAG.getConstant(31, SL, MVT::i32));
3098 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3099 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3100 }
3101
3102 return SDValue();
3103}
3104
3105SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3106 DAGCombinerInfo &DCI) const {
3107 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3108 if (!RHS)
3109 return SDValue();
3110
3111 EVT VT = N->getValueType(0);
3112 SDValue LHS = N->getOperand(0);
3113 unsigned ShiftAmt = RHS->getZExtValue();
3114 SelectionDAG &DAG = DCI.DAG;
3115 SDLoc SL(N);
3116
3117 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3118 // this improves the ability to match BFE patterns in isel.
3119 if (LHS.getOpcode() == ISD::AND) {
3120 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3121 if (Mask->getAPIntValue().isShiftedMask() &&
3122 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3123 return DAG.getNode(
3124 ISD::AND, SL, VT,
3125 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3126 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3127 }
3128 }
3129 }
3130
3131 if (VT != MVT::i64)
3132 return SDValue();
3133
3134 if (ShiftAmt < 32)
3135 return SDValue();
3136
3137 // srl i64:x, C for C >= 32
3138 // =>
3139 // build_pair (srl hi_32(x), C - 32), 0
3140 SDValue One = DAG.getConstant(1, SL, MVT::i32);
3141 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3142
3143 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3144 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3145
3146 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3147 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3148
3149 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3150
3151 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3152}
3153
3154SDValue AMDGPUTargetLowering::performTruncateCombine(
3155 SDNode *N, DAGCombinerInfo &DCI) const {
3156 SDLoc SL(N);
3157 SelectionDAG &DAG = DCI.DAG;
3158 EVT VT = N->getValueType(0);
3159 SDValue Src = N->getOperand(0);
3160
3161 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3162 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3163 SDValue Vec = Src.getOperand(0);
3164 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3165 SDValue Elt0 = Vec.getOperand(0);
3166 EVT EltVT = Elt0.getValueType();
3167 if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
3168 if (EltVT.isFloatingPoint()) {
3169 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3170 EltVT.changeTypeToInteger(), Elt0);
3171 }
3172
3173 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3174 }
3175 }
3176 }
3177
3178 // Equivalent of above for accessing the high element of a vector as an
3179 // integer operation.
3180 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3181 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3182 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3183 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3184 SDValue BV = stripBitcast(Src.getOperand(0));
3185 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3186 BV.getValueType().getVectorNumElements() == 2) {
3187 SDValue SrcElt = BV.getOperand(1);
3188 EVT SrcEltVT = SrcElt.getValueType();
3189 if (SrcEltVT.isFloatingPoint()) {
3190 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3191 SrcEltVT.changeTypeToInteger(), SrcElt);
3192 }
3193
3194 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3195 }
3196 }
3197 }
3198 }
3199
3200 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3201 //
3202 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3203 // i16 (trunc (srl (i32 (trunc x), K)))
3204 if (VT.getScalarSizeInBits() < 32) {
3205 EVT SrcVT = Src.getValueType();
3206 if (SrcVT.getScalarSizeInBits() > 32 &&
3207 (Src.getOpcode() == ISD::SRL ||
3208 Src.getOpcode() == ISD::SRA ||
3209 Src.getOpcode() == ISD::SHL)) {
3210 SDValue Amt = Src.getOperand(1);
3211 KnownBits Known = DAG.computeKnownBits(Amt);
3212 unsigned Size = VT.getScalarSizeInBits();
3213 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3214 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3215 EVT MidVT = VT.isVector() ?
3216 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3217 VT.getVectorNumElements()) : MVT::i32;
3218
3219 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3220 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3221 Src.getOperand(0));
3222 DCI.AddToWorklist(Trunc.getNode());
3223
3224 if (Amt.getValueType() != NewShiftVT) {
3225 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3226 DCI.AddToWorklist(Amt.getNode());
3227 }
3228
3229 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3230 Trunc, Amt);
3231 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3232 }
3233 }
3234 }
3235
3236 return SDValue();
3237}
3238
3239// We need to specifically handle i64 mul here to avoid unnecessary conversion
3240// instructions. If we only match on the legalized i64 mul expansion,
3241// SimplifyDemandedBits will be unable to remove them because there will be
3242// multiple uses due to the separate mul + mulh[su].
3243static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3244 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3245 if (Size <= 32) {
3246 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3247 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3248 }
3249
3250 // Because we want to eliminate extension instructions before the
3251 // operation, we need to create a single user here (i.e. not the separate
3252 // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3253
3254 unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3255
3256 SDValue Mul = DAG.getNode(MulOpc, SL,
3257 DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3258
3259 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3260 Mul.getValue(0), Mul.getValue(1));
3261}
3262
3263SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3264 DAGCombinerInfo &DCI) const {
3265 EVT VT = N->getValueType(0);
3266
3267 unsigned Size = VT.getSizeInBits();
3268 if (VT.isVector() || Size > 64)
3269 return SDValue();
3270
3271 // There are i16 integer mul/mad.
3272 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3273 return SDValue();
3274
3275 SelectionDAG &DAG = DCI.DAG;
3276 SDLoc DL(N);
3277
3278 SDValue N0 = N->getOperand(0);
3279 SDValue N1 = N->getOperand(1);
3280
3281 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3282 // in the source into any_extends if the result of the mul is truncated. Since
3283 // we can assume the high bits are whatever we want, use the underlying value
3284 // to avoid the unknown high bits from interfering.
3285 if (N0.getOpcode() == ISD::ANY_EXTEND)
3286 N0 = N0.getOperand(0);
3287
3288 if (N1.getOpcode() == ISD::ANY_EXTEND)
3289 N1 = N1.getOperand(0);
3290
3291 SDValue Mul;
3292
3293 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3294 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3295 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3296 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3297 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3298 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3299 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3300 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3301 } else {
3302 return SDValue();
3303 }
3304
3305 // We need to use sext even for MUL_U24, because MUL_U24 is used
3306 // for signed multiply of 8 and 16-bit types.
3307 return DAG.getSExtOrTrunc(Mul, DL, VT);
3308}
3309
3310SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3311 DAGCombinerInfo &DCI) const {
3312 EVT VT = N->getValueType(0);
3313
3314 if (!Subtarget->hasMulI24() || VT.isVector())
3315 return SDValue();
3316
3317 SelectionDAG &DAG = DCI.DAG;
3318 SDLoc DL(N);
3319
3320 SDValue N0 = N->getOperand(0);
3321 SDValue N1 = N->getOperand(1);
3322
3323 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3324 return SDValue();
3325
3326 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3327 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3328
3329 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3330 DCI.AddToWorklist(Mulhi.getNode());
3331 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3332}
3333
3334SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3335 DAGCombinerInfo &DCI) const {
3336 EVT VT = N->getValueType(0);
3337
3338 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3339 return SDValue();
3340
3341 SelectionDAG &DAG = DCI.DAG;
3342 SDLoc DL(N);
3343
3344 SDValue N0 = N->getOperand(0);
3345 SDValue N1 = N->getOperand(1);
3346
3347 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3348 return SDValue();
3349
3350 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3351 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3352
3353 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3354 DCI.AddToWorklist(Mulhi.getNode());
3355 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3356}
3357
3358SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
3359 SDNode *N, DAGCombinerInfo &DCI) const {
3360 SelectionDAG &DAG = DCI.DAG;
3361
3362 // Simplify demanded bits before splitting into multiple users.
3363 if (SDValue V = simplifyI24(N, DCI))
3364 return V;
3365
3366 SDValue N0 = N->getOperand(0);
3367 SDValue N1 = N->getOperand(1);
3368
3369 bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3370
3371 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3372 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3373
3374 SDLoc SL(N);
3375
3376 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3377 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3378 return DAG.getMergeValues({ MulLo, MulHi }, SL);
3379}
3380
3381static bool isNegativeOne(SDValue Val) {
3382 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3383 return C->isAllOnesValue();
3384 return false;
3385}
3386
3387SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3388 SDValue Op,
3389 const SDLoc &DL,
3390 unsigned Opc) const {
3391 EVT VT = Op.getValueType();
3392 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3393 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3394 LegalVT != MVT::i16))
3395 return SDValue();
3396
3397 if (VT != MVT::i32)
3398 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3399
3400 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3401 if (VT != MVT::i32)
3402 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3403
3404 return FFBX;
3405}
3406
3407// The native instructions return -1 on 0 input. Optimize out a select that
3408// produces -1 on 0.
3409//
3410// TODO: If zero is not undef, we could also do this if the output is compared
3411// against the bitwidth.
3412//
3413// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3414SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3415 SDValue LHS, SDValue RHS,
3416 DAGCombinerInfo &DCI) const {
3417 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3418 if (!CmpRhs || !CmpRhs->isNullValue())
3419 return SDValue();
3420
3421 SelectionDAG &DAG = DCI.DAG;
3422 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3423 SDValue CmpLHS = Cond.getOperand(0);
3424
3425 unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3426 AMDGPUISD::FFBH_U32;
3427
3428 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3429 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3430 if (CCOpcode == ISD::SETEQ &&
3431 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3432 RHS.getOperand(0) == CmpLHS &&
3433 isNegativeOne(LHS)) {
3434 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3435 }
3436
3437 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3438 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3439 if (CCOpcode == ISD::SETNE &&
3440 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3441 LHS.getOperand(0) == CmpLHS &&
3442 isNegativeOne(RHS)) {
3443 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3444 }
3445
3446 return SDValue();
3447}
3448
3449static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3450 unsigned Op,
3451 const SDLoc &SL,
3452 SDValue Cond,
3453 SDValue N1,
3454 SDValue N2) {
3455 SelectionDAG &DAG = DCI.DAG;
3456 EVT VT = N1.getValueType();
3457
3458 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3459 N1.getOperand(0), N2.getOperand(0));
3460 DCI.AddToWorklist(NewSelect.getNode());
3461 return DAG.getNode(Op, SL, VT, NewSelect);
3462}
3463
3464// Pull a free FP operation out of a select so it may fold into uses.
3465//
3466// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3467// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3468//
3469// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3470// select c, (fabs x), +k -> fabs (select c, x, k)
3471static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3472 SDValue N) {
3473 SelectionDAG &DAG = DCI.DAG;
3474 SDValue Cond = N.getOperand(0);
3475 SDValue LHS = N.getOperand(1);
3476 SDValue RHS = N.getOperand(2);
3477
3478 EVT VT = N.getValueType();
3479 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3480 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3481 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3482 SDLoc(N), Cond, LHS, RHS);
3483 }
3484
3485 bool Inv = false;
3486 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3487 std::swap(LHS, RHS);
3488 Inv = true;
3489 }
3490
3491 // TODO: Support vector constants.
3492 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3493 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3494 SDLoc SL(N);
3495 // If one side is an fneg/fabs and the other is a constant, we can push the
3496 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3497 SDValue NewLHS = LHS.getOperand(0);
3498 SDValue NewRHS = RHS;
3499
3500 // Careful: if the neg can be folded up, don't try to pull it back down.
3501 bool ShouldFoldNeg = true;
3502
3503 if (NewLHS.hasOneUse()) {
3504 unsigned Opc = NewLHS.getOpcode();
3505 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3506 ShouldFoldNeg = false;
3507 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3508 ShouldFoldNeg = false;
3509 }
3510
3511 if (ShouldFoldNeg) {
3512 if (LHS.getOpcode() == ISD::FNEG)
3513 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3514 else if (CRHS->isNegative())
3515 return SDValue();
3516
3517 if (Inv)
3518 std::swap(NewLHS, NewRHS);
3519
3520 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3521 Cond, NewLHS, NewRHS);
3522 DCI.AddToWorklist(NewSelect.getNode());
3523 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3524 }
3525 }
3526
3527 return SDValue();
3528}
3529
3530
3531SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3532 DAGCombinerInfo &DCI) const {
3533 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3534 return Folded;
3535
3536 SDValue Cond = N->getOperand(0);
3537 if (Cond.getOpcode() != ISD::SETCC)
3538 return SDValue();
3539
3540 EVT VT = N->getValueType(0);
3541 SDValue LHS = Cond.getOperand(0);
3542 SDValue RHS = Cond.getOperand(1);
3543 SDValue CC = Cond.getOperand(2);
3544
3545 SDValue True = N->getOperand(1);
3546 SDValue False = N->getOperand(2);
3547
3548 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3549 SelectionDAG &DAG = DCI.DAG;
3550 if (DAG.isConstantValueOfAnyType(True) &&
3551 !DAG.isConstantValueOfAnyType(False)) {
3552 // Swap cmp + select pair to move constant to false input.
3553 // This will allow using VOPC cndmasks more often.
3554 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3555
3556 SDLoc SL(N);
3557 ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3558 LHS.getValueType().isInteger());
3559
3560 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3561 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3562 }
3563
3564 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3565 SDValue MinMax
3566 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3567 // Revisit this node so we can catch min3/max3/med3 patterns.
3568 //DCI.AddToWorklist(MinMax.getNode());
3569 return MinMax;
3570 }
3571 }
3572
3573 // There's no reason to not do this if the condition has other uses.
3574 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3575}
3576
3577static bool isInv2Pi(const APFloat &APF) {
3578 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3579 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3580 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3581
3582 return APF.bitwiseIsEqual(KF16) ||
3583 APF.bitwiseIsEqual(KF32) ||
3584 APF.bitwiseIsEqual(KF64);
3585}
3586
3587// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3588// additional cost to negate them.
3589bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3590 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3591 if (C->isZero() && !C->isNegative())
3592 return true;
3593
3594 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3595 return true;
3596 }
3597
3598 return false;
3599}
3600
3601static unsigned inverseMinMax(unsigned Opc) {
3602 switch (Opc) {
3603 case ISD::FMAXNUM:
3604 return ISD::FMINNUM;
3605 case ISD::FMINNUM:
3606 return ISD::FMAXNUM;
3607 case ISD::FMAXNUM_IEEE:
3608 return ISD::FMINNUM_IEEE;
3609 case ISD::FMINNUM_IEEE:
3610 return ISD::FMAXNUM_IEEE;
3611 case AMDGPUISD::FMAX_LEGACY:
3612 return AMDGPUISD::FMIN_LEGACY;
3613 case AMDGPUISD::FMIN_LEGACY:
3614 return AMDGPUISD::FMAX_LEGACY;
3615 default:
3616 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3616)
;
3617 }
3618}
3619
3620SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3621 DAGCombinerInfo &DCI) const {
3622 SelectionDAG &DAG = DCI.DAG;
3623 SDValue N0 = N->getOperand(0);
3624 EVT VT = N->getValueType(0);
3625
3626 unsigned Opc = N0.getOpcode();
3627
3628 // If the input has multiple uses and we can either fold the negate down, or
3629 // the other uses cannot, give up. This both prevents unprofitable
3630 // transformations and infinite loops: we won't repeatedly try to fold around
3631 // a negate that has no 'good' form.
3632 if (N0.hasOneUse()) {
3633 // This may be able to fold into the source, but at a code size cost. Don't
3634 // fold if the fold into the user is free.
3635 if (allUsesHaveSourceMods(N, 0))
3636 return SDValue();
3637 } else {
3638 if (fnegFoldsIntoOp(Opc) &&
3639 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3640 return SDValue();
3641 }
3642
3643 SDLoc SL(N);
3644 switch (Opc) {
3645 case ISD::FADD: {
3646 if (!mayIgnoreSignedZero(N0))
3647 return SDValue();
3648
3649 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3650 SDValue LHS = N0.getOperand(0);
3651 SDValue RHS = N0.getOperand(1);
3652
3653 if (LHS.getOpcode() != ISD::FNEG)
3654 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3655 else
3656 LHS = LHS.getOperand(0);
3657
3658 if (RHS.getOpcode() != ISD::FNEG)
3659 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3660 else
3661 RHS = RHS.getOperand(0);
3662
3663 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3664 if (Res.getOpcode() != ISD::FADD)
3665 return SDValue(); // Op got folded away.
3666 if (!N0.hasOneUse())
3667 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3668 return Res;
3669 }
3670 case ISD::FMUL:
3671 case AMDGPUISD::FMUL_LEGACY: {
3672 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3673 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3674 SDValue LHS = N0.getOperand(0);
3675 SDValue RHS = N0.getOperand(1);
3676
3677 if (LHS.getOpcode() == ISD::FNEG)
3678 LHS = LHS.getOperand(0);
3679 else if (RHS.getOpcode() == ISD::FNEG)
3680 RHS = RHS.getOperand(0);
3681 else
3682 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3683
3684 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3685 if (Res.getOpcode() != Opc)
3686 return SDValue(); // Op got folded away.
3687 if (!N0.hasOneUse())
3688 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3689 return Res;
3690 }
3691 case ISD::FMA:
3692 case ISD::FMAD: {
3693 if (!mayIgnoreSignedZero(N0))
3694 return SDValue();
3695
3696 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3697 SDValue LHS = N0.getOperand(0);
3698 SDValue MHS = N0.getOperand(1);
3699 SDValue RHS = N0.getOperand(2);
3700
3701 if (LHS.getOpcode() == ISD::FNEG)
3702 LHS = LHS.getOperand(0);
3703 else if (MHS.getOpcode() == ISD::FNEG)
3704 MHS = MHS.getOperand(0);
3705 else
3706 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3707
3708 if (RHS.getOpcode() != ISD::FNEG)
3709 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3710 else
3711 RHS = RHS.getOperand(0);
3712
3713 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3714 if (Res.getOpcode() != Opc)
3715 return SDValue(); // Op got folded away.
3716 if (!N0.hasOneUse())
3717 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3718 return Res;
3719 }
3720 case ISD::FMAXNUM:
3721 case ISD::FMINNUM:
3722 case ISD::FMAXNUM_IEEE:
3723 case ISD::FMINNUM_IEEE:
3724 case AMDGPUISD::FMAX_LEGACY:
3725 case AMDGPUISD::FMIN_LEGACY: {
3726 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3727 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3728 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3729 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3730
3731 SDValue LHS = N0.getOperand(0);
3732 SDValue RHS = N0.getOperand(1);
3733
3734 // 0 doesn't have a negated inline immediate.
3735 // TODO: This constant check should be generalized to other operations.
3736 if (isConstantCostlierToNegate(RHS))
3737 return SDValue();
3738
3739 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3740 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3741 unsigned Opposite = inverseMinMax(Opc);
3742
3743 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3744 if (Res.getOpcode() != Opposite)
3745 return SDValue(); // Op got folded away.
3746 if (!N0.hasOneUse())
3747 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3748 return Res;
3749 }
3750 case AMDGPUISD::FMED3: {
3751 SDValue Ops[3];
3752 for (unsigned I = 0; I < 3; ++I)
3753 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3754
3755 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3756 if (Res.getOpcode() != AMDGPUISD::FMED3)
3757 return SDValue(); // Op got folded away.
3758 if (!N0.hasOneUse())
3759 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3760 return Res;
3761 }
3762 case ISD::FP_EXTEND:
3763 case ISD::FTRUNC:
3764 case ISD::FRINT:
3765 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3766 case ISD::FSIN:
3767 case ISD::FCANONICALIZE:
3768 case AMDGPUISD::RCP:
3769 case AMDGPUISD::RCP_LEGACY:
3770 case AMDGPUISD::RCP_IFLAG:
3771 case AMDGPUISD::SIN_HW: {
3772 SDValue CvtSrc = N0.getOperand(0);
3773 if (CvtSrc.getOpcode() == ISD::FNEG) {
3774 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3775 // (fneg (rcp (fneg x))) -> (rcp x)
3776 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3777 }
3778
3779 if (!N0.hasOneUse())
3780 return SDValue();
3781
3782 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3783 // (fneg (rcp x)) -> (rcp (fneg x))
3784 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3785 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3786 }
3787 case ISD::FP_ROUND: {
3788 SDValue CvtSrc = N0.getOperand(0);
3789
3790 if (CvtSrc.getOpcode() == ISD::FNEG) {
3791 // (fneg (fp_round (fneg x))) -> (fp_round x)
3792 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3793 CvtSrc.getOperand(0), N0.getOperand(1));
3794 }
3795
3796 if (!N0.hasOneUse())
3797 return SDValue();
3798
3799 // (fneg (fp_round x)) -> (fp_round (fneg x))
3800 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3801 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3802 }
3803 case ISD::FP16_TO_FP: {
3804 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3805 // f16, but legalization of f16 fneg ends up pulling it out of the source.
3806 // Put the fneg back as a legal source operation that can be matched later.
3807 SDLoc SL(N);
3808
3809 SDValue Src = N0.getOperand(0);
3810 EVT SrcVT = Src.getValueType();
3811
3812 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3813 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3814 DAG.getConstant(0x8000, SL, SrcVT));
3815 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3816 }
3817 default:
3818 return SDValue();
3819 }
3820}
3821
3822SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3823 DAGCombinerInfo &DCI) const {
3824 SelectionDAG &DAG = DCI.DAG;
3825 SDValue N0 = N->getOperand(0);
3826
3827 if (!N0.hasOneUse())
3828 return SDValue();
3829
3830 switch (N0.getOpcode()) {
3831 case ISD::FP16_TO_FP: {
3832 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")((!Subtarget->has16BitInsts() && "should only see if f16 is illegal"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3832, __PRETTY_FUNCTION__))
;
3833 SDLoc SL(N);
3834 SDValue Src = N0.getOperand(0);
3835 EVT SrcVT = Src.getValueType();
3836
3837 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3838 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3839 DAG.getConstant(0x7fff, SL, SrcVT));
3840 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3841 }
3842 default:
3843 return SDValue();
3844 }
3845}
3846
3847SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3848 DAGCombinerInfo &DCI) const {
3849 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3850 if (!CFP)
3851 return SDValue();
3852
3853 // XXX - Should this flush denormals?
3854 const APFloat &Val = CFP->getValueAPF();
3855 APFloat One(Val.getSemantics(), "1.0");
3856 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3857}
3858
3859SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3860 DAGCombinerInfo &DCI) const {
3861 SelectionDAG &DAG = DCI.DAG;
3862 SDLoc DL(N);
3863
3864 switch(N->getOpcode()) {
3865 default:
3866 break;
3867 case ISD::BITCAST: {
3868 EVT DestVT = N->getValueType(0);
3869
3870 // Push casts through vector builds. This helps avoid emitting a large
3871 // number of copies when materializing floating point vector constants.
3872 //
3873 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3874 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3875 if (DestVT.isVector()) {
3876 SDValue Src = N->getOperand(0);
3877 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3878 EVT SrcVT = Src.getValueType();
3879 unsigned NElts = DestVT.getVectorNumElements();
3880
3881 if (SrcVT.getVectorNumElements() == NElts) {
3882 EVT DestEltVT = DestVT.getVectorElementType();
3883
3884 SmallVector<SDValue, 8> CastedElts;
3885 SDLoc SL(N);
3886 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3887 SDValue Elt = Src.getOperand(I);
3888 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3889 }
3890
3891 return DAG.getBuildVector(DestVT, SL, CastedElts);
3892 }
3893 }
3894 }
3895
3896 if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3897 break;
3898
3899 // Fold bitcasts of constants.
3900 //
3901 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3902 // TODO: Generalize and move to DAGCombiner
3903 SDValue Src = N->getOperand(0);
3904 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3905 if (Src.getValueType() == MVT::i64) {
3906 SDLoc SL(N);
3907 uint64_t CVal = C->getZExtValue();
3908 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3909 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3910 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3911 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
3912 }
3913 }
3914
3915 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3916 const APInt &Val = C->getValueAPF().bitcastToAPInt();
3917 SDLoc SL(N);
3918 uint64_t CVal = Val.getZExtValue();
3919 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3920 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3921 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3922
3923 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3924 }
3925
3926 break;
3927 }
3928 case ISD::SHL: {
3929 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3930 break;
3931
3932 return performShlCombine(N, DCI);
3933 }
3934 case ISD::SRL: {
3935 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3936 break;
3937
3938 return performSrlCombine(N, DCI);
3939 }
3940 case ISD::SRA: {
3941 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3942 break;
3943
3944 return performSraCombine(N, DCI);
3945 }
3946 case ISD::TRUNCATE:
3947 return performTruncateCombine(N, DCI);
3948 case ISD::MUL:
3949 return performMulCombine(N, DCI);
3950 case ISD::MULHS:
3951 return performMulhsCombine(N, DCI);
3952 case ISD::MULHU:
3953 return performMulhuCombine(N, DCI);
3954 case AMDGPUISD::MUL_I24:
3955 case AMDGPUISD::MUL_U24:
3956 case AMDGPUISD::MULHI_I24:
3957 case AMDGPUISD::MULHI_U24: {
3958 if (SDValue V = simplifyI24(N, DCI))
3959 return V;
3960 return SDValue();
3961 }
3962 case AMDGPUISD::MUL_LOHI_I24:
3963 case AMDGPUISD::MUL_LOHI_U24:
3964 return performMulLoHi24Combine(N, DCI);
3965 case ISD::SELECT:
3966 return performSelectCombine(N, DCI);
3967 case ISD::FNEG:
3968 return performFNegCombine(N, DCI);
3969 case ISD::FABS:
3970 return performFAbsCombine(N, DCI);
3971 case AMDGPUISD::BFE_I32:
3972 case AMDGPUISD::BFE_U32: {
3973 assert(!N->getValueType(0).isVector() &&((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3974, __PRETTY_FUNCTION__))
3974 "Vector handling of BFE not implemented")((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3974, __PRETTY_FUNCTION__))
;
3975 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3976 if (!Width)
3977 break;
3978
3979 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3980 if (WidthVal == 0)
3981 return DAG.getConstant(0, DL, MVT::i32);
3982
3983 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3984 if (!Offset)
3985 break;
3986
3987 SDValue BitsFrom = N->getOperand(0);
3988 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3989
3990 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3991
3992 if (OffsetVal == 0) {
3993 // This is already sign / zero extended, so try to fold away extra BFEs.
3994 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3995
3996 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3997 if (OpSignBits >= SignBits)
3998 return BitsFrom;
3999
4000 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4001 if (Signed) {
4002 // This is a sign_extend_inreg. Replace it to take advantage of existing
4003 // DAG Combines. If not eliminated, we will match back to BFE during
4004 // selection.
4005
4006 // TODO: The sext_inreg of extended types ends, although we can could
4007 // handle them in a single BFE.
4008 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4009 DAG.getValueType(SmallVT));
4010 }
4011
4012 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4013 }
4014
4015 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4016 if (Signed) {
4017 return constantFoldBFE<int32_t>(DAG,
4018 CVal->getSExtValue(),
4019 OffsetVal,
4020 WidthVal,
4021 DL);
4022 }
4023
4024 return constantFoldBFE<uint32_t>(DAG,
4025 CVal->getZExtValue(),
4026 OffsetVal,
4027 WidthVal,
4028 DL);
4029 }
4030
4031 if ((OffsetVal + WidthVal) >= 32 &&
4032 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4033 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4034 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4035 BitsFrom, ShiftVal);
4036 }
4037
4038 if (BitsFrom.hasOneUse()) {
4039 APInt Demanded = APInt::getBitsSet(32,
4040 OffsetVal,
4041 OffsetVal + WidthVal);
4042
4043 KnownBits Known;
4044 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4045 !DCI.isBeforeLegalizeOps());
4046 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4047 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4048 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4049 DCI.CommitTargetLoweringOpt(TLO);
4050 }
4051 }
4052
4053 break;
4054 }
4055 case ISD::LOAD:
4056 return performLoadCombine(N, DCI);
4057 case ISD::STORE:
4058 return performStoreCombine(N, DCI);
4059 case AMDGPUISD::RCP:
4060 case AMDGPUISD::RCP_IFLAG:
4061 return performRcpCombine(N, DCI);
4062 case ISD::AssertZext:
4063 case ISD::AssertSext:
4064 return performAssertSZExtCombine(N, DCI);
4065 case ISD::INTRINSIC_WO_CHAIN:
4066 return performIntrinsicWOChainCombine(N, DCI);
4067 }
4068 return SDValue();
4069}
4070
4071//===----------------------------------------------------------------------===//
4072// Helper functions
4073//===----------------------------------------------------------------------===//
4074
4075SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4076 const TargetRegisterClass *RC,
4077 unsigned Reg, EVT VT,
4078 const SDLoc &SL,
4079 bool RawReg) const {
4080 MachineFunction &MF = DAG.getMachineFunction();
4081 MachineRegisterInfo &MRI = MF.getRegInfo();
4082 unsigned VReg;
4083
4084 if (!MRI.isLiveIn(Reg)) {
4085 VReg = MRI.createVirtualRegister(RC);
4086 MRI.addLiveIn(Reg, VReg);
4087 } else {
4088 VReg = MRI.getLiveInVirtReg(Reg);
4089 }
4090
4091 if (RawReg)
4092 return DAG.getRegister(VReg, VT);
4093
4094 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4095}
4096
4097// This may be called multiple times, and nothing prevents creating multiple
4098// objects at the same offset. See if we already defined this object.
4099static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4100 int64_t Offset) {
4101 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4102 if (MFI.getObjectOffset(I) == Offset) {
4103 assert(MFI.getObjectSize(I) == Size)((MFI.getObjectSize(I) == Size) ? static_cast<void> (0)
: __assert_fail ("MFI.getObjectSize(I) == Size", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4103, __PRETTY_FUNCTION__))
;
4104 return I;
4105 }
4106 }
4107
4108 return MFI.CreateFixedObject(Size, Offset, true);
4109}
4110
4111SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4112 EVT VT,
4113 const SDLoc &SL,
4114 int64_t Offset) const {
4115 MachineFunction &MF = DAG.getMachineFunction();
4116 MachineFrameInfo &MFI = MF.getFrameInfo();
4117 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4118
4119 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4120 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4121
4122 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
4123 MachineMemOperand::MODereferenceable |
4124 MachineMemOperand::MOInvariant);
4125}
4126
4127SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4128 const SDLoc &SL,
4129 SDValue Chain,
4130 SDValue ArgVal,
4131 int64_t Offset) const {
4132 MachineFunction &MF = DAG.getMachineFunction();
4133 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4134
4135 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4136 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
4137 MachineMemOperand::MODereferenceable);
4138 return Store;
4139}
4140
4141SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4142 const TargetRegisterClass *RC,
4143 EVT VT, const SDLoc &SL,
4144 const ArgDescriptor &Arg) const {
4145 assert(Arg && "Attempting to load missing argument")((Arg && "Attempting to load missing argument") ? static_cast
<void> (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4145, __PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4146
4147 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4148 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4149 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4150
4151 if (!Arg.isMasked())
4
Calling 'ArgDescriptor::isMasked'
7
Returning from 'ArgDescriptor::isMasked'
8
Taking false branch
4152 return V;
4153
4154 unsigned Mask = Arg.getMask();
4155 unsigned Shift = countTrailingZeros<unsigned>(Mask);
9
Calling 'countTrailingZeros<unsigned int>'
16
Returning from 'countTrailingZeros<unsigned int>'
17
'Shift' initialized to 32
4156 V = DAG.getNode(ISD::SRL, SL, VT, V,
4157 DAG.getShiftAmountConstant(Shift, VT, SL));
4158 return DAG.getNode(ISD::AND, SL, VT, V,
4159 DAG.getConstant(Mask >> Shift, SL, VT));
18
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4160}
4161
4162uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4163 const MachineFunction &MF, const ImplicitParameter Param) const {
4164 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4165 const AMDGPUSubtarget &ST =
4166 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4167 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4168 unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
4169 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4170 ExplicitArgOffset;
4171 switch (Param) {
4172 case GRID_DIM:
4173 return ArgOffset;
4174 case GRID_OFFSET:
4175 return ArgOffset + 4;
4176 }
4177 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4177)
;
4178}
4179
4180#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4181
4182const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4183 switch ((AMDGPUISD::NodeType)Opcode) {
4184 case AMDGPUISD::FIRST_NUMBER: break;
4185 // AMDIL DAG nodes
4186 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4187 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4188
4189 // AMDGPU DAG nodes
4190 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4191 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4192 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4193 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4194 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4195 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4196 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4197 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4198 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4199 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4200 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4201 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4202 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4203 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4204 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4205 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4206 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4207 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4208 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4209 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4210 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4211 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4212 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4213 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4214 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4215 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4216 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4217 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4218 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4219 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4220 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4221 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4222 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4223 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4224 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4225 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4226 NODE_NAME_CASE(TRIG_PREOP)case AMDGPUISD::TRIG_PREOP: return "TRIG_PREOP";
4227 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4228 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4229 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4230 NODE_NAME_CASE(RSQ_LEGACY)case AMDGPUISD::RSQ_LEGACY: return "RSQ_LEGACY";
4231 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4232 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4233 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4234 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4235 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4236 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4237 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4238 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4239 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4240 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4241 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4242 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4243 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4244 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4245 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4246 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4247 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4248 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4249 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4250 NODE_NAME_CASE(MUL_LOHI_U24)case AMDGPUISD::MUL_LOHI_U24: return "MUL_LOHI_U24";
4251 NODE_NAME_CASE(MUL_LOHI_I24)case AMDGPUISD::MUL_LOHI_I24: return "MUL_LOHI_I24";
4252 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4253 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4254 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4255 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4256 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4257 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4258 NODE_NAME_CASE(EXPORT)case AMDGPUISD::EXPORT: return "EXPORT";
4259 NODE_NAME_CASE(EXPORT_DONE)case AMDGPUISD::EXPORT_DONE: return "EXPORT_DONE";
4260 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4261 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4262 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4263 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4264 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4265 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4266 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4267 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4268 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4269 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4270 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4271 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4272 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4273 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4274 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4275 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4276 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4277 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4278 NODE_NAME_CASE(FP16_ZEXT)case AMDGPUISD::FP16_ZEXT: return "FP16_ZEXT";
4279 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4280 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4281 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4282 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4283 NODE_NAME_CASE(KILL)case AMDGPUISD::KILL: return "KILL";
4284 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4285 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4286 NODE_NAME_CASE(INTERP_MOV)case AMDGPUISD::INTERP_MOV: return "INTERP_MOV";
4287 NODE_NAME_CASE(INTERP_P1)case AMDGPUISD::INTERP_P1: return "INTERP_P1";
4288 NODE_NAME_CASE(INTERP_P2)case AMDGPUISD::INTERP_P2: return "INTERP_P2";
4289 NODE_NAME_CASE(INTERP_P1LL_F16)case AMDGPUISD::INTERP_P1LL_F16: return "INTERP_P1LL_F16";
4290 NODE_NAME_CASE(INTERP_P1LV_F16)case AMDGPUISD::INTERP_P1LV_F16: return "INTERP_P1LV_F16";
4291 NODE_NAME_CASE(INTERP_P2_F16)case AMDGPUISD::INTERP_P2_F16: return "INTERP_P2_F16";
4292 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4293 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4294 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4295 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4296 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4297 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4298 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4299 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4300 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4301 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4302 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4303 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4304 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4305 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4306 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4307 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4308 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4309 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4310 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4311 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4312 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4313 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4314 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4315 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4316 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4317 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4318 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4319 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4320 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4321 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4322 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4323 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4324 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4325 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4326 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4327 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4328 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4329 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4330 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4331 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4332 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4333 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4334 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4335 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4336 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4337 NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)case AMDGPUISD::BUFFER_ATOMIC_PK_FADD: return "BUFFER_ATOMIC_PK_FADD"
;
4338 NODE_NAME_CASE(ATOMIC_FADD)case AMDGPUISD::ATOMIC_FADD: return "ATOMIC_FADD";
4339 NODE_NAME_CASE(ATOMIC_PK_FADD)case AMDGPUISD::ATOMIC_PK_FADD: return "ATOMIC_PK_FADD";
4340
4341 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4342 }
4343 return nullptr;
4344}
4345
4346SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4347 SelectionDAG &DAG, int Enabled,
4348 int &RefinementSteps,
4349 bool &UseOneConstNR,
4350 bool Reciprocal) const {
4351 EVT VT = Operand.getValueType();
4352
4353 if (VT == MVT::f32) {
4354 RefinementSteps = 0;
4355 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4356 }
4357
4358 // TODO: There is also f64 rsq instruction, but the documentation is less
4359 // clear on its precision.
4360
4361 return SDValue();
4362}
4363
4364SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4365 SelectionDAG &DAG, int Enabled,
4366 int &RefinementSteps) const {
4367 EVT VT = Operand.getValueType();
4368
4369 if (VT == MVT::f32) {
4370 // Reciprocal, < 1 ulp error.
4371 //
4372 // This reciprocal approximation converges to < 0.5 ulp error with one
4373 // newton rhapson performed with two fused multiple adds (FMAs).
4374
4375 RefinementSteps = 0;
4376 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4377 }
4378
4379 // TODO: There is also f64 rcp instruction, but the documentation is less
4380 // clear on its precision.
4381
4382 return SDValue();
4383}
4384
4385void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4386 const SDValue Op, KnownBits &Known,
4387 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4388
4389 Known.resetAll(); // Don't know anything.
4390
4391 unsigned Opc = Op.getOpcode();
4392
4393 switch (Opc) {
4394 default:
4395 break;
4396 case AMDGPUISD::CARRY:
4397 case AMDGPUISD::BORROW: {
4398 Known.Zero = APInt::getHighBitsSet(32, 31);
4399 break;
4400 }
4401
4402 case AMDGPUISD::BFE_I32:
4403 case AMDGPUISD::BFE_U32: {
4404 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4405 if (!CWidth)
4406 return;
4407
4408 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4409
4410 if (Opc == AMDGPUISD::BFE_U32)
4411 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4412
4413 break;
4414 }
4415 case AMDGPUISD::FP_TO_FP16:
4416 case AMDGPUISD::FP16_ZEXT: {
4417 unsigned BitWidth = Known.getBitWidth();
4418
4419 // High bits are zero.
4420 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4421 break;
4422 }
4423 case AMDGPUISD::MUL_U24:
4424 case AMDGPUISD::MUL_I24: {
4425 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4426 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4427 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4428 RHSKnown.countMinTrailingZeros();
4429 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4430
4431 // Truncate to 24 bits.
4432 LHSKnown = LHSKnown.trunc(24);
4433 RHSKnown = RHSKnown.trunc(24);
4434
4435 bool Negative = false;
4436 if (Opc == AMDGPUISD::MUL_I24) {
4437 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4438 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4439 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4440 if (MaxValBits >= 32)
4441 break;
4442 bool LHSNegative = LHSKnown.isNegative();
4443 bool LHSPositive = LHSKnown.isNonNegative();
4444 bool RHSNegative = RHSKnown.isNegative();
4445 bool RHSPositive = RHSKnown.isNonNegative();
4446 if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
4447 break;
4448 Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
4449 if (Negative)
4450 Known.One.setHighBits(32 - MaxValBits);
4451 else
4452 Known.Zero.setHighBits(32 - MaxValBits);
4453 } else {
4454 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4455 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4456 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4457 if (MaxValBits >= 32)
4458 break;
4459 Known.Zero.setHighBits(32 - MaxValBits);
4460 }
4461 break;
4462 }
4463 case AMDGPUISD::PERM: {
4464 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4465 if (!CMask)
4466 return;
4467
4468 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4469 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4470 unsigned Sel = CMask->getZExtValue();
4471
4472 for (unsigned I = 0; I < 32; I += 8) {
4473 unsigned SelBits = Sel & 0xff;
4474 if (SelBits < 4) {
4475 SelBits *= 8;
4476 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4477 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4478 } else if (SelBits < 7) {
4479 SelBits = (SelBits & 3) * 8;
4480 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4481 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4482 } else if (SelBits == 0x0c) {
4483 Known.Zero |= 0xFFull << I;
4484 } else if (SelBits > 0x0c) {
4485 Known.One |= 0xFFull << I;
4486 }
4487 Sel >>= 8;
4488 }
4489 break;
4490 }
4491 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4492 Known.Zero.setHighBits(24);
4493 break;
4494 }
4495 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4496 Known.Zero.setHighBits(16);
4497 break;
4498 }
4499 case AMDGPUISD::LDS: {
4500 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4501 unsigned Align = GA->getGlobal()->getAlignment();
4502
4503 Known.Zero.setHighBits(16);
4504 if (Align)
4505 Known.Zero.setLowBits(Log2_32(Align));
4506 break;
4507 }
4508 case ISD::INTRINSIC_WO_CHAIN: {
4509 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4510 switch (IID) {
4511 case Intrinsic::amdgcn_mbcnt_lo:
4512 case Intrinsic::amdgcn_mbcnt_hi: {
4513 const GCNSubtarget &ST =
4514 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4515 // These return at most the wavefront size - 1.
4516 unsigned Size = Op.getValueType().getSizeInBits();
4517 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4518 break;
4519 }
4520 default:
4521 break;
4522 }
4523 }
4524 }
4525}
4526
4527unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4528 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4529 unsigned Depth) const {
4530 switch (Op.getOpcode()) {
4531 case AMDGPUISD::BFE_I32: {
4532 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4533 if (!Width)
4534 return 1;
4535
4536 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4537 if (!isNullConstant(Op.getOperand(1)))
4538 return SignBits;
4539
4540 // TODO: Could probably figure something out with non-0 offsets.
4541 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4542 return std::max(SignBits, Op0SignBits);
4543 }
4544
4545 case AMDGPUISD::BFE_U32: {
4546 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4547 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4548 }
4549
4550 case AMDGPUISD::CARRY:
4551 case AMDGPUISD::BORROW:
4552 return 31;
4553 case AMDGPUISD::BUFFER_LOAD_BYTE:
4554 return 25;
4555 case AMDGPUISD::BUFFER_LOAD_SHORT:
4556 return 17;
4557 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4558 return 24;
4559 case AMDGPUISD::BUFFER_LOAD_USHORT:
4560 return 16;
4561 case AMDGPUISD::FP_TO_FP16:
4562 case AMDGPUISD::FP16_ZEXT:
4563 return 16;
4564 default:
4565 return 1;
4566 }
4567}
4568
4569bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4570 const SelectionDAG &DAG,
4571 bool SNaN,
4572 unsigned Depth) const {
4573 unsigned Opcode = Op.getOpcode();
4574 switch (Opcode) {
4575 case AMDGPUISD::FMIN_LEGACY:
4576 case AMDGPUISD::FMAX_LEGACY: {
4577 if (SNaN)
4578 return true;
4579
4580 // TODO: Can check no nans on one of the operands for each one, but which
4581 // one?
4582 return false;
4583 }
4584 case AMDGPUISD::FMUL_LEGACY:
4585 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4586 if (SNaN)
4587 return true;
4588 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4589 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4590 }
4591 case AMDGPUISD::FMED3:
4592 case AMDGPUISD::FMIN3:
4593 case AMDGPUISD::FMAX3:
4594 case AMDGPUISD::FMAD_FTZ: {
4595 if (SNaN)
4596 return true;
4597 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4598 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4599 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4600 }
4601 case AMDGPUISD::CVT_F32_UBYTE0:
4602 case AMDGPUISD::CVT_F32_UBYTE1:
4603 case AMDGPUISD::CVT_F32_UBYTE2:
4604 case AMDGPUISD::CVT_F32_UBYTE3:
4605 return true;
4606
4607 case AMDGPUISD::RCP:
4608 case AMDGPUISD::RSQ:
4609 case AMDGPUISD::RCP_LEGACY:
4610 case AMDGPUISD::RSQ_LEGACY:
4611 case AMDGPUISD::RSQ_CLAMP: {
4612 if (SNaN)
4613 return true;
4614
4615 // TODO: Need is known positive check.
4616 return false;
4617 }
4618 case AMDGPUISD::LDEXP:
4619 case AMDGPUISD::FRACT: {
4620 if (SNaN)
4621 return true;
4622 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4623 }
4624 case AMDGPUISD::DIV_SCALE:
4625 case AMDGPUISD::DIV_FMAS:
4626 case AMDGPUISD::DIV_FIXUP:
4627 case AMDGPUISD::TRIG_PREOP:
4628 // TODO: Refine on operands.
4629 return SNaN;
4630 case AMDGPUISD::SIN_HW:
4631 case AMDGPUISD::COS_HW: {
4632 // TODO: Need check for infinity
4633 return SNaN;
4634 }
4635 case ISD::INTRINSIC_WO_CHAIN: {
4636 unsigned IntrinsicID
4637 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4638 // TODO: Handle more intrinsics
4639 switch (IntrinsicID) {
4640 case Intrinsic::amdgcn_cubeid:
4641 return true;
4642
4643 case Intrinsic::amdgcn_frexp_mant: {
4644 if (SNaN)
4645 return true;
4646 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4647 }
4648 case Intrinsic::amdgcn_cvt_pkrtz: {
4649 if (SNaN)
4650 return true;
4651 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4652 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4653 }
4654 case Intrinsic::amdgcn_fdot2:
4655 // TODO: Refine on operand
4656 return SNaN;
4657 default:
4658 return false;
4659 }
4660 }
4661 default:
4662 return false;
4663 }
4664}
4665
4666TargetLowering::AtomicExpansionKind
4667AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4668 switch (RMW->getOperation()) {
4669 case AtomicRMWInst::Nand:
4670 case AtomicRMWInst::FAdd:
4671 case AtomicRMWInst::FSub:
4672 return AtomicExpansionKind::CmpXChg;
4673 default:
4674 return AtomicExpansionKind::None;
4675 }
4676}

/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
5
Assuming the condition is true
6
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-10~svn374877/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include "llvm/Support/SwapByteOrder.h"
18#include <algorithm>
19#include <cassert>
20#include <climits>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
117 return 32;
13
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 251, __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315// NOTE: The following support functions use the _32/_64 extensions instead of
316// type overloading so that signed and unsigned integers can be used without
317// ambiguity.
318
319/// Return the high 32 bits of a 64 bit value.
320constexpr inline uint32_t Hi_32(uint64_t Value) {
321 return static_cast<uint32_t>(Value >> 32);
322}
323
324/// Return the low 32 bits of a 64 bit value.
325constexpr inline uint32_t Lo_32(uint64_t Value) {
326 return static_cast<uint32_t>(Value);
327}
328
329/// Make a 64-bit integer from a high / low pair of 32-bit integers.
330constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
331 return ((uint64_t)High << 32) | (uint64_t)Low;
332}
333
334/// Checks if an integer fits into the given bit width.
335template <unsigned N> constexpr inline bool isInt(int64_t x) {
336 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
337}
338// Template specializations to get better code for common cases.
339template <> constexpr inline bool isInt<8>(int64_t x) {
340 return static_cast<int8_t>(x) == x;
341}
342template <> constexpr inline bool isInt<16>(int64_t x) {
343 return static_cast<int16_t>(x) == x;
344}
345template <> constexpr inline bool isInt<32>(int64_t x) {
346 return static_cast<int32_t>(x) == x;
347}
348
349/// Checks if a signed integer is an N bit number shifted left by S.
350template <unsigned N, unsigned S>
351constexpr inline bool isShiftedInt(int64_t x) {
352 static_assert(
353 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
354 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
355 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
356}
357
358/// Checks if an unsigned integer fits into the given bit width.
359///
360/// This is written as two functions rather than as simply
361///
362/// return N >= 64 || X < (UINT64_C(1) << N);
363///
364/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
365/// left too many places.
366template <unsigned N>
367constexpr inline typename std::enable_if<(N < 64), bool>::type
368isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline typename std::enable_if<N >= 64, bool>::type
374isUInt(uint64_t X) {
375 return true;
376}
377
378// Template specializations to get better code for common cases.
379template <> constexpr inline bool isUInt<8>(uint64_t x) {
380 return static_cast<uint8_t>(x) == x;
381}
382template <> constexpr inline bool isUInt<16>(uint64_t x) {
383 return static_cast<uint16_t>(x) == x;
384}
385template <> constexpr inline bool isUInt<32>(uint64_t x) {
386 return static_cast<uint32_t>(x) == x;
387}
388
389/// Checks if a unsigned integer is an N bit number shifted left by S.
390template <unsigned N, unsigned S>
391constexpr inline bool isShiftedUInt(uint64_t x) {
392 static_assert(
393 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
394 static_assert(N + S <= 64,
395 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
396 // Per the two static_asserts above, S must be strictly less than 64. So
397 // 1 << S is not undefined behavior.
398 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
399}
400
401/// Gets the maximum value for a N-bit unsigned integer.
402inline uint64_t maxUIntN(uint64_t N) {
403 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 403, __PRETTY_FUNCTION__))
;
404
405 // uint64_t(1) << 64 is undefined behavior, so we can't do
406 // (uint64_t(1) << N) - 1
407 // without checking first that N != 64. But this works and doesn't have a
408 // branch.
409 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
410}
411
412/// Gets the minimum value for a N-bit signed integer.
413inline int64_t minIntN(int64_t N) {
414 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 414, __PRETTY_FUNCTION__))
;
415
416 return -(UINT64_C(1)1UL<<(N-1));
417}
418
419/// Gets the maximum value for a N-bit signed integer.
420inline int64_t maxIntN(int64_t N) {
421 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 421, __PRETTY_FUNCTION__))
;
422
423 // This relies on two's complement wraparound when N == 64, so we convert to
424 // int64_t only at the very end to avoid UB.
425 return (UINT64_C(1)1UL << (N - 1)) - 1;
426}
427
428/// Checks if an unsigned integer fits into the given (dynamic) bit width.
429inline bool isUIntN(unsigned N, uint64_t x) {
430 return N >= 64 || x <= maxUIntN(N);
431}
432
433/// Checks if an signed integer fits into the given (dynamic) bit width.
434inline bool isIntN(unsigned N, int64_t x) {
435 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
436}
437
438/// Return true if the argument is a non-empty sequence of ones starting at the
439/// least significant bit with the remainder zero (32 bit version).
440/// Ex. isMask_32(0x0000FFFFU) == true.
441constexpr inline bool isMask_32(uint32_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument is a non-empty sequence of ones starting at the
446/// least significant bit with the remainder zero (64 bit version).
447constexpr inline bool isMask_64(uint64_t Value) {
448 return Value && ((Value + 1) & Value) == 0;
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
453constexpr inline bool isShiftedMask_32(uint32_t Value) {
454 return Value && isMask_32((Value - 1) | Value);
455}
456
457/// Return true if the argument contains a non-empty sequence of ones with the
458/// remainder zero (64 bit version.)
459constexpr inline bool isShiftedMask_64(uint64_t Value) {
460 return Value && isMask_64((Value - 1) | Value);
461}
462
463/// Return true if the argument is a power of two > 0.
464/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
465constexpr inline bool isPowerOf2_32(uint32_t Value) {
466 return Value && !(Value & (Value - 1));
467}
468
469/// Return true if the argument is a power of two > 0 (64 bit edition.)
470constexpr inline bool isPowerOf2_64(uint64_t Value) {
471 return Value && !(Value & (Value - 1));
472}
473
474/// Return a byte-swapped representation of the 16-bit argument.
475inline uint16_t ByteSwap_16(uint16_t Value) {
476 return sys::SwapByteOrder_16(Value);
477}
478
479/// Return a byte-swapped representation of the 32-bit argument.
480inline uint32_t ByteSwap_32(uint32_t Value) {
481 return sys::SwapByteOrder_32(Value);
482}
483
484/// Return a byte-swapped representation of the 64-bit argument.
485inline uint64_t ByteSwap_64(uint64_t Value) {
486 return sys::SwapByteOrder_64(Value);
487}
488
489/// Count the number of ones from the most significant bit to the first
490/// zero bit.
491///
492/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countLeadingZeros<T>(~Value, ZB);
503}
504
505/// Count the number of ones from the least significant bit to the first
506/// zero bit.
507///
508/// Ex. countTrailingOnes(0x00FF00FF) == 8.
509/// Only unsigned integral types are allowed.
510///
511/// \param ZB the behavior on an input of all ones. Only ZB_Width and
512/// ZB_Undefined are valid arguments.
513template <typename T>
514unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
515 static_assert(std::numeric_limits<T>::is_integer &&
516 !std::numeric_limits<T>::is_signed,
517 "Only unsigned integral types are allowed.");
518 return countTrailingZeros<T>(~Value, ZB);
519}
520
521namespace detail {
522template <typename T, std::size_t SizeOfT> struct PopulationCounter {
523 static unsigned count(T Value) {
524 // Generic version, forward to 32 bits.
525 static_assert(SizeOfT <= 4, "Not implemented!");
526#if defined(__GNUC__4)
527 return __builtin_popcount(Value);
528#else
529 uint32_t v = Value;
530 v = v - ((v >> 1) & 0x55555555);
531 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
532 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
533#endif
534 }
535};
536
537template <typename T> struct PopulationCounter<T, 8> {
538 static unsigned count(T Value) {
539#if defined(__GNUC__4)
540 return __builtin_popcountll(Value);
541#else
542 uint64_t v = Value;
543 v = v - ((v >> 1) & 0x5555555555555555ULL);
544 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
545 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
546 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
547#endif
548 }
549};
550} // namespace detail
551
552/// Count the number of set bits in a value.
553/// Ex. countPopulation(0xF000F000) = 8
554/// Returns 0 if the word is zero.
555template <typename T>
556inline unsigned countPopulation(T Value) {
557 static_assert(std::numeric_limits<T>::is_integer &&
558 !std::numeric_limits<T>::is_signed,
559 "Only unsigned integral types are allowed.");
560 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
561}
562
563/// Compile time Log2.
564/// Valid only for positive powers of two.
565template <size_t kValue> constexpr inline size_t CTLog2() {
566 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
567 "Value is not a valid power of 2");
568 return 1 + CTLog2<kValue / 2>();
569}
570
571template <> constexpr inline size_t CTLog2<1>() { return 0; }
572
573/// Return the log base 2 of the specified value.
574inline double Log2(double Value) {
575#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
576 return __builtin_log(Value) / __builtin_log(2.0);
577#else
578 return log2(Value);
579#endif
580}
581
582/// Return the floor log base 2 of the specified value, -1 if the value is zero.
583/// (32 bit edition.)
584/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
585inline unsigned Log2_32(uint32_t Value) {
586 return 31 - countLeadingZeros(Value);
587}
588
589/// Return the floor log base 2 of the specified value, -1 if the value is zero.
590/// (64 bit edition.)
591inline unsigned Log2_64(uint64_t Value) {
592 return 63 - countLeadingZeros(Value);
593}
594
595/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
596/// (32 bit edition).
597/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
598inline unsigned Log2_32_Ceil(uint32_t Value) {
599 return 32 - countLeadingZeros(Value - 1);
600}
601
602/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
603/// (64 bit edition.)
604inline unsigned Log2_64_Ceil(uint64_t Value) {
605 return 64 - countLeadingZeros(Value - 1);
606}
607
608/// Return the greatest common divisor of the values using Euclid's algorithm.
609template <typename T>
610inline T greatestCommonDivisor(T A, T B) {
611 while (B) {
612 T Tmp = B;
613 B = A % B;
614 A = Tmp;
615 }
616 return A;
617}
618
619inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
620 return greatestCommonDivisor<uint64_t>(A, B);
621}
622
623/// This function takes a 64-bit integer and returns the bit equivalent double.
624inline double BitsToDouble(uint64_t Bits) {
625 double D;
626 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
627 memcpy(&D, &Bits, sizeof(Bits));
628 return D;
629}
630
631/// This function takes a 32-bit integer and returns the bit equivalent float.
632inline float BitsToFloat(uint32_t Bits) {
633 float F;
634 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
635 memcpy(&F, &Bits, sizeof(Bits));
636 return F;
637}
638
639/// This function takes a double and returns the bit equivalent 64-bit integer.
640/// Note that copying doubles around changes the bits of NaNs on some hosts,
641/// notably x86, so this routine cannot be used if these bits are needed.
642inline uint64_t DoubleToBits(double Double) {
643 uint64_t Bits;
644 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
645 memcpy(&Bits, &Double, sizeof(Double));
646 return Bits;
647}
648
649/// This function takes a float and returns the bit equivalent 32-bit integer.
650/// Note that copying floats around changes the bits of NaNs on some hosts,
651/// notably x86, so this routine cannot be used if these bits are needed.
652inline uint32_t FloatToBits(float Float) {
653 uint32_t Bits;
654 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
655 memcpy(&Bits, &Float, sizeof(Float));
656 return Bits;
657}
658
659/// A and B are either alignments or offsets. Return the minimum alignment that
660/// may be assumed after adding the two together.
661constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
662 // The largest power of 2 that divides both A and B.
663 //
664 // Replace "-Value" by "1+~Value" in the following commented code to avoid
665 // MSVC warning C4146
666 // return (A | B) & -(A | B);
667 return (A | B) & (1 + ~(A | B));
668}
669
670/// Returns the next power of two (in 64-bits) that is strictly greater than A.
671/// Returns zero on overflow.
672inline uint64_t NextPowerOf2(uint64_t A) {
673 A |= (A >> 1);
674 A |= (A >> 2);
675 A |= (A >> 4);
676 A |= (A >> 8);
677 A |= (A >> 16);
678 A |= (A >> 32);
679 return A + 1;
680}
681
682/// Returns the power of two which is less than or equal to the given value.
683/// Essentially, it is a floor operation across the domain of powers of two.
684inline uint64_t PowerOf2Floor(uint64_t A) {
685 if (!A) return 0;
686 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
687}
688
689/// Returns the power of two which is greater than or equal to the given value.
690/// Essentially, it is a ceil operation across the domain of powers of two.
691inline uint64_t PowerOf2Ceil(uint64_t A) {
692 if (!A)
693 return 0;
694 return NextPowerOf2(A - 1);
695}
696
697/// Returns the next integer (mod 2**64) that is greater than or equal to
698/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
699///
700/// If non-zero \p Skew is specified, the return value will be a minimal
701/// integer that is greater than or equal to \p Value and equal to
702/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
703/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
704///
705/// Examples:
706/// \code
707/// alignTo(5, 8) = 8
708/// alignTo(17, 8) = 24
709/// alignTo(~0LL, 8) = 0
710/// alignTo(321, 255) = 510
711///
712/// alignTo(5, 8, 7) = 7
713/// alignTo(17, 8, 1) = 17
714/// alignTo(~0LL, 8, 3) = 3
715/// alignTo(321, 255, 42) = 552
716/// \endcode
717inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
718 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 718, __PRETTY_FUNCTION__))
;
719 Skew %= Align;
720 return (Value + Align - 1 - Skew) / Align * Align + Skew;
721}
722
723/// Returns the next integer (mod 2**64) that is greater than or equal to
724/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
725template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
726 static_assert(Align != 0u, "Align must be non-zero");
727 return (Value + Align - 1) / Align * Align;
728}
729
730/// Returns the integer ceil(Numerator / Denominator).
731inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
732 return alignTo(Numerator, Denominator) / Denominator;
733}
734
735/// Returns the largest uint64_t less than or equal to \p Value and is
736/// \p Skew mod \p Align. \p Align must be non-zero
737inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
738 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 738, __PRETTY_FUNCTION__))
;
739 Skew %= Align;
740 return (Value - Skew) / Align * Align + Skew;
741}
742
743/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
744/// Requires 0 < B <= 32.
745template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
746 static_assert(B > 0, "Bit width can't be 0.");
747 static_assert(B <= 32, "Bit width out of range.");
748 return int32_t(X << (32 - B)) >> (32 - B);
749}
750
751/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
752/// Requires 0 < B < 32.
753inline int32_t SignExtend32(uint32_t X, unsigned B) {
754 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 754, __PRETTY_FUNCTION__))
;
755 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 755, __PRETTY_FUNCTION__))
;
756 return int32_t(X << (32 - B)) >> (32 - B);
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
760/// Requires 0 < B < 64.
761template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 64, "Bit width out of range.");
764 return int64_t(x << (64 - B)) >> (64 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
768/// Requires 0 < B < 64.
769inline int64_t SignExtend64(uint64_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 770, __PRETTY_FUNCTION__))
;
771 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~svn374877/include/llvm/Support/MathExtras.h"
, 771, __PRETTY_FUNCTION__))
;
772 return int64_t(X << (64 - B)) >> (64 - B);
773}
774
775/// Subtract two unsigned integers, X and Y, of type T and return the absolute
776/// value of the result.
777template <typename T>
778typename std::enable_if<std::is_unsigned<T>::value, T>::type
779AbsoluteDifference(T X, T Y) {
780 return std::max(X, Y) - std::min(X, Y);
781}
782
783/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
784/// maximum representable value of T on overflow. ResultOverflowed indicates if
785/// the result is larger than the maximum representable value of type T.
786template <typename T>
787typename std::enable_if<std::is_unsigned<T>::value, T>::type
788SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
789 bool Dummy;
790 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
791 // Hacker's Delight, p. 29
792 T Z = X + Y;
793 Overflowed = (Z < X || Z < Y);
794 if (Overflowed)
795 return std::numeric_limits<T>::max();
796 else
797 return Z;
798}
799
800/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
801/// maximum representable value of T on overflow. ResultOverflowed indicates if
802/// the result is larger than the maximum representable value of type T.
803template <typename T>
804typename std::enable_if<std::is_unsigned<T>::value, T>::type
805SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
806 bool Dummy;
807 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
808
809 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
810 // because it fails for uint16_t (where multiplication can have undefined
811 // behavior due to promotion to int), and requires a division in addition
812 // to the multiplication.
813
814 Overflowed = false;
815
816 // Log2(Z) would be either Log2Z or Log2Z + 1.
817 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
818 // will necessarily be less than Log2Max as desired.
819 int Log2Z = Log2_64(X) + Log2_64(Y);
820 const T Max = std::numeric_limits<T>::max();
821 int Log2Max = Log2_64(Max);
822 if (Log2Z < Log2Max) {
823 return X * Y;
824 }
825 if (Log2Z > Log2Max) {
826 Overflowed = true;
827 return Max;
828 }
829
830 // We're going to use the top bit, and maybe overflow one
831 // bit past it. Multiply all but the bottom bit then add
832 // that on at the end.
833 T Z = (X >> 1) * Y;
834 if (Z & ~(Max >> 1)) {
835 Overflowed = true;
836 return Max;
837 }
838 Z <<= 1;
839 if (X & 1)
840 return SaturatingAdd(Z, Y, ResultOverflowed);
841
842 return Z;
843}
844
845/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
846/// the product. Clamp the result to the maximum representable value of T on
847/// overflow. ResultOverflowed indicates if the result is larger than the
848/// maximum representable value of type T.
849template <typename T>
850typename std::enable_if<std::is_unsigned<T>::value, T>::type
851SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
852 bool Dummy;
853 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
854
855 T Product = SaturatingMultiply(X, Y, &Overflowed);
856 if (Overflowed)
857 return Product;
858
859 return SaturatingAdd(A, Product, &Overflowed);
860}
861
862/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
863extern const float huge_valf;
864
865
866/// Add two signed integers, computing the two's complement truncated result,
867/// returning true if overflow occured.
868template <typename T>
869typename std::enable_if<std::is_signed<T>::value, T>::type
870AddOverflow(T X, T Y, T &Result) {
871#if __has_builtin(__builtin_add_overflow)1
872 return __builtin_add_overflow(X, Y, &Result);
873#else
874 // Perform the unsigned addition.
875 using U = typename std::make_unsigned<T>::type;
876 const U UX = static_cast<U>(X);
877 const U UY = static_cast<U>(Y);
878 const U UResult = UX + UY;
879
880 // Convert to signed.
881 Result = static_cast<T>(UResult);
882
883 // Adding two positive numbers should result in a positive number.
884 if (X > 0 && Y > 0)
885 return Result <= 0;
886 // Adding two negatives should result in a negative number.
887 if (X < 0 && Y < 0)
888 return Result >= 0;
889 return false;
890#endif
891}
892
893/// Subtract two signed integers, computing the two's complement truncated
894/// result, returning true if an overflow ocurred.
895template <typename T>
896typename std::enable_if<std::is_signed<T>::value, T>::type
897SubOverflow(T X, T Y, T &Result) {
898#if __has_builtin(__builtin_sub_overflow)1
899 return __builtin_sub_overflow(X, Y, &Result);
900#else
901 // Perform the unsigned addition.
902 using U = typename std::make_unsigned<T>::type;
903 const U UX = static_cast<U>(X);
904 const U UY = static_cast<U>(Y);
905 const U UResult = UX - UY;
906
907 // Convert to signed.
908 Result = static_cast<T>(UResult);
909
910 // Subtracting a positive number from a negative results in a negative number.
911 if (X <= 0 && Y > 0)
912 return Result >= 0;
913 // Subtracting a negative number from a positive results in a positive number.
914 if (X >= 0 && Y < 0)
915 return Result <= 0;
916 return false;
917#endif
918}
919
920
921/// Multiply two signed integers, computing the two's complement truncated
922/// result, returning true if an overflow ocurred.
923template <typename T>
924typename std::enable_if<std::is_signed<T>::value, T>::type
925MulOverflow(T X, T Y, T &Result) {
926 // Perform the unsigned multiplication on absolute values.
927 using U = typename std::make_unsigned<T>::type;
928 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
929 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
930 const U UResult = UX * UY;
931
932 // Convert to signed.
933 const bool IsNegative = (X < 0) ^ (Y < 0);
934 Result = IsNegative ? (0 - UResult) : UResult;
935
936 // If any of the args was 0, result is 0 and no overflow occurs.
937 if (UX == 0 || UY == 0)
938 return false;
939
940 // UX and UY are in [1, 2^n], where n is the number of digits.
941 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
942 // positive) divided by an argument compares to the other.
943 if (IsNegative)
944 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
945 else
946 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
947}
948
949} // End llvm namespace
950
951#endif