Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4202, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/include -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-01-13-084841-49055-1 -x c++ /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUCallLowering.h"
18#include "AMDGPUFrameLowering.h"
19#include "AMDGPURegisterInfo.h"
20#include "AMDGPUSubtarget.h"
21#include "AMDGPUTargetMachine.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "R600MachineFunctionInfo.h"
24#include "SIInstrInfo.h"
25#include "SIMachineFunctionInfo.h"
26#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
27#include "llvm/CodeGen/Analysis.h"
28#include "llvm/CodeGen/CallingConvLower.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineRegisterInfo.h"
31#include "llvm/CodeGen/SelectionDAG.h"
32#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
33#include "llvm/IR/DataLayout.h"
34#include "llvm/IR/DiagnosticInfo.h"
35#include "llvm/Support/KnownBits.h"
36#include "llvm/Support/MathExtras.h"
37using namespace llvm;
38
39#include "AMDGPUGenCallingConv.inc"
40
41// Find a larger type to do a load / store of a vector with.
42EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
43 unsigned StoreSize = VT.getStoreSizeInBits();
44 if (StoreSize <= 32)
45 return EVT::getIntegerVT(Ctx, StoreSize);
46
47 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")((StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? static_cast<void> (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 47, __PRETTY_FUNCTION__))
;
48 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
49}
50
51unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
52 EVT VT = Op.getValueType();
53 KnownBits Known = DAG.computeKnownBits(Op);
54 return VT.getSizeInBits() - Known.countMinLeadingZeros();
55}
56
57unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
58 EVT VT = Op.getValueType();
59
60 // In order for this to be a signed 24-bit value, bit 23, must
61 // be a sign bit.
62 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
63}
64
65AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
66 const AMDGPUSubtarget &STI)
67 : TargetLowering(TM), Subtarget(&STI) {
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
70 setOperationAction(ISD::LOAD, MVT::f32, Promote);
71 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
72
73 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
74 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
75
76 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
77 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
78
79 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
80 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
81
82 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
83 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
84
85 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87
88 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
89 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
90
91 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
92 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
93
94 setOperationAction(ISD::LOAD, MVT::i64, Promote);
95 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
96
97 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
98 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
99
100 setOperationAction(ISD::LOAD, MVT::f64, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
102
103 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
105
106 // There are no 64-bit extloads. These should be done as a 32-bit extload and
107 // an extension to 64-bit.
108 for (MVT VT : MVT::integer_valuetypes()) {
109 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
110 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
111 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
112 }
113
114 for (MVT VT : MVT::integer_valuetypes()) {
115 if (VT == MVT::i64)
116 continue;
117
118 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
119 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
120 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
121 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
122
123 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
124 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
125 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
126 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
127
128 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
129 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
130 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
131 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
132 }
133
134 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
135 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
137 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
138 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
139 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
141 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
143 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
145 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
146 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
148 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
149 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
150 }
151
152 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
153 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
154 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
155 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
156 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
157 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
158 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
159
160 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
161 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
162 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
163 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
164
165 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
166 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
167 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
168 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
169
170 setOperationAction(ISD::STORE, MVT::f32, Promote);
171 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
172
173 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
174 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
175
176 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
177 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
178
179 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
180 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
181
182 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
183 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
184
185 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
186 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
187
188 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
189 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
190
191 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
192 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
193
194 setOperationAction(ISD::STORE, MVT::i64, Promote);
195 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
196
197 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
198 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
199
200 setOperationAction(ISD::STORE, MVT::f64, Promote);
201 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
202
203 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
204 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
205
206 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
207 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
208 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
210
211 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
212 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
213 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
214 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
215
216 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
217 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
218 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
219 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
220 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
221 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
222 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
223
224 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
225 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
226
227 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
228 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
229
230 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
231 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
232
233 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
234 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
235
236
237 setOperationAction(ISD::Constant, MVT::i32, Legal);
238 setOperationAction(ISD::Constant, MVT::i64, Legal);
239 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
240 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
241
242 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
243 setOperationAction(ISD::BRIND, MVT::Other, Expand);
244
245 // This is totally unsupported, just custom lower to produce an error.
246 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
247
248 // Library functions. These default to Expand, but we have instructions
249 // for them.
250 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
251 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
252 setOperationAction(ISD::FPOW, MVT::f32, Legal);
253 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
254 setOperationAction(ISD::FABS, MVT::f32, Legal);
255 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
256 setOperationAction(ISD::FRINT, MVT::f32, Legal);
257 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
258 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
259 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
260
261 setOperationAction(ISD::FROUND, MVT::f32, Custom);
262 setOperationAction(ISD::FROUND, MVT::f64, Custom);
263
264 setOperationAction(ISD::FLOG, MVT::f32, Custom);
265 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
266 setOperationAction(ISD::FEXP, MVT::f32, Custom);
267
268
269 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
270 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
271
272 setOperationAction(ISD::FREM, MVT::f32, Custom);
273 setOperationAction(ISD::FREM, MVT::f64, Custom);
274
275 // Expand to fneg + fadd.
276 setOperationAction(ISD::FSUB, MVT::f64, Expand);
277
278 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
279 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
280 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
281 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
282 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
283 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
284 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
285 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
286 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
287 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
288 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
289 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
290 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
291 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
292 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
293 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
294 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
295 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
296 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
297 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
298 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
299 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
300
301 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
302 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
303 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
304
305 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
306 for (MVT VT : ScalarIntVTs) {
307 // These should use [SU]DIVREM, so set them to expand
308 setOperationAction(ISD::SDIV, VT, Expand);
309 setOperationAction(ISD::UDIV, VT, Expand);
310 setOperationAction(ISD::SREM, VT, Expand);
311 setOperationAction(ISD::UREM, VT, Expand);
312
313 // GPU does not have divrem function for signed or unsigned.
314 setOperationAction(ISD::SDIVREM, VT, Custom);
315 setOperationAction(ISD::UDIVREM, VT, Custom);
316
317 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
318 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
319 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
320
321 setOperationAction(ISD::BSWAP, VT, Expand);
322 setOperationAction(ISD::CTTZ, VT, Expand);
323 setOperationAction(ISD::CTLZ, VT, Expand);
324
325 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
326 setOperationAction(ISD::ADDC, VT, Legal);
327 setOperationAction(ISD::SUBC, VT, Legal);
328 setOperationAction(ISD::ADDE, VT, Legal);
329 setOperationAction(ISD::SUBE, VT, Legal);
330 }
331
332 // The hardware supports 32-bit ROTR, but not ROTL.
333 setOperationAction(ISD::ROTL, MVT::i32, Expand);
334 setOperationAction(ISD::ROTL, MVT::i64, Expand);
335 setOperationAction(ISD::ROTR, MVT::i64, Expand);
336
337 setOperationAction(ISD::MUL, MVT::i64, Expand);
338 setOperationAction(ISD::MULHU, MVT::i64, Expand);
339 setOperationAction(ISD::MULHS, MVT::i64, Expand);
340 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
341 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
342 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
343 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
344 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
345
346 setOperationAction(ISD::SMIN, MVT::i32, Legal);
347 setOperationAction(ISD::UMIN, MVT::i32, Legal);
348 setOperationAction(ISD::SMAX, MVT::i32, Legal);
349 setOperationAction(ISD::UMAX, MVT::i32, Legal);
350
351 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
352 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
353 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
354 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355
356 static const MVT::SimpleValueType VectorIntTypes[] = {
357 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
358 };
359
360 for (MVT VT : VectorIntTypes) {
361 // Expand the following operations for the current type by default.
362 setOperationAction(ISD::ADD, VT, Expand);
363 setOperationAction(ISD::AND, VT, Expand);
364 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
365 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
366 setOperationAction(ISD::MUL, VT, Expand);
367 setOperationAction(ISD::MULHU, VT, Expand);
368 setOperationAction(ISD::MULHS, VT, Expand);
369 setOperationAction(ISD::OR, VT, Expand);
370 setOperationAction(ISD::SHL, VT, Expand);
371 setOperationAction(ISD::SRA, VT, Expand);
372 setOperationAction(ISD::SRL, VT, Expand);
373 setOperationAction(ISD::ROTL, VT, Expand);
374 setOperationAction(ISD::ROTR, VT, Expand);
375 setOperationAction(ISD::SUB, VT, Expand);
376 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
377 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
378 setOperationAction(ISD::SDIV, VT, Expand);
379 setOperationAction(ISD::UDIV, VT, Expand);
380 setOperationAction(ISD::SREM, VT, Expand);
381 setOperationAction(ISD::UREM, VT, Expand);
382 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
383 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
384 setOperationAction(ISD::SDIVREM, VT, Custom);
385 setOperationAction(ISD::UDIVREM, VT, Expand);
386 setOperationAction(ISD::SELECT, VT, Expand);
387 setOperationAction(ISD::VSELECT, VT, Expand);
388 setOperationAction(ISD::SELECT_CC, VT, Expand);
389 setOperationAction(ISD::XOR, VT, Expand);
390 setOperationAction(ISD::BSWAP, VT, Expand);
391 setOperationAction(ISD::CTPOP, VT, Expand);
392 setOperationAction(ISD::CTTZ, VT, Expand);
393 setOperationAction(ISD::CTLZ, VT, Expand);
394 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
395 setOperationAction(ISD::SETCC, VT, Expand);
396 }
397
398 static const MVT::SimpleValueType FloatVectorTypes[] = {
399 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
400 };
401
402 for (MVT VT : FloatVectorTypes) {
403 setOperationAction(ISD::FABS, VT, Expand);
404 setOperationAction(ISD::FMINNUM, VT, Expand);
405 setOperationAction(ISD::FMAXNUM, VT, Expand);
406 setOperationAction(ISD::FADD, VT, Expand);
407 setOperationAction(ISD::FCEIL, VT, Expand);
408 setOperationAction(ISD::FCOS, VT, Expand);
409 setOperationAction(ISD::FDIV, VT, Expand);
410 setOperationAction(ISD::FEXP2, VT, Expand);
411 setOperationAction(ISD::FEXP, VT, Expand);
412 setOperationAction(ISD::FLOG2, VT, Expand);
413 setOperationAction(ISD::FREM, VT, Expand);
414 setOperationAction(ISD::FLOG, VT, Expand);
415 setOperationAction(ISD::FLOG10, VT, Expand);
416 setOperationAction(ISD::FPOW, VT, Expand);
417 setOperationAction(ISD::FFLOOR, VT, Expand);
418 setOperationAction(ISD::FTRUNC, VT, Expand);
419 setOperationAction(ISD::FMUL, VT, Expand);
420 setOperationAction(ISD::FMA, VT, Expand);
421 setOperationAction(ISD::FRINT, VT, Expand);
422 setOperationAction(ISD::FNEARBYINT, VT, Expand);
423 setOperationAction(ISD::FSQRT, VT, Expand);
424 setOperationAction(ISD::FSIN, VT, Expand);
425 setOperationAction(ISD::FSUB, VT, Expand);
426 setOperationAction(ISD::FNEG, VT, Expand);
427 setOperationAction(ISD::VSELECT, VT, Expand);
428 setOperationAction(ISD::SELECT_CC, VT, Expand);
429 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
430 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
431 setOperationAction(ISD::SETCC, VT, Expand);
432 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
433 }
434
435 // This causes using an unrolled select operation rather than expansion with
436 // bit operations. This is in general better, but the alternative using BFI
437 // instructions may be better if the select sources are SGPRs.
438 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
439 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
440
441 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
442 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
443
444 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
445 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
446
447 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
448 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
449
450 // There are no libcalls of any kind.
451 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
452 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
453
454 setSchedulingPreference(Sched::RegPressure);
455 setJumpIsExpensive(true);
456
457 // FIXME: This is only partially true. If we have to do vector compares, any
458 // SGPR pair can be a condition register. If we have a uniform condition, we
459 // are better off doing SALU operations, where there is only one SCC. For now,
460 // we don't have a way of knowing during instruction selection if a condition
461 // will be uniform and we always use vector compares. Assume we are using
462 // vector compares until that is fixed.
463 setHasMultipleConditionRegisters(true);
464
465 setMinCmpXchgSizeInBits(32);
466 setSupportsUnalignedAtomics(false);
467
468 PredictableSelectIsExpensive = false;
469
470 // We want to find all load dependencies for long chains of stores to enable
471 // merging into very wide vectors. The problem is with vectors with > 4
472 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
473 // vectors are a legal type, even though we have to split the loads
474 // usually. When we can more precisely specify load legality per address
475 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
476 // smarter so that they can figure out what to do in 2 iterations without all
477 // N > 4 stores on the same chain.
478 GatherAllAliasesMaxDepth = 16;
479
480 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
481 // about these during lowering.
482 MaxStoresPerMemcpy = 0xffffffff;
483 MaxStoresPerMemmove = 0xffffffff;
484 MaxStoresPerMemset = 0xffffffff;
485
486 setTargetDAGCombine(ISD::BITCAST);
487 setTargetDAGCombine(ISD::SHL);
488 setTargetDAGCombine(ISD::SRA);
489 setTargetDAGCombine(ISD::SRL);
490 setTargetDAGCombine(ISD::TRUNCATE);
491 setTargetDAGCombine(ISD::MUL);
492 setTargetDAGCombine(ISD::MULHU);
493 setTargetDAGCombine(ISD::MULHS);
494 setTargetDAGCombine(ISD::SELECT);
495 setTargetDAGCombine(ISD::SELECT_CC);
496 setTargetDAGCombine(ISD::STORE);
497 setTargetDAGCombine(ISD::FADD);
498 setTargetDAGCombine(ISD::FSUB);
499 setTargetDAGCombine(ISD::FNEG);
500 setTargetDAGCombine(ISD::FABS);
501 setTargetDAGCombine(ISD::AssertZext);
502 setTargetDAGCombine(ISD::AssertSext);
503 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
504}
505
506//===----------------------------------------------------------------------===//
507// Target Information
508//===----------------------------------------------------------------------===//
509
510LLVM_READNONE__attribute__((__const__))
511static bool fnegFoldsIntoOp(unsigned Opc) {
512 switch (Opc) {
513 case ISD::FADD:
514 case ISD::FSUB:
515 case ISD::FMUL:
516 case ISD::FMA:
517 case ISD::FMAD:
518 case ISD::FMINNUM:
519 case ISD::FMAXNUM:
520 case ISD::FMINNUM_IEEE:
521 case ISD::FMAXNUM_IEEE:
522 case ISD::FSIN:
523 case ISD::FTRUNC:
524 case ISD::FRINT:
525 case ISD::FNEARBYINT:
526 case ISD::FCANONICALIZE:
527 case AMDGPUISD::RCP:
528 case AMDGPUISD::RCP_LEGACY:
529 case AMDGPUISD::RCP_IFLAG:
530 case AMDGPUISD::SIN_HW:
531 case AMDGPUISD::FMUL_LEGACY:
532 case AMDGPUISD::FMIN_LEGACY:
533 case AMDGPUISD::FMAX_LEGACY:
534 case AMDGPUISD::FMED3:
535 return true;
536 default:
537 return false;
538 }
539}
540
541/// \p returns true if the operation will definitely need to use a 64-bit
542/// encoding, and thus will use a VOP3 encoding regardless of the source
543/// modifiers.
544LLVM_READONLY__attribute__((__pure__))
545static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
546 return N->getNumOperands() > 2 || VT == MVT::f64;
547}
548
549// Most FP instructions support source modifiers, but this could be refined
550// slightly.
551LLVM_READONLY__attribute__((__pure__))
552static bool hasSourceMods(const SDNode *N) {
553 if (isa<MemSDNode>(N))
554 return false;
555
556 switch (N->getOpcode()) {
557 case ISD::CopyToReg:
558 case ISD::SELECT:
559 case ISD::FDIV:
560 case ISD::FREM:
561 case ISD::INLINEASM:
562 case ISD::INLINEASM_BR:
563 case AMDGPUISD::DIV_SCALE:
564 case ISD::INTRINSIC_W_CHAIN:
565
566 // TODO: Should really be looking at the users of the bitcast. These are
567 // problematic because bitcasts are used to legalize all stores to integer
568 // types.
569 case ISD::BITCAST:
570 return false;
571 case ISD::INTRINSIC_WO_CHAIN: {
572 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
573 case Intrinsic::amdgcn_interp_p1:
574 case Intrinsic::amdgcn_interp_p2:
575 case Intrinsic::amdgcn_interp_mov:
576 case Intrinsic::amdgcn_interp_p1_f16:
577 case Intrinsic::amdgcn_interp_p2_f16:
578 return false;
579 default:
580 return true;
581 }
582 }
583 default:
584 return true;
585 }
586}
587
588bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
589 unsigned CostThreshold) {
590 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
591 // it is truly free to use a source modifier in all cases. If there are
592 // multiple users but for each one will necessitate using VOP3, there will be
593 // a code size increase. Try to avoid increasing code size unless we know it
594 // will save on the instruction count.
595 unsigned NumMayIncreaseSize = 0;
596 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
597
598 // XXX - Should this limit number of uses to check?
599 for (const SDNode *U : N->uses()) {
600 if (!hasSourceMods(U))
601 return false;
602
603 if (!opMustUseVOP3Encoding(U, VT)) {
604 if (++NumMayIncreaseSize > CostThreshold)
605 return false;
606 }
607 }
608
609 return true;
610}
611
612MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
613 return MVT::i32;
614}
615
616bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
617 return true;
618}
619
620// The backend supports 32 and 64 bit floating point immediates.
621// FIXME: Why are we reporting vectors of FP immediates as legal?
622bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
623 bool ForCodeSize) const {
624 EVT ScalarVT = VT.getScalarType();
625 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
626 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
627}
628
629// We don't want to shrink f64 / f32 constants.
630bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
631 EVT ScalarVT = VT.getScalarType();
632 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
633}
634
635bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
636 ISD::LoadExtType ExtTy,
637 EVT NewVT) const {
638 // TODO: This may be worth removing. Check regression tests for diffs.
639 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
640 return false;
641
642 unsigned NewSize = NewVT.getStoreSizeInBits();
643
644 // If we are reducing to a 32-bit load, this is always better.
645 if (NewSize == 32)
646 return true;
647
648 EVT OldVT = N->getValueType(0);
649 unsigned OldSize = OldVT.getStoreSizeInBits();
650
651 MemSDNode *MN = cast<MemSDNode>(N);
652 unsigned AS = MN->getAddressSpace();
653 // Do not shrink an aligned scalar load to sub-dword.
654 // Scalar engine cannot do sub-dword loads.
655 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
656 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
657 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
658 (isa<LoadSDNode>(N) &&
659 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
660 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
661 return false;
662
663 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
664 // extloads, so doing one requires using a buffer_load. In cases where we
665 // still couldn't use a scalar load, using the wider load shouldn't really
666 // hurt anything.
667
668 // If the old size already had to be an extload, there's no harm in continuing
669 // to reduce the width.
670 return (OldSize < 32);
671}
672
673bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
674 const SelectionDAG &DAG,
675 const MachineMemOperand &MMO) const {
676
677 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())((LoadTy.getSizeInBits() == CastTy.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 677, __PRETTY_FUNCTION__))
;
678
679 if (LoadTy.getScalarType() == MVT::i32)
680 return false;
681
682 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
683 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
684
685 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
686 return false;
687
688 bool Fast = false;
689 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
690 CastTy, MMO, &Fast) &&
691 Fast;
692}
693
694// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
695// profitable with the expansion for 64-bit since it's generally good to
696// speculate things.
697// FIXME: These should really have the size as a parameter.
698bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
699 return true;
700}
701
702bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
703 return true;
704}
705
706bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
707 switch (N->getOpcode()) {
708 default:
709 return false;
710 case ISD::EntryToken:
711 case ISD::TokenFactor:
712 return true;
713 case ISD::INTRINSIC_WO_CHAIN:
714 {
715 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
716 switch (IntrID) {
717 default:
718 return false;
719 case Intrinsic::amdgcn_readfirstlane:
720 case Intrinsic::amdgcn_readlane:
721 return true;
722 }
723 }
724 break;
725 case ISD::LOAD:
726 {
727 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
728 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
729 return true;
730 return false;
731 }
732 break;
733 }
734}
735
736//===---------------------------------------------------------------------===//
737// Target Properties
738//===---------------------------------------------------------------------===//
739
740bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
741 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 741, __PRETTY_FUNCTION__))
;
742
743 // Packed operations do not have a fabs modifier.
744 return VT == MVT::f32 || VT == MVT::f64 ||
745 (Subtarget->has16BitInsts() && VT == MVT::f16);
746}
747
748bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
749 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 749, __PRETTY_FUNCTION__))
;
750 return VT == MVT::f32 || VT == MVT::f64 ||
751 (Subtarget->has16BitInsts() && VT == MVT::f16) ||
752 (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
753}
754
755bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
756 unsigned NumElem,
757 unsigned AS) const {
758 return true;
759}
760
761bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
762 // There are few operations which truly have vector input operands. Any vector
763 // operation is going to involve operations on each component, and a
764 // build_vector will be a copy per element, so it always makes sense to use a
765 // build_vector input in place of the extracted element to avoid a copy into a
766 // super register.
767 //
768 // We should probably only do this if all users are extracts only, but this
769 // should be the common case.
770 return true;
771}
772
773bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
774 // Truncate is just accessing a subregister.
775
776 unsigned SrcSize = Source.getSizeInBits();
777 unsigned DestSize = Dest.getSizeInBits();
778
779 return DestSize < SrcSize && DestSize % 32 == 0 ;
780}
781
782bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
783 // Truncate is just accessing a subregister.
784
785 unsigned SrcSize = Source->getScalarSizeInBits();
786 unsigned DestSize = Dest->getScalarSizeInBits();
787
788 if (DestSize== 16 && Subtarget->has16BitInsts())
789 return SrcSize >= 32;
790
791 return DestSize < SrcSize && DestSize % 32 == 0;
792}
793
794bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
795 unsigned SrcSize = Src->getScalarSizeInBits();
796 unsigned DestSize = Dest->getScalarSizeInBits();
797
798 if (SrcSize == 16 && Subtarget->has16BitInsts())
799 return DestSize >= 32;
800
801 return SrcSize == 32 && DestSize == 64;
802}
803
804bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
805 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
806 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
807 // this will enable reducing 64-bit operations the 32-bit, which is always
808 // good.
809
810 if (Src == MVT::i16)
811 return Dest == MVT::i32 ||Dest == MVT::i64 ;
812
813 return Src == MVT::i32 && Dest == MVT::i64;
814}
815
816bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
817 return isZExtFree(Val.getValueType(), VT2);
818}
819
820bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
821 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
822 // limited number of native 64-bit operations. Shrinking an operation to fit
823 // in a single 32-bit register should always be helpful. As currently used,
824 // this is much less general than the name suggests, and is only used in
825 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
826 // not profitable, and may actually be harmful.
827 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
828}
829
830//===---------------------------------------------------------------------===//
831// TargetLowering Callbacks
832//===---------------------------------------------------------------------===//
833
834CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
835 bool IsVarArg) {
836 switch (CC) {
837 case CallingConv::AMDGPU_VS:
838 case CallingConv::AMDGPU_GS:
839 case CallingConv::AMDGPU_PS:
840 case CallingConv::AMDGPU_CS:
841 case CallingConv::AMDGPU_HS:
842 case CallingConv::AMDGPU_ES:
843 case CallingConv::AMDGPU_LS:
844 return CC_AMDGPU;
845 case CallingConv::C:
846 case CallingConv::Fast:
847 case CallingConv::Cold:
848 return CC_AMDGPU_Func;
849 case CallingConv::AMDGPU_KERNEL:
850 case CallingConv::SPIR_KERNEL:
851 default:
852 report_fatal_error("Unsupported calling convention for call");
853 }
854}
855
856CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
857 bool IsVarArg) {
858 switch (CC) {
859 case CallingConv::AMDGPU_KERNEL:
860 case CallingConv::SPIR_KERNEL:
861 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 861)
;
862 case CallingConv::AMDGPU_VS:
863 case CallingConv::AMDGPU_GS:
864 case CallingConv::AMDGPU_PS:
865 case CallingConv::AMDGPU_CS:
866 case CallingConv::AMDGPU_HS:
867 case CallingConv::AMDGPU_ES:
868 case CallingConv::AMDGPU_LS:
869 return RetCC_SI_Shader;
870 case CallingConv::C:
871 case CallingConv::Fast:
872 case CallingConv::Cold:
873 return RetCC_AMDGPU_Func;
874 default:
875 report_fatal_error("Unsupported calling convention.");
876 }
877}
878
879/// The SelectionDAGBuilder will automatically promote function arguments
880/// with illegal types. However, this does not work for the AMDGPU targets
881/// since the function arguments are stored in memory as these illegal types.
882/// In order to handle this properly we need to get the original types sizes
883/// from the LLVM IR Function and fixup the ISD:InputArg values before
884/// passing them to AnalyzeFormalArguments()
885
886/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
887/// input values across multiple registers. Each item in the Ins array
888/// represents a single value that will be stored in registers. Ins[x].VT is
889/// the value type of the value that will be stored in the register, so
890/// whatever SDNode we lower the argument to needs to be this type.
891///
892/// In order to correctly lower the arguments we need to know the size of each
893/// argument. Since Ins[x].VT gives us the size of the register that will
894/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
895/// for the orignal function argument so that we can deduce the correct memory
896/// type to use for Ins[x]. In most cases the correct memory type will be
897/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
898/// we have a kernel argument of type v8i8, this argument will be split into
899/// 8 parts and each part will be represented by its own item in the Ins array.
900/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
901/// the argument before it was split. From this, we deduce that the memory type
902/// for each individual part is i8. We pass the memory type as LocVT to the
903/// calling convention analysis function and the register type (Ins[x].VT) as
904/// the ValVT.
905void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
906 CCState &State,
907 const SmallVectorImpl<ISD::InputArg> &Ins) const {
908 const MachineFunction &MF = State.getMachineFunction();
909 const Function &Fn = MF.getFunction();
910 LLVMContext &Ctx = Fn.getParent()->getContext();
911 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
912 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
913 CallingConv::ID CC = Fn.getCallingConv();
914
915 unsigned MaxAlign = 1;
916 uint64_t ExplicitArgOffset = 0;
917 const DataLayout &DL = Fn.getParent()->getDataLayout();
918
919 unsigned InIndex = 0;
920
921 for (const Argument &Arg : Fn.args()) {
922 Type *BaseArgTy = Arg.getType();
923 unsigned Align = DL.getABITypeAlignment(BaseArgTy);
924 MaxAlign = std::max(Align, MaxAlign);
925 unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
926
927 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
928 ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
929
930 // We're basically throwing away everything passed into us and starting over
931 // to get accurate in-memory offsets. The "PartOffset" is completely useless
932 // to us as computed in Ins.
933 //
934 // We also need to figure out what type legalization is trying to do to get
935 // the correct memory offsets.
936
937 SmallVector<EVT, 16> ValueVTs;
938 SmallVector<uint64_t, 16> Offsets;
939 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
940
941 for (unsigned Value = 0, NumValues = ValueVTs.size();
942 Value != NumValues; ++Value) {
943 uint64_t BasePartOffset = Offsets[Value];
944
945 EVT ArgVT = ValueVTs[Value];
946 EVT MemVT = ArgVT;
947 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
948 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
949
950 if (NumRegs == 1) {
951 // This argument is not split, so the IR type is the memory type.
952 if (ArgVT.isExtended()) {
953 // We have an extended type, like i24, so we should just use the
954 // register type.
955 MemVT = RegisterVT;
956 } else {
957 MemVT = ArgVT;
958 }
959 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
960 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
961 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())((ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements
()) ? static_cast<void> (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 961, __PRETTY_FUNCTION__))
;
962 // We have a vector value which has been split into a vector with
963 // the same scalar type, but fewer elements. This should handle
964 // all the floating-point vector types.
965 MemVT = RegisterVT;
966 } else if (ArgVT.isVector() &&
967 ArgVT.getVectorNumElements() == NumRegs) {
968 // This arg has been split so that each element is stored in a separate
969 // register.
970 MemVT = ArgVT.getScalarType();
971 } else if (ArgVT.isExtended()) {
972 // We have an extended type, like i65.
973 MemVT = RegisterVT;
974 } else {
975 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
976 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)((ArgVT.getStoreSizeInBits() % NumRegs == 0) ? static_cast<
void> (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 976, __PRETTY_FUNCTION__))
;
977 if (RegisterVT.isInteger()) {
978 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
979 } else if (RegisterVT.isVector()) {
980 assert(!RegisterVT.getScalarType().isFloatingPoint())((!RegisterVT.getScalarType().isFloatingPoint()) ? static_cast
<void> (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 980, __PRETTY_FUNCTION__))
;
981 unsigned NumElements = RegisterVT.getVectorNumElements();
982 assert(MemoryBits % NumElements == 0)((MemoryBits % NumElements == 0) ? static_cast<void> (0
) : __assert_fail ("MemoryBits % NumElements == 0", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 982, __PRETTY_FUNCTION__))
;
983 // This vector type has been split into another vector type with
984 // a different elements size.
985 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
986 MemoryBits / NumElements);
987 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
988 } else {
989 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 989)
;
990 }
991 }
992
993 // Convert one element vectors to scalar.
994 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
995 MemVT = MemVT.getScalarType();
996
997 // Round up vec3/vec5 argument.
998 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
999 assert(MemVT.getVectorNumElements() == 3 ||((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1000, __PRETTY_FUNCTION__))
1000 MemVT.getVectorNumElements() == 5)((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1000, __PRETTY_FUNCTION__))
;
1001 MemVT = MemVT.getPow2VectorType(State.getContext());
1002 }
1003
1004 unsigned PartOffset = 0;
1005 for (unsigned i = 0; i != NumRegs; ++i) {
1006 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1007 BasePartOffset + PartOffset,
1008 MemVT.getSimpleVT(),
1009 CCValAssign::Full));
1010 PartOffset += MemVT.getStoreSize();
1011 }
1012 }
1013 }
1014}
1015
1016SDValue AMDGPUTargetLowering::LowerReturn(
1017 SDValue Chain, CallingConv::ID CallConv,
1018 bool isVarArg,
1019 const SmallVectorImpl<ISD::OutputArg> &Outs,
1020 const SmallVectorImpl<SDValue> &OutVals,
1021 const SDLoc &DL, SelectionDAG &DAG) const {
1022 // FIXME: Fails for r600 tests
1023 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1024 // "wave terminate should not have return values");
1025 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1026}
1027
1028//===---------------------------------------------------------------------===//
1029// Target specific lowering
1030//===---------------------------------------------------------------------===//
1031
1032/// Selects the correct CCAssignFn for a given CallingConvention value.
1033CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1034 bool IsVarArg) {
1035 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1036}
1037
1038CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1039 bool IsVarArg) {
1040 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1041}
1042
1043SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1044 SelectionDAG &DAG,
1045 MachineFrameInfo &MFI,
1046 int ClobberedFI) const {
1047 SmallVector<SDValue, 8> ArgChains;
1048 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1049 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1050
1051 // Include the original chain at the beginning of the list. When this is
1052 // used by target LowerCall hooks, this helps legalize find the
1053 // CALLSEQ_BEGIN node.
1054 ArgChains.push_back(Chain);
1055
1056 // Add a chain value for each stack argument corresponding
1057 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1058 UE = DAG.getEntryNode().getNode()->use_end();
1059 U != UE; ++U) {
1060 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1061 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1062 if (FI->getIndex() < 0) {
1063 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1064 int64_t InLastByte = InFirstByte;
1065 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1066
1067 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1068 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1069 ArgChains.push_back(SDValue(L, 1));
1070 }
1071 }
1072 }
1073 }
1074
1075 // Build a tokenfactor for all the chains.
1076 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1077}
1078
1079SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1080 SmallVectorImpl<SDValue> &InVals,
1081 StringRef Reason) const {
1082 SDValue Callee = CLI.Callee;
1083 SelectionDAG &DAG = CLI.DAG;
1084
1085 const Function &Fn = DAG.getMachineFunction().getFunction();
1086
1087 StringRef FuncName("<unknown>");
1088
1089 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1090 FuncName = G->getSymbol();
1091 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1092 FuncName = G->getGlobal()->getName();
1093
1094 DiagnosticInfoUnsupported NoCalls(
1095 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1096 DAG.getContext()->diagnose(NoCalls);
1097
1098 if (!CLI.IsTailCall) {
1099 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1100 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1101 }
1102
1103 return DAG.getEntryNode();
1104}
1105
1106SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1107 SmallVectorImpl<SDValue> &InVals) const {
1108 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1109}
1110
1111SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1112 SelectionDAG &DAG) const {
1113 const Function &Fn = DAG.getMachineFunction().getFunction();
1114
1115 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1116 SDLoc(Op).getDebugLoc());
1117 DAG.getContext()->diagnose(NoDynamicAlloca);
1118 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1119 return DAG.getMergeValues(Ops, SDLoc());
1120}
1121
1122SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1123 SelectionDAG &DAG) const {
1124 switch (Op.getOpcode()) {
1125 default:
1126 Op->print(errs(), &DAG);
1127 llvm_unreachable("Custom lowering code for this"::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1128)
1128 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1128)
;
1129 break;
1130 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1131 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1132 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1133 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1134 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1135 case ISD::FREM: return LowerFREM(Op, DAG);
1136 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1137 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1138 case ISD::FRINT: return LowerFRINT(Op, DAG);
1139 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1140 case ISD::FROUND: return LowerFROUND(Op, DAG);
1141 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1142 case ISD::FLOG:
1143 return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
1144 case ISD::FLOG10:
1145 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1146 case ISD::FEXP:
1147 return lowerFEXP(Op, DAG);
1148 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1149 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1150 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1151 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1152 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1153 case ISD::CTTZ:
1154 case ISD::CTTZ_ZERO_UNDEF:
1155 case ISD::CTLZ:
1156 case ISD::CTLZ_ZERO_UNDEF:
1157 return LowerCTLZ_CTTZ(Op, DAG);
1158 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1159 }
1160 return Op;
1161}
1162
1163void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1164 SmallVectorImpl<SDValue> &Results,
1165 SelectionDAG &DAG) const {
1166 switch (N->getOpcode()) {
1167 case ISD::SIGN_EXTEND_INREG:
1168 // Different parts of legalization seem to interpret which type of
1169 // sign_extend_inreg is the one to check for custom lowering. The extended
1170 // from type is what really matters, but some places check for custom
1171 // lowering of the result type. This results in trying to use
1172 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1173 // nothing here and let the illegal result integer be handled normally.
1174 return;
1175 default:
1176 return;
1177 }
1178}
1179
1180bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1181 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1182 if (!GVar || !GVar->hasInitializer())
1183 return false;
1184
1185 return !isa<UndefValue>(GVar->getInitializer());
1186}
1187
1188SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1189 SDValue Op,
1190 SelectionDAG &DAG) const {
1191
1192 const DataLayout &DL = DAG.getDataLayout();
1193 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1194 const GlobalValue *GV = G->getGlobal();
1195
1196 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1197 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1198 if (!MFI->isEntryFunction()) {
1199 const Function &Fn = DAG.getMachineFunction().getFunction();
1200 DiagnosticInfoUnsupported BadLDSDecl(
1201 Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
1202 DAG.getContext()->diagnose(BadLDSDecl);
1203 }
1204
1205 // XXX: What does the value of G->getOffset() mean?
1206 assert(G->getOffset() == 0 &&((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1207, __PRETTY_FUNCTION__))
1207 "Do not know what to do with an non-zero offset")((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1207, __PRETTY_FUNCTION__))
;
1208
1209 // TODO: We could emit code to handle the initialization somewhere.
1210 if (!hasDefinedInitializer(GV)) {
1211 unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1212 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1213 }
1214 }
1215
1216 const Function &Fn = DAG.getMachineFunction().getFunction();
1217 DiagnosticInfoUnsupported BadInit(
1218 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1219 DAG.getContext()->diagnose(BadInit);
1220 return SDValue();
1221}
1222
1223SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1224 SelectionDAG &DAG) const {
1225 SmallVector<SDValue, 8> Args;
1226
1227 EVT VT = Op.getValueType();
1228 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1229 SDLoc SL(Op);
1230 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1231 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1232
1233 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1234 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1235 }
1236
1237 for (const SDUse &U : Op->ops())
1238 DAG.ExtractVectorElements(U.get(), Args);
1239
1240 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1241}
1242
1243SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1244 SelectionDAG &DAG) const {
1245
1246 SmallVector<SDValue, 8> Args;
1247 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1248 EVT VT = Op.getValueType();
1249 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1250 VT.getVectorNumElements());
1251
1252 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1253}
1254
1255/// Generate Min/Max node
1256SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1257 SDValue LHS, SDValue RHS,
1258 SDValue True, SDValue False,
1259 SDValue CC,
1260 DAGCombinerInfo &DCI) const {
1261 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1262 return SDValue();
1263
1264 SelectionDAG &DAG = DCI.DAG;
1265 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1266 switch (CCOpcode) {
1267 case ISD::SETOEQ:
1268 case ISD::SETONE:
1269 case ISD::SETUNE:
1270 case ISD::SETNE:
1271 case ISD::SETUEQ:
1272 case ISD::SETEQ:
1273 case ISD::SETFALSE:
1274 case ISD::SETFALSE2:
1275 case ISD::SETTRUE:
1276 case ISD::SETTRUE2:
1277 case ISD::SETUO:
1278 case ISD::SETO:
1279 break;
1280 case ISD::SETULE:
1281 case ISD::SETULT: {
1282 if (LHS == True)
1283 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1284 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1285 }
1286 case ISD::SETOLE:
1287 case ISD::SETOLT:
1288 case ISD::SETLE:
1289 case ISD::SETLT: {
1290 // Ordered. Assume ordered for undefined.
1291
1292 // Only do this after legalization to avoid interfering with other combines
1293 // which might occur.
1294 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1295 !DCI.isCalledByLegalizer())
1296 return SDValue();
1297
1298 // We need to permute the operands to get the correct NaN behavior. The
1299 // selected operand is the second one based on the failing compare with NaN,
1300 // so permute it based on the compare type the hardware uses.
1301 if (LHS == True)
1302 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1303 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1304 }
1305 case ISD::SETUGE:
1306 case ISD::SETUGT: {
1307 if (LHS == True)
1308 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1309 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1310 }
1311 case ISD::SETGT:
1312 case ISD::SETGE:
1313 case ISD::SETOGE:
1314 case ISD::SETOGT: {
1315 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1316 !DCI.isCalledByLegalizer())
1317 return SDValue();
1318
1319 if (LHS == True)
1320 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1321 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1322 }
1323 case ISD::SETCC_INVALID:
1324 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1324)
;
1325 }
1326 return SDValue();
1327}
1328
1329std::pair<SDValue, SDValue>
1330AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1331 SDLoc SL(Op);
1332
1333 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1334
1335 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1336 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1337
1338 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1339 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1340
1341 return std::make_pair(Lo, Hi);
1342}
1343
1344SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1345 SDLoc SL(Op);
1346
1347 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1348 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1349 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1350}
1351
1352SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1353 SDLoc SL(Op);
1354
1355 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1356 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1357 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1358}
1359
1360// Split a vector type into two parts. The first part is a power of two vector.
1361// The second part is whatever is left over, and is a scalar if it would
1362// otherwise be a 1-vector.
1363std::pair<EVT, EVT>
1364AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1365 EVT LoVT, HiVT;
1366 EVT EltVT = VT.getVectorElementType();
1367 unsigned NumElts = VT.getVectorNumElements();
1368 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1369 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1370 HiVT = NumElts - LoNumElts == 1
1371 ? EltVT
1372 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1373 return std::make_pair(LoVT, HiVT);
1374}
1375
1376// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1377// scalar.
1378std::pair<SDValue, SDValue>
1379AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1380 const EVT &LoVT, const EVT &HiVT,
1381 SelectionDAG &DAG) const {
1382 assert(LoVT.getVectorNumElements() +((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1385, __PRETTY_FUNCTION__))
1383 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1385, __PRETTY_FUNCTION__))
1384 N.getValueType().getVectorNumElements() &&((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1385, __PRETTY_FUNCTION__))
1385 "More vector elements requested than available!")((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1385, __PRETTY_FUNCTION__))
;
1386 auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1387 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1388 DAG.getConstant(0, DL, IdxTy));
1389 SDValue Hi = DAG.getNode(
1390 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1391 HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
1392 return std::make_pair(Lo, Hi);
1393}
1394
1395SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1396 SelectionDAG &DAG) const {
1397 LoadSDNode *Load = cast<LoadSDNode>(Op);
1398 EVT VT = Op.getValueType();
1399 SDLoc SL(Op);
1400
1401
1402 // If this is a 2 element vector, we really want to scalarize and not create
1403 // weird 1 element vectors.
1404 if (VT.getVectorNumElements() == 2) {
1405 SDValue Ops[2];
1406 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1407 return DAG.getMergeValues(Ops, SL);
1408 }
1409
1410 SDValue BasePtr = Load->getBasePtr();
1411 EVT MemVT = Load->getMemoryVT();
1412
1413 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1414
1415 EVT LoVT, HiVT;
1416 EVT LoMemVT, HiMemVT;
1417 SDValue Lo, Hi;
1418
1419 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1420 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1421 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1422
1423 unsigned Size = LoMemVT.getStoreSize();
1424 unsigned BaseAlign = Load->getAlignment();
1425 unsigned HiAlign = MinAlign(BaseAlign, Size);
1426
1427 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1428 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1429 BaseAlign, Load->getMemOperand()->getFlags());
1430 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1431 SDValue HiLoad =
1432 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1433 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1434 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1435
1436 auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
1437 SDValue Join;
1438 if (LoVT == HiVT) {
1439 // This is the case that the vector is power of two so was evenly split.
1440 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1441 } else {
1442 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1443 DAG.getConstant(0, SL, IdxTy));
1444 Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
1445 : ISD::INSERT_VECTOR_ELT,
1446 SL, VT, Join, HiLoad,
1447 DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
1448 }
1449
1450 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1451 LoLoad.getValue(1), HiLoad.getValue(1))};
1452
1453 return DAG.getMergeValues(Ops, SL);
1454}
1455
1456// Widen a vector load from vec3 to vec4.
1457SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
1458 SelectionDAG &DAG) const {
1459 LoadSDNode *Load = cast<LoadSDNode>(Op);
1460 EVT VT = Op.getValueType();
1461 assert(VT.getVectorNumElements() == 3)((VT.getVectorNumElements() == 3) ? static_cast<void> (
0) : __assert_fail ("VT.getVectorNumElements() == 3", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1461, __PRETTY_FUNCTION__))
;
1462 SDValue BasePtr = Load->getBasePtr();
1463 EVT MemVT = Load->getMemoryVT();
1464 SDLoc SL(Op);
1465 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1466 unsigned BaseAlign = Load->getAlignment();
1467
1468 EVT WideVT =
1469 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1470 EVT WideMemVT =
1471 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1472 SDValue WideLoad = DAG.getExtLoad(
1473 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1474 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1475 return DAG.getMergeValues(
1476 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1477 DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
1478 WideLoad.getValue(1)},
1479 SL);
1480}
1481
1482SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1483 SelectionDAG &DAG) const {
1484 StoreSDNode *Store = cast<StoreSDNode>(Op);
1485 SDValue Val = Store->getValue();
1486 EVT VT = Val.getValueType();
1487
1488 // If this is a 2 element vector, we really want to scalarize and not create
1489 // weird 1 element vectors.
1490 if (VT.getVectorNumElements() == 2)
1491 return scalarizeVectorStore(Store, DAG);
1492
1493 EVT MemVT = Store->getMemoryVT();
1494 SDValue Chain = Store->getChain();
1495 SDValue BasePtr = Store->getBasePtr();
1496 SDLoc SL(Op);
1497
1498 EVT LoVT, HiVT;
1499 EVT LoMemVT, HiMemVT;
1500 SDValue Lo, Hi;
1501
1502 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1503 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1504 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1505
1506 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1507
1508 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1509 unsigned BaseAlign = Store->getAlignment();
1510 unsigned Size = LoMemVT.getStoreSize();
1511 unsigned HiAlign = MinAlign(BaseAlign, Size);
1512
1513 SDValue LoStore =
1514 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1515 Store->getMemOperand()->getFlags());
1516 SDValue HiStore =
1517 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1518 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1519
1520 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1521}
1522
1523// This is a shortcut for integer division because we have fast i32<->f32
1524// conversions, and fast f32 reciprocal instructions. The fractional part of a
1525// float is enough to accurately represent up to a 24-bit signed integer.
1526SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1527 bool Sign) const {
1528 SDLoc DL(Op);
1529 EVT VT = Op.getValueType();
1530 SDValue LHS = Op.getOperand(0);
1531 SDValue RHS = Op.getOperand(1);
1532 MVT IntVT = MVT::i32;
1533 MVT FltVT = MVT::f32;
1534
1535 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1536 if (LHSSignBits < 9)
1537 return SDValue();
1538
1539 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1540 if (RHSSignBits < 9)
1541 return SDValue();
1542
1543 unsigned BitSize = VT.getSizeInBits();
1544 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1545 unsigned DivBits = BitSize - SignBits;
1546 if (Sign)
1547 ++DivBits;
1548
1549 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1550 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1551
1552 SDValue jq = DAG.getConstant(1, DL, IntVT);
1553
1554 if (Sign) {
1555 // char|short jq = ia ^ ib;
1556 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1557
1558 // jq = jq >> (bitsize - 2)
1559 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1560 DAG.getConstant(BitSize - 2, DL, VT));
1561
1562 // jq = jq | 0x1
1563 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1564 }
1565
1566 // int ia = (int)LHS;
1567 SDValue ia = LHS;
1568
1569 // int ib, (int)RHS;
1570 SDValue ib = RHS;
1571
1572 // float fa = (float)ia;
1573 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1574
1575 // float fb = (float)ib;
1576 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1577
1578 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1579 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1580
1581 // fq = trunc(fq);
1582 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1583
1584 // float fqneg = -fq;
1585 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1586
1587 MachineFunction &MF = DAG.getMachineFunction();
1588 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1589
1590 // float fr = mad(fqneg, fb, fa);
1591 unsigned OpCode = MFI->getMode().FP32Denormals ?
1592 (unsigned)AMDGPUISD::FMAD_FTZ :
1593 (unsigned)ISD::FMAD;
1594 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1595
1596 // int iq = (int)fq;
1597 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1598
1599 // fr = fabs(fr);
1600 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1601
1602 // fb = fabs(fb);
1603 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1604
1605 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1606
1607 // int cv = fr >= fb;
1608 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1609
1610 // jq = (cv ? jq : 0);
1611 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1612
1613 // dst = iq + jq;
1614 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1615
1616 // Rem needs compensation, it's easier to recompute it
1617 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1618 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1619
1620 // Truncate to number of bits this divide really is.
1621 if (Sign) {
1622 SDValue InRegSize
1623 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1624 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1625 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1626 } else {
1627 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1628 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1629 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1630 }
1631
1632 return DAG.getMergeValues({ Div, Rem }, DL);
1633}
1634
1635void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1636 SelectionDAG &DAG,
1637 SmallVectorImpl<SDValue> &Results) const {
1638 SDLoc DL(Op);
1639 EVT VT = Op.getValueType();
1640
1641 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")((VT == MVT::i64 && "LowerUDIVREM64 expects an i64") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1641, __PRETTY_FUNCTION__))
;
1642
1643 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1644
1645 SDValue One = DAG.getConstant(1, DL, HalfVT);
1646 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1647
1648 //HiLo split
1649 SDValue LHS = Op.getOperand(0);
1650 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1651 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1652
1653 SDValue RHS = Op.getOperand(1);
1654 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1655 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1656
1657 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1658 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1659
1660 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1661 LHS_Lo, RHS_Lo);
1662
1663 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1664 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1665
1666 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1667 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1668 return;
1669 }
1670
1671 if (isTypeLegal(MVT::i64)) {
1672 MachineFunction &MF = DAG.getMachineFunction();
1673 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1674
1675 // Compute denominator reciprocal.
1676 unsigned FMAD = MFI->getMode().FP32Denormals ?
1677 (unsigned)AMDGPUISD::FMAD_FTZ :
1678 (unsigned)ISD::FMAD;
1679
1680 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1681 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1682 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1683 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1684 Cvt_Lo);
1685 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1686 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1687 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1688 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1689 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1690 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1691 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1692 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1693 Mul1);
1694 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1695 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1696 SDValue Rcp64 = DAG.getBitcast(VT,
1697 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1698
1699 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1700 SDValue One64 = DAG.getConstant(1, DL, VT);
1701 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1702 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1703
1704 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1705 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1706 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1707 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1708 Zero);
1709 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1710 One);
1711
1712 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1713 Mulhi1_Lo, Zero1);
1714 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1715 Mulhi1_Hi, Add1_Lo.getValue(1));
1716 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1717 SDValue Add1 = DAG.getBitcast(VT,
1718 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1719
1720 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1721 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1722 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1723 Zero);
1724 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1725 One);
1726
1727 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1728 Mulhi2_Lo, Zero1);
1729 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1730 Mulhi2_Hi, Add1_Lo.getValue(1));
1731 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1732 Zero, Add2_Lo.getValue(1));
1733 SDValue Add2 = DAG.getBitcast(VT,
1734 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1735 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1736
1737 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1738
1739 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1740 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1741 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1742 Mul3_Lo, Zero1);
1743 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1744 Mul3_Hi, Sub1_Lo.getValue(1));
1745 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1746 SDValue Sub1 = DAG.getBitcast(VT,
1747 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1748
1749 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1750 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1751 ISD::SETUGE);
1752 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1753 ISD::SETUGE);
1754 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1755
1756 // TODO: Here and below portions of the code can be enclosed into if/endif.
1757 // Currently control flow is unconditional and we have 4 selects after
1758 // potential endif to substitute PHIs.
1759
1760 // if C3 != 0 ...
1761 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1762 RHS_Lo, Zero1);
1763 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1764 RHS_Hi, Sub1_Lo.getValue(1));
1765 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1766 Zero, Sub2_Lo.getValue(1));
1767 SDValue Sub2 = DAG.getBitcast(VT,
1768 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1769
1770 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1771
1772 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1773 ISD::SETUGE);
1774 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1775 ISD::SETUGE);
1776 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1777
1778 // if (C6 != 0)
1779 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1780
1781 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1782 RHS_Lo, Zero1);
1783 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1784 RHS_Hi, Sub2_Lo.getValue(1));
1785 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1786 Zero, Sub3_Lo.getValue(1));
1787 SDValue Sub3 = DAG.getBitcast(VT,
1788 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1789
1790 // endif C6
1791 // endif C3
1792
1793 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1794 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1795
1796 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1797 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1798
1799 Results.push_back(Div);
1800 Results.push_back(Rem);
1801
1802 return;
1803 }
1804
1805 // r600 expandion.
1806 // Get Speculative values
1807 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1808 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1809
1810 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1811 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1812 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1813
1814 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1815 SDValue DIV_Lo = Zero;
1816
1817 const unsigned halfBitWidth = HalfVT.getSizeInBits();
1818
1819 for (unsigned i = 0; i < halfBitWidth; ++i) {
1820 const unsigned bitPos = halfBitWidth - i - 1;
1821 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1822 // Get value of high bit
1823 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1824 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1825 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1826
1827 // Shift
1828 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1829 // Add LHS high bit
1830 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1831
1832 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1833 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1834
1835 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1836
1837 // Update REM
1838 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1839 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1840 }
1841
1842 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1843 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1844 Results.push_back(DIV);
1845 Results.push_back(REM);
1846}
1847
1848SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1849 SelectionDAG &DAG) const {
1850 SDLoc DL(Op);
1851 EVT VT = Op.getValueType();
1852
1853 if (VT == MVT::i64) {
1854 SmallVector<SDValue, 2> Results;
1855 LowerUDIVREM64(Op, DAG, Results);
1856 return DAG.getMergeValues(Results, DL);
1857 }
1858
1859 if (VT == MVT::i32) {
1860 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1861 return Res;
1862 }
1863
1864 SDValue Num = Op.getOperand(0);
1865 SDValue Den = Op.getOperand(1);
1866
1867 // RCP = URECIP(Den) = 2^32 / Den + e
1868 // e is rounding error.
1869 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1870
1871 // RCP_LO = mul(RCP, Den) */
1872 SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1873
1874 // RCP_HI = mulhu (RCP, Den) */
1875 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1876
1877 // NEG_RCP_LO = -RCP_LO
1878 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1879 RCP_LO);
1880
1881 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1882 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1883 NEG_RCP_LO, RCP_LO,
1884 ISD::SETEQ);
1885 // Calculate the rounding error from the URECIP instruction
1886 // E = mulhu(ABS_RCP_LO, RCP)
1887 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1888
1889 // RCP_A_E = RCP + E
1890 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1891
1892 // RCP_S_E = RCP - E
1893 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1894
1895 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1896 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1897 RCP_A_E, RCP_S_E,
1898 ISD::SETEQ);
1899 // Quotient = mulhu(Tmp0, Num)
1900 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1901
1902 // Num_S_Remainder = Quotient * Den
1903 SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1904
1905 // Remainder = Num - Num_S_Remainder
1906 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1907
1908 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1909 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1910 DAG.getConstant(-1, DL, VT),
1911 DAG.getConstant(0, DL, VT),
1912 ISD::SETUGE);
1913 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1914 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1915 Num_S_Remainder,
1916 DAG.getConstant(-1, DL, VT),
1917 DAG.getConstant(0, DL, VT),
1918 ISD::SETUGE);
1919 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1920 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1921 Remainder_GE_Zero);
1922
1923 // Calculate Division result:
1924
1925 // Quotient_A_One = Quotient + 1
1926 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1927 DAG.getConstant(1, DL, VT));
1928
1929 // Quotient_S_One = Quotient - 1
1930 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1931 DAG.getConstant(1, DL, VT));
1932
1933 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1934 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1935 Quotient, Quotient_A_One, ISD::SETEQ);
1936
1937 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1938 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1939 Quotient_S_One, Div, ISD::SETEQ);
1940
1941 // Calculate Rem result:
1942
1943 // Remainder_S_Den = Remainder - Den
1944 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1945
1946 // Remainder_A_Den = Remainder + Den
1947 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1948
1949 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1950 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1951 Remainder, Remainder_S_Den, ISD::SETEQ);
1952
1953 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1954 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1955 Remainder_A_Den, Rem, ISD::SETEQ);
1956 SDValue Ops[2] = {
1957 Div,
1958 Rem
1959 };
1960 return DAG.getMergeValues(Ops, DL);
1961}
1962
1963SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1964 SelectionDAG &DAG) const {
1965 SDLoc DL(Op);
1966 EVT VT = Op.getValueType();
1967
1968 SDValue LHS = Op.getOperand(0);
1969 SDValue RHS = Op.getOperand(1);
1970
1971 SDValue Zero = DAG.getConstant(0, DL, VT);
1972 SDValue NegOne = DAG.getConstant(-1, DL, VT);
1973
1974 if (VT == MVT::i32) {
1975 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1976 return Res;
1977 }
1978
1979 if (VT == MVT::i64 &&
1980 DAG.ComputeNumSignBits(LHS) > 32 &&
1981 DAG.ComputeNumSignBits(RHS) > 32) {
1982 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1983
1984 //HiLo split
1985 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1986 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1987 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1988 LHS_Lo, RHS_Lo);
1989 SDValue Res[2] = {
1990 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1991 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1992 };
1993 return DAG.getMergeValues(Res, DL);
1994 }
1995
1996 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1997 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1998 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1999 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2000
2001 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2002 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2003
2004 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2005 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2006
2007 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2008 SDValue Rem = Div.getValue(1);
2009
2010 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2011 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2012
2013 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2014 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2015
2016 SDValue Res[2] = {
2017 Div,
2018 Rem
2019 };
2020 return DAG.getMergeValues(Res, DL);
2021}
2022
2023// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
2024SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2025 SDLoc SL(Op);
2026 EVT VT = Op.getValueType();
2027 SDValue X = Op.getOperand(0);
2028 SDValue Y = Op.getOperand(1);
2029
2030 // TODO: Should this propagate fast-math-flags?
2031
2032 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
2033 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
2034 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
2035
2036 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
2037}
2038
2039SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2040 SDLoc SL(Op);
2041 SDValue Src = Op.getOperand(0);
2042
2043 // result = trunc(src)
2044 // if (src > 0.0 && src != result)
2045 // result += 1.0
2046
2047 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2048
2049 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2050 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2051
2052 EVT SetCCVT =
2053 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2054
2055 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2056 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2057 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2058
2059 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2060 // TODO: Should this propagate fast-math-flags?
2061 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2062}
2063
2064static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2065 SelectionDAG &DAG) {
2066 const unsigned FractBits = 52;
2067 const unsigned ExpBits = 11;
2068
2069 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2070 Hi,
2071 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2072 DAG.getConstant(ExpBits, SL, MVT::i32));
2073 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2074 DAG.getConstant(1023, SL, MVT::i32));
2075
2076 return Exp;
2077}
2078
2079SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2080 SDLoc SL(Op);
2081 SDValue Src = Op.getOperand(0);
2082
2083 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2083, __PRETTY_FUNCTION__))
;
2084
2085 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2086 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2087
2088 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2089
2090 // Extract the upper half, since this is where we will find the sign and
2091 // exponent.
2092 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2093
2094 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2095
2096 const unsigned FractBits = 52;
2097
2098 // Extract the sign bit.
2099 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2100 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2101
2102 // Extend back to 64-bits.
2103 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2104 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2105
2106 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2107 const SDValue FractMask
2108 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2109
2110 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2111 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2112 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2113
2114 EVT SetCCVT =
2115 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2116
2117 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2118
2119 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2120 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2121
2122 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2123 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2124
2125 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2126}
2127
2128SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2129 SDLoc SL(Op);
2130 SDValue Src = Op.getOperand(0);
2131
2132 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2132, __PRETTY_FUNCTION__))
;
2133
2134 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2135 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2136 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2137
2138 // TODO: Should this propagate fast-math-flags?
2139
2140 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2141 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2142
2143 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2144
2145 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2146 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2147
2148 EVT SetCCVT =
2149 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2150 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2151
2152 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2153}
2154
2155SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2156 // FNEARBYINT and FRINT are the same, except in their handling of FP
2157 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2158 // rint, so just treat them as equivalent.
2159 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2160}
2161
2162// XXX - May require not supporting f32 denormals?
2163
2164// Don't handle v2f16. The extra instructions to scalarize and repack around the
2165// compare and vselect end up producing worse code than scalarizing the whole
2166// operation.
2167SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
2168 SelectionDAG &DAG) const {
2169 SDLoc SL(Op);
2170 SDValue X = Op.getOperand(0);
2171 EVT VT = Op.getValueType();
2172
2173 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2174
2175 // TODO: Should this propagate fast-math-flags?
2176
2177 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2178
2179 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2180
2181 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2182 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2183 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2184
2185 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2186
2187 EVT SetCCVT =
2188 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2189
2190 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2191
2192 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2193
2194 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2195}
2196
2197SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
2198 SDLoc SL(Op);
2199 SDValue X = Op.getOperand(0);
2200
2201 SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2202
2203 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2204 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2205 const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2206 const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2207 EVT SetCCVT =
2208 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2209
2210 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2211
2212 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2213
2214 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2215
2216 const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff)0x000fffffffffffffL, SL,
2217 MVT::i64);
2218
2219 SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2220 SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2221 DAG.getConstant(INT64_C(0x0008000000000000)0x0008000000000000L, SL,
2222 MVT::i64),
2223 Exp);
2224
2225 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2226 SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2227 DAG.getConstant(0, SL, MVT::i64), Tmp0,
2228 ISD::SETNE);
2229
2230 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2231 D, DAG.getConstant(0, SL, MVT::i64));
2232 SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2233
2234 K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2235 K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2236
2237 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2238 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2239 SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2240
2241 SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2242 ExpEqNegOne,
2243 DAG.getConstantFP(1.0, SL, MVT::f64),
2244 DAG.getConstantFP(0.0, SL, MVT::f64));
2245
2246 SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2247
2248 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2249 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2250
2251 return K;
2252}
2253
2254SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2255 EVT VT = Op.getValueType();
2256
2257 if (isOperationLegal(ISD::FTRUNC, VT))
2258 return LowerFROUND_LegalFTRUNC(Op, DAG);
2259
2260 if (VT == MVT::f64)
2261 return LowerFROUND64(Op, DAG);
2262
2263 llvm_unreachable("unhandled type")::llvm::llvm_unreachable_internal("unhandled type", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2263)
;
2264}
2265
2266SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2267 SDLoc SL(Op);
2268 SDValue Src = Op.getOperand(0);
2269
2270 // result = trunc(src);
2271 // if (src < 0.0 && src != result)
2272 // result += -1.0.
2273
2274 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2275
2276 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2277 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2278
2279 EVT SetCCVT =
2280 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2281
2282 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2283 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2284 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2285
2286 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2287 // TODO: Should this propagate fast-math-flags?
2288 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2289}
2290
2291SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2292 double Log2BaseInverted) const {
2293 EVT VT = Op.getValueType();
2294
2295 SDLoc SL(Op);
2296 SDValue Operand = Op.getOperand(0);
2297 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2298 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2299
2300 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2301}
2302
2303// exp2(M_LOG2E_F * f);
2304SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2305 EVT VT = Op.getValueType();
2306 SDLoc SL(Op);
2307 SDValue Src = Op.getOperand(0);
2308
2309 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2310 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2311 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2312}
2313
2314static bool isCtlzOpc(unsigned Opc) {
2315 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2316}
2317
2318static bool isCttzOpc(unsigned Opc) {
2319 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2320}
2321
2322SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2323 SDLoc SL(Op);
2324 SDValue Src = Op.getOperand(0);
2325 bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2326 Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2327
2328 unsigned ISDOpc, NewOpc;
2329 if (isCtlzOpc(Op.getOpcode())) {
2330 ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2331 NewOpc = AMDGPUISD::FFBH_U32;
2332 } else if (isCttzOpc(Op.getOpcode())) {
2333 ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2334 NewOpc = AMDGPUISD::FFBL_B32;
2335 } else
2336 llvm_unreachable("Unexpected OPCode!!!")::llvm::llvm_unreachable_internal("Unexpected OPCode!!!", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2336)
;
2337
2338
2339 if (ZeroUndef && Src.getValueType() == MVT::i32)
2340 return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2341
2342 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2343
2344 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2345 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2346
2347 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2348 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2349
2350 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2351 *DAG.getContext(), MVT::i32);
2352
2353 SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2354 SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2355
2356 SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2357 SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2358
2359 const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2360 SDValue Add, NewOpr;
2361 if (isCtlzOpc(Op.getOpcode())) {
2362 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2363 // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2364 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2365 } else {
2366 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2367 // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2368 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2369 }
2370
2371 if (!ZeroUndef) {
2372 // Test if the full 64-bit input is zero.
2373
2374 // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2375 // which we probably don't want.
2376 SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2377 SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2378 SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2379
2380 // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2381 // with the same cycles, otherwise it is slower.
2382 // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2383 // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2384
2385 const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2386
2387 // The instruction returns -1 for 0 input, but the defined intrinsic
2388 // behavior is to return the number of bits.
2389 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2390 SrcIsZero, Bits32, NewOpr);
2391 }
2392
2393 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2394}
2395
2396SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2397 bool Signed) const {
2398 // Unsigned
2399 // cul2f(ulong u)
2400 //{
2401 // uint lz = clz(u);
2402 // uint e = (u != 0) ? 127U + 63U - lz : 0;
2403 // u = (u << lz) & 0x7fffffffffffffffUL;
2404 // ulong t = u & 0xffffffffffUL;
2405 // uint v = (e << 23) | (uint)(u >> 40);
2406 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2407 // return as_float(v + r);
2408 //}
2409 // Signed
2410 // cl2f(long l)
2411 //{
2412 // long s = l >> 63;
2413 // float r = cul2f((l + s) ^ s);
2414 // return s ? -r : r;
2415 //}
2416
2417 SDLoc SL(Op);
2418 SDValue Src = Op.getOperand(0);
2419 SDValue L = Src;
2420
2421 SDValue S;
2422 if (Signed) {
2423 const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2424 S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2425
2426 SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2427 L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2428 }
2429
2430 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2431 *DAG.getContext(), MVT::f32);
2432
2433
2434 SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2435 SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2436 SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2437 LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2438
2439 SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2440 SDValue E = DAG.getSelect(SL, MVT::i32,
2441 DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2442 DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2443 ZeroI32);
2444
2445 SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2446 DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2447 DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2448
2449 SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2450 DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2451
2452 SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2453 U, DAG.getConstant(40, SL, MVT::i64));
2454
2455 SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2456 DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2457 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2458
2459 SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2460 SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2461 SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2462
2463 SDValue One = DAG.getConstant(1, SL, MVT::i32);
2464
2465 SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2466
2467 SDValue R = DAG.getSelect(SL, MVT::i32,
2468 RCmp,
2469 One,
2470 DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2471 R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2472 R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2473
2474 if (!Signed)
2475 return R;
2476
2477 SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2478 return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2479}
2480
2481SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2482 bool Signed) const {
2483 SDLoc SL(Op);
2484 SDValue Src = Op.getOperand(0);
2485
2486 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2487
2488 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2489 DAG.getConstant(0, SL, MVT::i32));
2490 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2491 DAG.getConstant(1, SL, MVT::i32));
2492
2493 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2494 SL, MVT::f64, Hi);
2495
2496 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2497
2498 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2499 DAG.getConstant(32, SL, MVT::i32));
2500 // TODO: Should this propagate fast-math-flags?
2501 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2502}
2503
2504SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2505 SelectionDAG &DAG) const {
2506 // TODO: Factor out code common with LowerSINT_TO_FP.
2507 EVT DestVT = Op.getValueType();
2508 SDValue Src = Op.getOperand(0);
2509 EVT SrcVT = Src.getValueType();
2510
2511 if (SrcVT == MVT::i16) {
2512 if (DestVT == MVT::f16)
2513 return Op;
2514 SDLoc DL(Op);
2515
2516 // Promote src to i32
2517 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2518 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2519 }
2520
2521 assert(SrcVT == MVT::i64 && "operation should be legal")((SrcVT == MVT::i64 && "operation should be legal") ?
static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2521, __PRETTY_FUNCTION__))
;
2522
2523 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2524 SDLoc DL(Op);
2525
2526 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2527 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2528 SDValue FPRound =
2529 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2530
2531 return FPRound;
2532 }
2533
2534 if (DestVT == MVT::f32)
2535 return LowerINT_TO_FP32(Op, DAG, false);
2536
2537 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2537, __PRETTY_FUNCTION__))
;
2538 return LowerINT_TO_FP64(Op, DAG, false);
2539}
2540
2541SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2542 SelectionDAG &DAG) const {
2543 EVT DestVT = Op.getValueType();
2544
2545 SDValue Src = Op.getOperand(0);
2546 EVT SrcVT = Src.getValueType();
2547
2548 if (SrcVT == MVT::i16) {
2549 if (DestVT == MVT::f16)
2550 return Op;
2551
2552 SDLoc DL(Op);
2553 // Promote src to i32
2554 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2555 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2556 }
2557
2558 assert(SrcVT == MVT::i64 && "operation should be legal")((SrcVT == MVT::i64 && "operation should be legal") ?
static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2558, __PRETTY_FUNCTION__))
;
2559
2560 // TODO: Factor out code common with LowerUINT_TO_FP.
2561
2562 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2563 SDLoc DL(Op);
2564 SDValue Src = Op.getOperand(0);
2565
2566 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2567 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2568 SDValue FPRound =
2569 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2570
2571 return FPRound;
2572 }
2573
2574 if (DestVT == MVT::f32)
2575 return LowerINT_TO_FP32(Op, DAG, true);
2576
2577 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2577, __PRETTY_FUNCTION__))
;
2578 return LowerINT_TO_FP64(Op, DAG, true);
2579}
2580
2581SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2582 bool Signed) const {
2583 SDLoc SL(Op);
2584
2585 SDValue Src = Op.getOperand(0);
2586
2587 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2588
2589 SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL), SL,
2590 MVT::f64);
2591 SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL), SL,
2592 MVT::f64);
2593 // TODO: Should this propagate fast-math-flags?
2594 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2595
2596 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2597
2598
2599 SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2600
2601 SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2602 MVT::i32, FloorMul);
2603 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2604
2605 SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2606
2607 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2608}
2609
2610SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2611 SDLoc DL(Op);
2612 SDValue N0 = Op.getOperand(0);
2613
2614 // Convert to target node to get known bits
2615 if (N0.getValueType() == MVT::f32)
2616 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2617
2618 if (getTargetMachine().Options.UnsafeFPMath) {
2619 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2620 return SDValue();
2621 }
2622
2623 assert(N0.getSimpleValueType() == MVT::f64)((N0.getSimpleValueType() == MVT::f64) ? static_cast<void>
(0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2623, __PRETTY_FUNCTION__))
;
2624
2625 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2626 const unsigned ExpMask = 0x7ff;
2627 const unsigned ExpBiasf64 = 1023;
2628 const unsigned ExpBiasf16 = 15;
2629 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2630 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2631 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2632 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2633 DAG.getConstant(32, DL, MVT::i64));
2634 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2635 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2636 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2637 DAG.getConstant(20, DL, MVT::i64));
2638 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2639 DAG.getConstant(ExpMask, DL, MVT::i32));
2640 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2641 // add the f16 bias (15) to get the biased exponent for the f16 format.
2642 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2643 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2644
2645 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2646 DAG.getConstant(8, DL, MVT::i32));
2647 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2648 DAG.getConstant(0xffe, DL, MVT::i32));
2649
2650 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2651 DAG.getConstant(0x1ff, DL, MVT::i32));
2652 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2653
2654 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2655 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2656
2657 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2658 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2659 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2660 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2661
2662 // N = M | (E << 12);
2663 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2664 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2665 DAG.getConstant(12, DL, MVT::i32)));
2666
2667 // B = clamp(1-E, 0, 13);
2668 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2669 One, E);
2670 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2671 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2672 DAG.getConstant(13, DL, MVT::i32));
2673
2674 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2675 DAG.getConstant(0x1000, DL, MVT::i32));
2676
2677 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2678 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2679 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2680 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2681
2682 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2683 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2684 DAG.getConstant(0x7, DL, MVT::i32));
2685 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2686 DAG.getConstant(2, DL, MVT::i32));
2687 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2688 One, Zero, ISD::SETEQ);
2689 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2690 One, Zero, ISD::SETGT);
2691 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2692 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2693
2694 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2695 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2696 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2697 I, V, ISD::SETEQ);
2698
2699 // Extract the sign bit.
2700 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2701 DAG.getConstant(16, DL, MVT::i32));
2702 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2703 DAG.getConstant(0x8000, DL, MVT::i32));
2704
2705 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2706 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2707}
2708
2709SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2710 SelectionDAG &DAG) const {
2711 SDValue Src = Op.getOperand(0);
2712
2713 // TODO: Factor out code common with LowerFP_TO_UINT.
2714
2715 EVT SrcVT = Src.getValueType();
2716 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2717 SDLoc DL(Op);
2718
2719 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2720 SDValue FpToInt32 =
2721 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2722
2723 return FpToInt32;
2724 }
2725
2726 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2727 return LowerFP64_TO_INT(Op, DAG, true);
2728
2729 return SDValue();
2730}
2731
2732SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2733 SelectionDAG &DAG) const {
2734 SDValue Src = Op.getOperand(0);
2735
2736 // TODO: Factor out code common with LowerFP_TO_SINT.
2737
2738 EVT SrcVT = Src.getValueType();
2739 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2740 SDLoc DL(Op);
2741
2742 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2743 SDValue FpToInt32 =
2744 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2745
2746 return FpToInt32;
2747 }
2748
2749 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2750 return LowerFP64_TO_INT(Op, DAG, false);
2751
2752 return SDValue();
2753}
2754
2755SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2756 SelectionDAG &DAG) const {
2757 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2758 MVT VT = Op.getSimpleValueType();
2759 MVT ScalarVT = VT.getScalarType();
2760
2761 assert(VT.isVector())((VT.isVector()) ? static_cast<void> (0) : __assert_fail
("VT.isVector()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2761, __PRETTY_FUNCTION__))
;
2762
2763 SDValue Src = Op.getOperand(0);
2764 SDLoc DL(Op);
2765
2766 // TODO: Don't scalarize on Evergreen?
2767 unsigned NElts = VT.getVectorNumElements();
2768 SmallVector<SDValue, 8> Args;
2769 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2770
2771 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2772 for (unsigned I = 0; I < NElts; ++I)
2773 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2774
2775 return DAG.getBuildVector(VT, DL, Args);
2776}
2777
2778//===----------------------------------------------------------------------===//
2779// Custom DAG optimizations
2780//===----------------------------------------------------------------------===//
2781
2782static bool isU24(SDValue Op, SelectionDAG &DAG) {
2783 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2784}
2785
2786static bool isI24(SDValue Op, SelectionDAG &DAG) {
2787 EVT VT = Op.getValueType();
2788 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2789 // as unsigned 24-bit values.
2790 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2791}
2792
2793static SDValue simplifyI24(SDNode *Node24,
2794 TargetLowering::DAGCombinerInfo &DCI) {
2795 SelectionDAG &DAG = DCI.DAG;
2796 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2797
2798 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2799 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2800 unsigned NewOpcode = Node24->getOpcode();
2801 if (IsIntrin) {
2802 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2803 NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2804 AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2805 }
2806
2807 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2808
2809 // First try to simplify using GetDemandedBits which allows the operands to
2810 // have other uses, but will only perform simplifications that involve
2811 // bypassing some nodes for this user.
2812 SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
2813 SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
2814 if (DemandedLHS || DemandedRHS)
2815 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2816 DemandedLHS ? DemandedLHS : LHS,
2817 DemandedRHS ? DemandedRHS : RHS);
2818
2819 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2820 // operands if this node is the only user.
2821 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2822 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2823 return SDValue(Node24, 0);
2824 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2825 return SDValue(Node24, 0);
2826
2827 return SDValue();
2828}
2829
2830template <typename IntTy>
2831static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2832 uint32_t Width, const SDLoc &DL) {
2833 if (Width + Offset < 32) {
2834 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2835 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2836 return DAG.getConstant(Result, DL, MVT::i32);
2837 }
2838
2839 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2840}
2841
2842static bool hasVolatileUser(SDNode *Val) {
2843 for (SDNode *U : Val->uses()) {
2844 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2845 if (M->isVolatile())
2846 return true;
2847 }
2848 }
2849
2850 return false;
2851}
2852
2853bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2854 // i32 vectors are the canonical memory type.
2855 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2856 return false;
2857
2858 if (!VT.isByteSized())
2859 return false;
2860
2861 unsigned Size = VT.getStoreSize();
2862
2863 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2864 return false;
2865
2866 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2867 return false;
2868
2869 return true;
2870}
2871
2872// Replace load of an illegal type with a store of a bitcast to a friendlier
2873// type.
2874SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2875 DAGCombinerInfo &DCI) const {
2876 if (!DCI.isBeforeLegalize())
2877 return SDValue();
2878
2879 LoadSDNode *LN = cast<LoadSDNode>(N);
2880 if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2881 return SDValue();
2882
2883 SDLoc SL(N);
2884 SelectionDAG &DAG = DCI.DAG;
2885 EVT VT = LN->getMemoryVT();
2886
2887 unsigned Size = VT.getStoreSize();
2888 unsigned Align = LN->getAlignment();
2889 if (Align < Size && isTypeLegal(VT)) {
2890 bool IsFast;
2891 unsigned AS = LN->getAddressSpace();
2892
2893 // Expand unaligned loads earlier than legalization. Due to visitation order
2894 // problems during legalization, the emitted instructions to pack and unpack
2895 // the bytes again are not eliminated in the case of an unaligned copy.
2896 if (!allowsMisalignedMemoryAccesses(
2897 VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
2898 SDValue Ops[2];
2899
2900 if (VT.isVector())
2901 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
2902 else
2903 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2904
2905 return DAG.getMergeValues(Ops, SDLoc(N));
2906 }
2907
2908 if (!IsFast)
2909 return SDValue();
2910 }
2911
2912 if (!shouldCombineMemoryType(VT))
2913 return SDValue();
2914
2915 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2916
2917 SDValue NewLoad
2918 = DAG.getLoad(NewVT, SL, LN->getChain(),
2919 LN->getBasePtr(), LN->getMemOperand());
2920
2921 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2922 DCI.CombineTo(N, BC, NewLoad.getValue(1));
2923 return SDValue(N, 0);
2924}
2925
2926// Replace store of an illegal type with a store of a bitcast to a friendlier
2927// type.
2928SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2929 DAGCombinerInfo &DCI) const {
2930 if (!DCI.isBeforeLegalize())
2931 return SDValue();
2932
2933 StoreSDNode *SN = cast<StoreSDNode>(N);
2934 if (SN->isVolatile() || !ISD::isNormalStore(SN))
2935 return SDValue();
2936
2937 EVT VT = SN->getMemoryVT();
2938 unsigned Size = VT.getStoreSize();
2939
2940 SDLoc SL(N);
2941 SelectionDAG &DAG = DCI.DAG;
2942 unsigned Align = SN->getAlignment();
2943 if (Align < Size && isTypeLegal(VT)) {
2944 bool IsFast;
2945 unsigned AS = SN->getAddressSpace();
2946
2947 // Expand unaligned stores earlier than legalization. Due to visitation
2948 // order problems during legalization, the emitted instructions to pack and
2949 // unpack the bytes again are not eliminated in the case of an unaligned
2950 // copy.
2951 if (!allowsMisalignedMemoryAccesses(
2952 VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
2953 if (VT.isVector())
2954 return scalarizeVectorStore(SN, DAG);
2955
2956 return expandUnalignedStore(SN, DAG);
2957 }
2958
2959 if (!IsFast)
2960 return SDValue();
2961 }
2962
2963 if (!shouldCombineMemoryType(VT))
2964 return SDValue();
2965
2966 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2967 SDValue Val = SN->getValue();
2968
2969 //DCI.AddToWorklist(Val.getNode());
2970
2971 bool OtherUses = !Val.hasOneUse();
2972 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2973 if (OtherUses) {
2974 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2975 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2976 }
2977
2978 return DAG.getStore(SN->getChain(), SL, CastVal,
2979 SN->getBasePtr(), SN->getMemOperand());
2980}
2981
2982// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2983// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2984// issues.
2985SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2986 DAGCombinerInfo &DCI) const {
2987 SelectionDAG &DAG = DCI.DAG;
2988 SDValue N0 = N->getOperand(0);
2989
2990 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2991 // (vt2 (truncate (assertzext vt0:x, vt1)))
2992 if (N0.getOpcode() == ISD::TRUNCATE) {
2993 SDValue N1 = N->getOperand(1);
2994 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2995 SDLoc SL(N);
2996
2997 SDValue Src = N0.getOperand(0);
2998 EVT SrcVT = Src.getValueType();
2999 if (SrcVT.bitsGE(ExtVT)) {
3000 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3001 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3002 }
3003 }
3004
3005 return SDValue();
3006}
3007
3008SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3009 SDNode *N, DAGCombinerInfo &DCI) const {
3010 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3011 switch (IID) {
3012 case Intrinsic::amdgcn_mul_i24:
3013 case Intrinsic::amdgcn_mul_u24:
3014 return simplifyI24(N, DCI);
3015 default:
3016 return SDValue();
3017 }
3018}
3019
3020/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3021/// binary operation \p Opc to it with the corresponding constant operands.
3022SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3023 DAGCombinerInfo &DCI, const SDLoc &SL,
3024 unsigned Opc, SDValue LHS,
3025 uint32_t ValLo, uint32_t ValHi) const {
3026 SelectionDAG &DAG = DCI.DAG;
3027 SDValue Lo, Hi;
3028 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3029
3030 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3031 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3032
3033 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3034 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3035
3036 // Re-visit the ands. It's possible we eliminated one of them and it could
3037 // simplify the vector.
3038 DCI.AddToWorklist(Lo.getNode());
3039 DCI.AddToWorklist(Hi.getNode());
3040
3041 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3042 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3043}
3044
3045SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3046 DAGCombinerInfo &DCI) const {
3047 EVT VT = N->getValueType(0);
3048
3049 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3050 if (!RHS)
3051 return SDValue();
3052
3053 SDValue LHS = N->getOperand(0);
3054 unsigned RHSVal = RHS->getZExtValue();
3055 if (!RHSVal)
3056 return LHS;
3057
3058 SDLoc SL(N);
3059 SelectionDAG &DAG = DCI.DAG;
3060
3061 switch (LHS->getOpcode()) {
3062 default:
3063 break;
3064 case ISD::ZERO_EXTEND:
3065 case ISD::SIGN_EXTEND:
3066 case ISD::ANY_EXTEND: {
3067 SDValue X = LHS->getOperand(0);
3068
3069 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3070 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3071 // Prefer build_vector as the canonical form if packed types are legal.
3072 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3073 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3074 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3075 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3076 }
3077
3078 // shl (ext x) => zext (shl x), if shift does not overflow int
3079 if (VT != MVT::i64)
3080 break;
3081 KnownBits Known = DAG.computeKnownBits(X);
3082 unsigned LZ = Known.countMinLeadingZeros();
3083 if (LZ < RHSVal)
3084 break;
3085 EVT XVT = X.getValueType();
3086 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3087 return DAG.getZExtOrTrunc(Shl, SL, VT);
3088 }
3089 }
3090
3091 if (VT != MVT::i64)
3092 return SDValue();
3093
3094 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3095
3096 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3097 // common case, splitting this into a move and a 32-bit shift is faster and
3098 // the same code size.
3099 if (RHSVal < 32)
3100 return SDValue();
3101
3102 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3103
3104 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3105 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3106
3107 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3108
3109 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3110 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3111}
3112
3113SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3114 DAGCombinerInfo &DCI) const {
3115 if (N->getValueType(0) != MVT::i64)
3116 return SDValue();
3117
3118 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3119 if (!RHS)
3120 return SDValue();
3121
3122 SelectionDAG &DAG = DCI.DAG;
3123 SDLoc SL(N);
3124 unsigned RHSVal = RHS->getZExtValue();
3125
3126 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3127 if (RHSVal == 32) {
3128 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3129 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3130 DAG.getConstant(31, SL, MVT::i32));
3131
3132 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3133 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3134 }
3135
3136 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3137 if (RHSVal == 63) {
3138 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3139 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3140 DAG.getConstant(31, SL, MVT::i32));
3141 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3142 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3143 }
3144
3145 return SDValue();
3146}
3147
3148SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3149 DAGCombinerInfo &DCI) const {
3150 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3151 if (!RHS)
3152 return SDValue();
3153
3154 EVT VT = N->getValueType(0);
3155 SDValue LHS = N->getOperand(0);
3156 unsigned ShiftAmt = RHS->getZExtValue();
3157 SelectionDAG &DAG = DCI.DAG;
3158 SDLoc SL(N);
3159
3160 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3161 // this improves the ability to match BFE patterns in isel.
3162 if (LHS.getOpcode() == ISD::AND) {
3163 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3164 if (Mask->getAPIntValue().isShiftedMask() &&
3165 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3166 return DAG.getNode(
3167 ISD::AND, SL, VT,
3168 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3169 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3170 }
3171 }
3172 }
3173
3174 if (VT != MVT::i64)
3175 return SDValue();
3176
3177 if (ShiftAmt < 32)
3178 return SDValue();
3179
3180 // srl i64:x, C for C >= 32
3181 // =>
3182 // build_pair (srl hi_32(x), C - 32), 0
3183 SDValue One = DAG.getConstant(1, SL, MVT::i32);
3184 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3185
3186 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3187 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3188
3189 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3190 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3191
3192 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3193
3194 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3195}
3196
3197SDValue AMDGPUTargetLowering::performTruncateCombine(
3198 SDNode *N, DAGCombinerInfo &DCI) const {
3199 SDLoc SL(N);
3200 SelectionDAG &DAG = DCI.DAG;
3201 EVT VT = N->getValueType(0);
3202 SDValue Src = N->getOperand(0);
3203
3204 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3205 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3206 SDValue Vec = Src.getOperand(0);
3207 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3208 SDValue Elt0 = Vec.getOperand(0);
3209 EVT EltVT = Elt0.getValueType();
3210 if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
3211 if (EltVT.isFloatingPoint()) {
3212 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3213 EltVT.changeTypeToInteger(), Elt0);
3214 }
3215
3216 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3217 }
3218 }
3219 }
3220
3221 // Equivalent of above for accessing the high element of a vector as an
3222 // integer operation.
3223 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3224 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3225 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3226 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3227 SDValue BV = stripBitcast(Src.getOperand(0));
3228 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3229 BV.getValueType().getVectorNumElements() == 2) {
3230 SDValue SrcElt = BV.getOperand(1);
3231 EVT SrcEltVT = SrcElt.getValueType();
3232 if (SrcEltVT.isFloatingPoint()) {
3233 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3234 SrcEltVT.changeTypeToInteger(), SrcElt);
3235 }
3236
3237 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3238 }
3239 }
3240 }
3241 }
3242
3243 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3244 //
3245 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3246 // i16 (trunc (srl (i32 (trunc x), K)))
3247 if (VT.getScalarSizeInBits() < 32) {
3248 EVT SrcVT = Src.getValueType();
3249 if (SrcVT.getScalarSizeInBits() > 32 &&
3250 (Src.getOpcode() == ISD::SRL ||
3251 Src.getOpcode() == ISD::SRA ||
3252 Src.getOpcode() == ISD::SHL)) {
3253 SDValue Amt = Src.getOperand(1);
3254 KnownBits Known = DAG.computeKnownBits(Amt);
3255 unsigned Size = VT.getScalarSizeInBits();
3256 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3257 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3258 EVT MidVT = VT.isVector() ?
3259 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3260 VT.getVectorNumElements()) : MVT::i32;
3261
3262 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3263 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3264 Src.getOperand(0));
3265 DCI.AddToWorklist(Trunc.getNode());
3266
3267 if (Amt.getValueType() != NewShiftVT) {
3268 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3269 DCI.AddToWorklist(Amt.getNode());
3270 }
3271
3272 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3273 Trunc, Amt);
3274 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3275 }
3276 }
3277 }
3278
3279 return SDValue();
3280}
3281
3282// We need to specifically handle i64 mul here to avoid unnecessary conversion
3283// instructions. If we only match on the legalized i64 mul expansion,
3284// SimplifyDemandedBits will be unable to remove them because there will be
3285// multiple uses due to the separate mul + mulh[su].
3286static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3287 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3288 if (Size <= 32) {
3289 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3290 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3291 }
3292
3293 // Because we want to eliminate extension instructions before the
3294 // operation, we need to create a single user here (i.e. not the separate
3295 // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3296
3297 unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3298
3299 SDValue Mul = DAG.getNode(MulOpc, SL,
3300 DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3301
3302 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3303 Mul.getValue(0), Mul.getValue(1));
3304}
3305
3306SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3307 DAGCombinerInfo &DCI) const {
3308 EVT VT = N->getValueType(0);
3309
3310 unsigned Size = VT.getSizeInBits();
3311 if (VT.isVector() || Size > 64)
3312 return SDValue();
3313
3314 // There are i16 integer mul/mad.
3315 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3316 return SDValue();
3317
3318 SelectionDAG &DAG = DCI.DAG;
3319 SDLoc DL(N);
3320
3321 SDValue N0 = N->getOperand(0);
3322 SDValue N1 = N->getOperand(1);
3323
3324 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3325 // in the source into any_extends if the result of the mul is truncated. Since
3326 // we can assume the high bits are whatever we want, use the underlying value
3327 // to avoid the unknown high bits from interfering.
3328 if (N0.getOpcode() == ISD::ANY_EXTEND)
3329 N0 = N0.getOperand(0);
3330
3331 if (N1.getOpcode() == ISD::ANY_EXTEND)
3332 N1 = N1.getOperand(0);
3333
3334 SDValue Mul;
3335
3336 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3337 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3338 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3339 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3340 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3341 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3342 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3343 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3344 } else {
3345 return SDValue();
3346 }
3347
3348 // We need to use sext even for MUL_U24, because MUL_U24 is used
3349 // for signed multiply of 8 and 16-bit types.
3350 return DAG.getSExtOrTrunc(Mul, DL, VT);
3351}
3352
3353SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3354 DAGCombinerInfo &DCI) const {
3355 EVT VT = N->getValueType(0);
3356
3357 if (!Subtarget->hasMulI24() || VT.isVector())
3358 return SDValue();
3359
3360 SelectionDAG &DAG = DCI.DAG;
3361 SDLoc DL(N);
3362
3363 SDValue N0 = N->getOperand(0);
3364 SDValue N1 = N->getOperand(1);
3365
3366 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3367 return SDValue();
3368
3369 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3370 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3371
3372 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3373 DCI.AddToWorklist(Mulhi.getNode());
3374 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3375}
3376
3377SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3378 DAGCombinerInfo &DCI) const {
3379 EVT VT = N->getValueType(0);
3380
3381 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3382 return SDValue();
3383
3384 SelectionDAG &DAG = DCI.DAG;
3385 SDLoc DL(N);
3386
3387 SDValue N0 = N->getOperand(0);
3388 SDValue N1 = N->getOperand(1);
3389
3390 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3391 return SDValue();
3392
3393 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3394 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3395
3396 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3397 DCI.AddToWorklist(Mulhi.getNode());
3398 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3399}
3400
3401SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
3402 SDNode *N, DAGCombinerInfo &DCI) const {
3403 SelectionDAG &DAG = DCI.DAG;
3404
3405 // Simplify demanded bits before splitting into multiple users.
3406 if (SDValue V = simplifyI24(N, DCI))
3407 return V;
3408
3409 SDValue N0 = N->getOperand(0);
3410 SDValue N1 = N->getOperand(1);
3411
3412 bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3413
3414 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3415 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3416
3417 SDLoc SL(N);
3418
3419 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3420 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3421 return DAG.getMergeValues({ MulLo, MulHi }, SL);
3422}
3423
3424static bool isNegativeOne(SDValue Val) {
3425 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3426 return C->isAllOnesValue();
3427 return false;
3428}
3429
3430SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3431 SDValue Op,
3432 const SDLoc &DL,
3433 unsigned Opc) const {
3434 EVT VT = Op.getValueType();
3435 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3436 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3437 LegalVT != MVT::i16))
3438 return SDValue();
3439
3440 if (VT != MVT::i32)
3441 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3442
3443 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3444 if (VT != MVT::i32)
3445 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3446
3447 return FFBX;
3448}
3449
3450// The native instructions return -1 on 0 input. Optimize out a select that
3451// produces -1 on 0.
3452//
3453// TODO: If zero is not undef, we could also do this if the output is compared
3454// against the bitwidth.
3455//
3456// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3457SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3458 SDValue LHS, SDValue RHS,
3459 DAGCombinerInfo &DCI) const {
3460 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3461 if (!CmpRhs || !CmpRhs->isNullValue())
3462 return SDValue();
3463
3464 SelectionDAG &DAG = DCI.DAG;
3465 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3466 SDValue CmpLHS = Cond.getOperand(0);
3467
3468 unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3469 AMDGPUISD::FFBH_U32;
3470
3471 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3472 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3473 if (CCOpcode == ISD::SETEQ &&
3474 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3475 RHS.getOperand(0) == CmpLHS &&
3476 isNegativeOne(LHS)) {
3477 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3478 }
3479
3480 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3481 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3482 if (CCOpcode == ISD::SETNE &&
3483 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3484 LHS.getOperand(0) == CmpLHS &&
3485 isNegativeOne(RHS)) {
3486 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3487 }
3488
3489 return SDValue();
3490}
3491
3492static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3493 unsigned Op,
3494 const SDLoc &SL,
3495 SDValue Cond,
3496 SDValue N1,
3497 SDValue N2) {
3498 SelectionDAG &DAG = DCI.DAG;
3499 EVT VT = N1.getValueType();
3500
3501 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3502 N1.getOperand(0), N2.getOperand(0));
3503 DCI.AddToWorklist(NewSelect.getNode());
3504 return DAG.getNode(Op, SL, VT, NewSelect);
3505}
3506
3507// Pull a free FP operation out of a select so it may fold into uses.
3508//
3509// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3510// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3511//
3512// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3513// select c, (fabs x), +k -> fabs (select c, x, k)
3514static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3515 SDValue N) {
3516 SelectionDAG &DAG = DCI.DAG;
3517 SDValue Cond = N.getOperand(0);
3518 SDValue LHS = N.getOperand(1);
3519 SDValue RHS = N.getOperand(2);
3520
3521 EVT VT = N.getValueType();
3522 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3523 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3524 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3525 SDLoc(N), Cond, LHS, RHS);
3526 }
3527
3528 bool Inv = false;
3529 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3530 std::swap(LHS, RHS);
3531 Inv = true;
3532 }
3533
3534 // TODO: Support vector constants.
3535 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3536 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3537 SDLoc SL(N);
3538 // If one side is an fneg/fabs and the other is a constant, we can push the
3539 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3540 SDValue NewLHS = LHS.getOperand(0);
3541 SDValue NewRHS = RHS;
3542
3543 // Careful: if the neg can be folded up, don't try to pull it back down.
3544 bool ShouldFoldNeg = true;
3545
3546 if (NewLHS.hasOneUse()) {
3547 unsigned Opc = NewLHS.getOpcode();
3548 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3549 ShouldFoldNeg = false;
3550 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3551 ShouldFoldNeg = false;
3552 }
3553
3554 if (ShouldFoldNeg) {
3555 if (LHS.getOpcode() == ISD::FNEG)
3556 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3557 else if (CRHS->isNegative())
3558 return SDValue();
3559
3560 if (Inv)
3561 std::swap(NewLHS, NewRHS);
3562
3563 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3564 Cond, NewLHS, NewRHS);
3565 DCI.AddToWorklist(NewSelect.getNode());
3566 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3567 }
3568 }
3569
3570 return SDValue();
3571}
3572
3573
3574SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3575 DAGCombinerInfo &DCI) const {
3576 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3577 return Folded;
3578
3579 SDValue Cond = N->getOperand(0);
3580 if (Cond.getOpcode() != ISD::SETCC)
3581 return SDValue();
3582
3583 EVT VT = N->getValueType(0);
3584 SDValue LHS = Cond.getOperand(0);
3585 SDValue RHS = Cond.getOperand(1);
3586 SDValue CC = Cond.getOperand(2);
3587
3588 SDValue True = N->getOperand(1);
3589 SDValue False = N->getOperand(2);
3590
3591 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3592 SelectionDAG &DAG = DCI.DAG;
3593 if (DAG.isConstantValueOfAnyType(True) &&
3594 !DAG.isConstantValueOfAnyType(False)) {
3595 // Swap cmp + select pair to move constant to false input.
3596 // This will allow using VOPC cndmasks more often.
3597 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3598
3599 SDLoc SL(N);
3600 ISD::CondCode NewCC =
3601 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3602
3603 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3604 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3605 }
3606
3607 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3608 SDValue MinMax
3609 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3610 // Revisit this node so we can catch min3/max3/med3 patterns.
3611 //DCI.AddToWorklist(MinMax.getNode());
3612 return MinMax;
3613 }
3614 }
3615
3616 // There's no reason to not do this if the condition has other uses.
3617 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3618}
3619
3620static bool isInv2Pi(const APFloat &APF) {
3621 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3622 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3623 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3624
3625 return APF.bitwiseIsEqual(KF16) ||
3626 APF.bitwiseIsEqual(KF32) ||
3627 APF.bitwiseIsEqual(KF64);
3628}
3629
3630// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3631// additional cost to negate them.
3632bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3633 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3634 if (C->isZero() && !C->isNegative())
3635 return true;
3636
3637 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3638 return true;
3639 }
3640
3641 return false;
3642}
3643
3644static unsigned inverseMinMax(unsigned Opc) {
3645 switch (Opc) {
3646 case ISD::FMAXNUM:
3647 return ISD::FMINNUM;
3648 case ISD::FMINNUM:
3649 return ISD::FMAXNUM;
3650 case ISD::FMAXNUM_IEEE:
3651 return ISD::FMINNUM_IEEE;
3652 case ISD::FMINNUM_IEEE:
3653 return ISD::FMAXNUM_IEEE;
3654 case AMDGPUISD::FMAX_LEGACY:
3655 return AMDGPUISD::FMIN_LEGACY;
3656 case AMDGPUISD::FMIN_LEGACY:
3657 return AMDGPUISD::FMAX_LEGACY;
3658 default:
3659 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3659)
;
3660 }
3661}
3662
3663SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3664 DAGCombinerInfo &DCI) const {
3665 SelectionDAG &DAG = DCI.DAG;
3666 SDValue N0 = N->getOperand(0);
3667 EVT VT = N->getValueType(0);
3668
3669 unsigned Opc = N0.getOpcode();
3670
3671 // If the input has multiple uses and we can either fold the negate down, or
3672 // the other uses cannot, give up. This both prevents unprofitable
3673 // transformations and infinite loops: we won't repeatedly try to fold around
3674 // a negate that has no 'good' form.
3675 if (N0.hasOneUse()) {
3676 // This may be able to fold into the source, but at a code size cost. Don't
3677 // fold if the fold into the user is free.
3678 if (allUsesHaveSourceMods(N, 0))
3679 return SDValue();
3680 } else {
3681 if (fnegFoldsIntoOp(Opc) &&
3682 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3683 return SDValue();
3684 }
3685
3686 SDLoc SL(N);
3687 switch (Opc) {
3688 case ISD::FADD: {
3689 if (!mayIgnoreSignedZero(N0))
3690 return SDValue();
3691
3692 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3693 SDValue LHS = N0.getOperand(0);
3694 SDValue RHS = N0.getOperand(1);
3695
3696 if (LHS.getOpcode() != ISD::FNEG)
3697 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3698 else
3699 LHS = LHS.getOperand(0);
3700
3701 if (RHS.getOpcode() != ISD::FNEG)
3702 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3703 else
3704 RHS = RHS.getOperand(0);
3705
3706 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3707 if (Res.getOpcode() != ISD::FADD)
3708 return SDValue(); // Op got folded away.
3709 if (!N0.hasOneUse())
3710 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3711 return Res;
3712 }
3713 case ISD::FMUL:
3714 case AMDGPUISD::FMUL_LEGACY: {
3715 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3716 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3717 SDValue LHS = N0.getOperand(0);
3718 SDValue RHS = N0.getOperand(1);
3719
3720 if (LHS.getOpcode() == ISD::FNEG)
3721 LHS = LHS.getOperand(0);
3722 else if (RHS.getOpcode() == ISD::FNEG)
3723 RHS = RHS.getOperand(0);
3724 else
3725 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3726
3727 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3728 if (Res.getOpcode() != Opc)
3729 return SDValue(); // Op got folded away.
3730 if (!N0.hasOneUse())
3731 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3732 return Res;
3733 }
3734 case ISD::FMA:
3735 case ISD::FMAD: {
3736 if (!mayIgnoreSignedZero(N0))
3737 return SDValue();
3738
3739 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3740 SDValue LHS = N0.getOperand(0);
3741 SDValue MHS = N0.getOperand(1);
3742 SDValue RHS = N0.getOperand(2);
3743
3744 if (LHS.getOpcode() == ISD::FNEG)
3745 LHS = LHS.getOperand(0);
3746 else if (MHS.getOpcode() == ISD::FNEG)
3747 MHS = MHS.getOperand(0);
3748 else
3749 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3750
3751 if (RHS.getOpcode() != ISD::FNEG)
3752 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3753 else
3754 RHS = RHS.getOperand(0);
3755
3756 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3757 if (Res.getOpcode() != Opc)
3758 return SDValue(); // Op got folded away.
3759 if (!N0.hasOneUse())
3760 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3761 return Res;
3762 }
3763 case ISD::FMAXNUM:
3764 case ISD::FMINNUM:
3765 case ISD::FMAXNUM_IEEE:
3766 case ISD::FMINNUM_IEEE:
3767 case AMDGPUISD::FMAX_LEGACY:
3768 case AMDGPUISD::FMIN_LEGACY: {
3769 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3770 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3771 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3772 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3773
3774 SDValue LHS = N0.getOperand(0);
3775 SDValue RHS = N0.getOperand(1);
3776
3777 // 0 doesn't have a negated inline immediate.
3778 // TODO: This constant check should be generalized to other operations.
3779 if (isConstantCostlierToNegate(RHS))
3780 return SDValue();
3781
3782 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3783 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3784 unsigned Opposite = inverseMinMax(Opc);
3785
3786 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3787 if (Res.getOpcode() != Opposite)
3788 return SDValue(); // Op got folded away.
3789 if (!N0.hasOneUse())
3790 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3791 return Res;
3792 }
3793 case AMDGPUISD::FMED3: {
3794 SDValue Ops[3];
3795 for (unsigned I = 0; I < 3; ++I)
3796 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3797
3798 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3799 if (Res.getOpcode() != AMDGPUISD::FMED3)
3800 return SDValue(); // Op got folded away.
3801 if (!N0.hasOneUse())
3802 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3803 return Res;
3804 }
3805 case ISD::FP_EXTEND:
3806 case ISD::FTRUNC:
3807 case ISD::FRINT:
3808 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3809 case ISD::FSIN:
3810 case ISD::FCANONICALIZE:
3811 case AMDGPUISD::RCP:
3812 case AMDGPUISD::RCP_LEGACY:
3813 case AMDGPUISD::RCP_IFLAG:
3814 case AMDGPUISD::SIN_HW: {
3815 SDValue CvtSrc = N0.getOperand(0);
3816 if (CvtSrc.getOpcode() == ISD::FNEG) {
3817 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3818 // (fneg (rcp (fneg x))) -> (rcp x)
3819 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3820 }
3821
3822 if (!N0.hasOneUse())
3823 return SDValue();
3824
3825 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3826 // (fneg (rcp x)) -> (rcp (fneg x))
3827 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3828 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3829 }
3830 case ISD::FP_ROUND: {
3831 SDValue CvtSrc = N0.getOperand(0);
3832
3833 if (CvtSrc.getOpcode() == ISD::FNEG) {
3834 // (fneg (fp_round (fneg x))) -> (fp_round x)
3835 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3836 CvtSrc.getOperand(0), N0.getOperand(1));
3837 }
3838
3839 if (!N0.hasOneUse())
3840 return SDValue();
3841
3842 // (fneg (fp_round x)) -> (fp_round (fneg x))
3843 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3844 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3845 }
3846 case ISD::FP16_TO_FP: {
3847 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3848 // f16, but legalization of f16 fneg ends up pulling it out of the source.
3849 // Put the fneg back as a legal source operation that can be matched later.
3850 SDLoc SL(N);
3851
3852 SDValue Src = N0.getOperand(0);
3853 EVT SrcVT = Src.getValueType();
3854
3855 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3856 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3857 DAG.getConstant(0x8000, SL, SrcVT));
3858 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3859 }
3860 default:
3861 return SDValue();
3862 }
3863}
3864
3865SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3866 DAGCombinerInfo &DCI) const {
3867 SelectionDAG &DAG = DCI.DAG;
3868 SDValue N0 = N->getOperand(0);
3869
3870 if (!N0.hasOneUse())
3871 return SDValue();
3872
3873 switch (N0.getOpcode()) {
3874 case ISD::FP16_TO_FP: {
3875 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")((!Subtarget->has16BitInsts() && "should only see if f16 is illegal"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3875, __PRETTY_FUNCTION__))
;
3876 SDLoc SL(N);
3877 SDValue Src = N0.getOperand(0);
3878 EVT SrcVT = Src.getValueType();
3879
3880 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3881 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3882 DAG.getConstant(0x7fff, SL, SrcVT));
3883 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3884 }
3885 default:
3886 return SDValue();
3887 }
3888}
3889
3890SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3891 DAGCombinerInfo &DCI) const {
3892 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3893 if (!CFP)
3894 return SDValue();
3895
3896 // XXX - Should this flush denormals?
3897 const APFloat &Val = CFP->getValueAPF();
3898 APFloat One(Val.getSemantics(), "1.0");
3899 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3900}
3901
3902SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3903 DAGCombinerInfo &DCI) const {
3904 SelectionDAG &DAG = DCI.DAG;
3905 SDLoc DL(N);
3906
3907 switch(N->getOpcode()) {
3908 default:
3909 break;
3910 case ISD::BITCAST: {
3911 EVT DestVT = N->getValueType(0);
3912
3913 // Push casts through vector builds. This helps avoid emitting a large
3914 // number of copies when materializing floating point vector constants.
3915 //
3916 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3917 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3918 if (DestVT.isVector()) {
3919 SDValue Src = N->getOperand(0);
3920 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3921 EVT SrcVT = Src.getValueType();
3922 unsigned NElts = DestVT.getVectorNumElements();
3923
3924 if (SrcVT.getVectorNumElements() == NElts) {
3925 EVT DestEltVT = DestVT.getVectorElementType();
3926
3927 SmallVector<SDValue, 8> CastedElts;
3928 SDLoc SL(N);
3929 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3930 SDValue Elt = Src.getOperand(I);
3931 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3932 }
3933
3934 return DAG.getBuildVector(DestVT, SL, CastedElts);
3935 }
3936 }
3937 }
3938
3939 if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3940 break;
3941
3942 // Fold bitcasts of constants.
3943 //
3944 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3945 // TODO: Generalize and move to DAGCombiner
3946 SDValue Src = N->getOperand(0);
3947 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3948 if (Src.getValueType() == MVT::i64) {
3949 SDLoc SL(N);
3950 uint64_t CVal = C->getZExtValue();
3951 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3952 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3953 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3954 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
3955 }
3956 }
3957
3958 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3959 const APInt &Val = C->getValueAPF().bitcastToAPInt();
3960 SDLoc SL(N);
3961 uint64_t CVal = Val.getZExtValue();
3962 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3963 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3964 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3965
3966 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3967 }
3968
3969 break;
3970 }
3971 case ISD::SHL: {
3972 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3973 break;
3974
3975 return performShlCombine(N, DCI);
3976 }
3977 case ISD::SRL: {
3978 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3979 break;
3980
3981 return performSrlCombine(N, DCI);
3982 }
3983 case ISD::SRA: {
3984 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3985 break;
3986
3987 return performSraCombine(N, DCI);
3988 }
3989 case ISD::TRUNCATE:
3990 return performTruncateCombine(N, DCI);
3991 case ISD::MUL:
3992 return performMulCombine(N, DCI);
3993 case ISD::MULHS:
3994 return performMulhsCombine(N, DCI);
3995 case ISD::MULHU:
3996 return performMulhuCombine(N, DCI);
3997 case AMDGPUISD::MUL_I24:
3998 case AMDGPUISD::MUL_U24:
3999 case AMDGPUISD::MULHI_I24:
4000 case AMDGPUISD::MULHI_U24: {
4001 if (SDValue V = simplifyI24(N, DCI))
4002 return V;
4003 return SDValue();
4004 }
4005 case AMDGPUISD::MUL_LOHI_I24:
4006 case AMDGPUISD::MUL_LOHI_U24:
4007 return performMulLoHi24Combine(N, DCI);
4008 case ISD::SELECT:
4009 return performSelectCombine(N, DCI);
4010 case ISD::FNEG:
4011 return performFNegCombine(N, DCI);
4012 case ISD::FABS:
4013 return performFAbsCombine(N, DCI);
4014 case AMDGPUISD::BFE_I32:
4015 case AMDGPUISD::BFE_U32: {
4016 assert(!N->getValueType(0).isVector() &&((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4017, __PRETTY_FUNCTION__))
4017 "Vector handling of BFE not implemented")((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4017, __PRETTY_FUNCTION__))
;
4018 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4019 if (!Width)
4020 break;
4021
4022 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4023 if (WidthVal == 0)
4024 return DAG.getConstant(0, DL, MVT::i32);
4025
4026 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4027 if (!Offset)
4028 break;
4029
4030 SDValue BitsFrom = N->getOperand(0);
4031 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4032
4033 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4034
4035 if (OffsetVal == 0) {
4036 // This is already sign / zero extended, so try to fold away extra BFEs.
4037 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4038
4039 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4040 if (OpSignBits >= SignBits)
4041 return BitsFrom;
4042
4043 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4044 if (Signed) {
4045 // This is a sign_extend_inreg. Replace it to take advantage of existing
4046 // DAG Combines. If not eliminated, we will match back to BFE during
4047 // selection.
4048
4049 // TODO: The sext_inreg of extended types ends, although we can could
4050 // handle them in a single BFE.
4051 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4052 DAG.getValueType(SmallVT));
4053 }
4054
4055 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4056 }
4057
4058 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4059 if (Signed) {
4060 return constantFoldBFE<int32_t>(DAG,
4061 CVal->getSExtValue(),
4062 OffsetVal,
4063 WidthVal,
4064 DL);
4065 }
4066
4067 return constantFoldBFE<uint32_t>(DAG,
4068 CVal->getZExtValue(),
4069 OffsetVal,
4070 WidthVal,
4071 DL);
4072 }
4073
4074 if ((OffsetVal + WidthVal) >= 32 &&
4075 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4076 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4077 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4078 BitsFrom, ShiftVal);
4079 }
4080
4081 if (BitsFrom.hasOneUse()) {
4082 APInt Demanded = APInt::getBitsSet(32,
4083 OffsetVal,
4084 OffsetVal + WidthVal);
4085
4086 KnownBits Known;
4087 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4088 !DCI.isBeforeLegalizeOps());
4089 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4090 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4091 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4092 DCI.CommitTargetLoweringOpt(TLO);
4093 }
4094 }
4095
4096 break;
4097 }
4098 case ISD::LOAD:
4099 return performLoadCombine(N, DCI);
4100 case ISD::STORE:
4101 return performStoreCombine(N, DCI);
4102 case AMDGPUISD::RCP:
4103 case AMDGPUISD::RCP_IFLAG:
4104 return performRcpCombine(N, DCI);
4105 case ISD::AssertZext:
4106 case ISD::AssertSext:
4107 return performAssertSZExtCombine(N, DCI);
4108 case ISD::INTRINSIC_WO_CHAIN:
4109 return performIntrinsicWOChainCombine(N, DCI);
4110 }
4111 return SDValue();
4112}
4113
4114//===----------------------------------------------------------------------===//
4115// Helper functions
4116//===----------------------------------------------------------------------===//
4117
4118SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4119 const TargetRegisterClass *RC,
4120 unsigned Reg, EVT VT,
4121 const SDLoc &SL,
4122 bool RawReg) const {
4123 MachineFunction &MF = DAG.getMachineFunction();
4124 MachineRegisterInfo &MRI = MF.getRegInfo();
4125 unsigned VReg;
4126
4127 if (!MRI.isLiveIn(Reg)) {
4128 VReg = MRI.createVirtualRegister(RC);
4129 MRI.addLiveIn(Reg, VReg);
4130 } else {
4131 VReg = MRI.getLiveInVirtReg(Reg);
4132 }
4133
4134 if (RawReg)
4135 return DAG.getRegister(VReg, VT);
4136
4137 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4138}
4139
4140// This may be called multiple times, and nothing prevents creating multiple
4141// objects at the same offset. See if we already defined this object.
4142static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4143 int64_t Offset) {
4144 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4145 if (MFI.getObjectOffset(I) == Offset) {
4146 assert(MFI.getObjectSize(I) == Size)((MFI.getObjectSize(I) == Size) ? static_cast<void> (0)
: __assert_fail ("MFI.getObjectSize(I) == Size", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4146, __PRETTY_FUNCTION__))
;
4147 return I;
4148 }
4149 }
4150
4151 return MFI.CreateFixedObject(Size, Offset, true);
4152}
4153
4154SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4155 EVT VT,
4156 const SDLoc &SL,
4157 int64_t Offset) const {
4158 MachineFunction &MF = DAG.getMachineFunction();
4159 MachineFrameInfo &MFI = MF.getFrameInfo();
4160 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4161
4162 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4163 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4164
4165 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
4166 MachineMemOperand::MODereferenceable |
4167 MachineMemOperand::MOInvariant);
4168}
4169
4170SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4171 const SDLoc &SL,
4172 SDValue Chain,
4173 SDValue ArgVal,
4174 int64_t Offset) const {
4175 MachineFunction &MF = DAG.getMachineFunction();
4176 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4177
4178 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4179 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
4180 MachineMemOperand::MODereferenceable);
4181 return Store;
4182}
4183
4184SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4185 const TargetRegisterClass *RC,
4186 EVT VT, const SDLoc &SL,
4187 const ArgDescriptor &Arg) const {
4188 assert(Arg && "Attempting to load missing argument")((Arg && "Attempting to load missing argument") ? static_cast
<void> (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4188, __PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4189
4190 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4191 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4192 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4193
4194 if (!Arg.isMasked())
4
Calling 'ArgDescriptor::isMasked'
7
Returning from 'ArgDescriptor::isMasked'
8
Taking false branch
4195 return V;
4196
4197 unsigned Mask = Arg.getMask();
4198 unsigned Shift = countTrailingZeros<unsigned>(Mask);
9
Calling 'countTrailingZeros<unsigned int>'
16
Returning from 'countTrailingZeros<unsigned int>'
17
'Shift' initialized to 32
4199 V = DAG.getNode(ISD::SRL, SL, VT, V,
4200 DAG.getShiftAmountConstant(Shift, VT, SL));
4201 return DAG.getNode(ISD::AND, SL, VT, V,
4202 DAG.getConstant(Mask >> Shift, SL, VT));
18
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4203}
4204
4205uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4206 const MachineFunction &MF, const ImplicitParameter Param) const {
4207 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4208 const AMDGPUSubtarget &ST =
4209 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4210 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4211 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4212 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4213 ExplicitArgOffset;
4214 switch (Param) {
4215 case GRID_DIM:
4216 return ArgOffset;
4217 case GRID_OFFSET:
4218 return ArgOffset + 4;
4219 }
4220 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4220)
;
4221}
4222
4223#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4224
4225const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4226 switch ((AMDGPUISD::NodeType)Opcode) {
4227 case AMDGPUISD::FIRST_NUMBER: break;
4228 // AMDIL DAG nodes
4229 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4230 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4231
4232 // AMDGPU DAG nodes
4233 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4234 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4235 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4236 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4237 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4238 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4239 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4240 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4241 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4242 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4243 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4244 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4245 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4246 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4247 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4248 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4249 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4250 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4251 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4252 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4253 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4254 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4255 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4256 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4257 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4258 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4259 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4260 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4261 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4262 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4263 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4264 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4265 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4266 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4267 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4268 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4269 NODE_NAME_CASE(TRIG_PREOP)case AMDGPUISD::TRIG_PREOP: return "TRIG_PREOP";
4270 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4271 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4272 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4273 NODE_NAME_CASE(RSQ_LEGACY)case AMDGPUISD::RSQ_LEGACY: return "RSQ_LEGACY";
4274 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4275 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4276 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4277 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4278 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4279 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4280 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4281 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4282 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4283 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4284 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4285 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4286 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4287 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4288 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4289 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4290 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4291 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4292 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4293 NODE_NAME_CASE(MUL_LOHI_U24)case AMDGPUISD::MUL_LOHI_U24: return "MUL_LOHI_U24";
4294 NODE_NAME_CASE(MUL_LOHI_I24)case AMDGPUISD::MUL_LOHI_I24: return "MUL_LOHI_I24";
4295 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4296 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4297 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4298 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4299 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4300 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4301 NODE_NAME_CASE(EXPORT)case AMDGPUISD::EXPORT: return "EXPORT";
4302 NODE_NAME_CASE(EXPORT_DONE)case AMDGPUISD::EXPORT_DONE: return "EXPORT_DONE";
4303 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4304 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4305 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4306 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4307 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4308 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4309 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4310 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4311 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4312 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4313 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4314 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4315 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4316 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4317 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4318 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4319 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4320 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4321 NODE_NAME_CASE(FP16_ZEXT)case AMDGPUISD::FP16_ZEXT: return "FP16_ZEXT";
4322 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4323 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4324 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4325 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4326 NODE_NAME_CASE(KILL)case AMDGPUISD::KILL: return "KILL";
4327 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4328 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4329 NODE_NAME_CASE(INTERP_P1LL_F16)case AMDGPUISD::INTERP_P1LL_F16: return "INTERP_P1LL_F16";
4330 NODE_NAME_CASE(INTERP_P1LV_F16)case AMDGPUISD::INTERP_P1LV_F16: return "INTERP_P1LV_F16";
4331 NODE_NAME_CASE(INTERP_P2_F16)case AMDGPUISD::INTERP_P2_F16: return "INTERP_P2_F16";
4332 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4333 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4334 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4335 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4336 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4337 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4338 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4339 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4340 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4341 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4342 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4343 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4344 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4345 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4346 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4347 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4348 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4349 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4350 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4351 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4352 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4353 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4354 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4355 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4356 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4357 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4358 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4359 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4360 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4361 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4362 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4363 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4364 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4365 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4366 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4367 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4368 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4369 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4370 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4371 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4372 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4373 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4374 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4375 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4376 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4377 NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)case AMDGPUISD::BUFFER_ATOMIC_PK_FADD: return "BUFFER_ATOMIC_PK_FADD"
;
4378 NODE_NAME_CASE(ATOMIC_PK_FADD)case AMDGPUISD::ATOMIC_PK_FADD: return "ATOMIC_PK_FADD";
4379
4380 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4381 }
4382 return nullptr;
4383}
4384
4385SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4386 SelectionDAG &DAG, int Enabled,
4387 int &RefinementSteps,
4388 bool &UseOneConstNR,
4389 bool Reciprocal) const {
4390 EVT VT = Operand.getValueType();
4391
4392 if (VT == MVT::f32) {
4393 RefinementSteps = 0;
4394 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4395 }
4396
4397 // TODO: There is also f64 rsq instruction, but the documentation is less
4398 // clear on its precision.
4399
4400 return SDValue();
4401}
4402
4403SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4404 SelectionDAG &DAG, int Enabled,
4405 int &RefinementSteps) const {
4406 EVT VT = Operand.getValueType();
4407
4408 if (VT == MVT::f32) {
4409 // Reciprocal, < 1 ulp error.
4410 //
4411 // This reciprocal approximation converges to < 0.5 ulp error with one
4412 // newton rhapson performed with two fused multiple adds (FMAs).
4413
4414 RefinementSteps = 0;
4415 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4416 }
4417
4418 // TODO: There is also f64 rcp instruction, but the documentation is less
4419 // clear on its precision.
4420
4421 return SDValue();
4422}
4423
4424void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4425 const SDValue Op, KnownBits &Known,
4426 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4427
4428 Known.resetAll(); // Don't know anything.
4429
4430 unsigned Opc = Op.getOpcode();
4431
4432 switch (Opc) {
4433 default:
4434 break;
4435 case AMDGPUISD::CARRY:
4436 case AMDGPUISD::BORROW: {
4437 Known.Zero = APInt::getHighBitsSet(32, 31);
4438 break;
4439 }
4440
4441 case AMDGPUISD::BFE_I32:
4442 case AMDGPUISD::BFE_U32: {
4443 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4444 if (!CWidth)
4445 return;
4446
4447 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4448
4449 if (Opc == AMDGPUISD::BFE_U32)
4450 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4451
4452 break;
4453 }
4454 case AMDGPUISD::FP_TO_FP16:
4455 case AMDGPUISD::FP16_ZEXT: {
4456 unsigned BitWidth = Known.getBitWidth();
4457
4458 // High bits are zero.
4459 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4460 break;
4461 }
4462 case AMDGPUISD::MUL_U24:
4463 case AMDGPUISD::MUL_I24: {
4464 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4465 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4466 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4467 RHSKnown.countMinTrailingZeros();
4468 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4469 // Skip extra check if all bits are known zeros.
4470 if (TrailZ >= 32)
4471 break;
4472
4473 // Truncate to 24 bits.
4474 LHSKnown = LHSKnown.trunc(24);
4475 RHSKnown = RHSKnown.trunc(24);
4476
4477 if (Opc == AMDGPUISD::MUL_I24) {
4478 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4479 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4480 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4481 if (MaxValBits >= 32)
4482 break;
4483 bool LHSNegative = LHSKnown.isNegative();
4484 bool LHSNonNegative = LHSKnown.isNonNegative();
4485 bool LHSPositive = LHSKnown.isStrictlyPositive();
4486 bool RHSNegative = RHSKnown.isNegative();
4487 bool RHSNonNegative = RHSKnown.isNonNegative();
4488 bool RHSPositive = RHSKnown.isStrictlyPositive();
4489
4490 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4491 Known.Zero.setHighBits(32 - MaxValBits);
4492 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4493 Known.One.setHighBits(32 - MaxValBits);
4494 } else {
4495 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4496 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4497 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4498 if (MaxValBits >= 32)
4499 break;
4500 Known.Zero.setHighBits(32 - MaxValBits);
4501 }
4502 break;
4503 }
4504 case AMDGPUISD::PERM: {
4505 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4506 if (!CMask)
4507 return;
4508
4509 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4510 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4511 unsigned Sel = CMask->getZExtValue();
4512
4513 for (unsigned I = 0; I < 32; I += 8) {
4514 unsigned SelBits = Sel & 0xff;
4515 if (SelBits < 4) {
4516 SelBits *= 8;
4517 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4518 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4519 } else if (SelBits < 7) {
4520 SelBits = (SelBits & 3) * 8;
4521 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4522 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4523 } else if (SelBits == 0x0c) {
4524 Known.Zero |= 0xFFull << I;
4525 } else if (SelBits > 0x0c) {
4526 Known.One |= 0xFFull << I;
4527 }
4528 Sel >>= 8;
4529 }
4530 break;
4531 }
4532 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4533 Known.Zero.setHighBits(24);
4534 break;
4535 }
4536 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4537 Known.Zero.setHighBits(16);
4538 break;
4539 }
4540 case AMDGPUISD::LDS: {
4541 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4542 unsigned Align = GA->getGlobal()->getAlignment();
4543
4544 Known.Zero.setHighBits(16);
4545 if (Align)
4546 Known.Zero.setLowBits(Log2_32(Align));
4547 break;
4548 }
4549 case ISD::INTRINSIC_WO_CHAIN: {
4550 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4551 switch (IID) {
4552 case Intrinsic::amdgcn_mbcnt_lo:
4553 case Intrinsic::amdgcn_mbcnt_hi: {
4554 const GCNSubtarget &ST =
4555 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4556 // These return at most the wavefront size - 1.
4557 unsigned Size = Op.getValueType().getSizeInBits();
4558 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4559 break;
4560 }
4561 default:
4562 break;
4563 }
4564 }
4565 }
4566}
4567
4568unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4569 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4570 unsigned Depth) const {
4571 switch (Op.getOpcode()) {
4572 case AMDGPUISD::BFE_I32: {
4573 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4574 if (!Width)
4575 return 1;
4576
4577 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4578 if (!isNullConstant(Op.getOperand(1)))
4579 return SignBits;
4580
4581 // TODO: Could probably figure something out with non-0 offsets.
4582 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4583 return std::max(SignBits, Op0SignBits);
4584 }
4585
4586 case AMDGPUISD::BFE_U32: {
4587 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4588 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4589 }
4590
4591 case AMDGPUISD::CARRY:
4592 case AMDGPUISD::BORROW:
4593 return 31;
4594 case AMDGPUISD::BUFFER_LOAD_BYTE:
4595 return 25;
4596 case AMDGPUISD::BUFFER_LOAD_SHORT:
4597 return 17;
4598 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4599 return 24;
4600 case AMDGPUISD::BUFFER_LOAD_USHORT:
4601 return 16;
4602 case AMDGPUISD::FP_TO_FP16:
4603 case AMDGPUISD::FP16_ZEXT:
4604 return 16;
4605 default:
4606 return 1;
4607 }
4608}
4609
4610bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4611 const SelectionDAG &DAG,
4612 bool SNaN,
4613 unsigned Depth) const {
4614 unsigned Opcode = Op.getOpcode();
4615 switch (Opcode) {
4616 case AMDGPUISD::FMIN_LEGACY:
4617 case AMDGPUISD::FMAX_LEGACY: {
4618 if (SNaN)
4619 return true;
4620
4621 // TODO: Can check no nans on one of the operands for each one, but which
4622 // one?
4623 return false;
4624 }
4625 case AMDGPUISD::FMUL_LEGACY:
4626 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4627 if (SNaN)
4628 return true;
4629 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4630 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4631 }
4632 case AMDGPUISD::FMED3:
4633 case AMDGPUISD::FMIN3:
4634 case AMDGPUISD::FMAX3:
4635 case AMDGPUISD::FMAD_FTZ: {
4636 if (SNaN)
4637 return true;
4638 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4639 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4640 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4641 }
4642 case AMDGPUISD::CVT_F32_UBYTE0:
4643 case AMDGPUISD::CVT_F32_UBYTE1:
4644 case AMDGPUISD::CVT_F32_UBYTE2:
4645 case AMDGPUISD::CVT_F32_UBYTE3:
4646 return true;
4647
4648 case AMDGPUISD::RCP:
4649 case AMDGPUISD::RSQ:
4650 case AMDGPUISD::RCP_LEGACY:
4651 case AMDGPUISD::RSQ_LEGACY:
4652 case AMDGPUISD::RSQ_CLAMP: {
4653 if (SNaN)
4654 return true;
4655
4656 // TODO: Need is known positive check.
4657 return false;
4658 }
4659 case AMDGPUISD::LDEXP:
4660 case AMDGPUISD::FRACT: {
4661 if (SNaN)
4662 return true;
4663 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4664 }
4665 case AMDGPUISD::DIV_SCALE:
4666 case AMDGPUISD::DIV_FMAS:
4667 case AMDGPUISD::DIV_FIXUP:
4668 case AMDGPUISD::TRIG_PREOP:
4669 // TODO: Refine on operands.
4670 return SNaN;
4671 case AMDGPUISD::SIN_HW:
4672 case AMDGPUISD::COS_HW: {
4673 // TODO: Need check for infinity
4674 return SNaN;
4675 }
4676 case ISD::INTRINSIC_WO_CHAIN: {
4677 unsigned IntrinsicID
4678 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4679 // TODO: Handle more intrinsics
4680 switch (IntrinsicID) {
4681 case Intrinsic::amdgcn_cubeid:
4682 return true;
4683
4684 case Intrinsic::amdgcn_frexp_mant: {
4685 if (SNaN)
4686 return true;
4687 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4688 }
4689 case Intrinsic::amdgcn_cvt_pkrtz: {
4690 if (SNaN)
4691 return true;
4692 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4693 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4694 }
4695 case Intrinsic::amdgcn_fdot2:
4696 // TODO: Refine on operand
4697 return SNaN;
4698 default:
4699 return false;
4700 }
4701 }
4702 default:
4703 return false;
4704 }
4705}
4706
4707TargetLowering::AtomicExpansionKind
4708AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4709 switch (RMW->getOperation()) {
4710 case AtomicRMWInst::Nand:
4711 case AtomicRMWInst::FAdd:
4712 case AtomicRMWInst::FSub:
4713 return AtomicExpansionKind::CmpXChg;
4714 default:
4715 return AtomicExpansionKind::None;
4716 }
4717}

/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
5
Assuming the condition is true
6
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include "llvm/Support/SwapByteOrder.h"
18#include <algorithm>
19#include <cassert>
20#include <climits>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
117 return 32;
13
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 251, __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315// NOTE: The following support functions use the _32/_64 extensions instead of
316// type overloading so that signed and unsigned integers can be used without
317// ambiguity.
318
319/// Return the high 32 bits of a 64 bit value.
320constexpr inline uint32_t Hi_32(uint64_t Value) {
321 return static_cast<uint32_t>(Value >> 32);
322}
323
324/// Return the low 32 bits of a 64 bit value.
325constexpr inline uint32_t Lo_32(uint64_t Value) {
326 return static_cast<uint32_t>(Value);
327}
328
329/// Make a 64-bit integer from a high / low pair of 32-bit integers.
330constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
331 return ((uint64_t)High << 32) | (uint64_t)Low;
332}
333
334/// Checks if an integer fits into the given bit width.
335template <unsigned N> constexpr inline bool isInt(int64_t x) {
336 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
337}
338// Template specializations to get better code for common cases.
339template <> constexpr inline bool isInt<8>(int64_t x) {
340 return static_cast<int8_t>(x) == x;
341}
342template <> constexpr inline bool isInt<16>(int64_t x) {
343 return static_cast<int16_t>(x) == x;
344}
345template <> constexpr inline bool isInt<32>(int64_t x) {
346 return static_cast<int32_t>(x) == x;
347}
348
349/// Checks if a signed integer is an N bit number shifted left by S.
350template <unsigned N, unsigned S>
351constexpr inline bool isShiftedInt(int64_t x) {
352 static_assert(
353 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
354 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
355 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
356}
357
358/// Checks if an unsigned integer fits into the given bit width.
359///
360/// This is written as two functions rather than as simply
361///
362/// return N >= 64 || X < (UINT64_C(1) << N);
363///
364/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
365/// left too many places.
366template <unsigned N>
367constexpr inline typename std::enable_if<(N < 64), bool>::type
368isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline typename std::enable_if<N >= 64, bool>::type
374isUInt(uint64_t X) {
375 return true;
376}
377
378// Template specializations to get better code for common cases.
379template <> constexpr inline bool isUInt<8>(uint64_t x) {
380 return static_cast<uint8_t>(x) == x;
381}
382template <> constexpr inline bool isUInt<16>(uint64_t x) {
383 return static_cast<uint16_t>(x) == x;
384}
385template <> constexpr inline bool isUInt<32>(uint64_t x) {
386 return static_cast<uint32_t>(x) == x;
387}
388
389/// Checks if a unsigned integer is an N bit number shifted left by S.
390template <unsigned N, unsigned S>
391constexpr inline bool isShiftedUInt(uint64_t x) {
392 static_assert(
393 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
394 static_assert(N + S <= 64,
395 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
396 // Per the two static_asserts above, S must be strictly less than 64. So
397 // 1 << S is not undefined behavior.
398 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
399}
400
401/// Gets the maximum value for a N-bit unsigned integer.
402inline uint64_t maxUIntN(uint64_t N) {
403 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 403, __PRETTY_FUNCTION__))
;
404
405 // uint64_t(1) << 64 is undefined behavior, so we can't do
406 // (uint64_t(1) << N) - 1
407 // without checking first that N != 64. But this works and doesn't have a
408 // branch.
409 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
410}
411
412/// Gets the minimum value for a N-bit signed integer.
413inline int64_t minIntN(int64_t N) {
414 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 414, __PRETTY_FUNCTION__))
;
415
416 return -(UINT64_C(1)1UL<<(N-1));
417}
418
419/// Gets the maximum value for a N-bit signed integer.
420inline int64_t maxIntN(int64_t N) {
421 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 421, __PRETTY_FUNCTION__))
;
422
423 // This relies on two's complement wraparound when N == 64, so we convert to
424 // int64_t only at the very end to avoid UB.
425 return (UINT64_C(1)1UL << (N - 1)) - 1;
426}
427
428/// Checks if an unsigned integer fits into the given (dynamic) bit width.
429inline bool isUIntN(unsigned N, uint64_t x) {
430 return N >= 64 || x <= maxUIntN(N);
431}
432
433/// Checks if an signed integer fits into the given (dynamic) bit width.
434inline bool isIntN(unsigned N, int64_t x) {
435 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
436}
437
438/// Return true if the argument is a non-empty sequence of ones starting at the
439/// least significant bit with the remainder zero (32 bit version).
440/// Ex. isMask_32(0x0000FFFFU) == true.
441constexpr inline bool isMask_32(uint32_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument is a non-empty sequence of ones starting at the
446/// least significant bit with the remainder zero (64 bit version).
447constexpr inline bool isMask_64(uint64_t Value) {
448 return Value && ((Value + 1) & Value) == 0;
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
453constexpr inline bool isShiftedMask_32(uint32_t Value) {
454 return Value && isMask_32((Value - 1) | Value);
455}
456
457/// Return true if the argument contains a non-empty sequence of ones with the
458/// remainder zero (64 bit version.)
459constexpr inline bool isShiftedMask_64(uint64_t Value) {
460 return Value && isMask_64((Value - 1) | Value);
461}
462
463/// Return true if the argument is a power of two > 0.
464/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
465constexpr inline bool isPowerOf2_32(uint32_t Value) {
466 return Value && !(Value & (Value - 1));
467}
468
469/// Return true if the argument is a power of two > 0 (64 bit edition.)
470constexpr inline bool isPowerOf2_64(uint64_t Value) {
471 return Value && !(Value & (Value - 1));
472}
473
474/// Return a byte-swapped representation of the 16-bit argument.
475inline uint16_t ByteSwap_16(uint16_t Value) {
476 return sys::SwapByteOrder_16(Value);
477}
478
479/// Return a byte-swapped representation of the 32-bit argument.
480inline uint32_t ByteSwap_32(uint32_t Value) {
481 return sys::SwapByteOrder_32(Value);
482}
483
484/// Return a byte-swapped representation of the 64-bit argument.
485inline uint64_t ByteSwap_64(uint64_t Value) {
486 return sys::SwapByteOrder_64(Value);
487}
488
489/// Count the number of ones from the most significant bit to the first
490/// zero bit.
491///
492/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countLeadingZeros<T>(~Value, ZB);
503}
504
505/// Count the number of ones from the least significant bit to the first
506/// zero bit.
507///
508/// Ex. countTrailingOnes(0x00FF00FF) == 8.
509/// Only unsigned integral types are allowed.
510///
511/// \param ZB the behavior on an input of all ones. Only ZB_Width and
512/// ZB_Undefined are valid arguments.
513template <typename T>
514unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
515 static_assert(std::numeric_limits<T>::is_integer &&
516 !std::numeric_limits<T>::is_signed,
517 "Only unsigned integral types are allowed.");
518 return countTrailingZeros<T>(~Value, ZB);
519}
520
521namespace detail {
522template <typename T, std::size_t SizeOfT> struct PopulationCounter {
523 static unsigned count(T Value) {
524 // Generic version, forward to 32 bits.
525 static_assert(SizeOfT <= 4, "Not implemented!");
526#if defined(__GNUC__4)
527 return __builtin_popcount(Value);
528#else
529 uint32_t v = Value;
530 v = v - ((v >> 1) & 0x55555555);
531 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
532 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
533#endif
534 }
535};
536
537template <typename T> struct PopulationCounter<T, 8> {
538 static unsigned count(T Value) {
539#if defined(__GNUC__4)
540 return __builtin_popcountll(Value);
541#else
542 uint64_t v = Value;
543 v = v - ((v >> 1) & 0x5555555555555555ULL);
544 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
545 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
546 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
547#endif
548 }
549};
550} // namespace detail
551
552/// Count the number of set bits in a value.
553/// Ex. countPopulation(0xF000F000) = 8
554/// Returns 0 if the word is zero.
555template <typename T>
556inline unsigned countPopulation(T Value) {
557 static_assert(std::numeric_limits<T>::is_integer &&
558 !std::numeric_limits<T>::is_signed,
559 "Only unsigned integral types are allowed.");
560 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
561}
562
563/// Compile time Log2.
564/// Valid only for positive powers of two.
565template <size_t kValue> constexpr inline size_t CTLog2() {
566 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
567 "Value is not a valid power of 2");
568 return 1 + CTLog2<kValue / 2>();
569}
570
571template <> constexpr inline size_t CTLog2<1>() { return 0; }
572
573/// Return the log base 2 of the specified value.
574inline double Log2(double Value) {
575#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
576 return __builtin_log(Value) / __builtin_log(2.0);
577#else
578 return log2(Value);
579#endif
580}
581
582/// Return the floor log base 2 of the specified value, -1 if the value is zero.
583/// (32 bit edition.)
584/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
585inline unsigned Log2_32(uint32_t Value) {
586 return 31 - countLeadingZeros(Value);
587}
588
589/// Return the floor log base 2 of the specified value, -1 if the value is zero.
590/// (64 bit edition.)
591inline unsigned Log2_64(uint64_t Value) {
592 return 63 - countLeadingZeros(Value);
593}
594
595/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
596/// (32 bit edition).
597/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
598inline unsigned Log2_32_Ceil(uint32_t Value) {
599 return 32 - countLeadingZeros(Value - 1);
600}
601
602/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
603/// (64 bit edition.)
604inline unsigned Log2_64_Ceil(uint64_t Value) {
605 return 64 - countLeadingZeros(Value - 1);
606}
607
608/// Return the greatest common divisor of the values using Euclid's algorithm.
609template <typename T>
610inline T greatestCommonDivisor(T A, T B) {
611 while (B) {
612 T Tmp = B;
613 B = A % B;
614 A = Tmp;
615 }
616 return A;
617}
618
619inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
620 return greatestCommonDivisor<uint64_t>(A, B);
621}
622
623/// This function takes a 64-bit integer and returns the bit equivalent double.
624inline double BitsToDouble(uint64_t Bits) {
625 double D;
626 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
627 memcpy(&D, &Bits, sizeof(Bits));
628 return D;
629}
630
631/// This function takes a 32-bit integer and returns the bit equivalent float.
632inline float BitsToFloat(uint32_t Bits) {
633 float F;
634 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
635 memcpy(&F, &Bits, sizeof(Bits));
636 return F;
637}
638
639/// This function takes a double and returns the bit equivalent 64-bit integer.
640/// Note that copying doubles around changes the bits of NaNs on some hosts,
641/// notably x86, so this routine cannot be used if these bits are needed.
642inline uint64_t DoubleToBits(double Double) {
643 uint64_t Bits;
644 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
645 memcpy(&Bits, &Double, sizeof(Double));
646 return Bits;
647}
648
649/// This function takes a float and returns the bit equivalent 32-bit integer.
650/// Note that copying floats around changes the bits of NaNs on some hosts,
651/// notably x86, so this routine cannot be used if these bits are needed.
652inline uint32_t FloatToBits(float Float) {
653 uint32_t Bits;
654 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
655 memcpy(&Bits, &Float, sizeof(Float));
656 return Bits;
657}
658
659/// A and B are either alignments or offsets. Return the minimum alignment that
660/// may be assumed after adding the two together.
661constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
662 // The largest power of 2 that divides both A and B.
663 //
664 // Replace "-Value" by "1+~Value" in the following commented code to avoid
665 // MSVC warning C4146
666 // return (A | B) & -(A | B);
667 return (A | B) & (1 + ~(A | B));
668}
669
670/// Returns the next power of two (in 64-bits) that is strictly greater than A.
671/// Returns zero on overflow.
672inline uint64_t NextPowerOf2(uint64_t A) {
673 A |= (A >> 1);
674 A |= (A >> 2);
675 A |= (A >> 4);
676 A |= (A >> 8);
677 A |= (A >> 16);
678 A |= (A >> 32);
679 return A + 1;
680}
681
682/// Returns the power of two which is less than or equal to the given value.
683/// Essentially, it is a floor operation across the domain of powers of two.
684inline uint64_t PowerOf2Floor(uint64_t A) {
685 if (!A) return 0;
686 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
687}
688
689/// Returns the power of two which is greater than or equal to the given value.
690/// Essentially, it is a ceil operation across the domain of powers of two.
691inline uint64_t PowerOf2Ceil(uint64_t A) {
692 if (!A)
693 return 0;
694 return NextPowerOf2(A - 1);
695}
696
697/// Returns the next integer (mod 2**64) that is greater than or equal to
698/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
699///
700/// If non-zero \p Skew is specified, the return value will be a minimal
701/// integer that is greater than or equal to \p Value and equal to
702/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
703/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
704///
705/// Examples:
706/// \code
707/// alignTo(5, 8) = 8
708/// alignTo(17, 8) = 24
709/// alignTo(~0LL, 8) = 0
710/// alignTo(321, 255) = 510
711///
712/// alignTo(5, 8, 7) = 7
713/// alignTo(17, 8, 1) = 17
714/// alignTo(~0LL, 8, 3) = 3
715/// alignTo(321, 255, 42) = 552
716/// \endcode
717inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
718 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 718, __PRETTY_FUNCTION__))
;
719 Skew %= Align;
720 return (Value + Align - 1 - Skew) / Align * Align + Skew;
721}
722
723/// Returns the next integer (mod 2**64) that is greater than or equal to
724/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
725template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
726 static_assert(Align != 0u, "Align must be non-zero");
727 return (Value + Align - 1) / Align * Align;
728}
729
730/// Returns the integer ceil(Numerator / Denominator).
731inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
732 return alignTo(Numerator, Denominator) / Denominator;
733}
734
735/// Returns the integer nearest(Numerator / Denominator).
736inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
737 return (Numerator + (Denominator / 2)) / Denominator;
738}
739
740/// Returns the largest uint64_t less than or equal to \p Value and is
741/// \p Skew mod \p Align. \p Align must be non-zero
742inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
743 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 743, __PRETTY_FUNCTION__))
;
744 Skew %= Align;
745 return (Value - Skew) / Align * Align + Skew;
746}
747
748/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
749/// Requires 0 < B <= 32.
750template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
751 static_assert(B > 0, "Bit width can't be 0.");
752 static_assert(B <= 32, "Bit width out of range.");
753 return int32_t(X << (32 - B)) >> (32 - B);
754}
755
756/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
757/// Requires 0 < B < 32.
758inline int32_t SignExtend32(uint32_t X, unsigned B) {
759 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 759, __PRETTY_FUNCTION__))
;
760 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 760, __PRETTY_FUNCTION__))
;
761 return int32_t(X << (32 - B)) >> (32 - B);
762}
763
764/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
765/// Requires 0 < B < 64.
766template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
767 static_assert(B > 0, "Bit width can't be 0.");
768 static_assert(B <= 64, "Bit width out of range.");
769 return int64_t(x << (64 - B)) >> (64 - B);
770}
771
772/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
773/// Requires 0 < B < 64.
774inline int64_t SignExtend64(uint64_t X, unsigned B) {
775 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 775, __PRETTY_FUNCTION__))
;
776 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include/llvm/Support/MathExtras.h"
, 776, __PRETTY_FUNCTION__))
;
777 return int64_t(X << (64 - B)) >> (64 - B);
778}
779
780/// Subtract two unsigned integers, X and Y, of type T and return the absolute
781/// value of the result.
782template <typename T>
783typename std::enable_if<std::is_unsigned<T>::value, T>::type
784AbsoluteDifference(T X, T Y) {
785 return std::max(X, Y) - std::min(X, Y);
786}
787
788/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
789/// maximum representable value of T on overflow. ResultOverflowed indicates if
790/// the result is larger than the maximum representable value of type T.
791template <typename T>
792typename std::enable_if<std::is_unsigned<T>::value, T>::type
793SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
794 bool Dummy;
795 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
796 // Hacker's Delight, p. 29
797 T Z = X + Y;
798 Overflowed = (Z < X || Z < Y);
799 if (Overflowed)
800 return std::numeric_limits<T>::max();
801 else
802 return Z;
803}
804
805/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
806/// maximum representable value of T on overflow. ResultOverflowed indicates if
807/// the result is larger than the maximum representable value of type T.
808template <typename T>
809typename std::enable_if<std::is_unsigned<T>::value, T>::type
810SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
811 bool Dummy;
812 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
813
814 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
815 // because it fails for uint16_t (where multiplication can have undefined
816 // behavior due to promotion to int), and requires a division in addition
817 // to the multiplication.
818
819 Overflowed = false;
820
821 // Log2(Z) would be either Log2Z or Log2Z + 1.
822 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
823 // will necessarily be less than Log2Max as desired.
824 int Log2Z = Log2_64(X) + Log2_64(Y);
825 const T Max = std::numeric_limits<T>::max();
826 int Log2Max = Log2_64(Max);
827 if (Log2Z < Log2Max) {
828 return X * Y;
829 }
830 if (Log2Z > Log2Max) {
831 Overflowed = true;
832 return Max;
833 }
834
835 // We're going to use the top bit, and maybe overflow one
836 // bit past it. Multiply all but the bottom bit then add
837 // that on at the end.
838 T Z = (X >> 1) * Y;
839 if (Z & ~(Max >> 1)) {
840 Overflowed = true;
841 return Max;
842 }
843 Z <<= 1;
844 if (X & 1)
845 return SaturatingAdd(Z, Y, ResultOverflowed);
846
847 return Z;
848}
849
850/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
851/// the product. Clamp the result to the maximum representable value of T on
852/// overflow. ResultOverflowed indicates if the result is larger than the
853/// maximum representable value of type T.
854template <typename T>
855typename std::enable_if<std::is_unsigned<T>::value, T>::type
856SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
857 bool Dummy;
858 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
859
860 T Product = SaturatingMultiply(X, Y, &Overflowed);
861 if (Overflowed)
862 return Product;
863
864 return SaturatingAdd(A, Product, &Overflowed);
865}
866
867/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
868extern const float huge_valf;
869
870
871/// Add two signed integers, computing the two's complement truncated result,
872/// returning true if overflow occured.
873template <typename T>
874typename std::enable_if<std::is_signed<T>::value, T>::type
875AddOverflow(T X, T Y, T &Result) {
876#if __has_builtin(__builtin_add_overflow)1
877 return __builtin_add_overflow(X, Y, &Result);
878#else
879 // Perform the unsigned addition.
880 using U = typename std::make_unsigned<T>::type;
881 const U UX = static_cast<U>(X);
882 const U UY = static_cast<U>(Y);
883 const U UResult = UX + UY;
884
885 // Convert to signed.
886 Result = static_cast<T>(UResult);
887
888 // Adding two positive numbers should result in a positive number.
889 if (X > 0 && Y > 0)
890 return Result <= 0;
891 // Adding two negatives should result in a negative number.
892 if (X < 0 && Y < 0)
893 return Result >= 0;
894 return false;
895#endif
896}
897
898/// Subtract two signed integers, computing the two's complement truncated
899/// result, returning true if an overflow ocurred.
900template <typename T>
901typename std::enable_if<std::is_signed<T>::value, T>::type
902SubOverflow(T X, T Y, T &Result) {
903#if __has_builtin(__builtin_sub_overflow)1
904 return __builtin_sub_overflow(X, Y, &Result);
905#else
906 // Perform the unsigned addition.
907 using U = typename std::make_unsigned<T>::type;
908 const U UX = static_cast<U>(X);
909 const U UY = static_cast<U>(Y);
910 const U UResult = UX - UY;
911
912 // Convert to signed.
913 Result = static_cast<T>(UResult);
914
915 // Subtracting a positive number from a negative results in a negative number.
916 if (X <= 0 && Y > 0)
917 return Result >= 0;
918 // Subtracting a negative number from a positive results in a positive number.
919 if (X >= 0 && Y < 0)
920 return Result <= 0;
921 return false;
922#endif
923}
924
925
926/// Multiply two signed integers, computing the two's complement truncated
927/// result, returning true if an overflow ocurred.
928template <typename T>
929typename std::enable_if<std::is_signed<T>::value, T>::type
930MulOverflow(T X, T Y, T &Result) {
931 // Perform the unsigned multiplication on absolute values.
932 using U = typename std::make_unsigned<T>::type;
933 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
934 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
935 const U UResult = UX * UY;
936
937 // Convert to signed.
938 const bool IsNegative = (X < 0) ^ (Y < 0);
939 Result = IsNegative ? (0 - UResult) : UResult;
940
941 // If any of the args was 0, result is 0 and no overflow occurs.
942 if (UX == 0 || UY == 0)
943 return false;
944
945 // UX and UY are in [1, 2^n], where n is the number of digits.
946 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
947 // positive) divided by an argument compares to the other.
948 if (IsNegative)
949 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
950 else
951 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
952}
953
954} // End llvm namespace
955
956#endif