Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4231, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347=. -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-09-184146-41876-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUCallLowering.h"
18#include "AMDGPUFrameLowering.h"
19#include "AMDGPUSubtarget.h"
20#include "AMDGPUTargetMachine.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "R600MachineFunctionInfo.h"
23#include "SIInstrInfo.h"
24#include "SIMachineFunctionInfo.h"
25#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26#include "llvm/CodeGen/Analysis.h"
27#include "llvm/CodeGen/CallingConvLower.h"
28#include "llvm/CodeGen/MachineFunction.h"
29#include "llvm/CodeGen/MachineRegisterInfo.h"
30#include "llvm/CodeGen/SelectionDAG.h"
31#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
32#include "llvm/IR/DataLayout.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/Support/KnownBits.h"
35#include "llvm/Support/MathExtras.h"
36using namespace llvm;
37
38#include "AMDGPUGenCallingConv.inc"
39
40static cl::opt<bool> AMDGPUBypassSlowDiv(
41 "amdgpu-bypass-slow-div",
42 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
43 cl::init(true));
44
45// Find a larger type to do a load / store of a vector with.
46EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
47 unsigned StoreSize = VT.getStoreSizeInBits();
48 if (StoreSize <= 32)
49 return EVT::getIntegerVT(Ctx, StoreSize);
50
51 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")((StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? static_cast<void> (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 51, __PRETTY_FUNCTION__))
;
52 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
53}
54
55unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
56 EVT VT = Op.getValueType();
57 KnownBits Known = DAG.computeKnownBits(Op);
58 return VT.getSizeInBits() - Known.countMinLeadingZeros();
59}
60
61unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
62 EVT VT = Op.getValueType();
63
64 // In order for this to be a signed 24-bit value, bit 23, must
65 // be a sign bit.
66 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
67}
68
69AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
70 const AMDGPUSubtarget &STI)
71 : TargetLowering(TM), Subtarget(&STI) {
72 // Lower floating point store/load to integer store/load to reduce the number
73 // of patterns in tablegen.
74 setOperationAction(ISD::LOAD, MVT::f32, Promote);
75 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
76
77 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
78 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
79
80 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
81 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
82
83 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
84 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
85
86 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
87 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
88
89 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
90 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
91
92 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
93 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
94
95 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
96 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
97
98 setOperationAction(ISD::LOAD, MVT::i64, Promote);
99 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
100
101 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
102 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
103
104 setOperationAction(ISD::LOAD, MVT::f64, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
106
107 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
109
110 // There are no 64-bit extloads. These should be done as a 32-bit extload and
111 // an extension to 64-bit.
112 for (MVT VT : MVT::integer_valuetypes()) {
113 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
114 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
115 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
116 }
117
118 for (MVT VT : MVT::integer_valuetypes()) {
119 if (VT == MVT::i64)
120 continue;
121
122 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
123 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
124 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
125 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
126
127 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
128 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
129 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
130 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
131
132 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
133 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
134 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
135 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
136 }
137
138 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
139 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
140 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
141 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
142 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
143 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
144 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
145 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
146 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
147 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
148 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
149 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
150 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
151 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
153 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
154 }
155
156 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
157 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
158 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
159 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
160 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
161 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
162 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
163
164 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
165 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
166 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
167 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
168
169 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
172 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
173
174 setOperationAction(ISD::STORE, MVT::f32, Promote);
175 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
176
177 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
178 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
179
180 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
181 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
182
183 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
184 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
185
186 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
187 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
188
189 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
190 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
191
192 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
193 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
194
195 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
196 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
197
198 setOperationAction(ISD::STORE, MVT::i64, Promote);
199 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
200
201 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
202 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
203
204 setOperationAction(ISD::STORE, MVT::f64, Promote);
205 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
206
207 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
208 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
209
210 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
211 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
212 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
213 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
214
215 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
216 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
217 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
218 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
219
220 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
221 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
222 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
223 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
224 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
225 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
226 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
227
228 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
229 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
230
231 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
232 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
233
234 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
235 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
236
237 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
238 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
239
240
241 setOperationAction(ISD::Constant, MVT::i32, Legal);
242 setOperationAction(ISD::Constant, MVT::i64, Legal);
243 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
244 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
245
246 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
247 setOperationAction(ISD::BRIND, MVT::Other, Expand);
248
249 // This is totally unsupported, just custom lower to produce an error.
250 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
251
252 // Library functions. These default to Expand, but we have instructions
253 // for them.
254 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
255 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
256 setOperationAction(ISD::FPOW, MVT::f32, Legal);
257 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
258 setOperationAction(ISD::FABS, MVT::f32, Legal);
259 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
260 setOperationAction(ISD::FRINT, MVT::f32, Legal);
261 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
262 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
263 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
264
265 setOperationAction(ISD::FROUND, MVT::f32, Custom);
266 setOperationAction(ISD::FROUND, MVT::f64, Custom);
267
268 setOperationAction(ISD::FLOG, MVT::f32, Custom);
269 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
270 setOperationAction(ISD::FEXP, MVT::f32, Custom);
271
272
273 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
274 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
275
276 setOperationAction(ISD::FREM, MVT::f32, Custom);
277 setOperationAction(ISD::FREM, MVT::f64, Custom);
278
279 // Expand to fneg + fadd.
280 setOperationAction(ISD::FSUB, MVT::f64, Expand);
281
282 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
283 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
284 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
285 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
286 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
287 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
288 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
289 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
290 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
291 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
292 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
293 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
294 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
295 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
296 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
297 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
298 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
299 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
300 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
301 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
302 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
303 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
304
305 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
306 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
307 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
308
309 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
310 for (MVT VT : ScalarIntVTs) {
311 // These should use [SU]DIVREM, so set them to expand
312 setOperationAction(ISD::SDIV, VT, Expand);
313 setOperationAction(ISD::UDIV, VT, Expand);
314 setOperationAction(ISD::SREM, VT, Expand);
315 setOperationAction(ISD::UREM, VT, Expand);
316
317 // GPU does not have divrem function for signed or unsigned.
318 setOperationAction(ISD::SDIVREM, VT, Custom);
319 setOperationAction(ISD::UDIVREM, VT, Custom);
320
321 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
322 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
323 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
324
325 setOperationAction(ISD::BSWAP, VT, Expand);
326 setOperationAction(ISD::CTTZ, VT, Expand);
327 setOperationAction(ISD::CTLZ, VT, Expand);
328
329 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
330 setOperationAction(ISD::ADDC, VT, Legal);
331 setOperationAction(ISD::SUBC, VT, Legal);
332 setOperationAction(ISD::ADDE, VT, Legal);
333 setOperationAction(ISD::SUBE, VT, Legal);
334 }
335
336 // The hardware supports 32-bit ROTR, but not ROTL.
337 setOperationAction(ISD::ROTL, MVT::i32, Expand);
338 setOperationAction(ISD::ROTL, MVT::i64, Expand);
339 setOperationAction(ISD::ROTR, MVT::i64, Expand);
340
341 setOperationAction(ISD::MUL, MVT::i64, Expand);
342 setOperationAction(ISD::MULHU, MVT::i64, Expand);
343 setOperationAction(ISD::MULHS, MVT::i64, Expand);
344 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
345 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
346 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
347 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
348 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
349
350 setOperationAction(ISD::SMIN, MVT::i32, Legal);
351 setOperationAction(ISD::UMIN, MVT::i32, Legal);
352 setOperationAction(ISD::SMAX, MVT::i32, Legal);
353 setOperationAction(ISD::UMAX, MVT::i32, Legal);
354
355 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
357 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
358 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
359
360 static const MVT::SimpleValueType VectorIntTypes[] = {
361 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
362 };
363
364 for (MVT VT : VectorIntTypes) {
365 // Expand the following operations for the current type by default.
366 setOperationAction(ISD::ADD, VT, Expand);
367 setOperationAction(ISD::AND, VT, Expand);
368 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
369 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
370 setOperationAction(ISD::MUL, VT, Expand);
371 setOperationAction(ISD::MULHU, VT, Expand);
372 setOperationAction(ISD::MULHS, VT, Expand);
373 setOperationAction(ISD::OR, VT, Expand);
374 setOperationAction(ISD::SHL, VT, Expand);
375 setOperationAction(ISD::SRA, VT, Expand);
376 setOperationAction(ISD::SRL, VT, Expand);
377 setOperationAction(ISD::ROTL, VT, Expand);
378 setOperationAction(ISD::ROTR, VT, Expand);
379 setOperationAction(ISD::SUB, VT, Expand);
380 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
381 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
382 setOperationAction(ISD::SDIV, VT, Expand);
383 setOperationAction(ISD::UDIV, VT, Expand);
384 setOperationAction(ISD::SREM, VT, Expand);
385 setOperationAction(ISD::UREM, VT, Expand);
386 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
387 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
388 setOperationAction(ISD::SDIVREM, VT, Custom);
389 setOperationAction(ISD::UDIVREM, VT, Expand);
390 setOperationAction(ISD::SELECT, VT, Expand);
391 setOperationAction(ISD::VSELECT, VT, Expand);
392 setOperationAction(ISD::SELECT_CC, VT, Expand);
393 setOperationAction(ISD::XOR, VT, Expand);
394 setOperationAction(ISD::BSWAP, VT, Expand);
395 setOperationAction(ISD::CTPOP, VT, Expand);
396 setOperationAction(ISD::CTTZ, VT, Expand);
397 setOperationAction(ISD::CTLZ, VT, Expand);
398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
399 setOperationAction(ISD::SETCC, VT, Expand);
400 }
401
402 static const MVT::SimpleValueType FloatVectorTypes[] = {
403 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
404 };
405
406 for (MVT VT : FloatVectorTypes) {
407 setOperationAction(ISD::FABS, VT, Expand);
408 setOperationAction(ISD::FMINNUM, VT, Expand);
409 setOperationAction(ISD::FMAXNUM, VT, Expand);
410 setOperationAction(ISD::FADD, VT, Expand);
411 setOperationAction(ISD::FCEIL, VT, Expand);
412 setOperationAction(ISD::FCOS, VT, Expand);
413 setOperationAction(ISD::FDIV, VT, Expand);
414 setOperationAction(ISD::FEXP2, VT, Expand);
415 setOperationAction(ISD::FEXP, VT, Expand);
416 setOperationAction(ISD::FLOG2, VT, Expand);
417 setOperationAction(ISD::FREM, VT, Expand);
418 setOperationAction(ISD::FLOG, VT, Expand);
419 setOperationAction(ISD::FLOG10, VT, Expand);
420 setOperationAction(ISD::FPOW, VT, Expand);
421 setOperationAction(ISD::FFLOOR, VT, Expand);
422 setOperationAction(ISD::FTRUNC, VT, Expand);
423 setOperationAction(ISD::FMUL, VT, Expand);
424 setOperationAction(ISD::FMA, VT, Expand);
425 setOperationAction(ISD::FRINT, VT, Expand);
426 setOperationAction(ISD::FNEARBYINT, VT, Expand);
427 setOperationAction(ISD::FSQRT, VT, Expand);
428 setOperationAction(ISD::FSIN, VT, Expand);
429 setOperationAction(ISD::FSUB, VT, Expand);
430 setOperationAction(ISD::FNEG, VT, Expand);
431 setOperationAction(ISD::VSELECT, VT, Expand);
432 setOperationAction(ISD::SELECT_CC, VT, Expand);
433 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
434 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
435 setOperationAction(ISD::SETCC, VT, Expand);
436 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
437 }
438
439 // This causes using an unrolled select operation rather than expansion with
440 // bit operations. This is in general better, but the alternative using BFI
441 // instructions may be better if the select sources are SGPRs.
442 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
443 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
444
445 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
446 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
447
448 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
449 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
450
451 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
452 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
453
454 // There are no libcalls of any kind.
455 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
456 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
457
458 setSchedulingPreference(Sched::RegPressure);
459 setJumpIsExpensive(true);
460
461 // FIXME: This is only partially true. If we have to do vector compares, any
462 // SGPR pair can be a condition register. If we have a uniform condition, we
463 // are better off doing SALU operations, where there is only one SCC. For now,
464 // we don't have a way of knowing during instruction selection if a condition
465 // will be uniform and we always use vector compares. Assume we are using
466 // vector compares until that is fixed.
467 setHasMultipleConditionRegisters(true);
468
469 setMinCmpXchgSizeInBits(32);
470 setSupportsUnalignedAtomics(false);
471
472 PredictableSelectIsExpensive = false;
473
474 // We want to find all load dependencies for long chains of stores to enable
475 // merging into very wide vectors. The problem is with vectors with > 4
476 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
477 // vectors are a legal type, even though we have to split the loads
478 // usually. When we can more precisely specify load legality per address
479 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
480 // smarter so that they can figure out what to do in 2 iterations without all
481 // N > 4 stores on the same chain.
482 GatherAllAliasesMaxDepth = 16;
483
484 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
485 // about these during lowering.
486 MaxStoresPerMemcpy = 0xffffffff;
487 MaxStoresPerMemmove = 0xffffffff;
488 MaxStoresPerMemset = 0xffffffff;
489
490 // The expansion for 64-bit division is enormous.
491 if (AMDGPUBypassSlowDiv)
492 addBypassSlowDiv(64, 32);
493
494 setTargetDAGCombine(ISD::BITCAST);
495 setTargetDAGCombine(ISD::SHL);
496 setTargetDAGCombine(ISD::SRA);
497 setTargetDAGCombine(ISD::SRL);
498 setTargetDAGCombine(ISD::TRUNCATE);
499 setTargetDAGCombine(ISD::MUL);
500 setTargetDAGCombine(ISD::MULHU);
501 setTargetDAGCombine(ISD::MULHS);
502 setTargetDAGCombine(ISD::SELECT);
503 setTargetDAGCombine(ISD::SELECT_CC);
504 setTargetDAGCombine(ISD::STORE);
505 setTargetDAGCombine(ISD::FADD);
506 setTargetDAGCombine(ISD::FSUB);
507 setTargetDAGCombine(ISD::FNEG);
508 setTargetDAGCombine(ISD::FABS);
509 setTargetDAGCombine(ISD::AssertZext);
510 setTargetDAGCombine(ISD::AssertSext);
511 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
512}
513
514//===----------------------------------------------------------------------===//
515// Target Information
516//===----------------------------------------------------------------------===//
517
518LLVM_READNONE__attribute__((__const__))
519static bool fnegFoldsIntoOp(unsigned Opc) {
520 switch (Opc) {
521 case ISD::FADD:
522 case ISD::FSUB:
523 case ISD::FMUL:
524 case ISD::FMA:
525 case ISD::FMAD:
526 case ISD::FMINNUM:
527 case ISD::FMAXNUM:
528 case ISD::FMINNUM_IEEE:
529 case ISD::FMAXNUM_IEEE:
530 case ISD::FSIN:
531 case ISD::FTRUNC:
532 case ISD::FRINT:
533 case ISD::FNEARBYINT:
534 case ISD::FCANONICALIZE:
535 case AMDGPUISD::RCP:
536 case AMDGPUISD::RCP_LEGACY:
537 case AMDGPUISD::RCP_IFLAG:
538 case AMDGPUISD::SIN_HW:
539 case AMDGPUISD::FMUL_LEGACY:
540 case AMDGPUISD::FMIN_LEGACY:
541 case AMDGPUISD::FMAX_LEGACY:
542 case AMDGPUISD::FMED3:
543 return true;
544 default:
545 return false;
546 }
547}
548
549/// \p returns true if the operation will definitely need to use a 64-bit
550/// encoding, and thus will use a VOP3 encoding regardless of the source
551/// modifiers.
552LLVM_READONLY__attribute__((__pure__))
553static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
554 return N->getNumOperands() > 2 || VT == MVT::f64;
555}
556
557// Most FP instructions support source modifiers, but this could be refined
558// slightly.
559LLVM_READONLY__attribute__((__pure__))
560static bool hasSourceMods(const SDNode *N) {
561 if (isa<MemSDNode>(N))
562 return false;
563
564 switch (N->getOpcode()) {
565 case ISD::CopyToReg:
566 case ISD::SELECT:
567 case ISD::FDIV:
568 case ISD::FREM:
569 case ISD::INLINEASM:
570 case ISD::INLINEASM_BR:
571 case AMDGPUISD::DIV_SCALE:
572 case ISD::INTRINSIC_W_CHAIN:
573
574 // TODO: Should really be looking at the users of the bitcast. These are
575 // problematic because bitcasts are used to legalize all stores to integer
576 // types.
577 case ISD::BITCAST:
578 return false;
579 case ISD::INTRINSIC_WO_CHAIN: {
580 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
581 case Intrinsic::amdgcn_interp_p1:
582 case Intrinsic::amdgcn_interp_p2:
583 case Intrinsic::amdgcn_interp_mov:
584 case Intrinsic::amdgcn_interp_p1_f16:
585 case Intrinsic::amdgcn_interp_p2_f16:
586 return false;
587 default:
588 return true;
589 }
590 }
591 default:
592 return true;
593 }
594}
595
596bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
597 unsigned CostThreshold) {
598 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
599 // it is truly free to use a source modifier in all cases. If there are
600 // multiple users but for each one will necessitate using VOP3, there will be
601 // a code size increase. Try to avoid increasing code size unless we know it
602 // will save on the instruction count.
603 unsigned NumMayIncreaseSize = 0;
604 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
605
606 // XXX - Should this limit number of uses to check?
607 for (const SDNode *U : N->uses()) {
608 if (!hasSourceMods(U))
609 return false;
610
611 if (!opMustUseVOP3Encoding(U, VT)) {
612 if (++NumMayIncreaseSize > CostThreshold)
613 return false;
614 }
615 }
616
617 return true;
618}
619
620MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
621 return MVT::i32;
622}
623
624bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
625 return true;
626}
627
628// The backend supports 32 and 64 bit floating point immediates.
629// FIXME: Why are we reporting vectors of FP immediates as legal?
630bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
631 bool ForCodeSize) const {
632 EVT ScalarVT = VT.getScalarType();
633 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
634 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
635}
636
637// We don't want to shrink f64 / f32 constants.
638bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
639 EVT ScalarVT = VT.getScalarType();
640 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
641}
642
643bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
644 ISD::LoadExtType ExtTy,
645 EVT NewVT) const {
646 // TODO: This may be worth removing. Check regression tests for diffs.
647 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
648 return false;
649
650 unsigned NewSize = NewVT.getStoreSizeInBits();
651
652 // If we are reducing to a 32-bit load or a smaller multi-dword load,
653 // this is always better.
654 if (NewSize >= 32)
655 return true;
656
657 EVT OldVT = N->getValueType(0);
658 unsigned OldSize = OldVT.getStoreSizeInBits();
659
660 MemSDNode *MN = cast<MemSDNode>(N);
661 unsigned AS = MN->getAddressSpace();
662 // Do not shrink an aligned scalar load to sub-dword.
663 // Scalar engine cannot do sub-dword loads.
664 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
665 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
666 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
667 (isa<LoadSDNode>(N) &&
668 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
669 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
670 return false;
671
672 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
673 // extloads, so doing one requires using a buffer_load. In cases where we
674 // still couldn't use a scalar load, using the wider load shouldn't really
675 // hurt anything.
676
677 // If the old size already had to be an extload, there's no harm in continuing
678 // to reduce the width.
679 return (OldSize < 32);
680}
681
682bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
683 const SelectionDAG &DAG,
684 const MachineMemOperand &MMO) const {
685
686 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())((LoadTy.getSizeInBits() == CastTy.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 686, __PRETTY_FUNCTION__))
;
687
688 if (LoadTy.getScalarType() == MVT::i32)
689 return false;
690
691 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
692 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
693
694 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
695 return false;
696
697 bool Fast = false;
698 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
699 CastTy, MMO, &Fast) &&
700 Fast;
701}
702
703// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
704// profitable with the expansion for 64-bit since it's generally good to
705// speculate things.
706// FIXME: These should really have the size as a parameter.
707bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
708 return true;
709}
710
711bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
712 return true;
713}
714
715bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
716 switch (N->getOpcode()) {
717 default:
718 return false;
719 case ISD::EntryToken:
720 case ISD::TokenFactor:
721 return true;
722 case ISD::INTRINSIC_WO_CHAIN:
723 {
724 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
725 switch (IntrID) {
726 default:
727 return false;
728 case Intrinsic::amdgcn_readfirstlane:
729 case Intrinsic::amdgcn_readlane:
730 return true;
731 }
732 }
733 break;
734 case ISD::LOAD:
735 {
736 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
737 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
738 return true;
739 return false;
740 }
741 break;
742 }
743}
744
745TargetLowering::NegatibleCost
746AMDGPUTargetLowering::getNegatibleCost(SDValue Op, SelectionDAG &DAG,
747 bool LegalOperations, bool ForCodeSize,
748 unsigned Depth) const {
749 switch (Op.getOpcode()) {
750 case ISD::FMA:
751 case ISD::FMAD: {
752 // Negating a fma is not free if it has users without source mods.
753 if (!allUsesHaveSourceMods(Op.getNode()))
754 return NegatibleCost::Expensive;
755 break;
756 }
757 default:
758 break;
759 }
760
761 return TargetLowering::getNegatibleCost(Op, DAG, LegalOperations, ForCodeSize,
762 Depth);
763}
764
765//===---------------------------------------------------------------------===//
766// Target Properties
767//===---------------------------------------------------------------------===//
768
769bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
770 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 770, __PRETTY_FUNCTION__))
;
771
772 // Packed operations do not have a fabs modifier.
773 return VT == MVT::f32 || VT == MVT::f64 ||
774 (Subtarget->has16BitInsts() && VT == MVT::f16);
775}
776
777bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
778 assert(VT.isFloatingPoint())((VT.isFloatingPoint()) ? static_cast<void> (0) : __assert_fail
("VT.isFloatingPoint()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 778, __PRETTY_FUNCTION__))
;
779 return VT == MVT::f32 || VT == MVT::f64 ||
780 (Subtarget->has16BitInsts() && VT == MVT::f16) ||
781 (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
782}
783
784bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
785 unsigned NumElem,
786 unsigned AS) const {
787 return true;
788}
789
790bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
791 // There are few operations which truly have vector input operands. Any vector
792 // operation is going to involve operations on each component, and a
793 // build_vector will be a copy per element, so it always makes sense to use a
794 // build_vector input in place of the extracted element to avoid a copy into a
795 // super register.
796 //
797 // We should probably only do this if all users are extracts only, but this
798 // should be the common case.
799 return true;
800}
801
802bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
803 // Truncate is just accessing a subregister.
804
805 unsigned SrcSize = Source.getSizeInBits();
806 unsigned DestSize = Dest.getSizeInBits();
807
808 return DestSize < SrcSize && DestSize % 32 == 0 ;
809}
810
811bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
812 // Truncate is just accessing a subregister.
813
814 unsigned SrcSize = Source->getScalarSizeInBits();
815 unsigned DestSize = Dest->getScalarSizeInBits();
816
817 if (DestSize== 16 && Subtarget->has16BitInsts())
818 return SrcSize >= 32;
819
820 return DestSize < SrcSize && DestSize % 32 == 0;
821}
822
823bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
824 unsigned SrcSize = Src->getScalarSizeInBits();
825 unsigned DestSize = Dest->getScalarSizeInBits();
826
827 if (SrcSize == 16 && Subtarget->has16BitInsts())
828 return DestSize >= 32;
829
830 return SrcSize == 32 && DestSize == 64;
831}
832
833bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
834 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
835 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
836 // this will enable reducing 64-bit operations the 32-bit, which is always
837 // good.
838
839 if (Src == MVT::i16)
840 return Dest == MVT::i32 ||Dest == MVT::i64 ;
841
842 return Src == MVT::i32 && Dest == MVT::i64;
843}
844
845bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
846 return isZExtFree(Val.getValueType(), VT2);
847}
848
849bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
850 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
851 // limited number of native 64-bit operations. Shrinking an operation to fit
852 // in a single 32-bit register should always be helpful. As currently used,
853 // this is much less general than the name suggests, and is only used in
854 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
855 // not profitable, and may actually be harmful.
856 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
857}
858
859//===---------------------------------------------------------------------===//
860// TargetLowering Callbacks
861//===---------------------------------------------------------------------===//
862
863CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
864 bool IsVarArg) {
865 switch (CC) {
866 case CallingConv::AMDGPU_VS:
867 case CallingConv::AMDGPU_GS:
868 case CallingConv::AMDGPU_PS:
869 case CallingConv::AMDGPU_CS:
870 case CallingConv::AMDGPU_HS:
871 case CallingConv::AMDGPU_ES:
872 case CallingConv::AMDGPU_LS:
873 return CC_AMDGPU;
874 case CallingConv::C:
875 case CallingConv::Fast:
876 case CallingConv::Cold:
877 return CC_AMDGPU_Func;
878 case CallingConv::AMDGPU_KERNEL:
879 case CallingConv::SPIR_KERNEL:
880 default:
881 report_fatal_error("Unsupported calling convention for call");
882 }
883}
884
885CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
886 bool IsVarArg) {
887 switch (CC) {
888 case CallingConv::AMDGPU_KERNEL:
889 case CallingConv::SPIR_KERNEL:
890 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 890)
;
891 case CallingConv::AMDGPU_VS:
892 case CallingConv::AMDGPU_GS:
893 case CallingConv::AMDGPU_PS:
894 case CallingConv::AMDGPU_CS:
895 case CallingConv::AMDGPU_HS:
896 case CallingConv::AMDGPU_ES:
897 case CallingConv::AMDGPU_LS:
898 return RetCC_SI_Shader;
899 case CallingConv::C:
900 case CallingConv::Fast:
901 case CallingConv::Cold:
902 return RetCC_AMDGPU_Func;
903 default:
904 report_fatal_error("Unsupported calling convention.");
905 }
906}
907
908/// The SelectionDAGBuilder will automatically promote function arguments
909/// with illegal types. However, this does not work for the AMDGPU targets
910/// since the function arguments are stored in memory as these illegal types.
911/// In order to handle this properly we need to get the original types sizes
912/// from the LLVM IR Function and fixup the ISD:InputArg values before
913/// passing them to AnalyzeFormalArguments()
914
915/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
916/// input values across multiple registers. Each item in the Ins array
917/// represents a single value that will be stored in registers. Ins[x].VT is
918/// the value type of the value that will be stored in the register, so
919/// whatever SDNode we lower the argument to needs to be this type.
920///
921/// In order to correctly lower the arguments we need to know the size of each
922/// argument. Since Ins[x].VT gives us the size of the register that will
923/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
924/// for the orignal function argument so that we can deduce the correct memory
925/// type to use for Ins[x]. In most cases the correct memory type will be
926/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
927/// we have a kernel argument of type v8i8, this argument will be split into
928/// 8 parts and each part will be represented by its own item in the Ins array.
929/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
930/// the argument before it was split. From this, we deduce that the memory type
931/// for each individual part is i8. We pass the memory type as LocVT to the
932/// calling convention analysis function and the register type (Ins[x].VT) as
933/// the ValVT.
934void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
935 CCState &State,
936 const SmallVectorImpl<ISD::InputArg> &Ins) const {
937 const MachineFunction &MF = State.getMachineFunction();
938 const Function &Fn = MF.getFunction();
939 LLVMContext &Ctx = Fn.getParent()->getContext();
940 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
941 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
942 CallingConv::ID CC = Fn.getCallingConv();
943
944 unsigned MaxAlign = 1;
945 uint64_t ExplicitArgOffset = 0;
946 const DataLayout &DL = Fn.getParent()->getDataLayout();
947
948 unsigned InIndex = 0;
949
950 for (const Argument &Arg : Fn.args()) {
951 Type *BaseArgTy = Arg.getType();
952 unsigned Align = DL.getABITypeAlignment(BaseArgTy);
953 MaxAlign = std::max(Align, MaxAlign);
954 unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
955
956 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
957 ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
958
959 // We're basically throwing away everything passed into us and starting over
960 // to get accurate in-memory offsets. The "PartOffset" is completely useless
961 // to us as computed in Ins.
962 //
963 // We also need to figure out what type legalization is trying to do to get
964 // the correct memory offsets.
965
966 SmallVector<EVT, 16> ValueVTs;
967 SmallVector<uint64_t, 16> Offsets;
968 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
969
970 for (unsigned Value = 0, NumValues = ValueVTs.size();
971 Value != NumValues; ++Value) {
972 uint64_t BasePartOffset = Offsets[Value];
973
974 EVT ArgVT = ValueVTs[Value];
975 EVT MemVT = ArgVT;
976 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
977 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
978
979 if (NumRegs == 1) {
980 // This argument is not split, so the IR type is the memory type.
981 if (ArgVT.isExtended()) {
982 // We have an extended type, like i24, so we should just use the
983 // register type.
984 MemVT = RegisterVT;
985 } else {
986 MemVT = ArgVT;
987 }
988 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
989 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
990 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())((ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements
()) ? static_cast<void> (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 990, __PRETTY_FUNCTION__))
;
991 // We have a vector value which has been split into a vector with
992 // the same scalar type, but fewer elements. This should handle
993 // all the floating-point vector types.
994 MemVT = RegisterVT;
995 } else if (ArgVT.isVector() &&
996 ArgVT.getVectorNumElements() == NumRegs) {
997 // This arg has been split so that each element is stored in a separate
998 // register.
999 MemVT = ArgVT.getScalarType();
1000 } else if (ArgVT.isExtended()) {
1001 // We have an extended type, like i65.
1002 MemVT = RegisterVT;
1003 } else {
1004 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1005 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)((ArgVT.getStoreSizeInBits() % NumRegs == 0) ? static_cast<
void> (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1005, __PRETTY_FUNCTION__))
;
1006 if (RegisterVT.isInteger()) {
1007 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1008 } else if (RegisterVT.isVector()) {
1009 assert(!RegisterVT.getScalarType().isFloatingPoint())((!RegisterVT.getScalarType().isFloatingPoint()) ? static_cast
<void> (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1009, __PRETTY_FUNCTION__))
;
1010 unsigned NumElements = RegisterVT.getVectorNumElements();
1011 assert(MemoryBits % NumElements == 0)((MemoryBits % NumElements == 0) ? static_cast<void> (0
) : __assert_fail ("MemoryBits % NumElements == 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1011, __PRETTY_FUNCTION__))
;
1012 // This vector type has been split into another vector type with
1013 // a different elements size.
1014 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1015 MemoryBits / NumElements);
1016 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1017 } else {
1018 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1018)
;
1019 }
1020 }
1021
1022 // Convert one element vectors to scalar.
1023 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1024 MemVT = MemVT.getScalarType();
1025
1026 // Round up vec3/vec5 argument.
1027 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1028 assert(MemVT.getVectorNumElements() == 3 ||((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1029, __PRETTY_FUNCTION__))
1029 MemVT.getVectorNumElements() == 5)((MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements
() == 5) ? static_cast<void> (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1029, __PRETTY_FUNCTION__))
;
1030 MemVT = MemVT.getPow2VectorType(State.getContext());
1031 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1032 MemVT = MemVT.getRoundIntegerType(State.getContext());
1033 }
1034
1035 unsigned PartOffset = 0;
1036 for (unsigned i = 0; i != NumRegs; ++i) {
1037 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1038 BasePartOffset + PartOffset,
1039 MemVT.getSimpleVT(),
1040 CCValAssign::Full));
1041 PartOffset += MemVT.getStoreSize();
1042 }
1043 }
1044 }
1045}
1046
1047SDValue AMDGPUTargetLowering::LowerReturn(
1048 SDValue Chain, CallingConv::ID CallConv,
1049 bool isVarArg,
1050 const SmallVectorImpl<ISD::OutputArg> &Outs,
1051 const SmallVectorImpl<SDValue> &OutVals,
1052 const SDLoc &DL, SelectionDAG &DAG) const {
1053 // FIXME: Fails for r600 tests
1054 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1055 // "wave terminate should not have return values");
1056 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1057}
1058
1059//===---------------------------------------------------------------------===//
1060// Target specific lowering
1061//===---------------------------------------------------------------------===//
1062
1063/// Selects the correct CCAssignFn for a given CallingConvention value.
1064CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1065 bool IsVarArg) {
1066 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1067}
1068
1069CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1070 bool IsVarArg) {
1071 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1072}
1073
1074SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1075 SelectionDAG &DAG,
1076 MachineFrameInfo &MFI,
1077 int ClobberedFI) const {
1078 SmallVector<SDValue, 8> ArgChains;
1079 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1080 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1081
1082 // Include the original chain at the beginning of the list. When this is
1083 // used by target LowerCall hooks, this helps legalize find the
1084 // CALLSEQ_BEGIN node.
1085 ArgChains.push_back(Chain);
1086
1087 // Add a chain value for each stack argument corresponding
1088 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1089 UE = DAG.getEntryNode().getNode()->use_end();
1090 U != UE; ++U) {
1091 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1092 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1093 if (FI->getIndex() < 0) {
1094 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1095 int64_t InLastByte = InFirstByte;
1096 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1097
1098 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1099 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1100 ArgChains.push_back(SDValue(L, 1));
1101 }
1102 }
1103 }
1104 }
1105
1106 // Build a tokenfactor for all the chains.
1107 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1108}
1109
1110SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1111 SmallVectorImpl<SDValue> &InVals,
1112 StringRef Reason) const {
1113 SDValue Callee = CLI.Callee;
1114 SelectionDAG &DAG = CLI.DAG;
1115
1116 const Function &Fn = DAG.getMachineFunction().getFunction();
1117
1118 StringRef FuncName("<unknown>");
1119
1120 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1121 FuncName = G->getSymbol();
1122 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1123 FuncName = G->getGlobal()->getName();
1124
1125 DiagnosticInfoUnsupported NoCalls(
1126 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1127 DAG.getContext()->diagnose(NoCalls);
1128
1129 if (!CLI.IsTailCall) {
1130 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1131 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1132 }
1133
1134 return DAG.getEntryNode();
1135}
1136
1137SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1138 SmallVectorImpl<SDValue> &InVals) const {
1139 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1140}
1141
1142SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1143 SelectionDAG &DAG) const {
1144 const Function &Fn = DAG.getMachineFunction().getFunction();
1145
1146 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1147 SDLoc(Op).getDebugLoc());
1148 DAG.getContext()->diagnose(NoDynamicAlloca);
1149 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1150 return DAG.getMergeValues(Ops, SDLoc());
1151}
1152
1153SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1154 SelectionDAG &DAG) const {
1155 switch (Op.getOpcode()) {
1156 default:
1157 Op->print(errs(), &DAG);
1158 llvm_unreachable("Custom lowering code for this"::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1159)
1159 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this"
"instruction is not implemented yet!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1159)
;
1160 break;
1161 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1162 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1163 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1164 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1165 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1166 case ISD::FREM: return LowerFREM(Op, DAG);
1167 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1168 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1169 case ISD::FRINT: return LowerFRINT(Op, DAG);
1170 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1171 case ISD::FROUND: return LowerFROUND(Op, DAG);
1172 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1173 case ISD::FLOG:
1174 return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
1175 case ISD::FLOG10:
1176 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1177 case ISD::FEXP:
1178 return lowerFEXP(Op, DAG);
1179 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1180 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1181 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1182 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1183 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1184 case ISD::CTTZ:
1185 case ISD::CTTZ_ZERO_UNDEF:
1186 case ISD::CTLZ:
1187 case ISD::CTLZ_ZERO_UNDEF:
1188 return LowerCTLZ_CTTZ(Op, DAG);
1189 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1190 }
1191 return Op;
1192}
1193
1194void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1195 SmallVectorImpl<SDValue> &Results,
1196 SelectionDAG &DAG) const {
1197 switch (N->getOpcode()) {
1198 case ISD::SIGN_EXTEND_INREG:
1199 // Different parts of legalization seem to interpret which type of
1200 // sign_extend_inreg is the one to check for custom lowering. The extended
1201 // from type is what really matters, but some places check for custom
1202 // lowering of the result type. This results in trying to use
1203 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1204 // nothing here and let the illegal result integer be handled normally.
1205 return;
1206 default:
1207 return;
1208 }
1209}
1210
1211bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1212 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1213 if (!GVar || !GVar->hasInitializer())
1214 return false;
1215
1216 return !isa<UndefValue>(GVar->getInitializer());
1217}
1218
1219SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1220 SDValue Op,
1221 SelectionDAG &DAG) const {
1222
1223 const DataLayout &DL = DAG.getDataLayout();
1224 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1225 const GlobalValue *GV = G->getGlobal();
1226
1227 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1228 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1229 if (!MFI->isEntryFunction()) {
1230 const Function &Fn = DAG.getMachineFunction().getFunction();
1231 DiagnosticInfoUnsupported BadLDSDecl(
1232 Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
1233 DAG.getContext()->diagnose(BadLDSDecl);
1234 }
1235
1236 // XXX: What does the value of G->getOffset() mean?
1237 assert(G->getOffset() == 0 &&((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1238, __PRETTY_FUNCTION__))
1238 "Do not know what to do with an non-zero offset")((G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? static_cast<void> (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1238, __PRETTY_FUNCTION__))
;
1239
1240 // TODO: We could emit code to handle the initialization somewhere.
1241 if (!hasDefinedInitializer(GV)) {
1242 unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1243 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1244 }
1245 }
1246
1247 const Function &Fn = DAG.getMachineFunction().getFunction();
1248 DiagnosticInfoUnsupported BadInit(
1249 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1250 DAG.getContext()->diagnose(BadInit);
1251 return SDValue();
1252}
1253
1254SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1255 SelectionDAG &DAG) const {
1256 SmallVector<SDValue, 8> Args;
1257
1258 EVT VT = Op.getValueType();
1259 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1260 SDLoc SL(Op);
1261 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1262 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1263
1264 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1265 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1266 }
1267
1268 for (const SDUse &U : Op->ops())
1269 DAG.ExtractVectorElements(U.get(), Args);
1270
1271 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1272}
1273
1274SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1275 SelectionDAG &DAG) const {
1276
1277 SmallVector<SDValue, 8> Args;
1278 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1279 EVT VT = Op.getValueType();
1280 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1281 VT.getVectorNumElements());
1282
1283 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1284}
1285
1286/// Generate Min/Max node
1287SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1288 SDValue LHS, SDValue RHS,
1289 SDValue True, SDValue False,
1290 SDValue CC,
1291 DAGCombinerInfo &DCI) const {
1292 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1293 return SDValue();
1294
1295 SelectionDAG &DAG = DCI.DAG;
1296 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1297 switch (CCOpcode) {
1298 case ISD::SETOEQ:
1299 case ISD::SETONE:
1300 case ISD::SETUNE:
1301 case ISD::SETNE:
1302 case ISD::SETUEQ:
1303 case ISD::SETEQ:
1304 case ISD::SETFALSE:
1305 case ISD::SETFALSE2:
1306 case ISD::SETTRUE:
1307 case ISD::SETTRUE2:
1308 case ISD::SETUO:
1309 case ISD::SETO:
1310 break;
1311 case ISD::SETULE:
1312 case ISD::SETULT: {
1313 if (LHS == True)
1314 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1315 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1316 }
1317 case ISD::SETOLE:
1318 case ISD::SETOLT:
1319 case ISD::SETLE:
1320 case ISD::SETLT: {
1321 // Ordered. Assume ordered for undefined.
1322
1323 // Only do this after legalization to avoid interfering with other combines
1324 // which might occur.
1325 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1326 !DCI.isCalledByLegalizer())
1327 return SDValue();
1328
1329 // We need to permute the operands to get the correct NaN behavior. The
1330 // selected operand is the second one based on the failing compare with NaN,
1331 // so permute it based on the compare type the hardware uses.
1332 if (LHS == True)
1333 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1334 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1335 }
1336 case ISD::SETUGE:
1337 case ISD::SETUGT: {
1338 if (LHS == True)
1339 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1340 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1341 }
1342 case ISD::SETGT:
1343 case ISD::SETGE:
1344 case ISD::SETOGE:
1345 case ISD::SETOGT: {
1346 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1347 !DCI.isCalledByLegalizer())
1348 return SDValue();
1349
1350 if (LHS == True)
1351 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1352 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1353 }
1354 case ISD::SETCC_INVALID:
1355 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1355)
;
1356 }
1357 return SDValue();
1358}
1359
1360std::pair<SDValue, SDValue>
1361AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1362 SDLoc SL(Op);
1363
1364 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1365
1366 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1367 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1368
1369 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1370 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1371
1372 return std::make_pair(Lo, Hi);
1373}
1374
1375SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1376 SDLoc SL(Op);
1377
1378 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1379 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1380 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1381}
1382
1383SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1384 SDLoc SL(Op);
1385
1386 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1387 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1388 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1389}
1390
1391// Split a vector type into two parts. The first part is a power of two vector.
1392// The second part is whatever is left over, and is a scalar if it would
1393// otherwise be a 1-vector.
1394std::pair<EVT, EVT>
1395AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1396 EVT LoVT, HiVT;
1397 EVT EltVT = VT.getVectorElementType();
1398 unsigned NumElts = VT.getVectorNumElements();
1399 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1400 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1401 HiVT = NumElts - LoNumElts == 1
1402 ? EltVT
1403 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1404 return std::make_pair(LoVT, HiVT);
1405}
1406
1407// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1408// scalar.
1409std::pair<SDValue, SDValue>
1410AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1411 const EVT &LoVT, const EVT &HiVT,
1412 SelectionDAG &DAG) const {
1413 assert(LoVT.getVectorNumElements() +((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1416, __PRETTY_FUNCTION__))
1414 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1416, __PRETTY_FUNCTION__))
1415 N.getValueType().getVectorNumElements() &&((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1416, __PRETTY_FUNCTION__))
1416 "More vector elements requested than available!")((LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements
() : 1) <= N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!") ? static_cast
<void> (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1416, __PRETTY_FUNCTION__))
;
1417 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1418 DAG.getVectorIdxConstant(0, DL));
1419 SDValue Hi = DAG.getNode(
1420 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1421 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1422 return std::make_pair(Lo, Hi);
1423}
1424
1425SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1426 SelectionDAG &DAG) const {
1427 LoadSDNode *Load = cast<LoadSDNode>(Op);
1428 EVT VT = Op.getValueType();
1429 SDLoc SL(Op);
1430
1431
1432 // If this is a 2 element vector, we really want to scalarize and not create
1433 // weird 1 element vectors.
1434 if (VT.getVectorNumElements() == 2) {
1435 SDValue Ops[2];
1436 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1437 return DAG.getMergeValues(Ops, SL);
1438 }
1439
1440 SDValue BasePtr = Load->getBasePtr();
1441 EVT MemVT = Load->getMemoryVT();
1442
1443 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1444
1445 EVT LoVT, HiVT;
1446 EVT LoMemVT, HiMemVT;
1447 SDValue Lo, Hi;
1448
1449 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1450 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1451 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1452
1453 unsigned Size = LoMemVT.getStoreSize();
1454 unsigned BaseAlign = Load->getAlignment();
1455 unsigned HiAlign = MinAlign(BaseAlign, Size);
1456
1457 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1458 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1459 BaseAlign, Load->getMemOperand()->getFlags());
1460 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1461 SDValue HiLoad =
1462 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1463 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1464 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1465
1466 SDValue Join;
1467 if (LoVT == HiVT) {
1468 // This is the case that the vector is power of two so was evenly split.
1469 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1470 } else {
1471 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1472 DAG.getVectorIdxConstant(0, SL));
1473 Join = DAG.getNode(
1474 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1475 VT, Join, HiLoad,
1476 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1477 }
1478
1479 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1480 LoLoad.getValue(1), HiLoad.getValue(1))};
1481
1482 return DAG.getMergeValues(Ops, SL);
1483}
1484
1485// Widen a vector load from vec3 to vec4.
1486SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
1487 SelectionDAG &DAG) const {
1488 LoadSDNode *Load = cast<LoadSDNode>(Op);
1489 EVT VT = Op.getValueType();
1490 assert(VT.getVectorNumElements() == 3)((VT.getVectorNumElements() == 3) ? static_cast<void> (
0) : __assert_fail ("VT.getVectorNumElements() == 3", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1490, __PRETTY_FUNCTION__))
;
1491 SDValue BasePtr = Load->getBasePtr();
1492 EVT MemVT = Load->getMemoryVT();
1493 SDLoc SL(Op);
1494 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1495 unsigned BaseAlign = Load->getAlignment();
1496
1497 EVT WideVT =
1498 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1499 EVT WideMemVT =
1500 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1501 SDValue WideLoad = DAG.getExtLoad(
1502 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1503 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1504 return DAG.getMergeValues(
1505 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1506 DAG.getVectorIdxConstant(0, SL)),
1507 WideLoad.getValue(1)},
1508 SL);
1509}
1510
1511SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1512 SelectionDAG &DAG) const {
1513 StoreSDNode *Store = cast<StoreSDNode>(Op);
1514 SDValue Val = Store->getValue();
1515 EVT VT = Val.getValueType();
1516
1517 // If this is a 2 element vector, we really want to scalarize and not create
1518 // weird 1 element vectors.
1519 if (VT.getVectorNumElements() == 2)
1520 return scalarizeVectorStore(Store, DAG);
1521
1522 EVT MemVT = Store->getMemoryVT();
1523 SDValue Chain = Store->getChain();
1524 SDValue BasePtr = Store->getBasePtr();
1525 SDLoc SL(Op);
1526
1527 EVT LoVT, HiVT;
1528 EVT LoMemVT, HiMemVT;
1529 SDValue Lo, Hi;
1530
1531 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1532 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1533 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1534
1535 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1536
1537 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1538 unsigned BaseAlign = Store->getAlignment();
1539 unsigned Size = LoMemVT.getStoreSize();
1540 unsigned HiAlign = MinAlign(BaseAlign, Size);
1541
1542 SDValue LoStore =
1543 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1544 Store->getMemOperand()->getFlags());
1545 SDValue HiStore =
1546 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1547 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1548
1549 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1550}
1551
1552// This is a shortcut for integer division because we have fast i32<->f32
1553// conversions, and fast f32 reciprocal instructions. The fractional part of a
1554// float is enough to accurately represent up to a 24-bit signed integer.
1555SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1556 bool Sign) const {
1557 SDLoc DL(Op);
1558 EVT VT = Op.getValueType();
1559 SDValue LHS = Op.getOperand(0);
1560 SDValue RHS = Op.getOperand(1);
1561 MVT IntVT = MVT::i32;
1562 MVT FltVT = MVT::f32;
1563
1564 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1565 if (LHSSignBits < 9)
1566 return SDValue();
1567
1568 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1569 if (RHSSignBits < 9)
1570 return SDValue();
1571
1572 unsigned BitSize = VT.getSizeInBits();
1573 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1574 unsigned DivBits = BitSize - SignBits;
1575 if (Sign)
1576 ++DivBits;
1577
1578 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1579 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1580
1581 SDValue jq = DAG.getConstant(1, DL, IntVT);
1582
1583 if (Sign) {
1584 // char|short jq = ia ^ ib;
1585 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1586
1587 // jq = jq >> (bitsize - 2)
1588 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1589 DAG.getConstant(BitSize - 2, DL, VT));
1590
1591 // jq = jq | 0x1
1592 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1593 }
1594
1595 // int ia = (int)LHS;
1596 SDValue ia = LHS;
1597
1598 // int ib, (int)RHS;
1599 SDValue ib = RHS;
1600
1601 // float fa = (float)ia;
1602 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1603
1604 // float fb = (float)ib;
1605 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1606
1607 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1608 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1609
1610 // fq = trunc(fq);
1611 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1612
1613 // float fqneg = -fq;
1614 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1615
1616 MachineFunction &MF = DAG.getMachineFunction();
1617 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1618
1619 // float fr = mad(fqneg, fb, fa);
1620 unsigned OpCode = MFI->getMode().allFP32Denormals() ?
1621 (unsigned)AMDGPUISD::FMAD_FTZ :
1622 (unsigned)ISD::FMAD;
1623 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1624
1625 // int iq = (int)fq;
1626 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1627
1628 // fr = fabs(fr);
1629 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1630
1631 // fb = fabs(fb);
1632 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1633
1634 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1635
1636 // int cv = fr >= fb;
1637 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1638
1639 // jq = (cv ? jq : 0);
1640 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1641
1642 // dst = iq + jq;
1643 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1644
1645 // Rem needs compensation, it's easier to recompute it
1646 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1647 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1648
1649 // Truncate to number of bits this divide really is.
1650 if (Sign) {
1651 SDValue InRegSize
1652 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1653 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1654 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1655 } else {
1656 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1657 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1658 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1659 }
1660
1661 return DAG.getMergeValues({ Div, Rem }, DL);
1662}
1663
1664void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1665 SelectionDAG &DAG,
1666 SmallVectorImpl<SDValue> &Results) const {
1667 SDLoc DL(Op);
1668 EVT VT = Op.getValueType();
1669
1670 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")((VT == MVT::i64 && "LowerUDIVREM64 expects an i64") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1670, __PRETTY_FUNCTION__))
;
1671
1672 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1673
1674 SDValue One = DAG.getConstant(1, DL, HalfVT);
1675 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1676
1677 //HiLo split
1678 SDValue LHS = Op.getOperand(0);
1679 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1680 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1681
1682 SDValue RHS = Op.getOperand(1);
1683 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1684 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1685
1686 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1687 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1688
1689 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1690 LHS_Lo, RHS_Lo);
1691
1692 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1693 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1694
1695 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1696 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1697 return;
1698 }
1699
1700 if (isTypeLegal(MVT::i64)) {
1701 MachineFunction &MF = DAG.getMachineFunction();
1702 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1703
1704 // Compute denominator reciprocal.
1705 unsigned FMAD = MFI->getMode().allFP32Denormals() ?
1706 (unsigned)AMDGPUISD::FMAD_FTZ :
1707 (unsigned)ISD::FMAD;
1708
1709 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1710 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1711 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1712 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1713 Cvt_Lo);
1714 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1715 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1716 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1717 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1718 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1719 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1720 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1721 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1722 Mul1);
1723 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1724 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1725 SDValue Rcp64 = DAG.getBitcast(VT,
1726 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1727
1728 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1729 SDValue One64 = DAG.getConstant(1, DL, VT);
1730 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1731 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1732
1733 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1734 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1735 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1736 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1737 Zero);
1738 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1739 One);
1740
1741 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1742 Mulhi1_Lo, Zero1);
1743 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1744 Mulhi1_Hi, Add1_Lo.getValue(1));
1745 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1746 SDValue Add1 = DAG.getBitcast(VT,
1747 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1748
1749 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1750 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1751 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1752 Zero);
1753 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1754 One);
1755
1756 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1757 Mulhi2_Lo, Zero1);
1758 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1759 Mulhi2_Hi, Add1_Lo.getValue(1));
1760 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1761 Zero, Add2_Lo.getValue(1));
1762 SDValue Add2 = DAG.getBitcast(VT,
1763 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1764 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1765
1766 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1767
1768 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1769 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1770 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1771 Mul3_Lo, Zero1);
1772 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1773 Mul3_Hi, Sub1_Lo.getValue(1));
1774 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1775 SDValue Sub1 = DAG.getBitcast(VT,
1776 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1777
1778 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1779 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1780 ISD::SETUGE);
1781 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1782 ISD::SETUGE);
1783 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1784
1785 // TODO: Here and below portions of the code can be enclosed into if/endif.
1786 // Currently control flow is unconditional and we have 4 selects after
1787 // potential endif to substitute PHIs.
1788
1789 // if C3 != 0 ...
1790 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1791 RHS_Lo, Zero1);
1792 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1793 RHS_Hi, Sub1_Lo.getValue(1));
1794 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1795 Zero, Sub2_Lo.getValue(1));
1796 SDValue Sub2 = DAG.getBitcast(VT,
1797 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1798
1799 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1800
1801 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1802 ISD::SETUGE);
1803 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1804 ISD::SETUGE);
1805 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1806
1807 // if (C6 != 0)
1808 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1809
1810 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1811 RHS_Lo, Zero1);
1812 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1813 RHS_Hi, Sub2_Lo.getValue(1));
1814 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1815 Zero, Sub3_Lo.getValue(1));
1816 SDValue Sub3 = DAG.getBitcast(VT,
1817 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1818
1819 // endif C6
1820 // endif C3
1821
1822 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1823 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1824
1825 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1826 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1827
1828 Results.push_back(Div);
1829 Results.push_back(Rem);
1830
1831 return;
1832 }
1833
1834 // r600 expandion.
1835 // Get Speculative values
1836 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1837 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1838
1839 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1840 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1841 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1842
1843 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1844 SDValue DIV_Lo = Zero;
1845
1846 const unsigned halfBitWidth = HalfVT.getSizeInBits();
1847
1848 for (unsigned i = 0; i < halfBitWidth; ++i) {
1849 const unsigned bitPos = halfBitWidth - i - 1;
1850 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1851 // Get value of high bit
1852 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1853 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1854 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1855
1856 // Shift
1857 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1858 // Add LHS high bit
1859 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1860
1861 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1862 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1863
1864 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1865
1866 // Update REM
1867 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1868 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1869 }
1870
1871 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1872 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1873 Results.push_back(DIV);
1874 Results.push_back(REM);
1875}
1876
1877SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1878 SelectionDAG &DAG) const {
1879 SDLoc DL(Op);
1880 EVT VT = Op.getValueType();
1881
1882 if (VT == MVT::i64) {
1883 SmallVector<SDValue, 2> Results;
1884 LowerUDIVREM64(Op, DAG, Results);
1885 return DAG.getMergeValues(Results, DL);
1886 }
1887
1888 if (VT == MVT::i32) {
1889 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1890 return Res;
1891 }
1892
1893 SDValue Num = Op.getOperand(0);
1894 SDValue Den = Op.getOperand(1);
1895
1896 // RCP = URECIP(Den) = 2^32 / Den + e
1897 // e is rounding error.
1898 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1899
1900 // RCP_LO = mul(RCP, Den) */
1901 SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1902
1903 // RCP_HI = mulhu (RCP, Den) */
1904 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1905
1906 // NEG_RCP_LO = -RCP_LO
1907 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1908 RCP_LO);
1909
1910 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1911 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1912 NEG_RCP_LO, RCP_LO,
1913 ISD::SETEQ);
1914 // Calculate the rounding error from the URECIP instruction
1915 // E = mulhu(ABS_RCP_LO, RCP)
1916 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1917
1918 // RCP_A_E = RCP + E
1919 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1920
1921 // RCP_S_E = RCP - E
1922 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1923
1924 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1925 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1926 RCP_A_E, RCP_S_E,
1927 ISD::SETEQ);
1928 // Quotient = mulhu(Tmp0, Num)
1929 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1930
1931 // Num_S_Remainder = Quotient * Den
1932 SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1933
1934 // Remainder = Num - Num_S_Remainder
1935 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1936
1937 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1938 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1939 DAG.getConstant(-1, DL, VT),
1940 DAG.getConstant(0, DL, VT),
1941 ISD::SETUGE);
1942 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1943 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1944 Num_S_Remainder,
1945 DAG.getConstant(-1, DL, VT),
1946 DAG.getConstant(0, DL, VT),
1947 ISD::SETUGE);
1948 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1949 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1950 Remainder_GE_Zero);
1951
1952 // Calculate Division result:
1953
1954 // Quotient_A_One = Quotient + 1
1955 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1956 DAG.getConstant(1, DL, VT));
1957
1958 // Quotient_S_One = Quotient - 1
1959 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1960 DAG.getConstant(1, DL, VT));
1961
1962 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1963 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1964 Quotient, Quotient_A_One, ISD::SETEQ);
1965
1966 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1967 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1968 Quotient_S_One, Div, ISD::SETEQ);
1969
1970 // Calculate Rem result:
1971
1972 // Remainder_S_Den = Remainder - Den
1973 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1974
1975 // Remainder_A_Den = Remainder + Den
1976 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1977
1978 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1979 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1980 Remainder, Remainder_S_Den, ISD::SETEQ);
1981
1982 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1983 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1984 Remainder_A_Den, Rem, ISD::SETEQ);
1985 SDValue Ops[2] = {
1986 Div,
1987 Rem
1988 };
1989 return DAG.getMergeValues(Ops, DL);
1990}
1991
1992SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1993 SelectionDAG &DAG) const {
1994 SDLoc DL(Op);
1995 EVT VT = Op.getValueType();
1996
1997 SDValue LHS = Op.getOperand(0);
1998 SDValue RHS = Op.getOperand(1);
1999
2000 SDValue Zero = DAG.getConstant(0, DL, VT);
2001 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2002
2003 if (VT == MVT::i32) {
2004 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2005 return Res;
2006 }
2007
2008 if (VT == MVT::i64 &&
2009 DAG.ComputeNumSignBits(LHS) > 32 &&
2010 DAG.ComputeNumSignBits(RHS) > 32) {
2011 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2012
2013 //HiLo split
2014 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2015 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2016 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2017 LHS_Lo, RHS_Lo);
2018 SDValue Res[2] = {
2019 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2020 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2021 };
2022 return DAG.getMergeValues(Res, DL);
2023 }
2024
2025 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2026 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2027 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2028 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2029
2030 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2031 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2032
2033 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2034 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2035
2036 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2037 SDValue Rem = Div.getValue(1);
2038
2039 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2040 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2041
2042 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2043 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2044
2045 SDValue Res[2] = {
2046 Div,
2047 Rem
2048 };
2049 return DAG.getMergeValues(Res, DL);
2050}
2051
2052// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
2053SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2054 SDLoc SL(Op);
2055 EVT VT = Op.getValueType();
2056 SDValue X = Op.getOperand(0);
2057 SDValue Y = Op.getOperand(1);
2058
2059 // TODO: Should this propagate fast-math-flags?
2060
2061 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
2062 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
2063 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
2064
2065 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
2066}
2067
2068SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2069 SDLoc SL(Op);
2070 SDValue Src = Op.getOperand(0);
2071
2072 // result = trunc(src)
2073 // if (src > 0.0 && src != result)
2074 // result += 1.0
2075
2076 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2077
2078 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2079 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2080
2081 EVT SetCCVT =
2082 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2083
2084 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2085 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2086 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2087
2088 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2089 // TODO: Should this propagate fast-math-flags?
2090 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2091}
2092
2093static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2094 SelectionDAG &DAG) {
2095 const unsigned FractBits = 52;
2096 const unsigned ExpBits = 11;
2097
2098 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2099 Hi,
2100 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2101 DAG.getConstant(ExpBits, SL, MVT::i32));
2102 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2103 DAG.getConstant(1023, SL, MVT::i32));
2104
2105 return Exp;
2106}
2107
2108SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2109 SDLoc SL(Op);
2110 SDValue Src = Op.getOperand(0);
2111
2112 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2112, __PRETTY_FUNCTION__))
;
2113
2114 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2115 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2116
2117 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2118
2119 // Extract the upper half, since this is where we will find the sign and
2120 // exponent.
2121 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2122
2123 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2124
2125 const unsigned FractBits = 52;
2126
2127 // Extract the sign bit.
2128 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2129 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2130
2131 // Extend back to 64-bits.
2132 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2133 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2134
2135 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2136 const SDValue FractMask
2137 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2138
2139 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2140 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2141 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2142
2143 EVT SetCCVT =
2144 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2145
2146 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2147
2148 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2149 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2150
2151 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2152 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2153
2154 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2155}
2156
2157SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2158 SDLoc SL(Op);
2159 SDValue Src = Op.getOperand(0);
2160
2161 assert(Op.getValueType() == MVT::f64)((Op.getValueType() == MVT::f64) ? static_cast<void> (0
) : __assert_fail ("Op.getValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2161, __PRETTY_FUNCTION__))
;
2162
2163 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2164 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2165 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2166
2167 // TODO: Should this propagate fast-math-flags?
2168
2169 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2170 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2171
2172 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2173
2174 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2175 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2176
2177 EVT SetCCVT =
2178 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2179 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2180
2181 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2182}
2183
2184SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2185 // FNEARBYINT and FRINT are the same, except in their handling of FP
2186 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2187 // rint, so just treat them as equivalent.
2188 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2189}
2190
2191// XXX - May require not supporting f32 denormals?
2192
2193// Don't handle v2f16. The extra instructions to scalarize and repack around the
2194// compare and vselect end up producing worse code than scalarizing the whole
2195// operation.
2196SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
2197 SelectionDAG &DAG) const {
2198 SDLoc SL(Op);
2199 SDValue X = Op.getOperand(0);
2200 EVT VT = Op.getValueType();
2201
2202 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2203
2204 // TODO: Should this propagate fast-math-flags?
2205
2206 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2207
2208 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2209
2210 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2211 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2212 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2213
2214 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2215
2216 EVT SetCCVT =
2217 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2218
2219 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2220
2221 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2222
2223 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2224}
2225
2226SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
2227 SDLoc SL(Op);
2228 SDValue X = Op.getOperand(0);
2229
2230 SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2231
2232 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2233 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2234 const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2235 const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2236 EVT SetCCVT =
2237 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2238
2239 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2240
2241 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2242
2243 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2244
2245 const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff)0x000fffffffffffffL, SL,
2246 MVT::i64);
2247
2248 SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2249 SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2250 DAG.getConstant(INT64_C(0x0008000000000000)0x0008000000000000L, SL,
2251 MVT::i64),
2252 Exp);
2253
2254 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2255 SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2256 DAG.getConstant(0, SL, MVT::i64), Tmp0,
2257 ISD::SETNE);
2258
2259 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2260 D, DAG.getConstant(0, SL, MVT::i64));
2261 SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2262
2263 K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2264 K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2265
2266 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2267 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2268 SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2269
2270 SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2271 ExpEqNegOne,
2272 DAG.getConstantFP(1.0, SL, MVT::f64),
2273 DAG.getConstantFP(0.0, SL, MVT::f64));
2274
2275 SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2276
2277 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2278 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2279
2280 return K;
2281}
2282
2283SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2284 EVT VT = Op.getValueType();
2285
2286 if (isOperationLegal(ISD::FTRUNC, VT))
2287 return LowerFROUND_LegalFTRUNC(Op, DAG);
2288
2289 if (VT == MVT::f64)
2290 return LowerFROUND64(Op, DAG);
2291
2292 llvm_unreachable("unhandled type")::llvm::llvm_unreachable_internal("unhandled type", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2292)
;
2293}
2294
2295SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2296 SDLoc SL(Op);
2297 SDValue Src = Op.getOperand(0);
2298
2299 // result = trunc(src);
2300 // if (src < 0.0 && src != result)
2301 // result += -1.0.
2302
2303 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2304
2305 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2306 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2307
2308 EVT SetCCVT =
2309 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2310
2311 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2312 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2313 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2314
2315 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2316 // TODO: Should this propagate fast-math-flags?
2317 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2318}
2319
2320SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2321 double Log2BaseInverted) const {
2322 EVT VT = Op.getValueType();
2323
2324 SDLoc SL(Op);
2325 SDValue Operand = Op.getOperand(0);
2326 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2327 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2328
2329 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2330}
2331
2332// exp2(M_LOG2E_F * f);
2333SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2334 EVT VT = Op.getValueType();
2335 SDLoc SL(Op);
2336 SDValue Src = Op.getOperand(0);
2337
2338 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2339 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2340 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2341}
2342
2343static bool isCtlzOpc(unsigned Opc) {
2344 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2345}
2346
2347static bool isCttzOpc(unsigned Opc) {
2348 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2349}
2350
2351SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2352 SDLoc SL(Op);
2353 SDValue Src = Op.getOperand(0);
2354 bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2355 Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2356
2357 unsigned ISDOpc, NewOpc;
2358 if (isCtlzOpc(Op.getOpcode())) {
2359 ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2360 NewOpc = AMDGPUISD::FFBH_U32;
2361 } else if (isCttzOpc(Op.getOpcode())) {
2362 ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2363 NewOpc = AMDGPUISD::FFBL_B32;
2364 } else
2365 llvm_unreachable("Unexpected OPCode!!!")::llvm::llvm_unreachable_internal("Unexpected OPCode!!!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2365)
;
2366
2367
2368 if (ZeroUndef && Src.getValueType() == MVT::i32)
2369 return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2370
2371 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2372
2373 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2374 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2375
2376 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2377 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2378
2379 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2380 *DAG.getContext(), MVT::i32);
2381
2382 SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2383 SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2384
2385 SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2386 SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2387
2388 const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2389 SDValue Add, NewOpr;
2390 if (isCtlzOpc(Op.getOpcode())) {
2391 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2392 // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2393 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2394 } else {
2395 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2396 // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2397 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2398 }
2399
2400 if (!ZeroUndef) {
2401 // Test if the full 64-bit input is zero.
2402
2403 // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2404 // which we probably don't want.
2405 SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2406 SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2407 SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2408
2409 // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2410 // with the same cycles, otherwise it is slower.
2411 // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2412 // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2413
2414 const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2415
2416 // The instruction returns -1 for 0 input, but the defined intrinsic
2417 // behavior is to return the number of bits.
2418 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2419 SrcIsZero, Bits32, NewOpr);
2420 }
2421
2422 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2423}
2424
2425SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2426 bool Signed) const {
2427 // Unsigned
2428 // cul2f(ulong u)
2429 //{
2430 // uint lz = clz(u);
2431 // uint e = (u != 0) ? 127U + 63U - lz : 0;
2432 // u = (u << lz) & 0x7fffffffffffffffUL;
2433 // ulong t = u & 0xffffffffffUL;
2434 // uint v = (e << 23) | (uint)(u >> 40);
2435 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2436 // return as_float(v + r);
2437 //}
2438 // Signed
2439 // cl2f(long l)
2440 //{
2441 // long s = l >> 63;
2442 // float r = cul2f((l + s) ^ s);
2443 // return s ? -r : r;
2444 //}
2445
2446 SDLoc SL(Op);
2447 SDValue Src = Op.getOperand(0);
2448 SDValue L = Src;
2449
2450 SDValue S;
2451 if (Signed) {
2452 const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2453 S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2454
2455 SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2456 L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2457 }
2458
2459 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2460 *DAG.getContext(), MVT::f32);
2461
2462
2463 SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2464 SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2465 SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2466 LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2467
2468 SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2469 SDValue E = DAG.getSelect(SL, MVT::i32,
2470 DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2471 DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2472 ZeroI32);
2473
2474 SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2475 DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2476 DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2477
2478 SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2479 DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2480
2481 SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2482 U, DAG.getConstant(40, SL, MVT::i64));
2483
2484 SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2485 DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2486 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2487
2488 SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2489 SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2490 SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2491
2492 SDValue One = DAG.getConstant(1, SL, MVT::i32);
2493
2494 SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2495
2496 SDValue R = DAG.getSelect(SL, MVT::i32,
2497 RCmp,
2498 One,
2499 DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2500 R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2501 R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2502
2503 if (!Signed)
2504 return R;
2505
2506 SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2507 return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2508}
2509
2510SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2511 bool Signed) const {
2512 SDLoc SL(Op);
2513 SDValue Src = Op.getOperand(0);
2514
2515 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2516
2517 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2518 DAG.getConstant(0, SL, MVT::i32));
2519 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2520 DAG.getConstant(1, SL, MVT::i32));
2521
2522 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2523 SL, MVT::f64, Hi);
2524
2525 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2526
2527 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2528 DAG.getConstant(32, SL, MVT::i32));
2529 // TODO: Should this propagate fast-math-flags?
2530 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2531}
2532
2533SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2534 SelectionDAG &DAG) const {
2535 // TODO: Factor out code common with LowerSINT_TO_FP.
2536 EVT DestVT = Op.getValueType();
2537 SDValue Src = Op.getOperand(0);
2538 EVT SrcVT = Src.getValueType();
2539
2540 if (SrcVT == MVT::i16) {
2541 if (DestVT == MVT::f16)
2542 return Op;
2543 SDLoc DL(Op);
2544
2545 // Promote src to i32
2546 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2547 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2548 }
2549
2550 assert(SrcVT == MVT::i64 && "operation should be legal")((SrcVT == MVT::i64 && "operation should be legal") ?
static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2550, __PRETTY_FUNCTION__))
;
2551
2552 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2553 SDLoc DL(Op);
2554
2555 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2556 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2557 SDValue FPRound =
2558 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2559
2560 return FPRound;
2561 }
2562
2563 if (DestVT == MVT::f32)
2564 return LowerINT_TO_FP32(Op, DAG, false);
2565
2566 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2566, __PRETTY_FUNCTION__))
;
2567 return LowerINT_TO_FP64(Op, DAG, false);
2568}
2569
2570SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2571 SelectionDAG &DAG) const {
2572 EVT DestVT = Op.getValueType();
2573
2574 SDValue Src = Op.getOperand(0);
2575 EVT SrcVT = Src.getValueType();
2576
2577 if (SrcVT == MVT::i16) {
2578 if (DestVT == MVT::f16)
2579 return Op;
2580
2581 SDLoc DL(Op);
2582 // Promote src to i32
2583 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2584 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2585 }
2586
2587 assert(SrcVT == MVT::i64 && "operation should be legal")((SrcVT == MVT::i64 && "operation should be legal") ?
static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2587, __PRETTY_FUNCTION__))
;
2588
2589 // TODO: Factor out code common with LowerUINT_TO_FP.
2590
2591 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2592 SDLoc DL(Op);
2593 SDValue Src = Op.getOperand(0);
2594
2595 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2596 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2597 SDValue FPRound =
2598 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2599
2600 return FPRound;
2601 }
2602
2603 if (DestVT == MVT::f32)
2604 return LowerINT_TO_FP32(Op, DAG, true);
2605
2606 assert(DestVT == MVT::f64)((DestVT == MVT::f64) ? static_cast<void> (0) : __assert_fail
("DestVT == MVT::f64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2606, __PRETTY_FUNCTION__))
;
2607 return LowerINT_TO_FP64(Op, DAG, true);
2608}
2609
2610SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2611 bool Signed) const {
2612 SDLoc SL(Op);
2613
2614 SDValue Src = Op.getOperand(0);
2615
2616 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2617
2618 SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)0x3df0000000000000UL), SL,
2619 MVT::f64);
2620 SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)0xc1f0000000000000UL), SL,
2621 MVT::f64);
2622 // TODO: Should this propagate fast-math-flags?
2623 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2624
2625 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2626
2627
2628 SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2629
2630 SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2631 MVT::i32, FloorMul);
2632 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2633
2634 SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2635
2636 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2637}
2638
2639SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2640 SDLoc DL(Op);
2641 SDValue N0 = Op.getOperand(0);
2642
2643 // Convert to target node to get known bits
2644 if (N0.getValueType() == MVT::f32)
2645 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2646
2647 if (getTargetMachine().Options.UnsafeFPMath) {
2648 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2649 return SDValue();
2650 }
2651
2652 assert(N0.getSimpleValueType() == MVT::f64)((N0.getSimpleValueType() == MVT::f64) ? static_cast<void>
(0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2652, __PRETTY_FUNCTION__))
;
2653
2654 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2655 const unsigned ExpMask = 0x7ff;
2656 const unsigned ExpBiasf64 = 1023;
2657 const unsigned ExpBiasf16 = 15;
2658 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2659 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2660 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2661 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2662 DAG.getConstant(32, DL, MVT::i64));
2663 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2664 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2665 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2666 DAG.getConstant(20, DL, MVT::i64));
2667 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2668 DAG.getConstant(ExpMask, DL, MVT::i32));
2669 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2670 // add the f16 bias (15) to get the biased exponent for the f16 format.
2671 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2672 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2673
2674 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2675 DAG.getConstant(8, DL, MVT::i32));
2676 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2677 DAG.getConstant(0xffe, DL, MVT::i32));
2678
2679 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2680 DAG.getConstant(0x1ff, DL, MVT::i32));
2681 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2682
2683 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2684 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2685
2686 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2687 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2688 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2689 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2690
2691 // N = M | (E << 12);
2692 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2693 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2694 DAG.getConstant(12, DL, MVT::i32)));
2695
2696 // B = clamp(1-E, 0, 13);
2697 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2698 One, E);
2699 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2700 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2701 DAG.getConstant(13, DL, MVT::i32));
2702
2703 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2704 DAG.getConstant(0x1000, DL, MVT::i32));
2705
2706 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2707 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2708 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2709 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2710
2711 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2712 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2713 DAG.getConstant(0x7, DL, MVT::i32));
2714 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2715 DAG.getConstant(2, DL, MVT::i32));
2716 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2717 One, Zero, ISD::SETEQ);
2718 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2719 One, Zero, ISD::SETGT);
2720 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2721 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2722
2723 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2724 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2725 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2726 I, V, ISD::SETEQ);
2727
2728 // Extract the sign bit.
2729 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2730 DAG.getConstant(16, DL, MVT::i32));
2731 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2732 DAG.getConstant(0x8000, DL, MVT::i32));
2733
2734 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2735 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2736}
2737
2738SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2739 SelectionDAG &DAG) const {
2740 SDValue Src = Op.getOperand(0);
2741
2742 // TODO: Factor out code common with LowerFP_TO_UINT.
2743
2744 EVT SrcVT = Src.getValueType();
2745 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2746 SDLoc DL(Op);
2747
2748 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2749 SDValue FpToInt32 =
2750 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2751
2752 return FpToInt32;
2753 }
2754
2755 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2756 return LowerFP64_TO_INT(Op, DAG, true);
2757
2758 return SDValue();
2759}
2760
2761SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2762 SelectionDAG &DAG) const {
2763 SDValue Src = Op.getOperand(0);
2764
2765 // TODO: Factor out code common with LowerFP_TO_SINT.
2766
2767 EVT SrcVT = Src.getValueType();
2768 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2769 SDLoc DL(Op);
2770
2771 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2772 SDValue FpToInt32 =
2773 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2774
2775 return FpToInt32;
2776 }
2777
2778 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2779 return LowerFP64_TO_INT(Op, DAG, false);
2780
2781 return SDValue();
2782}
2783
2784SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2785 SelectionDAG &DAG) const {
2786 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2787 MVT VT = Op.getSimpleValueType();
2788 MVT ScalarVT = VT.getScalarType();
2789
2790 assert(VT.isVector())((VT.isVector()) ? static_cast<void> (0) : __assert_fail
("VT.isVector()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2790, __PRETTY_FUNCTION__))
;
2791
2792 SDValue Src = Op.getOperand(0);
2793 SDLoc DL(Op);
2794
2795 // TODO: Don't scalarize on Evergreen?
2796 unsigned NElts = VT.getVectorNumElements();
2797 SmallVector<SDValue, 8> Args;
2798 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2799
2800 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2801 for (unsigned I = 0; I < NElts; ++I)
2802 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2803
2804 return DAG.getBuildVector(VT, DL, Args);
2805}
2806
2807//===----------------------------------------------------------------------===//
2808// Custom DAG optimizations
2809//===----------------------------------------------------------------------===//
2810
2811static bool isU24(SDValue Op, SelectionDAG &DAG) {
2812 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2813}
2814
2815static bool isI24(SDValue Op, SelectionDAG &DAG) {
2816 EVT VT = Op.getValueType();
2817 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2818 // as unsigned 24-bit values.
2819 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2820}
2821
2822static SDValue simplifyI24(SDNode *Node24,
2823 TargetLowering::DAGCombinerInfo &DCI) {
2824 SelectionDAG &DAG = DCI.DAG;
2825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2826 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2827
2828 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2829 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2830 unsigned NewOpcode = Node24->getOpcode();
2831 if (IsIntrin) {
2832 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2833 NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2834 AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2835 }
2836
2837 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2838
2839 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2840 // the operands to have other uses, but will only perform simplifications that
2841 // involve bypassing some nodes for this user.
2842 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2843 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2844 if (DemandedLHS || DemandedRHS)
2845 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2846 DemandedLHS ? DemandedLHS : LHS,
2847 DemandedRHS ? DemandedRHS : RHS);
2848
2849 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2850 // operands if this node is the only user.
2851 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2852 return SDValue(Node24, 0);
2853 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2854 return SDValue(Node24, 0);
2855
2856 return SDValue();
2857}
2858
2859template <typename IntTy>
2860static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2861 uint32_t Width, const SDLoc &DL) {
2862 if (Width + Offset < 32) {
2863 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2864 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2865 return DAG.getConstant(Result, DL, MVT::i32);
2866 }
2867
2868 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2869}
2870
2871static bool hasVolatileUser(SDNode *Val) {
2872 for (SDNode *U : Val->uses()) {
2873 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2874 if (M->isVolatile())
2875 return true;
2876 }
2877 }
2878
2879 return false;
2880}
2881
2882bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2883 // i32 vectors are the canonical memory type.
2884 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2885 return false;
2886
2887 if (!VT.isByteSized())
2888 return false;
2889
2890 unsigned Size = VT.getStoreSize();
2891
2892 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2893 return false;
2894
2895 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2896 return false;
2897
2898 return true;
2899}
2900
2901// Replace load of an illegal type with a store of a bitcast to a friendlier
2902// type.
2903SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2904 DAGCombinerInfo &DCI) const {
2905 if (!DCI.isBeforeLegalize())
2906 return SDValue();
2907
2908 LoadSDNode *LN = cast<LoadSDNode>(N);
2909 if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2910 return SDValue();
2911
2912 SDLoc SL(N);
2913 SelectionDAG &DAG = DCI.DAG;
2914 EVT VT = LN->getMemoryVT();
2915
2916 unsigned Size = VT.getStoreSize();
2917 unsigned Align = LN->getAlignment();
2918 if (Align < Size && isTypeLegal(VT)) {
2919 bool IsFast;
2920 unsigned AS = LN->getAddressSpace();
2921
2922 // Expand unaligned loads earlier than legalization. Due to visitation order
2923 // problems during legalization, the emitted instructions to pack and unpack
2924 // the bytes again are not eliminated in the case of an unaligned copy.
2925 if (!allowsMisalignedMemoryAccesses(
2926 VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
2927 SDValue Ops[2];
2928
2929 if (VT.isVector())
2930 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
2931 else
2932 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2933
2934 return DAG.getMergeValues(Ops, SDLoc(N));
2935 }
2936
2937 if (!IsFast)
2938 return SDValue();
2939 }
2940
2941 if (!shouldCombineMemoryType(VT))
2942 return SDValue();
2943
2944 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2945
2946 SDValue NewLoad
2947 = DAG.getLoad(NewVT, SL, LN->getChain(),
2948 LN->getBasePtr(), LN->getMemOperand());
2949
2950 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2951 DCI.CombineTo(N, BC, NewLoad.getValue(1));
2952 return SDValue(N, 0);
2953}
2954
2955// Replace store of an illegal type with a store of a bitcast to a friendlier
2956// type.
2957SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2958 DAGCombinerInfo &DCI) const {
2959 if (!DCI.isBeforeLegalize())
2960 return SDValue();
2961
2962 StoreSDNode *SN = cast<StoreSDNode>(N);
2963 if (SN->isVolatile() || !ISD::isNormalStore(SN))
2964 return SDValue();
2965
2966 EVT VT = SN->getMemoryVT();
2967 unsigned Size = VT.getStoreSize();
2968
2969 SDLoc SL(N);
2970 SelectionDAG &DAG = DCI.DAG;
2971 unsigned Align = SN->getAlignment();
2972 if (Align < Size && isTypeLegal(VT)) {
2973 bool IsFast;
2974 unsigned AS = SN->getAddressSpace();
2975
2976 // Expand unaligned stores earlier than legalization. Due to visitation
2977 // order problems during legalization, the emitted instructions to pack and
2978 // unpack the bytes again are not eliminated in the case of an unaligned
2979 // copy.
2980 if (!allowsMisalignedMemoryAccesses(
2981 VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
2982 if (VT.isVector())
2983 return scalarizeVectorStore(SN, DAG);
2984
2985 return expandUnalignedStore(SN, DAG);
2986 }
2987
2988 if (!IsFast)
2989 return SDValue();
2990 }
2991
2992 if (!shouldCombineMemoryType(VT))
2993 return SDValue();
2994
2995 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2996 SDValue Val = SN->getValue();
2997
2998 //DCI.AddToWorklist(Val.getNode());
2999
3000 bool OtherUses = !Val.hasOneUse();
3001 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3002 if (OtherUses) {
3003 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3004 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3005 }
3006
3007 return DAG.getStore(SN->getChain(), SL, CastVal,
3008 SN->getBasePtr(), SN->getMemOperand());
3009}
3010
3011// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3012// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3013// issues.
3014SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3015 DAGCombinerInfo &DCI) const {
3016 SelectionDAG &DAG = DCI.DAG;
3017 SDValue N0 = N->getOperand(0);
3018
3019 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3020 // (vt2 (truncate (assertzext vt0:x, vt1)))
3021 if (N0.getOpcode() == ISD::TRUNCATE) {
3022 SDValue N1 = N->getOperand(1);
3023 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3024 SDLoc SL(N);
3025
3026 SDValue Src = N0.getOperand(0);
3027 EVT SrcVT = Src.getValueType();
3028 if (SrcVT.bitsGE(ExtVT)) {
3029 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3030 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3031 }
3032 }
3033
3034 return SDValue();
3035}
3036
3037SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3038 SDNode *N, DAGCombinerInfo &DCI) const {
3039 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3040 switch (IID) {
3041 case Intrinsic::amdgcn_mul_i24:
3042 case Intrinsic::amdgcn_mul_u24:
3043 return simplifyI24(N, DCI);
3044 default:
3045 return SDValue();
3046 }
3047}
3048
3049/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3050/// binary operation \p Opc to it with the corresponding constant operands.
3051SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3052 DAGCombinerInfo &DCI, const SDLoc &SL,
3053 unsigned Opc, SDValue LHS,
3054 uint32_t ValLo, uint32_t ValHi) const {
3055 SelectionDAG &DAG = DCI.DAG;
3056 SDValue Lo, Hi;
3057 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3058
3059 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3060 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3061
3062 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3063 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3064
3065 // Re-visit the ands. It's possible we eliminated one of them and it could
3066 // simplify the vector.
3067 DCI.AddToWorklist(Lo.getNode());
3068 DCI.AddToWorklist(Hi.getNode());
3069
3070 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3071 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3072}
3073
3074SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3075 DAGCombinerInfo &DCI) const {
3076 EVT VT = N->getValueType(0);
3077
3078 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3079 if (!RHS)
3080 return SDValue();
3081
3082 SDValue LHS = N->getOperand(0);
3083 unsigned RHSVal = RHS->getZExtValue();
3084 if (!RHSVal)
3085 return LHS;
3086
3087 SDLoc SL(N);
3088 SelectionDAG &DAG = DCI.DAG;
3089
3090 switch (LHS->getOpcode()) {
3091 default:
3092 break;
3093 case ISD::ZERO_EXTEND:
3094 case ISD::SIGN_EXTEND:
3095 case ISD::ANY_EXTEND: {
3096 SDValue X = LHS->getOperand(0);
3097
3098 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3099 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3100 // Prefer build_vector as the canonical form if packed types are legal.
3101 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3102 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3103 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3104 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3105 }
3106
3107 // shl (ext x) => zext (shl x), if shift does not overflow int
3108 if (VT != MVT::i64)
3109 break;
3110 KnownBits Known = DAG.computeKnownBits(X);
3111 unsigned LZ = Known.countMinLeadingZeros();
3112 if (LZ < RHSVal)
3113 break;
3114 EVT XVT = X.getValueType();
3115 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3116 return DAG.getZExtOrTrunc(Shl, SL, VT);
3117 }
3118 }
3119
3120 if (VT != MVT::i64)
3121 return SDValue();
3122
3123 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3124
3125 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3126 // common case, splitting this into a move and a 32-bit shift is faster and
3127 // the same code size.
3128 if (RHSVal < 32)
3129 return SDValue();
3130
3131 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3132
3133 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3134 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3135
3136 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3137
3138 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3139 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3140}
3141
3142SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3143 DAGCombinerInfo &DCI) const {
3144 if (N->getValueType(0) != MVT::i64)
3145 return SDValue();
3146
3147 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3148 if (!RHS)
3149 return SDValue();
3150
3151 SelectionDAG &DAG = DCI.DAG;
3152 SDLoc SL(N);
3153 unsigned RHSVal = RHS->getZExtValue();
3154
3155 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3156 if (RHSVal == 32) {
3157 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3158 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3159 DAG.getConstant(31, SL, MVT::i32));
3160
3161 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3162 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3163 }
3164
3165 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3166 if (RHSVal == 63) {
3167 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3168 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3169 DAG.getConstant(31, SL, MVT::i32));
3170 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3171 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3172 }
3173
3174 return SDValue();
3175}
3176
3177SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3178 DAGCombinerInfo &DCI) const {
3179 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3180 if (!RHS)
3181 return SDValue();
3182
3183 EVT VT = N->getValueType(0);
3184 SDValue LHS = N->getOperand(0);
3185 unsigned ShiftAmt = RHS->getZExtValue();
3186 SelectionDAG &DAG = DCI.DAG;
3187 SDLoc SL(N);
3188
3189 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3190 // this improves the ability to match BFE patterns in isel.
3191 if (LHS.getOpcode() == ISD::AND) {
3192 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3193 if (Mask->getAPIntValue().isShiftedMask() &&
3194 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3195 return DAG.getNode(
3196 ISD::AND, SL, VT,
3197 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3198 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3199 }
3200 }
3201 }
3202
3203 if (VT != MVT::i64)
3204 return SDValue();
3205
3206 if (ShiftAmt < 32)
3207 return SDValue();
3208
3209 // srl i64:x, C for C >= 32
3210 // =>
3211 // build_pair (srl hi_32(x), C - 32), 0
3212 SDValue One = DAG.getConstant(1, SL, MVT::i32);
3213 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3214
3215 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3216 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3217
3218 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3219 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3220
3221 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3222
3223 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3224}
3225
3226SDValue AMDGPUTargetLowering::performTruncateCombine(
3227 SDNode *N, DAGCombinerInfo &DCI) const {
3228 SDLoc SL(N);
3229 SelectionDAG &DAG = DCI.DAG;
3230 EVT VT = N->getValueType(0);
3231 SDValue Src = N->getOperand(0);
3232
3233 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3234 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3235 SDValue Vec = Src.getOperand(0);
3236 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3237 SDValue Elt0 = Vec.getOperand(0);
3238 EVT EltVT = Elt0.getValueType();
3239 if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
3240 if (EltVT.isFloatingPoint()) {
3241 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3242 EltVT.changeTypeToInteger(), Elt0);
3243 }
3244
3245 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3246 }
3247 }
3248 }
3249
3250 // Equivalent of above for accessing the high element of a vector as an
3251 // integer operation.
3252 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3253 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3254 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3255 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3256 SDValue BV = stripBitcast(Src.getOperand(0));
3257 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3258 BV.getValueType().getVectorNumElements() == 2) {
3259 SDValue SrcElt = BV.getOperand(1);
3260 EVT SrcEltVT = SrcElt.getValueType();
3261 if (SrcEltVT.isFloatingPoint()) {
3262 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3263 SrcEltVT.changeTypeToInteger(), SrcElt);
3264 }
3265
3266 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3267 }
3268 }
3269 }
3270 }
3271
3272 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3273 //
3274 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3275 // i16 (trunc (srl (i32 (trunc x), K)))
3276 if (VT.getScalarSizeInBits() < 32) {
3277 EVT SrcVT = Src.getValueType();
3278 if (SrcVT.getScalarSizeInBits() > 32 &&
3279 (Src.getOpcode() == ISD::SRL ||
3280 Src.getOpcode() == ISD::SRA ||
3281 Src.getOpcode() == ISD::SHL)) {
3282 SDValue Amt = Src.getOperand(1);
3283 KnownBits Known = DAG.computeKnownBits(Amt);
3284 unsigned Size = VT.getScalarSizeInBits();
3285 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3286 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3287 EVT MidVT = VT.isVector() ?
3288 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3289 VT.getVectorNumElements()) : MVT::i32;
3290
3291 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3292 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3293 Src.getOperand(0));
3294 DCI.AddToWorklist(Trunc.getNode());
3295
3296 if (Amt.getValueType() != NewShiftVT) {
3297 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3298 DCI.AddToWorklist(Amt.getNode());
3299 }
3300
3301 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3302 Trunc, Amt);
3303 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3304 }
3305 }
3306 }
3307
3308 return SDValue();
3309}
3310
3311// We need to specifically handle i64 mul here to avoid unnecessary conversion
3312// instructions. If we only match on the legalized i64 mul expansion,
3313// SimplifyDemandedBits will be unable to remove them because there will be
3314// multiple uses due to the separate mul + mulh[su].
3315static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3316 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3317 if (Size <= 32) {
3318 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3319 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3320 }
3321
3322 // Because we want to eliminate extension instructions before the
3323 // operation, we need to create a single user here (i.e. not the separate
3324 // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3325
3326 unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3327
3328 SDValue Mul = DAG.getNode(MulOpc, SL,
3329 DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3330
3331 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3332 Mul.getValue(0), Mul.getValue(1));
3333}
3334
3335SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3336 DAGCombinerInfo &DCI) const {
3337 EVT VT = N->getValueType(0);
3338
3339 unsigned Size = VT.getSizeInBits();
3340 if (VT.isVector() || Size > 64)
3341 return SDValue();
3342
3343 // There are i16 integer mul/mad.
3344 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3345 return SDValue();
3346
3347 SelectionDAG &DAG = DCI.DAG;
3348 SDLoc DL(N);
3349
3350 SDValue N0 = N->getOperand(0);
3351 SDValue N1 = N->getOperand(1);
3352
3353 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3354 // in the source into any_extends if the result of the mul is truncated. Since
3355 // we can assume the high bits are whatever we want, use the underlying value
3356 // to avoid the unknown high bits from interfering.
3357 if (N0.getOpcode() == ISD::ANY_EXTEND)
3358 N0 = N0.getOperand(0);
3359
3360 if (N1.getOpcode() == ISD::ANY_EXTEND)
3361 N1 = N1.getOperand(0);
3362
3363 SDValue Mul;
3364
3365 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3366 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3367 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3368 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3369 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3370 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3371 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3372 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3373 } else {
3374 return SDValue();
3375 }
3376
3377 // We need to use sext even for MUL_U24, because MUL_U24 is used
3378 // for signed multiply of 8 and 16-bit types.
3379 return DAG.getSExtOrTrunc(Mul, DL, VT);
3380}
3381
3382SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3383 DAGCombinerInfo &DCI) const {
3384 EVT VT = N->getValueType(0);
3385
3386 if (!Subtarget->hasMulI24() || VT.isVector())
3387 return SDValue();
3388
3389 SelectionDAG &DAG = DCI.DAG;
3390 SDLoc DL(N);
3391
3392 SDValue N0 = N->getOperand(0);
3393 SDValue N1 = N->getOperand(1);
3394
3395 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3396 return SDValue();
3397
3398 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3399 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3400
3401 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3402 DCI.AddToWorklist(Mulhi.getNode());
3403 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3404}
3405
3406SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3407 DAGCombinerInfo &DCI) const {
3408 EVT VT = N->getValueType(0);
3409
3410 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3411 return SDValue();
3412
3413 SelectionDAG &DAG = DCI.DAG;
3414 SDLoc DL(N);
3415
3416 SDValue N0 = N->getOperand(0);
3417 SDValue N1 = N->getOperand(1);
3418
3419 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3420 return SDValue();
3421
3422 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3423 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3424
3425 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3426 DCI.AddToWorklist(Mulhi.getNode());
3427 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3428}
3429
3430SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
3431 SDNode *N, DAGCombinerInfo &DCI) const {
3432 SelectionDAG &DAG = DCI.DAG;
3433
3434 // Simplify demanded bits before splitting into multiple users.
3435 if (SDValue V = simplifyI24(N, DCI))
3436 return V;
3437
3438 SDValue N0 = N->getOperand(0);
3439 SDValue N1 = N->getOperand(1);
3440
3441 bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3442
3443 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3444 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3445
3446 SDLoc SL(N);
3447
3448 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3449 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3450 return DAG.getMergeValues({ MulLo, MulHi }, SL);
3451}
3452
3453static bool isNegativeOne(SDValue Val) {
3454 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3455 return C->isAllOnesValue();
3456 return false;
3457}
3458
3459SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3460 SDValue Op,
3461 const SDLoc &DL,
3462 unsigned Opc) const {
3463 EVT VT = Op.getValueType();
3464 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3465 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3466 LegalVT != MVT::i16))
3467 return SDValue();
3468
3469 if (VT != MVT::i32)
3470 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3471
3472 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3473 if (VT != MVT::i32)
3474 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3475
3476 return FFBX;
3477}
3478
3479// The native instructions return -1 on 0 input. Optimize out a select that
3480// produces -1 on 0.
3481//
3482// TODO: If zero is not undef, we could also do this if the output is compared
3483// against the bitwidth.
3484//
3485// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3486SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3487 SDValue LHS, SDValue RHS,
3488 DAGCombinerInfo &DCI) const {
3489 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3490 if (!CmpRhs || !CmpRhs->isNullValue())
3491 return SDValue();
3492
3493 SelectionDAG &DAG = DCI.DAG;
3494 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3495 SDValue CmpLHS = Cond.getOperand(0);
3496
3497 unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3498 AMDGPUISD::FFBH_U32;
3499
3500 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3501 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3502 if (CCOpcode == ISD::SETEQ &&
3503 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3504 RHS.getOperand(0) == CmpLHS &&
3505 isNegativeOne(LHS)) {
3506 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3507 }
3508
3509 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3510 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3511 if (CCOpcode == ISD::SETNE &&
3512 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3513 LHS.getOperand(0) == CmpLHS &&
3514 isNegativeOne(RHS)) {
3515 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3516 }
3517
3518 return SDValue();
3519}
3520
3521static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3522 unsigned Op,
3523 const SDLoc &SL,
3524 SDValue Cond,
3525 SDValue N1,
3526 SDValue N2) {
3527 SelectionDAG &DAG = DCI.DAG;
3528 EVT VT = N1.getValueType();
3529
3530 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3531 N1.getOperand(0), N2.getOperand(0));
3532 DCI.AddToWorklist(NewSelect.getNode());
3533 return DAG.getNode(Op, SL, VT, NewSelect);
3534}
3535
3536// Pull a free FP operation out of a select so it may fold into uses.
3537//
3538// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3539// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3540//
3541// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3542// select c, (fabs x), +k -> fabs (select c, x, k)
3543static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3544 SDValue N) {
3545 SelectionDAG &DAG = DCI.DAG;
3546 SDValue Cond = N.getOperand(0);
3547 SDValue LHS = N.getOperand(1);
3548 SDValue RHS = N.getOperand(2);
3549
3550 EVT VT = N.getValueType();
3551 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3552 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3553 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3554 SDLoc(N), Cond, LHS, RHS);
3555 }
3556
3557 bool Inv = false;
3558 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3559 std::swap(LHS, RHS);
3560 Inv = true;
3561 }
3562
3563 // TODO: Support vector constants.
3564 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3565 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3566 SDLoc SL(N);
3567 // If one side is an fneg/fabs and the other is a constant, we can push the
3568 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3569 SDValue NewLHS = LHS.getOperand(0);
3570 SDValue NewRHS = RHS;
3571
3572 // Careful: if the neg can be folded up, don't try to pull it back down.
3573 bool ShouldFoldNeg = true;
3574
3575 if (NewLHS.hasOneUse()) {
3576 unsigned Opc = NewLHS.getOpcode();
3577 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3578 ShouldFoldNeg = false;
3579 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3580 ShouldFoldNeg = false;
3581 }
3582
3583 if (ShouldFoldNeg) {
3584 if (LHS.getOpcode() == ISD::FNEG)
3585 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3586 else if (CRHS->isNegative())
3587 return SDValue();
3588
3589 if (Inv)
3590 std::swap(NewLHS, NewRHS);
3591
3592 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3593 Cond, NewLHS, NewRHS);
3594 DCI.AddToWorklist(NewSelect.getNode());
3595 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3596 }
3597 }
3598
3599 return SDValue();
3600}
3601
3602
3603SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3604 DAGCombinerInfo &DCI) const {
3605 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3606 return Folded;
3607
3608 SDValue Cond = N->getOperand(0);
3609 if (Cond.getOpcode() != ISD::SETCC)
3610 return SDValue();
3611
3612 EVT VT = N->getValueType(0);
3613 SDValue LHS = Cond.getOperand(0);
3614 SDValue RHS = Cond.getOperand(1);
3615 SDValue CC = Cond.getOperand(2);
3616
3617 SDValue True = N->getOperand(1);
3618 SDValue False = N->getOperand(2);
3619
3620 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3621 SelectionDAG &DAG = DCI.DAG;
3622 if (DAG.isConstantValueOfAnyType(True) &&
3623 !DAG.isConstantValueOfAnyType(False)) {
3624 // Swap cmp + select pair to move constant to false input.
3625 // This will allow using VOPC cndmasks more often.
3626 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3627
3628 SDLoc SL(N);
3629 ISD::CondCode NewCC =
3630 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3631
3632 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3633 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3634 }
3635
3636 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3637 SDValue MinMax
3638 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3639 // Revisit this node so we can catch min3/max3/med3 patterns.
3640 //DCI.AddToWorklist(MinMax.getNode());
3641 return MinMax;
3642 }
3643 }
3644
3645 // There's no reason to not do this if the condition has other uses.
3646 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3647}
3648
3649static bool isInv2Pi(const APFloat &APF) {
3650 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3651 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3652 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3653
3654 return APF.bitwiseIsEqual(KF16) ||
3655 APF.bitwiseIsEqual(KF32) ||
3656 APF.bitwiseIsEqual(KF64);
3657}
3658
3659// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3660// additional cost to negate them.
3661bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3662 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3663 if (C->isZero() && !C->isNegative())
3664 return true;
3665
3666 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3667 return true;
3668 }
3669
3670 return false;
3671}
3672
3673static unsigned inverseMinMax(unsigned Opc) {
3674 switch (Opc) {
3675 case ISD::FMAXNUM:
3676 return ISD::FMINNUM;
3677 case ISD::FMINNUM:
3678 return ISD::FMAXNUM;
3679 case ISD::FMAXNUM_IEEE:
3680 return ISD::FMINNUM_IEEE;
3681 case ISD::FMINNUM_IEEE:
3682 return ISD::FMAXNUM_IEEE;
3683 case AMDGPUISD::FMAX_LEGACY:
3684 return AMDGPUISD::FMIN_LEGACY;
3685 case AMDGPUISD::FMIN_LEGACY:
3686 return AMDGPUISD::FMAX_LEGACY;
3687 default:
3688 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3688)
;
3689 }
3690}
3691
3692SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3693 DAGCombinerInfo &DCI) const {
3694 SelectionDAG &DAG = DCI.DAG;
3695 SDValue N0 = N->getOperand(0);
3696 EVT VT = N->getValueType(0);
3697
3698 unsigned Opc = N0.getOpcode();
3699
3700 // If the input has multiple uses and we can either fold the negate down, or
3701 // the other uses cannot, give up. This both prevents unprofitable
3702 // transformations and infinite loops: we won't repeatedly try to fold around
3703 // a negate that has no 'good' form.
3704 if (N0.hasOneUse()) {
3705 // This may be able to fold into the source, but at a code size cost. Don't
3706 // fold if the fold into the user is free.
3707 if (allUsesHaveSourceMods(N, 0))
3708 return SDValue();
3709 } else {
3710 if (fnegFoldsIntoOp(Opc) &&
3711 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3712 return SDValue();
3713 }
3714
3715 SDLoc SL(N);
3716 switch (Opc) {
3717 case ISD::FADD: {
3718 if (!mayIgnoreSignedZero(N0))
3719 return SDValue();
3720
3721 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3722 SDValue LHS = N0.getOperand(0);
3723 SDValue RHS = N0.getOperand(1);
3724
3725 if (LHS.getOpcode() != ISD::FNEG)
3726 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3727 else
3728 LHS = LHS.getOperand(0);
3729
3730 if (RHS.getOpcode() != ISD::FNEG)
3731 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3732 else
3733 RHS = RHS.getOperand(0);
3734
3735 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3736 if (Res.getOpcode() != ISD::FADD)
3737 return SDValue(); // Op got folded away.
3738 if (!N0.hasOneUse())
3739 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3740 return Res;
3741 }
3742 case ISD::FMUL:
3743 case AMDGPUISD::FMUL_LEGACY: {
3744 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3745 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3746 SDValue LHS = N0.getOperand(0);
3747 SDValue RHS = N0.getOperand(1);
3748
3749 if (LHS.getOpcode() == ISD::FNEG)
3750 LHS = LHS.getOperand(0);
3751 else if (RHS.getOpcode() == ISD::FNEG)
3752 RHS = RHS.getOperand(0);
3753 else
3754 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3755
3756 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3757 if (Res.getOpcode() != Opc)
3758 return SDValue(); // Op got folded away.
3759 if (!N0.hasOneUse())
3760 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3761 return Res;
3762 }
3763 case ISD::FMA:
3764 case ISD::FMAD: {
3765 if (!mayIgnoreSignedZero(N0))
3766 return SDValue();
3767
3768 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3769 SDValue LHS = N0.getOperand(0);
3770 SDValue MHS = N0.getOperand(1);
3771 SDValue RHS = N0.getOperand(2);
3772
3773 if (LHS.getOpcode() == ISD::FNEG)
3774 LHS = LHS.getOperand(0);
3775 else if (MHS.getOpcode() == ISD::FNEG)
3776 MHS = MHS.getOperand(0);
3777 else
3778 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3779
3780 if (RHS.getOpcode() != ISD::FNEG)
3781 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3782 else
3783 RHS = RHS.getOperand(0);
3784
3785 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3786 if (Res.getOpcode() != Opc)
3787 return SDValue(); // Op got folded away.
3788 if (!N0.hasOneUse())
3789 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3790 return Res;
3791 }
3792 case ISD::FMAXNUM:
3793 case ISD::FMINNUM:
3794 case ISD::FMAXNUM_IEEE:
3795 case ISD::FMINNUM_IEEE:
3796 case AMDGPUISD::FMAX_LEGACY:
3797 case AMDGPUISD::FMIN_LEGACY: {
3798 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3799 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3800 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3801 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3802
3803 SDValue LHS = N0.getOperand(0);
3804 SDValue RHS = N0.getOperand(1);
3805
3806 // 0 doesn't have a negated inline immediate.
3807 // TODO: This constant check should be generalized to other operations.
3808 if (isConstantCostlierToNegate(RHS))
3809 return SDValue();
3810
3811 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3812 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3813 unsigned Opposite = inverseMinMax(Opc);
3814
3815 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3816 if (Res.getOpcode() != Opposite)
3817 return SDValue(); // Op got folded away.
3818 if (!N0.hasOneUse())
3819 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3820 return Res;
3821 }
3822 case AMDGPUISD::FMED3: {
3823 SDValue Ops[3];
3824 for (unsigned I = 0; I < 3; ++I)
3825 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3826
3827 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3828 if (Res.getOpcode() != AMDGPUISD::FMED3)
3829 return SDValue(); // Op got folded away.
3830 if (!N0.hasOneUse())
3831 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3832 return Res;
3833 }
3834 case ISD::FP_EXTEND:
3835 case ISD::FTRUNC:
3836 case ISD::FRINT:
3837 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3838 case ISD::FSIN:
3839 case ISD::FCANONICALIZE:
3840 case AMDGPUISD::RCP:
3841 case AMDGPUISD::RCP_LEGACY:
3842 case AMDGPUISD::RCP_IFLAG:
3843 case AMDGPUISD::SIN_HW: {
3844 SDValue CvtSrc = N0.getOperand(0);
3845 if (CvtSrc.getOpcode() == ISD::FNEG) {
3846 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3847 // (fneg (rcp (fneg x))) -> (rcp x)
3848 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3849 }
3850
3851 if (!N0.hasOneUse())
3852 return SDValue();
3853
3854 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3855 // (fneg (rcp x)) -> (rcp (fneg x))
3856 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3857 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3858 }
3859 case ISD::FP_ROUND: {
3860 SDValue CvtSrc = N0.getOperand(0);
3861
3862 if (CvtSrc.getOpcode() == ISD::FNEG) {
3863 // (fneg (fp_round (fneg x))) -> (fp_round x)
3864 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3865 CvtSrc.getOperand(0), N0.getOperand(1));
3866 }
3867
3868 if (!N0.hasOneUse())
3869 return SDValue();
3870
3871 // (fneg (fp_round x)) -> (fp_round (fneg x))
3872 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3873 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3874 }
3875 case ISD::FP16_TO_FP: {
3876 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3877 // f16, but legalization of f16 fneg ends up pulling it out of the source.
3878 // Put the fneg back as a legal source operation that can be matched later.
3879 SDLoc SL(N);
3880
3881 SDValue Src = N0.getOperand(0);
3882 EVT SrcVT = Src.getValueType();
3883
3884 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3885 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3886 DAG.getConstant(0x8000, SL, SrcVT));
3887 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3888 }
3889 default:
3890 return SDValue();
3891 }
3892}
3893
3894SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3895 DAGCombinerInfo &DCI) const {
3896 SelectionDAG &DAG = DCI.DAG;
3897 SDValue N0 = N->getOperand(0);
3898
3899 if (!N0.hasOneUse())
3900 return SDValue();
3901
3902 switch (N0.getOpcode()) {
3903 case ISD::FP16_TO_FP: {
3904 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")((!Subtarget->has16BitInsts() && "should only see if f16 is illegal"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3904, __PRETTY_FUNCTION__))
;
3905 SDLoc SL(N);
3906 SDValue Src = N0.getOperand(0);
3907 EVT SrcVT = Src.getValueType();
3908
3909 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3910 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3911 DAG.getConstant(0x7fff, SL, SrcVT));
3912 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3913 }
3914 default:
3915 return SDValue();
3916 }
3917}
3918
3919SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3920 DAGCombinerInfo &DCI) const {
3921 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3922 if (!CFP)
3923 return SDValue();
3924
3925 // XXX - Should this flush denormals?
3926 const APFloat &Val = CFP->getValueAPF();
3927 APFloat One(Val.getSemantics(), "1.0");
3928 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3929}
3930
3931SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3932 DAGCombinerInfo &DCI) const {
3933 SelectionDAG &DAG = DCI.DAG;
3934 SDLoc DL(N);
3935
3936 switch(N->getOpcode()) {
3937 default:
3938 break;
3939 case ISD::BITCAST: {
3940 EVT DestVT = N->getValueType(0);
3941
3942 // Push casts through vector builds. This helps avoid emitting a large
3943 // number of copies when materializing floating point vector constants.
3944 //
3945 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3946 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3947 if (DestVT.isVector()) {
3948 SDValue Src = N->getOperand(0);
3949 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3950 EVT SrcVT = Src.getValueType();
3951 unsigned NElts = DestVT.getVectorNumElements();
3952
3953 if (SrcVT.getVectorNumElements() == NElts) {
3954 EVT DestEltVT = DestVT.getVectorElementType();
3955
3956 SmallVector<SDValue, 8> CastedElts;
3957 SDLoc SL(N);
3958 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3959 SDValue Elt = Src.getOperand(I);
3960 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3961 }
3962
3963 return DAG.getBuildVector(DestVT, SL, CastedElts);
3964 }
3965 }
3966 }
3967
3968 if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3969 break;
3970
3971 // Fold bitcasts of constants.
3972 //
3973 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3974 // TODO: Generalize and move to DAGCombiner
3975 SDValue Src = N->getOperand(0);
3976 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3977 if (Src.getValueType() == MVT::i64) {
3978 SDLoc SL(N);
3979 uint64_t CVal = C->getZExtValue();
3980 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3981 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3982 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3983 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
3984 }
3985 }
3986
3987 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3988 const APInt &Val = C->getValueAPF().bitcastToAPInt();
3989 SDLoc SL(N);
3990 uint64_t CVal = Val.getZExtValue();
3991 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3992 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3993 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3994
3995 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3996 }
3997
3998 break;
3999 }
4000 case ISD::SHL: {
4001 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4002 break;
4003
4004 return performShlCombine(N, DCI);
4005 }
4006 case ISD::SRL: {
4007 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4008 break;
4009
4010 return performSrlCombine(N, DCI);
4011 }
4012 case ISD::SRA: {
4013 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4014 break;
4015
4016 return performSraCombine(N, DCI);
4017 }
4018 case ISD::TRUNCATE:
4019 return performTruncateCombine(N, DCI);
4020 case ISD::MUL:
4021 return performMulCombine(N, DCI);
4022 case ISD::MULHS:
4023 return performMulhsCombine(N, DCI);
4024 case ISD::MULHU:
4025 return performMulhuCombine(N, DCI);
4026 case AMDGPUISD::MUL_I24:
4027 case AMDGPUISD::MUL_U24:
4028 case AMDGPUISD::MULHI_I24:
4029 case AMDGPUISD::MULHI_U24: {
4030 if (SDValue V = simplifyI24(N, DCI))
4031 return V;
4032 return SDValue();
4033 }
4034 case AMDGPUISD::MUL_LOHI_I24:
4035 case AMDGPUISD::MUL_LOHI_U24:
4036 return performMulLoHi24Combine(N, DCI);
4037 case ISD::SELECT:
4038 return performSelectCombine(N, DCI);
4039 case ISD::FNEG:
4040 return performFNegCombine(N, DCI);
4041 case ISD::FABS:
4042 return performFAbsCombine(N, DCI);
4043 case AMDGPUISD::BFE_I32:
4044 case AMDGPUISD::BFE_U32: {
4045 assert(!N->getValueType(0).isVector() &&((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4046, __PRETTY_FUNCTION__))
4046 "Vector handling of BFE not implemented")((!N->getValueType(0).isVector() && "Vector handling of BFE not implemented"
) ? static_cast<void> (0) : __assert_fail ("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4046, __PRETTY_FUNCTION__))
;
4047 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4048 if (!Width)
4049 break;
4050
4051 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4052 if (WidthVal == 0)
4053 return DAG.getConstant(0, DL, MVT::i32);
4054
4055 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4056 if (!Offset)
4057 break;
4058
4059 SDValue BitsFrom = N->getOperand(0);
4060 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4061
4062 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4063
4064 if (OffsetVal == 0) {
4065 // This is already sign / zero extended, so try to fold away extra BFEs.
4066 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4067
4068 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4069 if (OpSignBits >= SignBits)
4070 return BitsFrom;
4071
4072 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4073 if (Signed) {
4074 // This is a sign_extend_inreg. Replace it to take advantage of existing
4075 // DAG Combines. If not eliminated, we will match back to BFE during
4076 // selection.
4077
4078 // TODO: The sext_inreg of extended types ends, although we can could
4079 // handle them in a single BFE.
4080 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4081 DAG.getValueType(SmallVT));
4082 }
4083
4084 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4085 }
4086
4087 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4088 if (Signed) {
4089 return constantFoldBFE<int32_t>(DAG,
4090 CVal->getSExtValue(),
4091 OffsetVal,
4092 WidthVal,
4093 DL);
4094 }
4095
4096 return constantFoldBFE<uint32_t>(DAG,
4097 CVal->getZExtValue(),
4098 OffsetVal,
4099 WidthVal,
4100 DL);
4101 }
4102
4103 if ((OffsetVal + WidthVal) >= 32 &&
4104 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4105 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4106 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4107 BitsFrom, ShiftVal);
4108 }
4109
4110 if (BitsFrom.hasOneUse()) {
4111 APInt Demanded = APInt::getBitsSet(32,
4112 OffsetVal,
4113 OffsetVal + WidthVal);
4114
4115 KnownBits Known;
4116 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4117 !DCI.isBeforeLegalizeOps());
4118 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4119 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4120 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4121 DCI.CommitTargetLoweringOpt(TLO);
4122 }
4123 }
4124
4125 break;
4126 }
4127 case ISD::LOAD:
4128 return performLoadCombine(N, DCI);
4129 case ISD::STORE:
4130 return performStoreCombine(N, DCI);
4131 case AMDGPUISD::RCP:
4132 case AMDGPUISD::RCP_IFLAG:
4133 return performRcpCombine(N, DCI);
4134 case ISD::AssertZext:
4135 case ISD::AssertSext:
4136 return performAssertSZExtCombine(N, DCI);
4137 case ISD::INTRINSIC_WO_CHAIN:
4138 return performIntrinsicWOChainCombine(N, DCI);
4139 }
4140 return SDValue();
4141}
4142
4143//===----------------------------------------------------------------------===//
4144// Helper functions
4145//===----------------------------------------------------------------------===//
4146
4147SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4148 const TargetRegisterClass *RC,
4149 unsigned Reg, EVT VT,
4150 const SDLoc &SL,
4151 bool RawReg) const {
4152 MachineFunction &MF = DAG.getMachineFunction();
4153 MachineRegisterInfo &MRI = MF.getRegInfo();
4154 unsigned VReg;
4155
4156 if (!MRI.isLiveIn(Reg)) {
4157 VReg = MRI.createVirtualRegister(RC);
4158 MRI.addLiveIn(Reg, VReg);
4159 } else {
4160 VReg = MRI.getLiveInVirtReg(Reg);
4161 }
4162
4163 if (RawReg)
4164 return DAG.getRegister(VReg, VT);
4165
4166 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4167}
4168
4169// This may be called multiple times, and nothing prevents creating multiple
4170// objects at the same offset. See if we already defined this object.
4171static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4172 int64_t Offset) {
4173 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4174 if (MFI.getObjectOffset(I) == Offset) {
4175 assert(MFI.getObjectSize(I) == Size)((MFI.getObjectSize(I) == Size) ? static_cast<void> (0)
: __assert_fail ("MFI.getObjectSize(I) == Size", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4175, __PRETTY_FUNCTION__))
;
4176 return I;
4177 }
4178 }
4179
4180 return MFI.CreateFixedObject(Size, Offset, true);
4181}
4182
4183SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4184 EVT VT,
4185 const SDLoc &SL,
4186 int64_t Offset) const {
4187 MachineFunction &MF = DAG.getMachineFunction();
4188 MachineFrameInfo &MFI = MF.getFrameInfo();
4189 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4190
4191 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4192 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4193
4194 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
4195 MachineMemOperand::MODereferenceable |
4196 MachineMemOperand::MOInvariant);
4197}
4198
4199SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4200 const SDLoc &SL,
4201 SDValue Chain,
4202 SDValue ArgVal,
4203 int64_t Offset) const {
4204 MachineFunction &MF = DAG.getMachineFunction();
4205 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4206
4207 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4208 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
4209 MachineMemOperand::MODereferenceable);
4210 return Store;
4211}
4212
4213SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4214 const TargetRegisterClass *RC,
4215 EVT VT, const SDLoc &SL,
4216 const ArgDescriptor &Arg) const {
4217 assert(Arg && "Attempting to load missing argument")((Arg && "Attempting to load missing argument") ? static_cast
<void> (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4217, __PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4218
4219 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4220 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4221 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4222
4223 if (!Arg.isMasked())
4
Calling 'ArgDescriptor::isMasked'
7
Returning from 'ArgDescriptor::isMasked'
8
Taking false branch
4224 return V;
4225
4226 unsigned Mask = Arg.getMask();
4227 unsigned Shift = countTrailingZeros<unsigned>(Mask);
9
Calling 'countTrailingZeros<unsigned int>'
16
Returning from 'countTrailingZeros<unsigned int>'
17
'Shift' initialized to 32
4228 V = DAG.getNode(ISD::SRL, SL, VT, V,
4229 DAG.getShiftAmountConstant(Shift, VT, SL));
4230 return DAG.getNode(ISD::AND, SL, VT, V,
4231 DAG.getConstant(Mask >> Shift, SL, VT));
18
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4232}
4233
4234uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4235 const MachineFunction &MF, const ImplicitParameter Param) const {
4236 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4237 const AMDGPUSubtarget &ST =
4238 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4239 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4240 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4241 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4242 ExplicitArgOffset;
4243 switch (Param) {
4244 case GRID_DIM:
4245 return ArgOffset;
4246 case GRID_OFFSET:
4247 return ArgOffset + 4;
4248 }
4249 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4249)
;
4250}
4251
4252#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4253
4254const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4255 switch ((AMDGPUISD::NodeType)Opcode) {
4256 case AMDGPUISD::FIRST_NUMBER: break;
4257 // AMDIL DAG nodes
4258 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4259 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4260
4261 // AMDGPU DAG nodes
4262 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4263 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4264 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4265 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4266 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4267 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4268 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4269 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4270 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4271 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4272 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4273 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4274 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4275 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4276 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4277 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4278 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4279 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4280 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4281 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4282 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4283 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4284 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4285 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4286 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4287 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4288 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4289 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4290 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4291 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4292 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4293 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4294 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4295 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4296 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4297 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4298 NODE_NAME_CASE(TRIG_PREOP)case AMDGPUISD::TRIG_PREOP: return "TRIG_PREOP";
4299 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4300 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4301 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4302 NODE_NAME_CASE(RSQ_LEGACY)case AMDGPUISD::RSQ_LEGACY: return "RSQ_LEGACY";
4303 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4304 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4305 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4306 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4307 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4308 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4309 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4310 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4311 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4312 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4313 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4314 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4315 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4316 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4317 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4318 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4319 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4320 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4321 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4322 NODE_NAME_CASE(MUL_LOHI_U24)case AMDGPUISD::MUL_LOHI_U24: return "MUL_LOHI_U24";
4323 NODE_NAME_CASE(MUL_LOHI_I24)case AMDGPUISD::MUL_LOHI_I24: return "MUL_LOHI_I24";
4324 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4325 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4326 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4327 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4328 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4329 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4330 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4331 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4332 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4333 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4334 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4335 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4336 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4337 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4338 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4339 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4340 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4341 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4342 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4343 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4344 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4345 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4346 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4347 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4348 NODE_NAME_CASE(FP16_ZEXT)case AMDGPUISD::FP16_ZEXT: return "FP16_ZEXT";
4349 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4350 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4351 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4352 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4353 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4354 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4355 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4356 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4357 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4358 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4359 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4360 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4361 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4362 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4363 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4364 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4365 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4366 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4367 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4368 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4369 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4370 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4371 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4372 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4373 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4374 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4375 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4376 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4377 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4378 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4379 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4380 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4381 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4382 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4383 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4384 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4385 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4386 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4387 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4388 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4389 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4390 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4391 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4392 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4393 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4394 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4395 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4396 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4397 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4398 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4399 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4400 NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)case AMDGPUISD::BUFFER_ATOMIC_PK_FADD: return "BUFFER_ATOMIC_PK_FADD"
;
4401 NODE_NAME_CASE(ATOMIC_PK_FADD)case AMDGPUISD::ATOMIC_PK_FADD: return "ATOMIC_PK_FADD";
4402
4403 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4404 }
4405 return nullptr;
4406}
4407
4408SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4409 SelectionDAG &DAG, int Enabled,
4410 int &RefinementSteps,
4411 bool &UseOneConstNR,
4412 bool Reciprocal) const {
4413 EVT VT = Operand.getValueType();
4414
4415 if (VT == MVT::f32) {
4416 RefinementSteps = 0;
4417 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4418 }
4419
4420 // TODO: There is also f64 rsq instruction, but the documentation is less
4421 // clear on its precision.
4422
4423 return SDValue();
4424}
4425
4426SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4427 SelectionDAG &DAG, int Enabled,
4428 int &RefinementSteps) const {
4429 EVT VT = Operand.getValueType();
4430
4431 if (VT == MVT::f32) {
4432 // Reciprocal, < 1 ulp error.
4433 //
4434 // This reciprocal approximation converges to < 0.5 ulp error with one
4435 // newton rhapson performed with two fused multiple adds (FMAs).
4436
4437 RefinementSteps = 0;
4438 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4439 }
4440
4441 // TODO: There is also f64 rcp instruction, but the documentation is less
4442 // clear on its precision.
4443
4444 return SDValue();
4445}
4446
4447void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4448 const SDValue Op, KnownBits &Known,
4449 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4450
4451 Known.resetAll(); // Don't know anything.
4452
4453 unsigned Opc = Op.getOpcode();
4454
4455 switch (Opc) {
4456 default:
4457 break;
4458 case AMDGPUISD::CARRY:
4459 case AMDGPUISD::BORROW: {
4460 Known.Zero = APInt::getHighBitsSet(32, 31);
4461 break;
4462 }
4463
4464 case AMDGPUISD::BFE_I32:
4465 case AMDGPUISD::BFE_U32: {
4466 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4467 if (!CWidth)
4468 return;
4469
4470 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4471
4472 if (Opc == AMDGPUISD::BFE_U32)
4473 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4474
4475 break;
4476 }
4477 case AMDGPUISD::FP_TO_FP16:
4478 case AMDGPUISD::FP16_ZEXT: {
4479 unsigned BitWidth = Known.getBitWidth();
4480
4481 // High bits are zero.
4482 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4483 break;
4484 }
4485 case AMDGPUISD::MUL_U24:
4486 case AMDGPUISD::MUL_I24: {
4487 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4488 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4489 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4490 RHSKnown.countMinTrailingZeros();
4491 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4492 // Skip extra check if all bits are known zeros.
4493 if (TrailZ >= 32)
4494 break;
4495
4496 // Truncate to 24 bits.
4497 LHSKnown = LHSKnown.trunc(24);
4498 RHSKnown = RHSKnown.trunc(24);
4499
4500 if (Opc == AMDGPUISD::MUL_I24) {
4501 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4502 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4503 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4504 if (MaxValBits >= 32)
4505 break;
4506 bool LHSNegative = LHSKnown.isNegative();
4507 bool LHSNonNegative = LHSKnown.isNonNegative();
4508 bool LHSPositive = LHSKnown.isStrictlyPositive();
4509 bool RHSNegative = RHSKnown.isNegative();
4510 bool RHSNonNegative = RHSKnown.isNonNegative();
4511 bool RHSPositive = RHSKnown.isStrictlyPositive();
4512
4513 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4514 Known.Zero.setHighBits(32 - MaxValBits);
4515 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4516 Known.One.setHighBits(32 - MaxValBits);
4517 } else {
4518 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4519 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4520 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4521 if (MaxValBits >= 32)
4522 break;
4523 Known.Zero.setHighBits(32 - MaxValBits);
4524 }
4525 break;
4526 }
4527 case AMDGPUISD::PERM: {
4528 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4529 if (!CMask)
4530 return;
4531
4532 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4533 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4534 unsigned Sel = CMask->getZExtValue();
4535
4536 for (unsigned I = 0; I < 32; I += 8) {
4537 unsigned SelBits = Sel & 0xff;
4538 if (SelBits < 4) {
4539 SelBits *= 8;
4540 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4541 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4542 } else if (SelBits < 7) {
4543 SelBits = (SelBits & 3) * 8;
4544 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4545 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4546 } else if (SelBits == 0x0c) {
4547 Known.Zero |= 0xFFull << I;
4548 } else if (SelBits > 0x0c) {
4549 Known.One |= 0xFFull << I;
4550 }
4551 Sel >>= 8;
4552 }
4553 break;
4554 }
4555 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4556 Known.Zero.setHighBits(24);
4557 break;
4558 }
4559 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4560 Known.Zero.setHighBits(16);
4561 break;
4562 }
4563 case AMDGPUISD::LDS: {
4564 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4565 unsigned Align = GA->getGlobal()->getAlignment();
4566
4567 Known.Zero.setHighBits(16);
4568 if (Align)
4569 Known.Zero.setLowBits(Log2_32(Align));
4570 break;
4571 }
4572 case ISD::INTRINSIC_WO_CHAIN: {
4573 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4574 switch (IID) {
4575 case Intrinsic::amdgcn_mbcnt_lo:
4576 case Intrinsic::amdgcn_mbcnt_hi: {
4577 const GCNSubtarget &ST =
4578 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4579 // These return at most the wavefront size - 1.
4580 unsigned Size = Op.getValueType().getSizeInBits();
4581 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4582 break;
4583 }
4584 default:
4585 break;
4586 }
4587 }
4588 }
4589}
4590
4591unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4592 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4593 unsigned Depth) const {
4594 switch (Op.getOpcode()) {
4595 case AMDGPUISD::BFE_I32: {
4596 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4597 if (!Width)
4598 return 1;
4599
4600 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4601 if (!isNullConstant(Op.getOperand(1)))
4602 return SignBits;
4603
4604 // TODO: Could probably figure something out with non-0 offsets.
4605 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4606 return std::max(SignBits, Op0SignBits);
4607 }
4608
4609 case AMDGPUISD::BFE_U32: {
4610 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4611 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4612 }
4613
4614 case AMDGPUISD::CARRY:
4615 case AMDGPUISD::BORROW:
4616 return 31;
4617 case AMDGPUISD::BUFFER_LOAD_BYTE:
4618 return 25;
4619 case AMDGPUISD::BUFFER_LOAD_SHORT:
4620 return 17;
4621 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4622 return 24;
4623 case AMDGPUISD::BUFFER_LOAD_USHORT:
4624 return 16;
4625 case AMDGPUISD::FP_TO_FP16:
4626 case AMDGPUISD::FP16_ZEXT:
4627 return 16;
4628 default:
4629 return 1;
4630 }
4631}
4632
4633bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4634 const SelectionDAG &DAG,
4635 bool SNaN,
4636 unsigned Depth) const {
4637 unsigned Opcode = Op.getOpcode();
4638 switch (Opcode) {
4639 case AMDGPUISD::FMIN_LEGACY:
4640 case AMDGPUISD::FMAX_LEGACY: {
4641 if (SNaN)
4642 return true;
4643
4644 // TODO: Can check no nans on one of the operands for each one, but which
4645 // one?
4646 return false;
4647 }
4648 case AMDGPUISD::FMUL_LEGACY:
4649 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4650 if (SNaN)
4651 return true;
4652 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4653 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4654 }
4655 case AMDGPUISD::FMED3:
4656 case AMDGPUISD::FMIN3:
4657 case AMDGPUISD::FMAX3:
4658 case AMDGPUISD::FMAD_FTZ: {
4659 if (SNaN)
4660 return true;
4661 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4662 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4663 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4664 }
4665 case AMDGPUISD::CVT_F32_UBYTE0:
4666 case AMDGPUISD::CVT_F32_UBYTE1:
4667 case AMDGPUISD::CVT_F32_UBYTE2:
4668 case AMDGPUISD::CVT_F32_UBYTE3:
4669 return true;
4670
4671 case AMDGPUISD::RCP:
4672 case AMDGPUISD::RSQ:
4673 case AMDGPUISD::RCP_LEGACY:
4674 case AMDGPUISD::RSQ_LEGACY:
4675 case AMDGPUISD::RSQ_CLAMP: {
4676 if (SNaN)
4677 return true;
4678
4679 // TODO: Need is known positive check.
4680 return false;
4681 }
4682 case AMDGPUISD::LDEXP:
4683 case AMDGPUISD::FRACT: {
4684 if (SNaN)
4685 return true;
4686 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4687 }
4688 case AMDGPUISD::DIV_SCALE:
4689 case AMDGPUISD::DIV_FMAS:
4690 case AMDGPUISD::DIV_FIXUP:
4691 case AMDGPUISD::TRIG_PREOP:
4692 // TODO: Refine on operands.
4693 return SNaN;
4694 case AMDGPUISD::SIN_HW:
4695 case AMDGPUISD::COS_HW: {
4696 // TODO: Need check for infinity
4697 return SNaN;
4698 }
4699 case ISD::INTRINSIC_WO_CHAIN: {
4700 unsigned IntrinsicID
4701 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4702 // TODO: Handle more intrinsics
4703 switch (IntrinsicID) {
4704 case Intrinsic::amdgcn_cubeid:
4705 return true;
4706
4707 case Intrinsic::amdgcn_frexp_mant: {
4708 if (SNaN)
4709 return true;
4710 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4711 }
4712 case Intrinsic::amdgcn_cvt_pkrtz: {
4713 if (SNaN)
4714 return true;
4715 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4716 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4717 }
4718 case Intrinsic::amdgcn_fdot2:
4719 // TODO: Refine on operand
4720 return SNaN;
4721 default:
4722 return false;
4723 }
4724 }
4725 default:
4726 return false;
4727 }
4728}
4729
4730TargetLowering::AtomicExpansionKind
4731AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4732 switch (RMW->getOperation()) {
4733 case AtomicRMWInst::Nand:
4734 case AtomicRMWInst::FAdd:
4735 case AtomicRMWInst::FSub:
4736 return AtomicExpansionKind::CmpXChg;
4737 default:
4738 return AtomicExpansionKind::None;
4739 }
4740}

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/CodeGen/Register.h"
14#include "llvm/IR/Function.h"
15#include "llvm/Pass.h"
16
17namespace llvm {
18
19class Function;
20class raw_ostream;
21class GCNSubtarget;
22class TargetMachine;
23class TargetRegisterClass;
24class TargetRegisterInfo;
25
26struct ArgDescriptor {
27private:
28 friend struct AMDGPUFunctionArgInfo;
29 friend class AMDGPUArgumentUsageInfo;
30
31 union {
32 Register Reg;
33 unsigned StackOffset;
34 };
35
36 // Bitmask to locate argument within the register.
37 unsigned Mask;
38
39 bool IsStack : 1;
40 bool IsSet : 1;
41
42public:
43 ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
44 bool IsStack = false, bool IsSet = false)
45 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
46
47 static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
48 return ArgDescriptor(Reg, Mask, false, true);
49 }
50
51 static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
52 return ArgDescriptor(Offset, Mask, true, true);
53 }
54
55 static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 Register getRegister() const {
72 assert(!IsStack)((!IsStack) ? static_cast<void> (0) : __assert_fail ("!IsStack"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 72, __PRETTY_FUNCTION__))
;
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((IsStack) ? static_cast<void> (0) : __assert_fail ("IsStack"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 77, __PRETTY_FUNCTION__))
;
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
5
Assuming the condition is true
6
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr = 0;
145
146 // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
147 ArgDescriptor WorkItemIDX;
148 ArgDescriptor WorkItemIDY;
149 ArgDescriptor WorkItemIDZ;
150
151 std::pair<const ArgDescriptor *, const TargetRegisterClass *>
152 getPreloadedValue(PreloadedValue Value) const;
153};
154
155class AMDGPUArgumentUsageInfo : public ImmutablePass {
156private:
157 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
158 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
159
160public:
161 static char ID;
162
163 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesAll();
167 }
168
169 bool doInitialization(Module &M) override;
170 bool doFinalization(Module &M) override;
171
172 void print(raw_ostream &OS, const Module *M = nullptr) const override;
173
174 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
175 ArgInfoMap[&F] = ArgInfo;
176 }
177
178 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
179 auto I = ArgInfoMap.find(&F);
180 if (I == ArgInfoMap.end()) {
181 assert(F.isDeclaration())((F.isDeclaration()) ? static_cast<void> (0) : __assert_fail
("F.isDeclaration()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h"
, 181, __PRETTY_FUNCTION__))
;
182 return ExternFunctionInfo;
183 }
184
185 return I->second;
186 }
187};
188
189} // end namespace llvm
190
191#endif

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <algorithm>
18#include <cassert>
19#include <climits>
20#include <cmath>
21#include <cstdint>
22#include <cstring>
23#include <limits>
24#include <type_traits>
25
26#ifdef __ANDROID_NDK__
27#include <android/api-level.h>
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and MathExtras.h is popular.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44/// The behavior an operation has on an input of 0.
45enum ZeroBehavior {
46 /// The returned value is undefined.
47 ZB_Undefined,
48 /// The returned value is numeric_limits<T>::max()
49 ZB_Max,
50 /// The returned value is numeric_limits<T>::digits
51 ZB_Width
52};
53
54/// Mathematical constants.
55namespace numbers {
56// TODO: Track C++20 std::numbers.
57// TODO: Favor using the hexadecimal FP constants (requires C++17).
58constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
59 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
60 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
61 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
62 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
63 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
64 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
65 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
66 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
67 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
68 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
69 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
70 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
71 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
72 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
73constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
74 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
75 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
76 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
77 log2ef = 1.44269504F, // (0x1.715476P+0)
78 log10ef = .434294482F, // (0x1.bcb7b2P-2)
79 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
80 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
81 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
82 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
83 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
84 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
85 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
86 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
87 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
88} // namespace numbers
89
90namespace detail {
91template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
92 static unsigned count(T Val, ZeroBehavior) {
93 if (!Val)
94 return std::numeric_limits<T>::digits;
95 if (Val & 0x1)
96 return 0;
97
98 // Bisection method.
99 unsigned ZeroBits = 0;
100 T Shift = std::numeric_limits<T>::digits >> 1;
101 T Mask = std::numeric_limits<T>::max() >> Shift;
102 while (Shift) {
103 if ((Val & Mask) == 0) {
104 Val >>= Shift;
105 ZeroBits |= Shift;
106 }
107 Shift >>= 1;
108 Mask >>= Shift;
109 }
110 return ZeroBits;
111 }
112};
113
114#if defined(__GNUC__4) || defined(_MSC_VER)
115template <typename T> struct TrailingZerosCounter<T, 4> {
116 static unsigned count(T Val, ZeroBehavior ZB) {
117 if (ZB
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
10.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
11
Assuming 'Val' is equal to 0
12
Taking true branch
118 return 32;
13
Returning the value 32
119
120#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
121 return __builtin_ctz(Val);
122#elif defined(_MSC_VER)
123 unsigned long Index;
124 _BitScanForward(&Index, Val);
125 return Index;
126#endif
127 }
128};
129
130#if !defined(_MSC_VER) || defined(_M_X64)
131template <typename T> struct TrailingZerosCounter<T, 8> {
132 static unsigned count(T Val, ZeroBehavior ZB) {
133 if (ZB != ZB_Undefined && Val == 0)
134 return 64;
135
136#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
137 return __builtin_ctzll(Val);
138#elif defined(_MSC_VER)
139 unsigned long Index;
140 _BitScanForward64(&Index, Val);
141 return Index;
142#endif
143 }
144};
145#endif
146#endif
147} // namespace detail
148
149/// Count number of 0's from the least significant bit to the most
150/// stopping at the first 1.
151///
152/// Only unsigned integral types are allowed.
153///
154/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
155/// valid arguments.
156template <typename T>
157unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
158 static_assert(std::numeric_limits<T>::is_integer &&
159 !std::numeric_limits<T>::is_signed,
160 "Only unsigned integral types are allowed.");
161 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
10
Calling 'TrailingZerosCounter::count'
14
Returning from 'TrailingZerosCounter::count'
15
Returning the value 32
162}
163
164namespace detail {
165template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
166 static unsigned count(T Val, ZeroBehavior) {
167 if (!Val)
168 return std::numeric_limits<T>::digits;
169
170 // Bisection method.
171 unsigned ZeroBits = 0;
172 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
173 T Tmp = Val >> Shift;
174 if (Tmp)
175 Val = Tmp;
176 else
177 ZeroBits |= Shift;
178 }
179 return ZeroBits;
180 }
181};
182
183#if defined(__GNUC__4) || defined(_MSC_VER)
184template <typename T> struct LeadingZerosCounter<T, 4> {
185 static unsigned count(T Val, ZeroBehavior ZB) {
186 if (ZB != ZB_Undefined && Val == 0)
187 return 32;
188
189#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
190 return __builtin_clz(Val);
191#elif defined(_MSC_VER)
192 unsigned long Index;
193 _BitScanReverse(&Index, Val);
194 return Index ^ 31;
195#endif
196 }
197};
198
199#if !defined(_MSC_VER) || defined(_M_X64)
200template <typename T> struct LeadingZerosCounter<T, 8> {
201 static unsigned count(T Val, ZeroBehavior ZB) {
202 if (ZB != ZB_Undefined && Val == 0)
203 return 64;
204
205#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
206 return __builtin_clzll(Val);
207#elif defined(_MSC_VER)
208 unsigned long Index;
209 _BitScanReverse64(&Index, Val);
210 return Index ^ 63;
211#endif
212 }
213};
214#endif
215#endif
216} // namespace detail
217
218/// Count number of 0's from the most significant bit to the least
219/// stopping at the first 1.
220///
221/// Only unsigned integral types are allowed.
222///
223/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
224/// valid arguments.
225template <typename T>
226unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
227 static_assert(std::numeric_limits<T>::is_integer &&
228 !std::numeric_limits<T>::is_signed,
229 "Only unsigned integral types are allowed.");
230 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
231}
232
233/// Get the index of the first set bit starting from the least
234/// significant bit.
235///
236/// Only unsigned integral types are allowed.
237///
238/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
239/// valid arguments.
240template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
241 if (ZB == ZB_Max && Val == 0)
242 return std::numeric_limits<T>::max();
243
244 return countTrailingZeros(Val, ZB_Undefined);
245}
246
247/// Create a bitmask with the N right-most bits set to 1, and all other
248/// bits set to 0. Only unsigned types are allowed.
249template <typename T> T maskTrailingOnes(unsigned N) {
250 static_assert(std::is_unsigned<T>::value, "Invalid type!");
251 const unsigned Bits = CHAR_BIT8 * sizeof(T);
252 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 252, __PRETTY_FUNCTION__))
;
253 return N == 0 ? 0 : (T(-1) >> (Bits - N));
254}
255
256/// Create a bitmask with the N left-most bits set to 1, and all other
257/// bits set to 0. Only unsigned types are allowed.
258template <typename T> T maskLeadingOnes(unsigned N) {
259 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
260}
261
262/// Create a bitmask with the N right-most bits set to 0, and all other
263/// bits set to 1. Only unsigned types are allowed.
264template <typename T> T maskTrailingZeros(unsigned N) {
265 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
266}
267
268/// Create a bitmask with the N left-most bits set to 0, and all other
269/// bits set to 1. Only unsigned types are allowed.
270template <typename T> T maskLeadingZeros(unsigned N) {
271 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
272}
273
274/// Get the index of the last set bit starting from the least
275/// significant bit.
276///
277/// Only unsigned integral types are allowed.
278///
279/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
280/// valid arguments.
281template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
282 if (ZB == ZB_Max && Val == 0)
283 return std::numeric_limits<T>::max();
284
285 // Use ^ instead of - because both gcc and llvm can remove the associated ^
286 // in the __builtin_clz intrinsic on x86.
287 return countLeadingZeros(Val, ZB_Undefined) ^
288 (std::numeric_limits<T>::digits - 1);
289}
290
291/// Macro compressed bit reversal table for 256 bits.
292///
293/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
294static const unsigned char BitReverseTable256[256] = {
295#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
296#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
297#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
298 R6(0), R6(2), R6(1), R6(3)
299#undef R2
300#undef R4
301#undef R6
302};
303
304/// Reverse the bits in \p Val.
305template <typename T>
306T reverseBits(T Val) {
307 unsigned char in[sizeof(Val)];
308 unsigned char out[sizeof(Val)];
309 std::memcpy(in, &Val, sizeof(Val));
310 for (unsigned i = 0; i < sizeof(Val); ++i)
311 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
312 std::memcpy(&Val, out, sizeof(Val));
313 return Val;
314}
315
316// NOTE: The following support functions use the _32/_64 extensions instead of
317// type overloading so that signed and unsigned integers can be used without
318// ambiguity.
319
320/// Return the high 32 bits of a 64 bit value.
321constexpr inline uint32_t Hi_32(uint64_t Value) {
322 return static_cast<uint32_t>(Value >> 32);
323}
324
325/// Return the low 32 bits of a 64 bit value.
326constexpr inline uint32_t Lo_32(uint64_t Value) {
327 return static_cast<uint32_t>(Value);
328}
329
330/// Make a 64-bit integer from a high / low pair of 32-bit integers.
331constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
332 return ((uint64_t)High << 32) | (uint64_t)Low;
333}
334
335/// Checks if an integer fits into the given bit width.
336template <unsigned N> constexpr inline bool isInt(int64_t x) {
337 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
338}
339// Template specializations to get better code for common cases.
340template <> constexpr inline bool isInt<8>(int64_t x) {
341 return static_cast<int8_t>(x) == x;
342}
343template <> constexpr inline bool isInt<16>(int64_t x) {
344 return static_cast<int16_t>(x) == x;
345}
346template <> constexpr inline bool isInt<32>(int64_t x) {
347 return static_cast<int32_t>(x) == x;
348}
349
350/// Checks if a signed integer is an N bit number shifted left by S.
351template <unsigned N, unsigned S>
352constexpr inline bool isShiftedInt(int64_t x) {
353 static_assert(
354 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
355 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
356 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
357}
358
359/// Checks if an unsigned integer fits into the given bit width.
360///
361/// This is written as two functions rather than as simply
362///
363/// return N >= 64 || X < (UINT64_C(1) << N);
364///
365/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
366/// left too many places.
367template <unsigned N>
368constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t X) {
374 return true;
375}
376
377// Template specializations to get better code for common cases.
378template <> constexpr inline bool isUInt<8>(uint64_t x) {
379 return static_cast<uint8_t>(x) == x;
380}
381template <> constexpr inline bool isUInt<16>(uint64_t x) {
382 return static_cast<uint16_t>(x) == x;
383}
384template <> constexpr inline bool isUInt<32>(uint64_t x) {
385 return static_cast<uint32_t>(x) == x;
386}
387
388/// Checks if a unsigned integer is an N bit number shifted left by S.
389template <unsigned N, unsigned S>
390constexpr inline bool isShiftedUInt(uint64_t x) {
391 static_assert(
392 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
393 static_assert(N + S <= 64,
394 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
395 // Per the two static_asserts above, S must be strictly less than 64. So
396 // 1 << S is not undefined behavior.
397 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
398}
399
400/// Gets the maximum value for a N-bit unsigned integer.
401inline uint64_t maxUIntN(uint64_t N) {
402 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 402, __PRETTY_FUNCTION__))
;
403
404 // uint64_t(1) << 64 is undefined behavior, so we can't do
405 // (uint64_t(1) << N) - 1
406 // without checking first that N != 64. But this works and doesn't have a
407 // branch.
408 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
409}
410
411/// Gets the minimum value for a N-bit signed integer.
412inline int64_t minIntN(int64_t N) {
413 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 413, __PRETTY_FUNCTION__))
;
414
415 return -(UINT64_C(1)1UL<<(N-1));
416}
417
418/// Gets the maximum value for a N-bit signed integer.
419inline int64_t maxIntN(int64_t N) {
420 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 420, __PRETTY_FUNCTION__))
;
421
422 // This relies on two's complement wraparound when N == 64, so we convert to
423 // int64_t only at the very end to avoid UB.
424 return (UINT64_C(1)1UL << (N - 1)) - 1;
425}
426
427/// Checks if an unsigned integer fits into the given (dynamic) bit width.
428inline bool isUIntN(unsigned N, uint64_t x) {
429 return N >= 64 || x <= maxUIntN(N);
430}
431
432/// Checks if an signed integer fits into the given (dynamic) bit width.
433inline bool isIntN(unsigned N, int64_t x) {
434 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
435}
436
437/// Return true if the argument is a non-empty sequence of ones starting at the
438/// least significant bit with the remainder zero (32 bit version).
439/// Ex. isMask_32(0x0000FFFFU) == true.
440constexpr inline bool isMask_32(uint32_t Value) {
441 return Value && ((Value + 1) & Value) == 0;
442}
443
444/// Return true if the argument is a non-empty sequence of ones starting at the
445/// least significant bit with the remainder zero (64 bit version).
446constexpr inline bool isMask_64(uint64_t Value) {
447 return Value && ((Value + 1) & Value) == 0;
448}
449
450/// Return true if the argument contains a non-empty sequence of ones with the
451/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
452constexpr inline bool isShiftedMask_32(uint32_t Value) {
453 return Value && isMask_32((Value - 1) | Value);
454}
455
456/// Return true if the argument contains a non-empty sequence of ones with the
457/// remainder zero (64 bit version.)
458constexpr inline bool isShiftedMask_64(uint64_t Value) {
459 return Value && isMask_64((Value - 1) | Value);
460}
461
462/// Return true if the argument is a power of two > 0.
463/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
464constexpr inline bool isPowerOf2_32(uint32_t Value) {
465 return Value && !(Value & (Value - 1));
466}
467
468/// Return true if the argument is a power of two > 0 (64 bit edition.)
469constexpr inline bool isPowerOf2_64(uint64_t Value) {
470 return Value && !(Value & (Value - 1));
471}
472
473/// Count the number of ones from the most significant bit to the first
474/// zero bit.
475///
476/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
477/// Only unsigned integral types are allowed.
478///
479/// \param ZB the behavior on an input of all ones. Only ZB_Width and
480/// ZB_Undefined are valid arguments.
481template <typename T>
482unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
483 static_assert(std::numeric_limits<T>::is_integer &&
484 !std::numeric_limits<T>::is_signed,
485 "Only unsigned integral types are allowed.");
486 return countLeadingZeros<T>(~Value, ZB);
487}
488
489/// Count the number of ones from the least significant bit to the first
490/// zero bit.
491///
492/// Ex. countTrailingOnes(0x00FF00FF) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countTrailingZeros<T>(~Value, ZB);
503}
504
505namespace detail {
506template <typename T, std::size_t SizeOfT> struct PopulationCounter {
507 static unsigned count(T Value) {
508 // Generic version, forward to 32 bits.
509 static_assert(SizeOfT <= 4, "Not implemented!");
510#if defined(__GNUC__4)
511 return __builtin_popcount(Value);
512#else
513 uint32_t v = Value;
514 v = v - ((v >> 1) & 0x55555555);
515 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
516 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
517#endif
518 }
519};
520
521template <typename T> struct PopulationCounter<T, 8> {
522 static unsigned count(T Value) {
523#if defined(__GNUC__4)
524 return __builtin_popcountll(Value);
525#else
526 uint64_t v = Value;
527 v = v - ((v >> 1) & 0x5555555555555555ULL);
528 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
529 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
530 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
531#endif
532 }
533};
534} // namespace detail
535
536/// Count the number of set bits in a value.
537/// Ex. countPopulation(0xF000F000) = 8
538/// Returns 0 if the word is zero.
539template <typename T>
540inline unsigned countPopulation(T Value) {
541 static_assert(std::numeric_limits<T>::is_integer &&
542 !std::numeric_limits<T>::is_signed,
543 "Only unsigned integral types are allowed.");
544 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
545}
546
547/// Compile time Log2.
548/// Valid only for positive powers of two.
549template <size_t kValue> constexpr inline size_t CTLog2() {
550 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
551 "Value is not a valid power of 2");
552 return 1 + CTLog2<kValue / 2>();
553}
554
555template <> constexpr inline size_t CTLog2<1>() { return 0; }
556
557/// Return the log base 2 of the specified value.
558inline double Log2(double Value) {
559#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
560 return __builtin_log(Value) / __builtin_log(2.0);
561#else
562 return log2(Value);
563#endif
564}
565
566/// Return the floor log base 2 of the specified value, -1 if the value is zero.
567/// (32 bit edition.)
568/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
569inline unsigned Log2_32(uint32_t Value) {
570 return 31 - countLeadingZeros(Value);
571}
572
573/// Return the floor log base 2 of the specified value, -1 if the value is zero.
574/// (64 bit edition.)
575inline unsigned Log2_64(uint64_t Value) {
576 return 63 - countLeadingZeros(Value);
577}
578
579/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
580/// (32 bit edition).
581/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
582inline unsigned Log2_32_Ceil(uint32_t Value) {
583 return 32 - countLeadingZeros(Value - 1);
584}
585
586/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
587/// (64 bit edition.)
588inline unsigned Log2_64_Ceil(uint64_t Value) {
589 return 64 - countLeadingZeros(Value - 1);
590}
591
592/// Return the greatest common divisor of the values using Euclid's algorithm.
593template <typename T>
594inline T greatestCommonDivisor(T A, T B) {
595 while (B) {
596 T Tmp = B;
597 B = A % B;
598 A = Tmp;
599 }
600 return A;
601}
602
603inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
604 return greatestCommonDivisor<uint64_t>(A, B);
605}
606
607/// This function takes a 64-bit integer and returns the bit equivalent double.
608inline double BitsToDouble(uint64_t Bits) {
609 double D;
610 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
611 memcpy(&D, &Bits, sizeof(Bits));
612 return D;
613}
614
615/// This function takes a 32-bit integer and returns the bit equivalent float.
616inline float BitsToFloat(uint32_t Bits) {
617 float F;
618 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
619 memcpy(&F, &Bits, sizeof(Bits));
620 return F;
621}
622
623/// This function takes a double and returns the bit equivalent 64-bit integer.
624/// Note that copying doubles around changes the bits of NaNs on some hosts,
625/// notably x86, so this routine cannot be used if these bits are needed.
626inline uint64_t DoubleToBits(double Double) {
627 uint64_t Bits;
628 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
629 memcpy(&Bits, &Double, sizeof(Double));
630 return Bits;
631}
632
633/// This function takes a float and returns the bit equivalent 32-bit integer.
634/// Note that copying floats around changes the bits of NaNs on some hosts,
635/// notably x86, so this routine cannot be used if these bits are needed.
636inline uint32_t FloatToBits(float Float) {
637 uint32_t Bits;
638 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
639 memcpy(&Bits, &Float, sizeof(Float));
640 return Bits;
641}
642
643/// A and B are either alignments or offsets. Return the minimum alignment that
644/// may be assumed after adding the two together.
645constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
646 // The largest power of 2 that divides both A and B.
647 //
648 // Replace "-Value" by "1+~Value" in the following commented code to avoid
649 // MSVC warning C4146
650 // return (A | B) & -(A | B);
651 return (A | B) & (1 + ~(A | B));
652}
653
654/// Returns the next power of two (in 64-bits) that is strictly greater than A.
655/// Returns zero on overflow.
656inline uint64_t NextPowerOf2(uint64_t A) {
657 A |= (A >> 1);
658 A |= (A >> 2);
659 A |= (A >> 4);
660 A |= (A >> 8);
661 A |= (A >> 16);
662 A |= (A >> 32);
663 return A + 1;
664}
665
666/// Returns the power of two which is less than or equal to the given value.
667/// Essentially, it is a floor operation across the domain of powers of two.
668inline uint64_t PowerOf2Floor(uint64_t A) {
669 if (!A) return 0;
670 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
671}
672
673/// Returns the power of two which is greater than or equal to the given value.
674/// Essentially, it is a ceil operation across the domain of powers of two.
675inline uint64_t PowerOf2Ceil(uint64_t A) {
676 if (!A)
677 return 0;
678 return NextPowerOf2(A - 1);
679}
680
681/// Returns the next integer (mod 2**64) that is greater than or equal to
682/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
683///
684/// If non-zero \p Skew is specified, the return value will be a minimal
685/// integer that is greater than or equal to \p Value and equal to
686/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
687/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
688///
689/// Examples:
690/// \code
691/// alignTo(5, 8) = 8
692/// alignTo(17, 8) = 24
693/// alignTo(~0LL, 8) = 0
694/// alignTo(321, 255) = 510
695///
696/// alignTo(5, 8, 7) = 7
697/// alignTo(17, 8, 1) = 17
698/// alignTo(~0LL, 8, 3) = 3
699/// alignTo(321, 255, 42) = 552
700/// \endcode
701inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
702 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 702, __PRETTY_FUNCTION__))
;
703 Skew %= Align;
704 return (Value + Align - 1 - Skew) / Align * Align + Skew;
705}
706
707/// Returns the next integer (mod 2**64) that is greater than or equal to
708/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
709template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
710 static_assert(Align != 0u, "Align must be non-zero");
711 return (Value + Align - 1) / Align * Align;
712}
713
714/// Returns the integer ceil(Numerator / Denominator).
715inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
716 return alignTo(Numerator, Denominator) / Denominator;
717}
718
719/// Returns the integer nearest(Numerator / Denominator).
720inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
721 return (Numerator + (Denominator / 2)) / Denominator;
722}
723
724/// Returns the largest uint64_t less than or equal to \p Value and is
725/// \p Skew mod \p Align. \p Align must be non-zero
726inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
727 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 727, __PRETTY_FUNCTION__))
;
728 Skew %= Align;
729 return (Value - Skew) / Align * Align + Skew;
730}
731
732/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
733/// Requires 0 < B <= 32.
734template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
735 static_assert(B > 0, "Bit width can't be 0.");
736 static_assert(B <= 32, "Bit width out of range.");
737 return int32_t(X << (32 - B)) >> (32 - B);
738}
739
740/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
741/// Requires 0 < B < 32.
742inline int32_t SignExtend32(uint32_t X, unsigned B) {
743 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 743, __PRETTY_FUNCTION__))
;
744 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 744, __PRETTY_FUNCTION__))
;
745 return int32_t(X << (32 - B)) >> (32 - B);
746}
747
748/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
749/// Requires 0 < B < 64.
750template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
751 static_assert(B > 0, "Bit width can't be 0.");
752 static_assert(B <= 64, "Bit width out of range.");
753 return int64_t(x << (64 - B)) >> (64 - B);
754}
755
756/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
757/// Requires 0 < B < 64.
758inline int64_t SignExtend64(uint64_t X, unsigned B) {
759 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 759, __PRETTY_FUNCTION__))
;
760 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 760, __PRETTY_FUNCTION__))
;
761 return int64_t(X << (64 - B)) >> (64 - B);
762}
763
764/// Subtract two unsigned integers, X and Y, of type T and return the absolute
765/// value of the result.
766template <typename T>
767std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
768 return std::max(X, Y) - std::min(X, Y);
769}
770
771/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
772/// maximum representable value of T on overflow. ResultOverflowed indicates if
773/// the result is larger than the maximum representable value of type T.
774template <typename T>
775std::enable_if_t<std::is_unsigned<T>::value, T>
776SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
777 bool Dummy;
778 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
779 // Hacker's Delight, p. 29
780 T Z = X + Y;
781 Overflowed = (Z < X || Z < Y);
782 if (Overflowed)
783 return std::numeric_limits<T>::max();
784 else
785 return Z;
786}
787
788/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
789/// maximum representable value of T on overflow. ResultOverflowed indicates if
790/// the result is larger than the maximum representable value of type T.
791template <typename T>
792std::enable_if_t<std::is_unsigned<T>::value, T>
793SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
794 bool Dummy;
795 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
796
797 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
798 // because it fails for uint16_t (where multiplication can have undefined
799 // behavior due to promotion to int), and requires a division in addition
800 // to the multiplication.
801
802 Overflowed = false;
803
804 // Log2(Z) would be either Log2Z or Log2Z + 1.
805 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
806 // will necessarily be less than Log2Max as desired.
807 int Log2Z = Log2_64(X) + Log2_64(Y);
808 const T Max = std::numeric_limits<T>::max();
809 int Log2Max = Log2_64(Max);
810 if (Log2Z < Log2Max) {
811 return X * Y;
812 }
813 if (Log2Z > Log2Max) {
814 Overflowed = true;
815 return Max;
816 }
817
818 // We're going to use the top bit, and maybe overflow one
819 // bit past it. Multiply all but the bottom bit then add
820 // that on at the end.
821 T Z = (X >> 1) * Y;
822 if (Z & ~(Max >> 1)) {
823 Overflowed = true;
824 return Max;
825 }
826 Z <<= 1;
827 if (X & 1)
828 return SaturatingAdd(Z, Y, ResultOverflowed);
829
830 return Z;
831}
832
833/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
834/// the product. Clamp the result to the maximum representable value of T on
835/// overflow. ResultOverflowed indicates if the result is larger than the
836/// maximum representable value of type T.
837template <typename T>
838std::enable_if_t<std::is_unsigned<T>::value, T>
839SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
840 bool Dummy;
841 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
842
843 T Product = SaturatingMultiply(X, Y, &Overflowed);
844 if (Overflowed)
845 return Product;
846
847 return SaturatingAdd(A, Product, &Overflowed);
848}
849
850/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
851extern const float huge_valf;
852
853
854/// Add two signed integers, computing the two's complement truncated result,
855/// returning true if overflow occured.
856template <typename T>
857std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
858#if __has_builtin(__builtin_add_overflow)1
859 return __builtin_add_overflow(X, Y, &Result);
860#else
861 // Perform the unsigned addition.
862 using U = std::make_unsigned_t<T>;
863 const U UX = static_cast<U>(X);
864 const U UY = static_cast<U>(Y);
865 const U UResult = UX + UY;
866
867 // Convert to signed.
868 Result = static_cast<T>(UResult);
869
870 // Adding two positive numbers should result in a positive number.
871 if (X > 0 && Y > 0)
872 return Result <= 0;
873 // Adding two negatives should result in a negative number.
874 if (X < 0 && Y < 0)
875 return Result >= 0;
876 return false;
877#endif
878}
879
880/// Subtract two signed integers, computing the two's complement truncated
881/// result, returning true if an overflow ocurred.
882template <typename T>
883std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
884#if __has_builtin(__builtin_sub_overflow)1
885 return __builtin_sub_overflow(X, Y, &Result);
886#else
887 // Perform the unsigned addition.
888 using U = std::make_unsigned_t<T>;
889 const U UX = static_cast<U>(X);
890 const U UY = static_cast<U>(Y);
891 const U UResult = UX - UY;
892
893 // Convert to signed.
894 Result = static_cast<T>(UResult);
895
896 // Subtracting a positive number from a negative results in a negative number.
897 if (X <= 0 && Y > 0)
898 return Result >= 0;
899 // Subtracting a negative number from a positive results in a positive number.
900 if (X >= 0 && Y < 0)
901 return Result <= 0;
902 return false;
903#endif
904}
905
906/// Multiply two signed integers, computing the two's complement truncated
907/// result, returning true if an overflow ocurred.
908template <typename T>
909std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
910 // Perform the unsigned multiplication on absolute values.
911 using U = std::make_unsigned_t<T>;
912 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
913 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
914 const U UResult = UX * UY;
915
916 // Convert to signed.
917 const bool IsNegative = (X < 0) ^ (Y < 0);
918 Result = IsNegative ? (0 - UResult) : UResult;
919
920 // If any of the args was 0, result is 0 and no overflow occurs.
921 if (UX == 0 || UY == 0)
922 return false;
923
924 // UX and UY are in [1, 2^n], where n is the number of digits.
925 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
926 // positive) divided by an argument compares to the other.
927 if (IsNegative)
928 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
929 else
930 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
931}
932
933} // End llvm namespace
934
935#endif