Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4560, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/source/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/CodeGen/Analysis.h"
21#include "llvm/CodeGen/MachineFrameInfo.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include "llvm/Support/CommandLine.h"
25#include "llvm/Support/KnownBits.h"
26#include "llvm/Target/TargetMachine.h"
27
28using namespace llvm;
29
30#include "AMDGPUGenCallingConv.inc"
31
32static cl::opt<bool> AMDGPUBypassSlowDiv(
33 "amdgpu-bypass-slow-div",
34 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35 cl::init(true));
36
37// Find a larger type to do a load / store of a vector with.
38EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
39 unsigned StoreSize = VT.getStoreSizeInBits();
40 if (StoreSize <= 32)
41 return EVT::getIntegerVT(Ctx, StoreSize);
42
43 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")(static_cast <bool> (StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? void (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 43, __extension__
__PRETTY_FUNCTION__))
;
44 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45}
46
47unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
48 return DAG.computeKnownBits(Op).countMaxActiveBits();
49}
50
51unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
52 // In order for this to be a signed 24-bit value, bit 23, must
53 // be a sign bit.
54 return DAG.ComputeMaxSignificantBits(Op);
55}
56
57AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
58 const AMDGPUSubtarget &STI)
59 : TargetLowering(TM), Subtarget(&STI) {
60 // Lower floating point store/load to integer store/load to reduce the number
61 // of patterns in tablegen.
62 setOperationAction(ISD::LOAD, MVT::f32, Promote);
63 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
64
65 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
66 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
67
68 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
69 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
70
71 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
72 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
73
74 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
75 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
76
77 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
78 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
79
80 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
81 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
82
83 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
84 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
85
86 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
87 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
88
89 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
90 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
91
92 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
93 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
94
95 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
96 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
97
98 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
99 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
100
101 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
102 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
103
104 setOperationAction(ISD::LOAD, MVT::i64, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
106
107 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
109
110 setOperationAction(ISD::LOAD, MVT::f64, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
112
113 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
115
116 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
118
119 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
120 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
121
122 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
123 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
124
125 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
126 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
127
128 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
129 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
130
131 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
132 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
133
134 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
135 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
136
137 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
138 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
139
140 // There are no 64-bit extloads. These should be done as a 32-bit extload and
141 // an extension to 64-bit.
142 for (MVT VT : MVT::integer_valuetypes())
143 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
144 Expand);
145
146 for (MVT VT : MVT::integer_valuetypes()) {
147 if (VT == MVT::i64)
148 continue;
149
150 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
151 setLoadExtAction(Op, VT, MVT::i1, Promote);
152 setLoadExtAction(Op, VT, MVT::i8, Legal);
153 setLoadExtAction(Op, VT, MVT::i16, Legal);
154 setLoadExtAction(Op, VT, MVT::i32, Expand);
155 }
156 }
157
158 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
159 for (auto MemVT :
160 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
161 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
162 Expand);
163
164 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
165 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
166 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
167 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
168 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
169 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
172
173 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
174 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
175 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
179
180 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
187
188 setOperationAction(ISD::STORE, MVT::f32, Promote);
189 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
190
191 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
192 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
193
194 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
195 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
196
197 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
198 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
199
200 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
201 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
202
203 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
204 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
205
206 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
207 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
208
209 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
210 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
211
212 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
213 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
214
215 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
216 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
217
218 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
219 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
220
221 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
222 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
223
224 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
225 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
226
227 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
228 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
229
230 setOperationAction(ISD::STORE, MVT::i64, Promote);
231 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
232
233 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
234 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
235
236 setOperationAction(ISD::STORE, MVT::f64, Promote);
237 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
238
239 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
240 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
241
242 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
243 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
244
245 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
246 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
247
248 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
249 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
250
251 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
252 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
253
254 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
255 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
256
257 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
258 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
259
260 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
261 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
262
263 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
264 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
265
266 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
267 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
268 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
269 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
270
271 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
272 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
273 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
274 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
275
276 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
277 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
278 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
279 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
280 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
281 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
282 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
283 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
284
285 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
286 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
287 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
288
289 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
290 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
291
292 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
293 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
294 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
295 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
296
297 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
298 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
299 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
300 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
301
302 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
303 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
304
305 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
306 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
307 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
308 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
309 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
310 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
311 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
312
313 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
314 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
315
316 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
317
318 // This is totally unsupported, just custom lower to produce an error.
319 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
320
321 // Library functions. These default to Expand, but we have instructions
322 // for them.
323 setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
324 ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
325 ISD::FMAXNUM},
326 MVT::f32, Legal);
327
328 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
329
330 setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
331
332 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
333
334 setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom);
335
336 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
337
338 if (Subtarget->has16BitInsts())
339 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
340 else
341 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
342
343 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
344 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
345 // default unless marked custom/legal.
346 setOperationAction(
347 ISD::IS_FPCLASS,
348 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
349 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
350 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
351 Custom);
352
353 // Expand to fneg + fadd.
354 setOperationAction(ISD::FSUB, MVT::f64, Expand);
355
356 setOperationAction(ISD::CONCAT_VECTORS,
357 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
358 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
359 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
360 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
361 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
362 Custom);
363 setOperationAction(
364 ISD::EXTRACT_SUBVECTOR,
365 {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32,
366 MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32,
367 MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32,
368 MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, MVT::v9i32,
369 MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
370 MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
371 MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
372 MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
373 MVT::v16f64, MVT::v16i64},
374 Custom);
375
376 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
377 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
378
379 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
380 for (MVT VT : ScalarIntVTs) {
381 // These should use [SU]DIVREM, so set them to expand
382 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
383 Expand);
384
385 // GPU does not have divrem function for signed or unsigned.
386 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
387
388 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
389 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
390
391 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
392
393 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
394 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
395 }
396
397 // The hardware supports 32-bit FSHR, but not FSHL.
398 setOperationAction(ISD::FSHR, MVT::i32, Legal);
399
400 // The hardware supports 32-bit ROTR, but not ROTL.
401 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
402 setOperationAction(ISD::ROTR, MVT::i64, Expand);
403
404 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
405
406 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
407 setOperationAction(
408 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
409 MVT::i64, Custom);
410 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
411
412 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
413 Legal);
414
415 setOperationAction(
416 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
417 MVT::i64, Custom);
418
419 static const MVT::SimpleValueType VectorIntTypes[] = {
420 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
421 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
422
423 for (MVT VT : VectorIntTypes) {
424 // Expand the following operations for the current type by default.
425 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
426 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
427 ISD::MULHS, ISD::OR, ISD::SHL,
428 ISD::SRA, ISD::SRL, ISD::ROTL,
429 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
430 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
431 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
432 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
433 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
434 ISD::XOR, ISD::BSWAP, ISD::CTPOP,
435 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
436 ISD::SETCC},
437 VT, Expand);
438 }
439
440 static const MVT::SimpleValueType FloatVectorTypes[] = {
441 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
442 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
443
444 for (MVT VT : FloatVectorTypes) {
445 setOperationAction(
446 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD,
447 ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2,
448 ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG,
449 ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC,
450 ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
451 ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG,
452 ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
453 ISD::SETCC, ISD::FCANONICALIZE},
454 VT, Expand);
455 }
456
457 // This causes using an unrolled select operation rather than expansion with
458 // bit operations. This is in general better, but the alternative using BFI
459 // instructions may be better if the select sources are SGPRs.
460 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
461 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
462
463 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
464 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
465
466 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
467 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
468
469 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
470 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
471
472 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
473 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
474
475 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
476 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
477
478 setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
479 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
480
481 setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
482 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
483
484 setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
485 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
486
487 setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
488 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
489
490 // There are no libcalls of any kind.
491 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
492 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
493
494 setSchedulingPreference(Sched::RegPressure);
495 setJumpIsExpensive(true);
496
497 // FIXME: This is only partially true. If we have to do vector compares, any
498 // SGPR pair can be a condition register. If we have a uniform condition, we
499 // are better off doing SALU operations, where there is only one SCC. For now,
500 // we don't have a way of knowing during instruction selection if a condition
501 // will be uniform and we always use vector compares. Assume we are using
502 // vector compares until that is fixed.
503 setHasMultipleConditionRegisters(true);
504
505 setMinCmpXchgSizeInBits(32);
506 setSupportsUnalignedAtomics(false);
507
508 PredictableSelectIsExpensive = false;
509
510 // We want to find all load dependencies for long chains of stores to enable
511 // merging into very wide vectors. The problem is with vectors with > 4
512 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
513 // vectors are a legal type, even though we have to split the loads
514 // usually. When we can more precisely specify load legality per address
515 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
516 // smarter so that they can figure out what to do in 2 iterations without all
517 // N > 4 stores on the same chain.
518 GatherAllAliasesMaxDepth = 16;
519
520 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
521 // about these during lowering.
522 MaxStoresPerMemcpy = 0xffffffff;
523 MaxStoresPerMemmove = 0xffffffff;
524 MaxStoresPerMemset = 0xffffffff;
525
526 // The expansion for 64-bit division is enormous.
527 if (AMDGPUBypassSlowDiv)
528 addBypassSlowDiv(64, 32);
529
530 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
531 ISD::SRA, ISD::SRL,
532 ISD::TRUNCATE, ISD::MUL,
533 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
534 ISD::MULHU, ISD::MULHS,
535 ISD::SELECT, ISD::SELECT_CC,
536 ISD::STORE, ISD::FADD,
537 ISD::FSUB, ISD::FNEG,
538 ISD::FABS, ISD::AssertZext,
539 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
540}
541
542bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
543 if (getTargetMachine().Options.NoSignedZerosFPMath)
544 return true;
545
546 const auto Flags = Op.getNode()->getFlags();
547 if (Flags.hasNoSignedZeros())
548 return true;
549
550 return false;
551}
552
553//===----------------------------------------------------------------------===//
554// Target Information
555//===----------------------------------------------------------------------===//
556
557LLVM_READNONE__attribute__((__const__))
558static bool fnegFoldsIntoOpcode(unsigned Opc) {
559 switch (Opc) {
560 case ISD::FADD:
561 case ISD::FSUB:
562 case ISD::FMUL:
563 case ISD::FMA:
564 case ISD::FMAD:
565 case ISD::FMINNUM:
566 case ISD::FMAXNUM:
567 case ISD::FMINNUM_IEEE:
568 case ISD::FMAXNUM_IEEE:
569 case ISD::SELECT:
570 case ISD::FSIN:
571 case ISD::FTRUNC:
572 case ISD::FRINT:
573 case ISD::FNEARBYINT:
574 case ISD::FCANONICALIZE:
575 case AMDGPUISD::RCP:
576 case AMDGPUISD::RCP_LEGACY:
577 case AMDGPUISD::RCP_IFLAG:
578 case AMDGPUISD::SIN_HW:
579 case AMDGPUISD::FMUL_LEGACY:
580 case AMDGPUISD::FMIN_LEGACY:
581 case AMDGPUISD::FMAX_LEGACY:
582 case AMDGPUISD::FMED3:
583 // TODO: handle llvm.amdgcn.fma.legacy
584 return true;
585 case ISD::BITCAST:
586 llvm_unreachable("bitcast is special cased")::llvm::llvm_unreachable_internal("bitcast is special cased",
"llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 586)
;
587 default:
588 return false;
589 }
590}
591
592static bool fnegFoldsIntoOp(const SDNode *N) {
593 unsigned Opc = N->getOpcode();
594 if (Opc == ISD::BITCAST) {
595 // TODO: Is there a benefit to checking the conditions performFNegCombine
596 // does? We don't for the other cases.
597 SDValue BCSrc = N->getOperand(0);
598 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
599 return BCSrc.getNumOperands() == 2 &&
600 BCSrc.getOperand(1).getValueSizeInBits() == 32;
601 }
602
603 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
604 }
605
606 return fnegFoldsIntoOpcode(Opc);
607}
608
609/// \p returns true if the operation will definitely need to use a 64-bit
610/// encoding, and thus will use a VOP3 encoding regardless of the source
611/// modifiers.
612LLVM_READONLY__attribute__((__pure__))
613static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
614 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
615 VT == MVT::f64;
616}
617
618/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
619/// type for ISD::SELECT.
620LLVM_READONLY__attribute__((__pure__))
621static bool selectSupportsSourceMods(const SDNode *N) {
622 // TODO: Only applies if select will be vector
623 return N->getValueType(0) == MVT::f32;
624}
625
626// Most FP instructions support source modifiers, but this could be refined
627// slightly.
628LLVM_READONLY__attribute__((__pure__))
629static bool hasSourceMods(const SDNode *N) {
630 if (isa<MemSDNode>(N))
631 return false;
632
633 switch (N->getOpcode()) {
634 case ISD::CopyToReg:
635 case ISD::FDIV:
636 case ISD::FREM:
637 case ISD::INLINEASM:
638 case ISD::INLINEASM_BR:
639 case AMDGPUISD::DIV_SCALE:
640 case ISD::INTRINSIC_W_CHAIN:
641
642 // TODO: Should really be looking at the users of the bitcast. These are
643 // problematic because bitcasts are used to legalize all stores to integer
644 // types.
645 case ISD::BITCAST:
646 return false;
647 case ISD::INTRINSIC_WO_CHAIN: {
648 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
649 case Intrinsic::amdgcn_interp_p1:
650 case Intrinsic::amdgcn_interp_p2:
651 case Intrinsic::amdgcn_interp_mov:
652 case Intrinsic::amdgcn_interp_p1_f16:
653 case Intrinsic::amdgcn_interp_p2_f16:
654 return false;
655 default:
656 return true;
657 }
658 }
659 case ISD::SELECT:
660 return selectSupportsSourceMods(N);
661 default:
662 return true;
663 }
664}
665
666bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
667 unsigned CostThreshold) {
668 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
669 // it is truly free to use a source modifier in all cases. If there are
670 // multiple users but for each one will necessitate using VOP3, there will be
671 // a code size increase. Try to avoid increasing code size unless we know it
672 // will save on the instruction count.
673 unsigned NumMayIncreaseSize = 0;
674 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
675
676 assert(!N->use_empty())(static_cast <bool> (!N->use_empty()) ? void (0) : __assert_fail
("!N->use_empty()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 676, __extension__ __PRETTY_FUNCTION__))
;
677
678 // XXX - Should this limit number of uses to check?
679 for (const SDNode *U : N->uses()) {
680 if (!hasSourceMods(U))
681 return false;
682
683 if (!opMustUseVOP3Encoding(U, VT)) {
684 if (++NumMayIncreaseSize > CostThreshold)
685 return false;
686 }
687 }
688
689 return true;
690}
691
692EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
693 ISD::NodeType ExtendKind) const {
694 assert(!VT.isVector() && "only scalar expected")(static_cast <bool> (!VT.isVector() && "only scalar expected"
) ? void (0) : __assert_fail ("!VT.isVector() && \"only scalar expected\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 694, __extension__
__PRETTY_FUNCTION__))
;
695
696 // Round to the next multiple of 32-bits.
697 unsigned Size = VT.getSizeInBits();
698 if (Size <= 32)
699 return MVT::i32;
700 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
701}
702
703MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
704 return MVT::i32;
705}
706
707bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
708 return true;
709}
710
711// The backend supports 32 and 64 bit floating point immediates.
712// FIXME: Why are we reporting vectors of FP immediates as legal?
713bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
714 bool ForCodeSize) const {
715 EVT ScalarVT = VT.getScalarType();
716 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
717 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
718}
719
720// We don't want to shrink f64 / f32 constants.
721bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
722 EVT ScalarVT = VT.getScalarType();
723 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
724}
725
726bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
727 ISD::LoadExtType ExtTy,
728 EVT NewVT) const {
729 // TODO: This may be worth removing. Check regression tests for diffs.
730 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
731 return false;
732
733 unsigned NewSize = NewVT.getStoreSizeInBits();
734
735 // If we are reducing to a 32-bit load or a smaller multi-dword load,
736 // this is always better.
737 if (NewSize >= 32)
738 return true;
739
740 EVT OldVT = N->getValueType(0);
741 unsigned OldSize = OldVT.getStoreSizeInBits();
742
743 MemSDNode *MN = cast<MemSDNode>(N);
744 unsigned AS = MN->getAddressSpace();
745 // Do not shrink an aligned scalar load to sub-dword.
746 // Scalar engine cannot do sub-dword loads.
747 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
748 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
749 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
750 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
751 MN->isInvariant())) &&
752 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
753 return false;
754
755 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
756 // extloads, so doing one requires using a buffer_load. In cases where we
757 // still couldn't use a scalar load, using the wider load shouldn't really
758 // hurt anything.
759
760 // If the old size already had to be an extload, there's no harm in continuing
761 // to reduce the width.
762 return (OldSize < 32);
763}
764
765bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
766 const SelectionDAG &DAG,
767 const MachineMemOperand &MMO) const {
768
769 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())(static_cast <bool> (LoadTy.getSizeInBits() == CastTy.getSizeInBits
()) ? void (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 769, __extension__
__PRETTY_FUNCTION__))
;
770
771 if (LoadTy.getScalarType() == MVT::i32)
772 return false;
773
774 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
775 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
776
777 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
778 return false;
779
780 unsigned Fast = 0;
781 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
782 CastTy, MMO, &Fast) &&
783 Fast;
784}
785
786// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
787// profitable with the expansion for 64-bit since it's generally good to
788// speculate things.
789bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
790 return true;
791}
792
793bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
794 return true;
795}
796
797bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
798 switch (N->getOpcode()) {
799 case ISD::EntryToken:
800 case ISD::TokenFactor:
801 return true;
802 case ISD::INTRINSIC_WO_CHAIN: {
803 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
804 switch (IntrID) {
805 case Intrinsic::amdgcn_readfirstlane:
806 case Intrinsic::amdgcn_readlane:
807 return true;
808 }
809 return false;
810 }
811 case ISD::LOAD:
812 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
813 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
814 return true;
815 return false;
816 case AMDGPUISD::SETCC: // ballot-style instruction
817 return true;
818 }
819 return false;
820}
821
822SDValue AMDGPUTargetLowering::getNegatedExpression(
823 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
824 NegatibleCost &Cost, unsigned Depth) const {
825
826 switch (Op.getOpcode()) {
827 case ISD::FMA:
828 case ISD::FMAD: {
829 // Negating a fma is not free if it has users without source mods.
830 if (!allUsesHaveSourceMods(Op.getNode()))
831 return SDValue();
832 break;
833 }
834 case AMDGPUISD::RCP: {
835 SDValue Src = Op.getOperand(0);
836 EVT VT = Op.getValueType();
837 SDLoc SL(Op);
838
839 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
840 ForCodeSize, Cost, Depth + 1);
841 if (NegSrc)
842 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
843 return SDValue();
844 }
845 default:
846 break;
847 }
848
849 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
850 ForCodeSize, Cost, Depth);
851}
852
853//===---------------------------------------------------------------------===//
854// Target Properties
855//===---------------------------------------------------------------------===//
856
857bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
858 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 858, __extension__ __PRETTY_FUNCTION__))
;
859
860 // Packed operations do not have a fabs modifier.
861 return VT == MVT::f32 || VT == MVT::f64 ||
862 (Subtarget->has16BitInsts() && VT == MVT::f16);
863}
864
865bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
866 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 866, __extension__ __PRETTY_FUNCTION__))
;
867 // Report this based on the end legalized type.
868 VT = VT.getScalarType();
869 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
870}
871
872bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
873 unsigned NumElem,
874 unsigned AS) const {
875 return true;
876}
877
878bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
879 // There are few operations which truly have vector input operands. Any vector
880 // operation is going to involve operations on each component, and a
881 // build_vector will be a copy per element, so it always makes sense to use a
882 // build_vector input in place of the extracted element to avoid a copy into a
883 // super register.
884 //
885 // We should probably only do this if all users are extracts only, but this
886 // should be the common case.
887 return true;
888}
889
890bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
891 // Truncate is just accessing a subregister.
892
893 unsigned SrcSize = Source.getSizeInBits();
894 unsigned DestSize = Dest.getSizeInBits();
895
896 return DestSize < SrcSize && DestSize % 32 == 0 ;
897}
898
899bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
900 // Truncate is just accessing a subregister.
901
902 unsigned SrcSize = Source->getScalarSizeInBits();
903 unsigned DestSize = Dest->getScalarSizeInBits();
904
905 if (DestSize== 16 && Subtarget->has16BitInsts())
906 return SrcSize >= 32;
907
908 return DestSize < SrcSize && DestSize % 32 == 0;
909}
910
911bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
912 unsigned SrcSize = Src->getScalarSizeInBits();
913 unsigned DestSize = Dest->getScalarSizeInBits();
914
915 if (SrcSize == 16 && Subtarget->has16BitInsts())
916 return DestSize >= 32;
917
918 return SrcSize == 32 && DestSize == 64;
919}
920
921bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
922 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
923 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
924 // this will enable reducing 64-bit operations the 32-bit, which is always
925 // good.
926
927 if (Src == MVT::i16)
928 return Dest == MVT::i32 ||Dest == MVT::i64 ;
929
930 return Src == MVT::i32 && Dest == MVT::i64;
931}
932
933bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
934 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
935 // limited number of native 64-bit operations. Shrinking an operation to fit
936 // in a single 32-bit register should always be helpful. As currently used,
937 // this is much less general than the name suggests, and is only used in
938 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
939 // not profitable, and may actually be harmful.
940 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
941}
942
943bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
944 const SDNode* N, CombineLevel Level) const {
945 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||(static_cast <bool> ((N->getOpcode() == ISD::SHL || N
->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL
) && "Expected shift op") ? void (0) : __assert_fail (
"(N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && \"Expected shift op\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 947, __extension__
__PRETTY_FUNCTION__))
946 N->getOpcode() == ISD::SRL) &&(static_cast <bool> ((N->getOpcode() == ISD::SHL || N
->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL
) && "Expected shift op") ? void (0) : __assert_fail (
"(N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && \"Expected shift op\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 947, __extension__
__PRETTY_FUNCTION__))
947 "Expected shift op")(static_cast <bool> ((N->getOpcode() == ISD::SHL || N
->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL
) && "Expected shift op") ? void (0) : __assert_fail (
"(N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && \"Expected shift op\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 947, __extension__
__PRETTY_FUNCTION__))
;
948 // Always commute pre-type legalization and right shifts.
949 // We're looking for shl(or(x,y),z) patterns.
950 if (Level < CombineLevel::AfterLegalizeTypes ||
951 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
952 return true;
953
954 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
955 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
956 (N->use_begin()->getOpcode() == ISD::SRA ||
957 N->use_begin()->getOpcode() == ISD::SRL))
958 return false;
959
960 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
961 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
962 if (LHS.getOpcode() != ISD::SHL)
963 return false;
964 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
965 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
966 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
967 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
968 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
969 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
970 };
971 SDValue LHS = N->getOperand(0).getOperand(0);
972 SDValue RHS = N->getOperand(0).getOperand(1);
973 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
974}
975
976//===---------------------------------------------------------------------===//
977// TargetLowering Callbacks
978//===---------------------------------------------------------------------===//
979
980CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
981 bool IsVarArg) {
982 switch (CC) {
983 case CallingConv::AMDGPU_VS:
984 case CallingConv::AMDGPU_GS:
985 case CallingConv::AMDGPU_PS:
986 case CallingConv::AMDGPU_CS:
987 case CallingConv::AMDGPU_HS:
988 case CallingConv::AMDGPU_ES:
989 case CallingConv::AMDGPU_LS:
990 return CC_AMDGPU;
991 case CallingConv::C:
992 case CallingConv::Fast:
993 case CallingConv::Cold:
994 return CC_AMDGPU_Func;
995 case CallingConv::AMDGPU_Gfx:
996 return CC_SI_Gfx;
997 case CallingConv::AMDGPU_KERNEL:
998 case CallingConv::SPIR_KERNEL:
999 default:
1000 report_fatal_error("Unsupported calling convention for call");
1001 }
1002}
1003
1004CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1005 bool IsVarArg) {
1006 switch (CC) {
1007 case CallingConv::AMDGPU_KERNEL:
1008 case CallingConv::SPIR_KERNEL:
1009 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1009)
;
1010 case CallingConv::AMDGPU_VS:
1011 case CallingConv::AMDGPU_GS:
1012 case CallingConv::AMDGPU_PS:
1013 case CallingConv::AMDGPU_CS:
1014 case CallingConv::AMDGPU_HS:
1015 case CallingConv::AMDGPU_ES:
1016 case CallingConv::AMDGPU_LS:
1017 return RetCC_SI_Shader;
1018 case CallingConv::AMDGPU_Gfx:
1019 return RetCC_SI_Gfx;
1020 case CallingConv::C:
1021 case CallingConv::Fast:
1022 case CallingConv::Cold:
1023 return RetCC_AMDGPU_Func;
1024 default:
1025 report_fatal_error("Unsupported calling convention.");
1026 }
1027}
1028
1029/// The SelectionDAGBuilder will automatically promote function arguments
1030/// with illegal types. However, this does not work for the AMDGPU targets
1031/// since the function arguments are stored in memory as these illegal types.
1032/// In order to handle this properly we need to get the original types sizes
1033/// from the LLVM IR Function and fixup the ISD:InputArg values before
1034/// passing them to AnalyzeFormalArguments()
1035
1036/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1037/// input values across multiple registers. Each item in the Ins array
1038/// represents a single value that will be stored in registers. Ins[x].VT is
1039/// the value type of the value that will be stored in the register, so
1040/// whatever SDNode we lower the argument to needs to be this type.
1041///
1042/// In order to correctly lower the arguments we need to know the size of each
1043/// argument. Since Ins[x].VT gives us the size of the register that will
1044/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1045/// for the original function argument so that we can deduce the correct memory
1046/// type to use for Ins[x]. In most cases the correct memory type will be
1047/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1048/// we have a kernel argument of type v8i8, this argument will be split into
1049/// 8 parts and each part will be represented by its own item in the Ins array.
1050/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1051/// the argument before it was split. From this, we deduce that the memory type
1052/// for each individual part is i8. We pass the memory type as LocVT to the
1053/// calling convention analysis function and the register type (Ins[x].VT) as
1054/// the ValVT.
1055void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1056 CCState &State,
1057 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1058 const MachineFunction &MF = State.getMachineFunction();
1059 const Function &Fn = MF.getFunction();
1060 LLVMContext &Ctx = Fn.getParent()->getContext();
1061 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1062 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1063 CallingConv::ID CC = Fn.getCallingConv();
1064
1065 Align MaxAlign = Align(1);
1066 uint64_t ExplicitArgOffset = 0;
1067 const DataLayout &DL = Fn.getParent()->getDataLayout();
1068
1069 unsigned InIndex = 0;
1070
1071 for (const Argument &Arg : Fn.args()) {
1072 const bool IsByRef = Arg.hasByRefAttr();
1073 Type *BaseArgTy = Arg.getType();
1074 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1075 Align Alignment = DL.getValueOrABITypeAlignment(
1076 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1077 MaxAlign = std::max(Alignment, MaxAlign);
1078 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1079
1080 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1081 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1082
1083 // We're basically throwing away everything passed into us and starting over
1084 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1085 // to us as computed in Ins.
1086 //
1087 // We also need to figure out what type legalization is trying to do to get
1088 // the correct memory offsets.
1089
1090 SmallVector<EVT, 16> ValueVTs;
1091 SmallVector<uint64_t, 16> Offsets;
1092 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1093
1094 for (unsigned Value = 0, NumValues = ValueVTs.size();
1095 Value != NumValues; ++Value) {
1096 uint64_t BasePartOffset = Offsets[Value];
1097
1098 EVT ArgVT = ValueVTs[Value];
1099 EVT MemVT = ArgVT;
1100 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1101 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1102
1103 if (NumRegs == 1) {
1104 // This argument is not split, so the IR type is the memory type.
1105 if (ArgVT.isExtended()) {
1106 // We have an extended type, like i24, so we should just use the
1107 // register type.
1108 MemVT = RegisterVT;
1109 } else {
1110 MemVT = ArgVT;
1111 }
1112 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1113 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1114 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())(static_cast <bool> (ArgVT.getVectorNumElements() > RegisterVT
.getVectorNumElements()) ? void (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1114, __extension__
__PRETTY_FUNCTION__))
;
1115 // We have a vector value which has been split into a vector with
1116 // the same scalar type, but fewer elements. This should handle
1117 // all the floating-point vector types.
1118 MemVT = RegisterVT;
1119 } else if (ArgVT.isVector() &&
1120 ArgVT.getVectorNumElements() == NumRegs) {
1121 // This arg has been split so that each element is stored in a separate
1122 // register.
1123 MemVT = ArgVT.getScalarType();
1124 } else if (ArgVT.isExtended()) {
1125 // We have an extended type, like i65.
1126 MemVT = RegisterVT;
1127 } else {
1128 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1129 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)(static_cast <bool> (ArgVT.getStoreSizeInBits() % NumRegs
== 0) ? void (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1129, __extension__
__PRETTY_FUNCTION__))
;
1130 if (RegisterVT.isInteger()) {
1131 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1132 } else if (RegisterVT.isVector()) {
1133 assert(!RegisterVT.getScalarType().isFloatingPoint())(static_cast <bool> (!RegisterVT.getScalarType().isFloatingPoint
()) ? void (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1133, __extension__
__PRETTY_FUNCTION__))
;
1134 unsigned NumElements = RegisterVT.getVectorNumElements();
1135 assert(MemoryBits % NumElements == 0)(static_cast <bool> (MemoryBits % NumElements == 0) ? void
(0) : __assert_fail ("MemoryBits % NumElements == 0", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1135, __extension__ __PRETTY_FUNCTION__))
;
1136 // This vector type has been split into another vector type with
1137 // a different elements size.
1138 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1139 MemoryBits / NumElements);
1140 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1141 } else {
1142 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1142)
;
1143 }
1144 }
1145
1146 // Convert one element vectors to scalar.
1147 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1148 MemVT = MemVT.getScalarType();
1149
1150 // Round up vec3/vec5 argument.
1151 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1152 assert(MemVT.getVectorNumElements() == 3 ||(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1155, __extension__
__PRETTY_FUNCTION__))
1153 MemVT.getVectorNumElements() == 5 ||(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1155, __extension__
__PRETTY_FUNCTION__))
1154 (MemVT.getVectorNumElements() >= 9 &&(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1155, __extension__
__PRETTY_FUNCTION__))
1155 MemVT.getVectorNumElements() <= 12))(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1155, __extension__
__PRETTY_FUNCTION__))
;
1156 MemVT = MemVT.getPow2VectorType(State.getContext());
1157 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1158 MemVT = MemVT.getRoundIntegerType(State.getContext());
1159 }
1160
1161 unsigned PartOffset = 0;
1162 for (unsigned i = 0; i != NumRegs; ++i) {
1163 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1164 BasePartOffset + PartOffset,
1165 MemVT.getSimpleVT(),
1166 CCValAssign::Full));
1167 PartOffset += MemVT.getStoreSize();
1168 }
1169 }
1170 }
1171}
1172
1173SDValue AMDGPUTargetLowering::LowerReturn(
1174 SDValue Chain, CallingConv::ID CallConv,
1175 bool isVarArg,
1176 const SmallVectorImpl<ISD::OutputArg> &Outs,
1177 const SmallVectorImpl<SDValue> &OutVals,
1178 const SDLoc &DL, SelectionDAG &DAG) const {
1179 // FIXME: Fails for r600 tests
1180 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1181 // "wave terminate should not have return values");
1182 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1183}
1184
1185//===---------------------------------------------------------------------===//
1186// Target specific lowering
1187//===---------------------------------------------------------------------===//
1188
1189/// Selects the correct CCAssignFn for a given CallingConvention value.
1190CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1191 bool IsVarArg) {
1192 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1193}
1194
1195CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1196 bool IsVarArg) {
1197 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1198}
1199
1200SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1201 SelectionDAG &DAG,
1202 MachineFrameInfo &MFI,
1203 int ClobberedFI) const {
1204 SmallVector<SDValue, 8> ArgChains;
1205 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1206 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1207
1208 // Include the original chain at the beginning of the list. When this is
1209 // used by target LowerCall hooks, this helps legalize find the
1210 // CALLSEQ_BEGIN node.
1211 ArgChains.push_back(Chain);
1212
1213 // Add a chain value for each stack argument corresponding
1214 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1215 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1216 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1217 if (FI->getIndex() < 0) {
1218 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1219 int64_t InLastByte = InFirstByte;
1220 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1221
1222 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1223 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1224 ArgChains.push_back(SDValue(L, 1));
1225 }
1226 }
1227 }
1228 }
1229
1230 // Build a tokenfactor for all the chains.
1231 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1232}
1233
1234SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1235 SmallVectorImpl<SDValue> &InVals,
1236 StringRef Reason) const {
1237 SDValue Callee = CLI.Callee;
1238 SelectionDAG &DAG = CLI.DAG;
1239
1240 const Function &Fn = DAG.getMachineFunction().getFunction();
1241
1242 StringRef FuncName("<unknown>");
1243
1244 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1245 FuncName = G->getSymbol();
1246 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1247 FuncName = G->getGlobal()->getName();
1248
1249 DiagnosticInfoUnsupported NoCalls(
1250 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1251 DAG.getContext()->diagnose(NoCalls);
1252
1253 if (!CLI.IsTailCall) {
1254 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1255 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1256 }
1257
1258 return DAG.getEntryNode();
1259}
1260
1261SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1262 SmallVectorImpl<SDValue> &InVals) const {
1263 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1264}
1265
1266SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1267 SelectionDAG &DAG) const {
1268 const Function &Fn = DAG.getMachineFunction().getFunction();
1269
1270 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1271 SDLoc(Op).getDebugLoc());
1272 DAG.getContext()->diagnose(NoDynamicAlloca);
1273 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1274 return DAG.getMergeValues(Ops, SDLoc());
1275}
1276
1277SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1278 SelectionDAG &DAG) const {
1279 switch (Op.getOpcode()) {
1280 default:
1281 Op->print(errs(), &DAG);
1282 llvm_unreachable("Custom lowering code for this "::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1283)
1283 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1283)
;
1284 break;
1285 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1286 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1287 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1288 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1289 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1290 case ISD::FREM: return LowerFREM(Op, DAG);
1291 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1292 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1293 case ISD::FRINT: return LowerFRINT(Op, DAG);
1294 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1295 case ISD::FROUNDEVEN:
1296 return LowerFROUNDEVEN(Op, DAG);
1297 case ISD::FROUND: return LowerFROUND(Op, DAG);
1298 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1299 case ISD::FLOG:
1300 return LowerFLOG(Op, DAG, numbers::ln2f);
1301 case ISD::FLOG10:
1302 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1303 case ISD::FEXP:
1304 return lowerFEXP(Op, DAG);
1305 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1306 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1307 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1308 case ISD::FP_TO_SINT:
1309 case ISD::FP_TO_UINT:
1310 return LowerFP_TO_INT(Op, DAG);
1311 case ISD::CTTZ:
1312 case ISD::CTTZ_ZERO_UNDEF:
1313 case ISD::CTLZ:
1314 case ISD::CTLZ_ZERO_UNDEF:
1315 return LowerCTLZ_CTTZ(Op, DAG);
1316 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1317 }
1318 return Op;
1319}
1320
1321void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1322 SmallVectorImpl<SDValue> &Results,
1323 SelectionDAG &DAG) const {
1324 switch (N->getOpcode()) {
1325 case ISD::SIGN_EXTEND_INREG:
1326 // Different parts of legalization seem to interpret which type of
1327 // sign_extend_inreg is the one to check for custom lowering. The extended
1328 // from type is what really matters, but some places check for custom
1329 // lowering of the result type. This results in trying to use
1330 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1331 // nothing here and let the illegal result integer be handled normally.
1332 return;
1333 default:
1334 return;
1335 }
1336}
1337
1338SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1339 SDValue Op,
1340 SelectionDAG &DAG) const {
1341
1342 const DataLayout &DL = DAG.getDataLayout();
1343 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1344 const GlobalValue *GV = G->getGlobal();
1345
1346 if (!MFI->isModuleEntryFunction()) {
1347 if (std::optional<uint32_t> Address =
1348 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
1349 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1350 }
1351 }
1352
1353 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1354 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1355 if (!MFI->isModuleEntryFunction() &&
1356 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1357 SDLoc DL(Op);
1358 const Function &Fn = DAG.getMachineFunction().getFunction();
1359 DiagnosticInfoUnsupported BadLDSDecl(
1360 Fn, "local memory global used by non-kernel function",
1361 DL.getDebugLoc(), DS_Warning);
1362 DAG.getContext()->diagnose(BadLDSDecl);
1363
1364 // We currently don't have a way to correctly allocate LDS objects that
1365 // aren't directly associated with a kernel. We do force inlining of
1366 // functions that use local objects. However, if these dead functions are
1367 // not eliminated, we don't want a compile time error. Just emit a warning
1368 // and a trap, since there should be no callable path here.
1369 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1370 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1371 Trap, DAG.getRoot());
1372 DAG.setRoot(OutputChain);
1373 return DAG.getUNDEF(Op.getValueType());
1374 }
1375
1376 // XXX: What does the value of G->getOffset() mean?
1377 assert(G->getOffset() == 0 &&(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1378, __extension__
__PRETTY_FUNCTION__))
1378 "Do not know what to do with an non-zero offset")(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1378, __extension__
__PRETTY_FUNCTION__))
;
1379
1380 // TODO: We could emit code to handle the initialization somewhere.
1381 // We ignore the initializer for now and legalize it to allow selection.
1382 // The initializer will anyway get errored out during assembly emission.
1383 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1384 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1385 }
1386 return SDValue();
1387}
1388
1389SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1390 SelectionDAG &DAG) const {
1391 SmallVector<SDValue, 8> Args;
1392 SDLoc SL(Op);
1393
1394 EVT VT = Op.getValueType();
1395 if (VT.getVectorElementType().getSizeInBits() < 32) {
1396 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1397 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1398 unsigned NewNumElt = OpBitSize / 32;
1399 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1400 : EVT::getVectorVT(*DAG.getContext(),
1401 MVT::i32, NewNumElt);
1402 for (const SDUse &U : Op->ops()) {
1403 SDValue In = U.get();
1404 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1405 if (NewNumElt > 1)
1406 DAG.ExtractVectorElements(NewIn, Args);
1407 else
1408 Args.push_back(NewIn);
1409 }
1410
1411 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1412 NewNumElt * Op.getNumOperands());
1413 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1414 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1415 }
1416 }
1417
1418 for (const SDUse &U : Op->ops())
1419 DAG.ExtractVectorElements(U.get(), Args);
1420
1421 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1422}
1423
1424SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1425 SelectionDAG &DAG) const {
1426 SDLoc SL(Op);
1427 SmallVector<SDValue, 8> Args;
1428 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1429 EVT VT = Op.getValueType();
1430 EVT SrcVT = Op.getOperand(0).getValueType();
1431
1432 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1433 unsigned NumElt = VT.getVectorNumElements();
1434 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1435 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types")(static_cast <bool> (NumElt % 2 == 0 && NumSrcElt
% 2 == 0 && "expect legal types") ? void (0) : __assert_fail
("NumElt % 2 == 0 && NumSrcElt % 2 == 0 && \"expect legal types\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1435, __extension__
__PRETTY_FUNCTION__))
;
1436
1437 // We have some TableGen patterns for when the extracted vector is exactly
1438 // the low or high half of the operand.
1439 if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt))
1440 return Op;
1441
1442 // Extract 32-bit registers at a time.
1443 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1444 EVT NewVT = NumElt == 2
1445 ? MVT::i32
1446 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1447 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1448
1449 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1450 if (NumElt == 2)
1451 Tmp = Args[0];
1452 else
1453 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1454
1455 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1456 }
1457
1458 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1459 VT.getVectorNumElements());
1460
1461 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1462}
1463
1464// TODO: Handle fabs too
1465static SDValue peekFNeg(SDValue Val) {
1466 if (Val.getOpcode() == ISD::FNEG)
1467 return Val.getOperand(0);
1468
1469 return Val;
1470}
1471SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1472 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1473 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1474 SelectionDAG &DAG = DCI.DAG;
1475 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1476 switch (CCOpcode) {
1477 case ISD::SETOEQ:
1478 case ISD::SETONE:
1479 case ISD::SETUNE:
1480 case ISD::SETNE:
1481 case ISD::SETUEQ:
1482 case ISD::SETEQ:
1483 case ISD::SETFALSE:
1484 case ISD::SETFALSE2:
1485 case ISD::SETTRUE:
1486 case ISD::SETTRUE2:
1487 case ISD::SETUO:
1488 case ISD::SETO:
1489 break;
1490 case ISD::SETULE:
1491 case ISD::SETULT: {
1492 if (LHS == True)
1493 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1494 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1495 }
1496 case ISD::SETOLE:
1497 case ISD::SETOLT:
1498 case ISD::SETLE:
1499 case ISD::SETLT: {
1500 // Ordered. Assume ordered for undefined.
1501
1502 // Only do this after legalization to avoid interfering with other combines
1503 // which might occur.
1504 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1505 !DCI.isCalledByLegalizer())
1506 return SDValue();
1507
1508 // We need to permute the operands to get the correct NaN behavior. The
1509 // selected operand is the second one based on the failing compare with NaN,
1510 // so permute it based on the compare type the hardware uses.
1511 if (LHS == True)
1512 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1513 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1514 }
1515 case ISD::SETUGE:
1516 case ISD::SETUGT: {
1517 if (LHS == True)
1518 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1519 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1520 }
1521 case ISD::SETGT:
1522 case ISD::SETGE:
1523 case ISD::SETOGE:
1524 case ISD::SETOGT: {
1525 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1526 !DCI.isCalledByLegalizer())
1527 return SDValue();
1528
1529 if (LHS == True)
1530 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1531 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1532 }
1533 case ISD::SETCC_INVALID:
1534 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1534)
;
1535 }
1536 return SDValue();
1537}
1538
1539/// Generate Min/Max node
1540SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1541 SDValue LHS, SDValue RHS,
1542 SDValue True, SDValue False,
1543 SDValue CC,
1544 DAGCombinerInfo &DCI) const {
1545 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1546 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1547
1548 SelectionDAG &DAG = DCI.DAG;
1549
1550 // If we can't directly match this, try to see if we can fold an fneg to
1551 // match.
1552
1553 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1554 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1555 SDValue NegTrue = peekFNeg(True);
1556
1557 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1558 // fmin/fmax.
1559 //
1560 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1561 // -> fneg (fmin_legacy lhs, K)
1562 //
1563 // TODO: Use getNegatedExpression
1564 if (LHS == NegTrue && CFalse && CRHS) {
1565 APFloat NegRHS = neg(CRHS->getValueAPF());
1566 if (NegRHS == CFalse->getValueAPF()) {
1567 SDValue Combined =
1568 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1569 if (Combined)
1570 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1571 return SDValue();
1572 }
1573 }
1574
1575 return SDValue();
1576}
1577
1578std::pair<SDValue, SDValue>
1579AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1580 SDLoc SL(Op);
1581
1582 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1583
1584 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1585 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1586
1587 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1588 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1589
1590 return std::pair(Lo, Hi);
1591}
1592
1593SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1594 SDLoc SL(Op);
1595
1596 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1597 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1598 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1599}
1600
1601SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1602 SDLoc SL(Op);
1603
1604 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1605 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1606 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1607}
1608
1609// Split a vector type into two parts. The first part is a power of two vector.
1610// The second part is whatever is left over, and is a scalar if it would
1611// otherwise be a 1-vector.
1612std::pair<EVT, EVT>
1613AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1614 EVT LoVT, HiVT;
1615 EVT EltVT = VT.getVectorElementType();
1616 unsigned NumElts = VT.getVectorNumElements();
1617 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1618 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1619 HiVT = NumElts - LoNumElts == 1
1620 ? EltVT
1621 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1622 return std::pair(LoVT, HiVT);
1623}
1624
1625// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1626// scalar.
1627std::pair<SDValue, SDValue>
1628AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1629 const EVT &LoVT, const EVT &HiVT,
1630 SelectionDAG &DAG) const {
1631 assert(LoVT.getVectorNumElements() +(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1634, __extension__
__PRETTY_FUNCTION__))
1632 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1634, __extension__
__PRETTY_FUNCTION__))
1633 N.getValueType().getVectorNumElements() &&(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1634, __extension__
__PRETTY_FUNCTION__))
1634 "More vector elements requested than available!")(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1634, __extension__
__PRETTY_FUNCTION__))
;
1635 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1636 DAG.getVectorIdxConstant(0, DL));
1637 SDValue Hi = DAG.getNode(
1638 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1639 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1640 return std::pair(Lo, Hi);
1641}
1642
1643SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1644 SelectionDAG &DAG) const {
1645 LoadSDNode *Load = cast<LoadSDNode>(Op);
1646 EVT VT = Op.getValueType();
1647 SDLoc SL(Op);
1648
1649
1650 // If this is a 2 element vector, we really want to scalarize and not create
1651 // weird 1 element vectors.
1652 if (VT.getVectorNumElements() == 2) {
1653 SDValue Ops[2];
1654 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1655 return DAG.getMergeValues(Ops, SL);
1656 }
1657
1658 SDValue BasePtr = Load->getBasePtr();
1659 EVT MemVT = Load->getMemoryVT();
1660
1661 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1662
1663 EVT LoVT, HiVT;
1664 EVT LoMemVT, HiMemVT;
1665 SDValue Lo, Hi;
1666
1667 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1668 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1669 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1670
1671 unsigned Size = LoMemVT.getStoreSize();
1672 Align BaseAlign = Load->getAlign();
1673 Align HiAlign = commonAlignment(BaseAlign, Size);
1674
1675 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1676 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1677 BaseAlign, Load->getMemOperand()->getFlags());
1678 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1679 SDValue HiLoad =
1680 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1681 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1682 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1683
1684 SDValue Join;
1685 if (LoVT == HiVT) {
1686 // This is the case that the vector is power of two so was evenly split.
1687 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1688 } else {
1689 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1690 DAG.getVectorIdxConstant(0, SL));
1691 Join = DAG.getNode(
1692 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1693 VT, Join, HiLoad,
1694 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1695 }
1696
1697 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1698 LoLoad.getValue(1), HiLoad.getValue(1))};
1699
1700 return DAG.getMergeValues(Ops, SL);
1701}
1702
1703SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1704 SelectionDAG &DAG) const {
1705 LoadSDNode *Load = cast<LoadSDNode>(Op);
1706 EVT VT = Op.getValueType();
1707 SDValue BasePtr = Load->getBasePtr();
1708 EVT MemVT = Load->getMemoryVT();
1709 SDLoc SL(Op);
1710 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1711 Align BaseAlign = Load->getAlign();
1712 unsigned NumElements = MemVT.getVectorNumElements();
1713
1714 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1715 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1716 if (NumElements != 3 ||
1717 (BaseAlign < Align(8) &&
1718 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1719 return SplitVectorLoad(Op, DAG);
1720
1721 assert(NumElements == 3)(static_cast <bool> (NumElements == 3) ? void (0) : __assert_fail
("NumElements == 3", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1721, __extension__ __PRETTY_FUNCTION__))
;
1722
1723 EVT WideVT =
1724 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1725 EVT WideMemVT =
1726 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1727 SDValue WideLoad = DAG.getExtLoad(
1728 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1729 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1730 return DAG.getMergeValues(
1731 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1732 DAG.getVectorIdxConstant(0, SL)),
1733 WideLoad.getValue(1)},
1734 SL);
1735}
1736
1737SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1738 SelectionDAG &DAG) const {
1739 StoreSDNode *Store = cast<StoreSDNode>(Op);
1740 SDValue Val = Store->getValue();
1741 EVT VT = Val.getValueType();
1742
1743 // If this is a 2 element vector, we really want to scalarize and not create
1744 // weird 1 element vectors.
1745 if (VT.getVectorNumElements() == 2)
1746 return scalarizeVectorStore(Store, DAG);
1747
1748 EVT MemVT = Store->getMemoryVT();
1749 SDValue Chain = Store->getChain();
1750 SDValue BasePtr = Store->getBasePtr();
1751 SDLoc SL(Op);
1752
1753 EVT LoVT, HiVT;
1754 EVT LoMemVT, HiMemVT;
1755 SDValue Lo, Hi;
1756
1757 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1758 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1759 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1760
1761 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1762
1763 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1764 Align BaseAlign = Store->getAlign();
1765 unsigned Size = LoMemVT.getStoreSize();
1766 Align HiAlign = commonAlignment(BaseAlign, Size);
1767
1768 SDValue LoStore =
1769 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1770 Store->getMemOperand()->getFlags());
1771 SDValue HiStore =
1772 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1773 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1774
1775 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1776}
1777
1778// This is a shortcut for integer division because we have fast i32<->f32
1779// conversions, and fast f32 reciprocal instructions. The fractional part of a
1780// float is enough to accurately represent up to a 24-bit signed integer.
1781SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1782 bool Sign) const {
1783 SDLoc DL(Op);
1784 EVT VT = Op.getValueType();
1785 SDValue LHS = Op.getOperand(0);
1786 SDValue RHS = Op.getOperand(1);
1787 MVT IntVT = MVT::i32;
1788 MVT FltVT = MVT::f32;
1789
1790 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1791 if (LHSSignBits < 9)
1792 return SDValue();
1793
1794 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1795 if (RHSSignBits < 9)
1796 return SDValue();
1797
1798 unsigned BitSize = VT.getSizeInBits();
1799 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1800 unsigned DivBits = BitSize - SignBits;
1801 if (Sign)
1802 ++DivBits;
1803
1804 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1805 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1806
1807 SDValue jq = DAG.getConstant(1, DL, IntVT);
1808
1809 if (Sign) {
1810 // char|short jq = ia ^ ib;
1811 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1812
1813 // jq = jq >> (bitsize - 2)
1814 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1815 DAG.getConstant(BitSize - 2, DL, VT));
1816
1817 // jq = jq | 0x1
1818 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1819 }
1820
1821 // int ia = (int)LHS;
1822 SDValue ia = LHS;
1823
1824 // int ib, (int)RHS;
1825 SDValue ib = RHS;
1826
1827 // float fa = (float)ia;
1828 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1829
1830 // float fb = (float)ib;
1831 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1832
1833 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1834 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1835
1836 // fq = trunc(fq);
1837 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1838
1839 // float fqneg = -fq;
1840 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1841
1842 MachineFunction &MF = DAG.getMachineFunction();
1843
1844 bool UseFmadFtz = false;
1845 if (Subtarget->isGCN()) {
1846 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1847 UseFmadFtz = MFI->getMode().allFP32Denormals();
1848 }
1849
1850 // float fr = mad(fqneg, fb, fa);
1851 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1852 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1853 : (unsigned)ISD::FMAD;
1854 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1855
1856 // int iq = (int)fq;
1857 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1858
1859 // fr = fabs(fr);
1860 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1861
1862 // fb = fabs(fb);
1863 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1864
1865 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1866
1867 // int cv = fr >= fb;
1868 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1869
1870 // jq = (cv ? jq : 0);
1871 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1872
1873 // dst = iq + jq;
1874 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1875
1876 // Rem needs compensation, it's easier to recompute it
1877 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1878 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1879
1880 // Truncate to number of bits this divide really is.
1881 if (Sign) {
1882 SDValue InRegSize
1883 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1884 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1885 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1886 } else {
1887 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1888 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1889 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1890 }
1891
1892 return DAG.getMergeValues({ Div, Rem }, DL);
1893}
1894
1895void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1896 SelectionDAG &DAG,
1897 SmallVectorImpl<SDValue> &Results) const {
1898 SDLoc DL(Op);
1899 EVT VT = Op.getValueType();
1900
1901 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")(static_cast <bool> (VT == MVT::i64 && "LowerUDIVREM64 expects an i64"
) ? void (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1901, __extension__
__PRETTY_FUNCTION__))
;
1902
1903 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1904
1905 SDValue One = DAG.getConstant(1, DL, HalfVT);
1906 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1907
1908 //HiLo split
1909 SDValue LHS_Lo, LHS_Hi;
1910 SDValue LHS = Op.getOperand(0);
1911 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
1912
1913 SDValue RHS_Lo, RHS_Hi;
1914 SDValue RHS = Op.getOperand(1);
1915 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
1916
1917 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1918 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1919
1920 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1921 LHS_Lo, RHS_Lo);
1922
1923 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1924 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1925
1926 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1927 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1928 return;
1929 }
1930
1931 if (isTypeLegal(MVT::i64)) {
1932 // The algorithm here is based on ideas from "Software Integer Division",
1933 // Tom Rodeheffer, August 2008.
1934
1935 MachineFunction &MF = DAG.getMachineFunction();
1936 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1937
1938 // Compute denominator reciprocal.
1939 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1940 (unsigned)ISD::FMA :
1941 !MFI->getMode().allFP32Denormals() ?
1942 (unsigned)ISD::FMAD :
1943 (unsigned)AMDGPUISD::FMAD_FTZ;
1944
1945 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1946 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1947 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1948 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1949 Cvt_Lo);
1950 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1951 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1952 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1953 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1954 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1955 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1956 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1957 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1958 Mul1);
1959 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1960 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1961 SDValue Rcp64 = DAG.getBitcast(VT,
1962 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1963
1964 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1965 SDValue One64 = DAG.getConstant(1, DL, VT);
1966 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1967 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1968
1969 // First round of UNR (Unsigned integer Newton-Raphson).
1970 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1971 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1972 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1973 SDValue Mulhi1_Lo, Mulhi1_Hi;
1974 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
1975 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
1976 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
1977 Mulhi1_Lo, Zero1);
1978 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
1979 Mulhi1_Hi, Add1_Lo.getValue(1));
1980 SDValue Add1 = DAG.getBitcast(VT,
1981 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1982
1983 // Second round of UNR.
1984 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1985 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1986 SDValue Mulhi2_Lo, Mulhi2_Hi;
1987 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
1988 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
1989 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
1990 Mulhi2_Lo, Zero1);
1991 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
1992 Mulhi2_Hi, Add2_Lo.getValue(1));
1993 SDValue Add2 = DAG.getBitcast(VT,
1994 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1995
1996 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1997
1998 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1999
2000 SDValue Mul3_Lo, Mul3_Hi;
2001 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2002 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2003 Mul3_Lo, Zero1);
2004 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2005 Mul3_Hi, Sub1_Lo.getValue(1));
2006 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2007 SDValue Sub1 = DAG.getBitcast(VT,
2008 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2009
2010 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2011 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2012 ISD::SETUGE);
2013 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2014 ISD::SETUGE);
2015 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2016
2017 // TODO: Here and below portions of the code can be enclosed into if/endif.
2018 // Currently control flow is unconditional and we have 4 selects after
2019 // potential endif to substitute PHIs.
2020
2021 // if C3 != 0 ...
2022 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2023 RHS_Lo, Zero1);
2024 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2025 RHS_Hi, Sub1_Lo.getValue(1));
2026 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2027 Zero, Sub2_Lo.getValue(1));
2028 SDValue Sub2 = DAG.getBitcast(VT,
2029 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2030
2031 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2032
2033 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2034 ISD::SETUGE);
2035 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2036 ISD::SETUGE);
2037 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2038
2039 // if (C6 != 0)
2040 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2041
2042 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2043 RHS_Lo, Zero1);
2044 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2045 RHS_Hi, Sub2_Lo.getValue(1));
2046 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2047 Zero, Sub3_Lo.getValue(1));
2048 SDValue Sub3 = DAG.getBitcast(VT,
2049 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2050
2051 // endif C6
2052 // endif C3
2053
2054 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2055 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2056
2057 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2058 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2059
2060 Results.push_back(Div);
2061 Results.push_back(Rem);
2062
2063 return;
2064 }
2065
2066 // r600 expandion.
2067 // Get Speculative values
2068 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2069 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2070
2071 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2072 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2073 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2074
2075 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2076 SDValue DIV_Lo = Zero;
2077
2078 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2079
2080 for (unsigned i = 0; i < halfBitWidth; ++i) {
2081 const unsigned bitPos = halfBitWidth - i - 1;
2082 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2083 // Get value of high bit
2084 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2085 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2086 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2087
2088 // Shift
2089 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2090 // Add LHS high bit
2091 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2092
2093 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2094 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2095
2096 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2097
2098 // Update REM
2099 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2100 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2101 }
2102
2103 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2104 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2105 Results.push_back(DIV);
2106 Results.push_back(REM);
2107}
2108
2109SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2110 SelectionDAG &DAG) const {
2111 SDLoc DL(Op);
2112 EVT VT = Op.getValueType();
2113
2114 if (VT == MVT::i64) {
2115 SmallVector<SDValue, 2> Results;
2116 LowerUDIVREM64(Op, DAG, Results);
2117 return DAG.getMergeValues(Results, DL);
2118 }
2119
2120 if (VT == MVT::i32) {
2121 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2122 return Res;
2123 }
2124
2125 SDValue X = Op.getOperand(0);
2126 SDValue Y = Op.getOperand(1);
2127
2128 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2129 // algorithm used here.
2130
2131 // Initial estimate of inv(y).
2132 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2133
2134 // One round of UNR.
2135 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2136 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2137 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2138 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2139
2140 // Quotient/remainder estimate.
2141 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2142 SDValue R =
2143 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2144
2145 // First quotient/remainder refinement.
2146 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2147 SDValue One = DAG.getConstant(1, DL, VT);
2148 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2149 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2150 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2151 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2152 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2153
2154 // Second quotient/remainder refinement.
2155 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2156 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2157 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2158 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2159 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2160
2161 return DAG.getMergeValues({Q, R}, DL);
2162}
2163
2164SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2165 SelectionDAG &DAG) const {
2166 SDLoc DL(Op);
2167 EVT VT = Op.getValueType();
2168
2169 SDValue LHS = Op.getOperand(0);
2170 SDValue RHS = Op.getOperand(1);
2171
2172 SDValue Zero = DAG.getConstant(0, DL, VT);
2173 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2174
2175 if (VT == MVT::i32) {
2176 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2177 return Res;
2178 }
2179
2180 if (VT == MVT::i64 &&
2181 DAG.ComputeNumSignBits(LHS) > 32 &&
2182 DAG.ComputeNumSignBits(RHS) > 32) {
2183 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2184
2185 //HiLo split
2186 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2187 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2188 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2189 LHS_Lo, RHS_Lo);
2190 SDValue Res[2] = {
2191 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2192 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2193 };
2194 return DAG.getMergeValues(Res, DL);
2195 }
2196
2197 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2198 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2199 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2200 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2201
2202 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2203 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2204
2205 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2206 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2207
2208 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2209 SDValue Rem = Div.getValue(1);
2210
2211 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2212 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2213
2214 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2215 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2216
2217 SDValue Res[2] = {
2218 Div,
2219 Rem
2220 };
2221 return DAG.getMergeValues(Res, DL);
2222}
2223
2224// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2225SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2226 SDLoc SL(Op);
2227 EVT VT = Op.getValueType();
2228 auto Flags = Op->getFlags();
2229 SDValue X = Op.getOperand(0);
2230 SDValue Y = Op.getOperand(1);
2231
2232 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2233 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2234 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2235 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2236 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2237}
2238
2239SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2240 SDLoc SL(Op);
2241 SDValue Src = Op.getOperand(0);
2242
2243 // result = trunc(src)
2244 // if (src > 0.0 && src != result)
2245 // result += 1.0
2246
2247 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2248
2249 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2250 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2251
2252 EVT SetCCVT =
2253 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2254
2255 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2256 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2257 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2258
2259 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2260 // TODO: Should this propagate fast-math-flags?
2261 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2262}
2263
2264static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2265 SelectionDAG &DAG) {
2266 const unsigned FractBits = 52;
2267 const unsigned ExpBits = 11;
2268
2269 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2270 Hi,
2271 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2272 DAG.getConstant(ExpBits, SL, MVT::i32));
2273 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2274 DAG.getConstant(1023, SL, MVT::i32));
2275
2276 return Exp;
2277}
2278
2279SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2280 SDLoc SL(Op);
2281 SDValue Src = Op.getOperand(0);
2282
2283 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2283, __extension__ __PRETTY_FUNCTION__))
;
2284
2285 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2286
2287 // Extract the upper half, since this is where we will find the sign and
2288 // exponent.
2289 SDValue Hi = getHiHalf64(Src, DAG);
2290
2291 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2292
2293 const unsigned FractBits = 52;
2294
2295 // Extract the sign bit.
2296 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2297 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2298
2299 // Extend back to 64-bits.
2300 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2301 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2302
2303 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2304 const SDValue FractMask
2305 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2306
2307 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2308 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2309 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2310
2311 EVT SetCCVT =
2312 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2313
2314 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2315
2316 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2317 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2318
2319 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2320 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2321
2322 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2323}
2324
2325SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2326 SDLoc SL(Op);
2327 SDValue Src = Op.getOperand(0);
2328
2329 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2329, __extension__ __PRETTY_FUNCTION__))
;
2330
2331 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2332 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2333 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2334
2335 // TODO: Should this propagate fast-math-flags?
2336
2337 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2338 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2339
2340 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2341
2342 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2343 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2344
2345 EVT SetCCVT =
2346 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2347 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2348
2349 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2350}
2351
2352SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2353 // FNEARBYINT and FRINT are the same, except in their handling of FP
2354 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2355 // rint, so just treat them as equivalent.
2356 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2357}
2358
2359SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2360 SelectionDAG &DAG) const {
2361 auto VT = Op.getValueType();
2362 auto Arg = Op.getOperand(0u);
2363 return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
2364}
2365
2366// XXX - May require not supporting f32 denormals?
2367
2368// Don't handle v2f16. The extra instructions to scalarize and repack around the
2369// compare and vselect end up producing worse code than scalarizing the whole
2370// operation.
2371SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2372 SDLoc SL(Op);
2373 SDValue X = Op.getOperand(0);
2374 EVT VT = Op.getValueType();
2375
2376 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2377
2378 // TODO: Should this propagate fast-math-flags?
2379
2380 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2381
2382 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2383
2384 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2385 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2386 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2387
2388 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2389
2390 EVT SetCCVT =
2391 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2392
2393 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2394
2395 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2396
2397 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2398}
2399
2400SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2401 SDLoc SL(Op);
2402 SDValue Src = Op.getOperand(0);
2403
2404 // result = trunc(src);
2405 // if (src < 0.0 && src != result)
2406 // result += -1.0.
2407
2408 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2409
2410 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2411 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2412
2413 EVT SetCCVT =
2414 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2415
2416 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2417 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2418 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2419
2420 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2421 // TODO: Should this propagate fast-math-flags?
2422 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2423}
2424
2425SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2426 double Log2BaseInverted) const {
2427 EVT VT = Op.getValueType();
2428
2429 SDLoc SL(Op);
2430 SDValue Operand = Op.getOperand(0);
2431 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2432 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2433
2434 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2435}
2436
2437// exp2(M_LOG2E_F * f);
2438SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2439 EVT VT = Op.getValueType();
2440 SDLoc SL(Op);
2441 SDValue Src = Op.getOperand(0);
2442
2443 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2444 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2445 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2446}
2447
2448static bool isCtlzOpc(unsigned Opc) {
2449 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2450}
2451
2452static bool isCttzOpc(unsigned Opc) {
2453 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2454}
2455
2456SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2457 SDLoc SL(Op);
2458 SDValue Src = Op.getOperand(0);
2459
2460 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()))(static_cast <bool> (isCtlzOpc(Op.getOpcode()) || isCttzOpc
(Op.getOpcode())) ? void (0) : __assert_fail ("isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2460, __extension__
__PRETTY_FUNCTION__))
;
2461 bool Ctlz = isCtlzOpc(Op.getOpcode());
2462 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2463
2464 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2465 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2466
2467 if (Src.getValueType() == MVT::i32) {
2468 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2469 // (cttz hi:lo) -> (umin (ffbl src), 32)
2470 // (ctlz_zero_undef src) -> (ffbh src)
2471 // (cttz_zero_undef src) -> (ffbl src)
2472 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2473 if (!ZeroUndef) {
2474 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2475 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2476 }
2477 return NewOpr;
2478 }
2479
2480 SDValue Lo, Hi;
2481 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2482
2483 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2484 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2485
2486 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2487 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2488 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2489 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2490
2491 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2492 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2493 if (Ctlz)
2494 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2495 else
2496 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2497
2498 SDValue NewOpr;
2499 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2500 if (!ZeroUndef) {
2501 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2502 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2503 }
2504
2505 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2506}
2507
2508SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2509 bool Signed) const {
2510 // The regular method converting a 64-bit integer to float roughly consists of
2511 // 2 steps: normalization and rounding. In fact, after normalization, the
2512 // conversion from a 64-bit integer to a float is essentially the same as the
2513 // one from a 32-bit integer. The only difference is that it has more
2514 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2515 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2516 // converted into the correct float number. The basic steps for the unsigned
2517 // conversion are illustrated in the following pseudo code:
2518 //
2519 // f32 uitofp(i64 u) {
2520 // i32 hi, lo = split(u);
2521 // // Only count the leading zeros in hi as we have native support of the
2522 // // conversion from i32 to f32. If hi is all 0s, the conversion is
2523 // // reduced to a 32-bit one automatically.
2524 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2525 // u <<= shamt;
2526 // hi, lo = split(u);
2527 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2528 // // convert it as a 32-bit integer and scale the result back.
2529 // return uitofp(hi) * 2^(32 - shamt);
2530 // }
2531 //
2532 // The signed one follows the same principle but uses 'ffbh_i32' to count its
2533 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2534 // converted instead followed by negation based its sign bit.
2535
2536 SDLoc SL(Op);
2537 SDValue Src = Op.getOperand(0);
2538
2539 SDValue Lo, Hi;
2540 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2541 SDValue Sign;
2542 SDValue ShAmt;
2543 if (Signed && Subtarget->isGCN()) {
2544 // We also need to consider the sign bit in Lo if Hi has just sign bits,
2545 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2546 // account. That is, the maximal shift is
2547 // - 32 if Lo and Hi have opposite signs;
2548 // - 33 if Lo and Hi have the same sign.
2549 //
2550 // Or, MaxShAmt = 33 + OppositeSign, where
2551 //
2552 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2553 // - -1 if Lo and Hi have opposite signs; and
2554 // - 0 otherwise.
2555 //
2556 // All in all, ShAmt is calculated as
2557 //
2558 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2559 //
2560 // or
2561 //
2562 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2563 //
2564 // to reduce the critical path.
2565 SDValue OppositeSign = DAG.getNode(
2566 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2567 DAG.getConstant(31, SL, MVT::i32));
2568 SDValue MaxShAmt =
2569 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2570 OppositeSign);
2571 // Count the leading sign bits.
2572 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2573 // Different from unsigned conversion, the shift should be one bit less to
2574 // preserve the sign bit.
2575 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2576 DAG.getConstant(1, SL, MVT::i32));
2577 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2578 } else {
2579 if (Signed) {
2580 // Without 'ffbh_i32', only leading zeros could be counted. Take the
2581 // absolute value first.
2582 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2583 DAG.getConstant(63, SL, MVT::i64));
2584 SDValue Abs =
2585 DAG.getNode(ISD::XOR, SL, MVT::i64,
2586 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2587 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2588 }
2589 // Count the leading zeros.
2590 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2591 // The shift amount for signed integers is [0, 32].
2592 }
2593 // Normalize the given 64-bit integer.
2594 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2595 // Split it again.
2596 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2597 // Calculate the adjust bit for rounding.
2598 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2599 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2600 DAG.getConstant(1, SL, MVT::i32), Lo);
2601 // Get the 32-bit normalized integer.
2602 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2603 // Convert the normalized 32-bit integer into f32.
2604 unsigned Opc =
2605 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2606 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2607
2608 // Finally, need to scale back the converted floating number as the original
2609 // 64-bit integer is converted as a 32-bit one.
2610 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2611 ShAmt);
2612 // On GCN, use LDEXP directly.
2613 if (Subtarget->isGCN())
2614 return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2615
2616 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2617 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2618 // exponent is enough to avoid overflowing into the sign bit.
2619 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2620 DAG.getConstant(23, SL, MVT::i32));
2621 SDValue IVal =
2622 DAG.getNode(ISD::ADD, SL, MVT::i32,
2623 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2624 if (Signed) {
2625 // Set the sign bit.
2626 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2627 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2628 DAG.getConstant(31, SL, MVT::i32));
2629 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2630 }
2631 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2632}
2633
2634SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2635 bool Signed) const {
2636 SDLoc SL(Op);
2637 SDValue Src = Op.getOperand(0);
2638
2639 SDValue Lo, Hi;
2640 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2641
2642 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2643 SL, MVT::f64, Hi);
2644
2645 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2646
2647 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2648 DAG.getConstant(32, SL, MVT::i32));
2649 // TODO: Should this propagate fast-math-flags?
2650 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2651}
2652
2653SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2654 SelectionDAG &DAG) const {
2655 // TODO: Factor out code common with LowerSINT_TO_FP.
2656 EVT DestVT = Op.getValueType();
2657 SDValue Src = Op.getOperand(0);
2658 EVT SrcVT = Src.getValueType();
2659
2660 if (SrcVT == MVT::i16) {
2661 if (DestVT == MVT::f16)
2662 return Op;
2663 SDLoc DL(Op);
2664
2665 // Promote src to i32
2666 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2667 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2668 }
2669
2670 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2670, __extension__
__PRETTY_FUNCTION__))
;
2671
2672 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2673 SDLoc DL(Op);
2674
2675 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2676 SDValue FPRoundFlag =
2677 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2678 SDValue FPRound =
2679 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2680
2681 return FPRound;
2682 }
2683
2684 if (DestVT == MVT::f32)
2685 return LowerINT_TO_FP32(Op, DAG, false);
2686
2687 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2687, __extension__ __PRETTY_FUNCTION__))
;
2688 return LowerINT_TO_FP64(Op, DAG, false);
2689}
2690
2691SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2692 SelectionDAG &DAG) const {
2693 EVT DestVT = Op.getValueType();
2694
2695 SDValue Src = Op.getOperand(0);
2696 EVT SrcVT = Src.getValueType();
2697
2698 if (SrcVT == MVT::i16) {
2699 if (DestVT == MVT::f16)
2700 return Op;
2701
2702 SDLoc DL(Op);
2703 // Promote src to i32
2704 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2705 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2706 }
2707
2708 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2708, __extension__
__PRETTY_FUNCTION__))
;
2709
2710 // TODO: Factor out code common with LowerUINT_TO_FP.
2711
2712 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2713 SDLoc DL(Op);
2714 SDValue Src = Op.getOperand(0);
2715
2716 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2717 SDValue FPRoundFlag =
2718 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2719 SDValue FPRound =
2720 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2721
2722 return FPRound;
2723 }
2724
2725 if (DestVT == MVT::f32)
2726 return LowerINT_TO_FP32(Op, DAG, true);
2727
2728 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2728, __extension__ __PRETTY_FUNCTION__))
;
2729 return LowerINT_TO_FP64(Op, DAG, true);
2730}
2731
2732SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2733 bool Signed) const {
2734 SDLoc SL(Op);
2735
2736 SDValue Src = Op.getOperand(0);
2737 EVT SrcVT = Src.getValueType();
2738
2739 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64)(static_cast <bool> (SrcVT == MVT::f32 || SrcVT == MVT::
f64) ? void (0) : __assert_fail ("SrcVT == MVT::f32 || SrcVT == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2739, __extension__
__PRETTY_FUNCTION__))
;
2740
2741 // The basic idea of converting a floating point number into a pair of 32-bit
2742 // integers is illustrated as follows:
2743 //
2744 // tf := trunc(val);
2745 // hif := floor(tf * 2^-32);
2746 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2747 // hi := fptoi(hif);
2748 // lo := fptoi(lof);
2749 //
2750 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2751 SDValue Sign;
2752 if (Signed && SrcVT == MVT::f32) {
2753 // However, a 32-bit floating point number has only 23 bits mantissa and
2754 // it's not enough to hold all the significant bits of `lof` if val is
2755 // negative. To avoid the loss of precision, We need to take the absolute
2756 // value after truncating and flip the result back based on the original
2757 // signedness.
2758 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2759 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2760 DAG.getConstant(31, SL, MVT::i32));
2761 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2762 }
2763
2764 SDValue K0, K1;
2765 if (SrcVT == MVT::f64) {
2766 K0 = DAG.getConstantFP(
2767 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL), SL,
2768 SrcVT);
2769 K1 = DAG.getConstantFP(
2770 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL), SL,
2771 SrcVT);
2772 } else {
2773 K0 = DAG.getConstantFP(
2774 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U), SL, SrcVT);
2775 K1 = DAG.getConstantFP(
2776 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U), SL, SrcVT);
2777 }
2778 // TODO: Should this propagate fast-math-flags?
2779 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2780
2781 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2782
2783 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2784
2785 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2786 : ISD::FP_TO_UINT,
2787 SL, MVT::i32, FloorMul);
2788 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2789
2790 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2791 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2792
2793 if (Signed && SrcVT == MVT::f32) {
2794 assert(Sign)(static_cast <bool> (Sign) ? void (0) : __assert_fail (
"Sign", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2794
, __extension__ __PRETTY_FUNCTION__))
;
2795 // Flip the result based on the signedness, which is either all 0s or 1s.
2796 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2797 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2798 // r := xor(r, sign) - sign;
2799 Result =
2800 DAG.getNode(ISD::SUB, SL, MVT::i64,
2801 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2802 }
2803
2804 return Result;
2805}
2806
2807SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2808 SDLoc DL(Op);
2809 SDValue N0 = Op.getOperand(0);
2810
2811 // Convert to target node to get known bits
2812 if (N0.getValueType() == MVT::f32)
2813 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2814
2815 if (getTargetMachine().Options.UnsafeFPMath) {
2816 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2817 return SDValue();
2818 }
2819
2820 assert(N0.getSimpleValueType() == MVT::f64)(static_cast <bool> (N0.getSimpleValueType() == MVT::f64
) ? void (0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2820, __extension__
__PRETTY_FUNCTION__))
;
2821
2822 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2823 const unsigned ExpMask = 0x7ff;
2824 const unsigned ExpBiasf64 = 1023;
2825 const unsigned ExpBiasf16 = 15;
2826 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2827 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2828 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2829 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2830 DAG.getConstant(32, DL, MVT::i64));
2831 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2832 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2833 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2834 DAG.getConstant(20, DL, MVT::i64));
2835 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2836 DAG.getConstant(ExpMask, DL, MVT::i32));
2837 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2838 // add the f16 bias (15) to get the biased exponent for the f16 format.
2839 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2840 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2841
2842 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2843 DAG.getConstant(8, DL, MVT::i32));
2844 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2845 DAG.getConstant(0xffe, DL, MVT::i32));
2846
2847 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2848 DAG.getConstant(0x1ff, DL, MVT::i32));
2849 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2850
2851 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2852 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2853
2854 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2855 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2856 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2857 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2858
2859 // N = M | (E << 12);
2860 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2861 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2862 DAG.getConstant(12, DL, MVT::i32)));
2863
2864 // B = clamp(1-E, 0, 13);
2865 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2866 One, E);
2867 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2868 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2869 DAG.getConstant(13, DL, MVT::i32));
2870
2871 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2872 DAG.getConstant(0x1000, DL, MVT::i32));
2873
2874 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2875 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2876 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2877 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2878
2879 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2880 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2881 DAG.getConstant(0x7, DL, MVT::i32));
2882 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2883 DAG.getConstant(2, DL, MVT::i32));
2884 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2885 One, Zero, ISD::SETEQ);
2886 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2887 One, Zero, ISD::SETGT);
2888 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2889 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2890
2891 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2892 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2893 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2894 I, V, ISD::SETEQ);
2895
2896 // Extract the sign bit.
2897 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2898 DAG.getConstant(16, DL, MVT::i32));
2899 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2900 DAG.getConstant(0x8000, DL, MVT::i32));
2901
2902 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2903 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2904}
2905
2906SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2907 SelectionDAG &DAG) const {
2908 SDValue Src = Op.getOperand(0);
2909 unsigned OpOpcode = Op.getOpcode();
2910 EVT SrcVT = Src.getValueType();
2911 EVT DestVT = Op.getValueType();
2912
2913 // Will be selected natively
2914 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2915 return Op;
2916
2917 // Promote i16 to i32
2918 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2919 SDLoc DL(Op);
2920
2921 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2922 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2923 }
2924
2925 if (SrcVT == MVT::f16 ||
2926 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2927 SDLoc DL(Op);
2928
2929 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2930 unsigned Ext =
2931 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2932 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2933 }
2934
2935 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2936 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2937
2938 return SDValue();
2939}
2940
2941SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2942 SelectionDAG &DAG) const {
2943 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2944 MVT VT = Op.getSimpleValueType();
2945 MVT ScalarVT = VT.getScalarType();
2946
2947 assert(VT.isVector())(static_cast <bool> (VT.isVector()) ? void (0) : __assert_fail
("VT.isVector()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2947, __extension__ __PRETTY_FUNCTION__))
;
2948
2949 SDValue Src = Op.getOperand(0);
2950 SDLoc DL(Op);
2951
2952 // TODO: Don't scalarize on Evergreen?
2953 unsigned NElts = VT.getVectorNumElements();
2954 SmallVector<SDValue, 8> Args;
2955 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2956
2957 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2958 for (unsigned I = 0; I < NElts; ++I)
2959 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2960
2961 return DAG.getBuildVector(VT, DL, Args);
2962}
2963
2964//===----------------------------------------------------------------------===//
2965// Custom DAG optimizations
2966//===----------------------------------------------------------------------===//
2967
2968static bool isU24(SDValue Op, SelectionDAG &DAG) {
2969 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2970}
2971
2972static bool isI24(SDValue Op, SelectionDAG &DAG) {
2973 EVT VT = Op.getValueType();
2974 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2975 // as unsigned 24-bit values.
2976 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2977}
2978
2979static SDValue simplifyMul24(SDNode *Node24,
2980 TargetLowering::DAGCombinerInfo &DCI) {
2981 SelectionDAG &DAG = DCI.DAG;
2982 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2983 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2984
2985 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2986 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2987 unsigned NewOpcode = Node24->getOpcode();
2988 if (IsIntrin) {
2989 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2990 switch (IID) {
2991 case Intrinsic::amdgcn_mul_i24:
2992 NewOpcode = AMDGPUISD::MUL_I24;
2993 break;
2994 case Intrinsic::amdgcn_mul_u24:
2995 NewOpcode = AMDGPUISD::MUL_U24;
2996 break;
2997 case Intrinsic::amdgcn_mulhi_i24:
2998 NewOpcode = AMDGPUISD::MULHI_I24;
2999 break;
3000 case Intrinsic::amdgcn_mulhi_u24:
3001 NewOpcode = AMDGPUISD::MULHI_U24;
3002 break;
3003 default:
3004 llvm_unreachable("Expected 24-bit mul intrinsic")::llvm::llvm_unreachable_internal("Expected 24-bit mul intrinsic"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 3004)
;
3005 }
3006 }
3007
3008 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3009
3010 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3011 // the operands to have other uses, but will only perform simplifications that
3012 // involve bypassing some nodes for this user.
3013 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3014 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3015 if (DemandedLHS || DemandedRHS)
3016 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3017 DemandedLHS ? DemandedLHS : LHS,
3018 DemandedRHS ? DemandedRHS : RHS);
3019
3020 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3021 // operands if this node is the only user.
3022 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3023 return SDValue(Node24, 0);
3024 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3025 return SDValue(Node24, 0);
3026
3027 return SDValue();
3028}
3029
3030template <typename IntTy>
3031static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3032 uint32_t Width, const SDLoc &DL) {
3033 if (Width + Offset < 32) {
3034 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3035 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3036 return DAG.getConstant(Result, DL, MVT::i32);
3037 }
3038
3039 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3040}
3041
3042static bool hasVolatileUser(SDNode *Val) {
3043 for (SDNode *U : Val->uses()) {
3044 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3045 if (M->isVolatile())
3046 return true;
3047 }
3048 }
3049
3050 return false;
3051}
3052
3053bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3054 // i32 vectors are the canonical memory type.
3055 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3056 return false;
3057
3058 if (!VT.isByteSized())
3059 return false;
3060
3061 unsigned Size = VT.getStoreSize();
3062
3063 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3064 return false;
3065
3066 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3067 return false;
3068
3069 return true;
3070}
3071
3072// Replace load of an illegal type with a store of a bitcast to a friendlier
3073// type.
3074SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3075 DAGCombinerInfo &DCI) const {
3076 if (!DCI.isBeforeLegalize())
3077 return SDValue();
3078
3079 LoadSDNode *LN = cast<LoadSDNode>(N);
3080 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3081 return SDValue();
3082
3083 SDLoc SL(N);
3084 SelectionDAG &DAG = DCI.DAG;
3085 EVT VT = LN->getMemoryVT();
3086
3087 unsigned Size = VT.getStoreSize();
3088 Align Alignment = LN->getAlign();
3089 if (Alignment < Size && isTypeLegal(VT)) {
3090 unsigned IsFast;
3091 unsigned AS = LN->getAddressSpace();
3092
3093 // Expand unaligned loads earlier than legalization. Due to visitation order
3094 // problems during legalization, the emitted instructions to pack and unpack
3095 // the bytes again are not eliminated in the case of an unaligned copy.
3096 if (!allowsMisalignedMemoryAccesses(
3097 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3098 if (VT.isVector())
3099 return SplitVectorLoad(SDValue(LN, 0), DAG);
3100
3101 SDValue Ops[2];
3102 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3103
3104 return DAG.getMergeValues(Ops, SDLoc(N));
3105 }
3106
3107 if (!IsFast)
3108 return SDValue();
3109 }
3110
3111 if (!shouldCombineMemoryType(VT))
3112 return SDValue();
3113
3114 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3115
3116 SDValue NewLoad
3117 = DAG.getLoad(NewVT, SL, LN->getChain(),
3118 LN->getBasePtr(), LN->getMemOperand());
3119
3120 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3121 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3122 return SDValue(N, 0);
3123}
3124
3125// Replace store of an illegal type with a store of a bitcast to a friendlier
3126// type.
3127SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3128 DAGCombinerInfo &DCI) const {
3129 if (!DCI.isBeforeLegalize())
3130 return SDValue();
3131
3132 StoreSDNode *SN = cast<StoreSDNode>(N);
3133 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3134 return SDValue();
3135
3136 EVT VT = SN->getMemoryVT();
3137 unsigned Size = VT.getStoreSize();
3138
3139 SDLoc SL(N);
3140 SelectionDAG &DAG = DCI.DAG;
3141 Align Alignment = SN->getAlign();
3142 if (Alignment < Size && isTypeLegal(VT)) {
3143 unsigned IsFast;
3144 unsigned AS = SN->getAddressSpace();
3145
3146 // Expand unaligned stores earlier than legalization. Due to visitation
3147 // order problems during legalization, the emitted instructions to pack and
3148 // unpack the bytes again are not eliminated in the case of an unaligned
3149 // copy.
3150 if (!allowsMisalignedMemoryAccesses(
3151 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3152 if (VT.isVector())
3153 return SplitVectorStore(SDValue(SN, 0), DAG);
3154
3155 return expandUnalignedStore(SN, DAG);
3156 }
3157
3158 if (!IsFast)
3159 return SDValue();
3160 }
3161
3162 if (!shouldCombineMemoryType(VT))
3163 return SDValue();
3164
3165 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3166 SDValue Val = SN->getValue();
3167
3168 //DCI.AddToWorklist(Val.getNode());
3169
3170 bool OtherUses = !Val.hasOneUse();
3171 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3172 if (OtherUses) {
3173 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3174 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3175 }
3176
3177 return DAG.getStore(SN->getChain(), SL, CastVal,
3178 SN->getBasePtr(), SN->getMemOperand());
3179}
3180
3181// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3182// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3183// issues.
3184SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3185 DAGCombinerInfo &DCI) const {
3186 SelectionDAG &DAG = DCI.DAG;
3187 SDValue N0 = N->getOperand(0);
3188
3189 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3190 // (vt2 (truncate (assertzext vt0:x, vt1)))
3191 if (N0.getOpcode() == ISD::TRUNCATE) {
3192 SDValue N1 = N->getOperand(1);
3193 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3194 SDLoc SL(N);
3195
3196 SDValue Src = N0.getOperand(0);
3197 EVT SrcVT = Src.getValueType();
3198 if (SrcVT.bitsGE(ExtVT)) {
3199 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3200 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3201 }
3202 }
3203
3204 return SDValue();
3205}
3206
3207SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3208 SDNode *N, DAGCombinerInfo &DCI) const {
3209 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3210 switch (IID) {
3211 case Intrinsic::amdgcn_mul_i24:
3212 case Intrinsic::amdgcn_mul_u24:
3213 case Intrinsic::amdgcn_mulhi_i24:
3214 case Intrinsic::amdgcn_mulhi_u24:
3215 return simplifyMul24(N, DCI);
3216 case Intrinsic::amdgcn_fract:
3217 case Intrinsic::amdgcn_rsq:
3218 case Intrinsic::amdgcn_rcp_legacy:
3219 case Intrinsic::amdgcn_rsq_legacy:
3220 case Intrinsic::amdgcn_rsq_clamp:
3221 case Intrinsic::amdgcn_ldexp: {
3222 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3223 SDValue Src = N->getOperand(1);
3224 return Src.isUndef() ? Src : SDValue();
3225 }
3226 default:
3227 return SDValue();
3228 }
3229}
3230
3231/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3232/// binary operation \p Opc to it with the corresponding constant operands.
3233SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3234 DAGCombinerInfo &DCI, const SDLoc &SL,
3235 unsigned Opc, SDValue LHS,
3236 uint32_t ValLo, uint32_t ValHi) const {
3237 SelectionDAG &DAG = DCI.DAG;
3238 SDValue Lo, Hi;
3239 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3240
3241 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3242 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3243
3244 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3245 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3246
3247 // Re-visit the ands. It's possible we eliminated one of them and it could
3248 // simplify the vector.
3249 DCI.AddToWorklist(Lo.getNode());
3250 DCI.AddToWorklist(Hi.getNode());
3251
3252 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3253 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3254}
3255
3256SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3257 DAGCombinerInfo &DCI) const {
3258 EVT VT = N->getValueType(0);
3259
3260 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3261 if (!RHS)
3262 return SDValue();
3263
3264 SDValue LHS = N->getOperand(0);
3265 unsigned RHSVal = RHS->getZExtValue();
3266 if (!RHSVal)
3267 return LHS;
3268
3269 SDLoc SL(N);
3270 SelectionDAG &DAG = DCI.DAG;
3271
3272 switch (LHS->getOpcode()) {
3273 default:
3274 break;
3275 case ISD::ZERO_EXTEND:
3276 case ISD::SIGN_EXTEND:
3277 case ISD::ANY_EXTEND: {
3278 SDValue X = LHS->getOperand(0);
3279
3280 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3281 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3282 // Prefer build_vector as the canonical form if packed types are legal.
3283 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3284 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3285 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3286 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3287 }
3288
3289 // shl (ext x) => zext (shl x), if shift does not overflow int
3290 if (VT != MVT::i64)
3291 break;
3292 KnownBits Known = DAG.computeKnownBits(X);
3293 unsigned LZ = Known.countMinLeadingZeros();
3294 if (LZ < RHSVal)
3295 break;
3296 EVT XVT = X.getValueType();
3297 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3298 return DAG.getZExtOrTrunc(Shl, SL, VT);
3299 }
3300 }
3301
3302 if (VT != MVT::i64)
3303 return SDValue();
3304
3305 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3306
3307 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3308 // common case, splitting this into a move and a 32-bit shift is faster and
3309 // the same code size.
3310 if (RHSVal < 32)
3311 return SDValue();
3312
3313 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3314
3315 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3316 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3317
3318 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3319
3320 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3321 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3322}
3323
3324SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3325 DAGCombinerInfo &DCI) const {
3326 if (N->getValueType(0) != MVT::i64)
3327 return SDValue();
3328
3329 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3330 if (!RHS)
3331 return SDValue();
3332
3333 SelectionDAG &DAG = DCI.DAG;
3334 SDLoc SL(N);
3335 unsigned RHSVal = RHS->getZExtValue();
3336
3337 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3338 if (RHSVal == 32) {
3339 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3340 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3341 DAG.getConstant(31, SL, MVT::i32));
3342
3343 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3344 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3345 }
3346
3347 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3348 if (RHSVal == 63) {
3349 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3350 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3351 DAG.getConstant(31, SL, MVT::i32));
3352 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3353 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3354 }
3355
3356 return SDValue();
3357}
3358
3359SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3360 DAGCombinerInfo &DCI) const {
3361 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3362 if (!RHS)
3363 return SDValue();
3364
3365 EVT VT = N->getValueType(0);
3366 SDValue LHS = N->getOperand(0);
3367 unsigned ShiftAmt = RHS->getZExtValue();
3368 SelectionDAG &DAG = DCI.DAG;
3369 SDLoc SL(N);
3370
3371 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3372 // this improves the ability to match BFE patterns in isel.
3373 if (LHS.getOpcode() == ISD::AND) {
3374 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3375 unsigned MaskIdx, MaskLen;
3376 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3377 MaskIdx == ShiftAmt) {
3378 return DAG.getNode(
3379 ISD::AND, SL, VT,
3380 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3381 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3382 }
3383 }
3384 }
3385
3386 if (VT != MVT::i64)
3387 return SDValue();
3388
3389 if (ShiftAmt < 32)
3390 return SDValue();
3391
3392 // srl i64:x, C for C >= 32
3393 // =>
3394 // build_pair (srl hi_32(x), C - 32), 0
3395 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3396
3397 SDValue Hi = getHiHalf64(LHS, DAG);
3398
3399 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3400 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3401
3402 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3403
3404 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3405}
3406
3407SDValue AMDGPUTargetLowering::performTruncateCombine(
3408 SDNode *N, DAGCombinerInfo &DCI) const {
3409 SDLoc SL(N);
3410 SelectionDAG &DAG = DCI.DAG;
3411 EVT VT = N->getValueType(0);
3412 SDValue Src = N->getOperand(0);
3413
3414 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3415 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3416 SDValue Vec = Src.getOperand(0);
3417 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3418 SDValue Elt0 = Vec.getOperand(0);
3419 EVT EltVT = Elt0.getValueType();
3420 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3421 if (EltVT.isFloatingPoint()) {
3422 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3423 EltVT.changeTypeToInteger(), Elt0);
3424 }
3425
3426 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3427 }
3428 }
3429 }
3430
3431 // Equivalent of above for accessing the high element of a vector as an
3432 // integer operation.
3433 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3434 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3435 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3436 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3437 SDValue BV = stripBitcast(Src.getOperand(0));
3438 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3439 BV.getValueType().getVectorNumElements() == 2) {
3440 SDValue SrcElt = BV.getOperand(1);
3441 EVT SrcEltVT = SrcElt.getValueType();
3442 if (SrcEltVT.isFloatingPoint()) {
3443 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3444 SrcEltVT.changeTypeToInteger(), SrcElt);
3445 }
3446
3447 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3448 }
3449 }
3450 }
3451 }
3452
3453 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3454 //
3455 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3456 // i16 (trunc (srl (i32 (trunc x), K)))
3457 if (VT.getScalarSizeInBits() < 32) {
3458 EVT SrcVT = Src.getValueType();
3459 if (SrcVT.getScalarSizeInBits() > 32 &&
3460 (Src.getOpcode() == ISD::SRL ||
3461 Src.getOpcode() == ISD::SRA ||
3462 Src.getOpcode() == ISD::SHL)) {
3463 SDValue Amt = Src.getOperand(1);
3464 KnownBits Known = DAG.computeKnownBits(Amt);
3465
3466 // - For left shifts, do the transform as long as the shift
3467 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
3468 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
3469 // losing information stored in the high bits when truncating.
3470 const unsigned MaxCstSize =
3471 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
3472 if (Known.getMaxValue().ule(MaxCstSize)) {
3473 EVT MidVT = VT.isVector() ?
3474 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3475 VT.getVectorNumElements()) : MVT::i32;
3476
3477 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3478 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3479 Src.getOperand(0));
3480 DCI.AddToWorklist(Trunc.getNode());
3481
3482 if (Amt.getValueType() != NewShiftVT) {
3483 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3484 DCI.AddToWorklist(Amt.getNode());
3485 }
3486
3487 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3488 Trunc, Amt);
3489 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3490 }
3491 }
3492 }
3493
3494 return SDValue();
3495}
3496
3497// We need to specifically handle i64 mul here to avoid unnecessary conversion
3498// instructions. If we only match on the legalized i64 mul expansion,
3499// SimplifyDemandedBits will be unable to remove them because there will be
3500// multiple uses due to the separate mul + mulh[su].
3501static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3502 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3503 if (Size <= 32) {
3504 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3505 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3506 }
3507
3508 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3509 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3510
3511 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3512 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3513
3514 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3515}
3516
3517SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3518 DAGCombinerInfo &DCI) const {
3519 EVT VT = N->getValueType(0);
3520
3521 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3522 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3523 // unnecessarily). isDivergent() is used as an approximation of whether the
3524 // value is in an SGPR.
3525 if (!N->isDivergent())
3526 return SDValue();
3527
3528 unsigned Size = VT.getSizeInBits();
3529 if (VT.isVector() || Size > 64)
3530 return SDValue();
3531
3532 // There are i16 integer mul/mad.
3533 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3534 return SDValue();
3535
3536 SelectionDAG &DAG = DCI.DAG;
3537 SDLoc DL(N);
3538
3539 SDValue N0 = N->getOperand(0);
3540 SDValue N1 = N->getOperand(1);
3541
3542 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3543 // in the source into any_extends if the result of the mul is truncated. Since
3544 // we can assume the high bits are whatever we want, use the underlying value
3545 // to avoid the unknown high bits from interfering.
3546 if (N0.getOpcode() == ISD::ANY_EXTEND)
3547 N0 = N0.getOperand(0);
3548
3549 if (N1.getOpcode() == ISD::ANY_EXTEND)
3550 N1 = N1.getOperand(0);
3551
3552 SDValue Mul;
3553
3554 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3555 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3556 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3557 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3558 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3559 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3560 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3561 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3562 } else {
3563 return SDValue();
3564 }
3565
3566 // We need to use sext even for MUL_U24, because MUL_U24 is used
3567 // for signed multiply of 8 and 16-bit types.
3568 return DAG.getSExtOrTrunc(Mul, DL, VT);
3569}
3570
3571SDValue
3572AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3573 DAGCombinerInfo &DCI) const {
3574 if (N->getValueType(0) != MVT::i32)
3575 return SDValue();
3576
3577 SelectionDAG &DAG = DCI.DAG;
3578 SDLoc DL(N);
3579
3580 SDValue N0 = N->getOperand(0);
3581 SDValue N1 = N->getOperand(1);
3582
3583 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3584 // in the source into any_extends if the result of the mul is truncated. Since
3585 // we can assume the high bits are whatever we want, use the underlying value
3586 // to avoid the unknown high bits from interfering.
3587 if (N0.getOpcode() == ISD::ANY_EXTEND)
3588 N0 = N0.getOperand(0);
3589 if (N1.getOpcode() == ISD::ANY_EXTEND)
3590 N1 = N1.getOperand(0);
3591
3592 // Try to use two fast 24-bit multiplies (one for each half of the result)
3593 // instead of one slow extending multiply.
3594 unsigned LoOpcode, HiOpcode;
3595 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3596 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3597 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3598 LoOpcode = AMDGPUISD::MUL_U24;
3599 HiOpcode = AMDGPUISD::MULHI_U24;
3600 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3601 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3602 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3603 LoOpcode = AMDGPUISD::MUL_I24;
3604 HiOpcode = AMDGPUISD::MULHI_I24;
3605 } else {
3606 return SDValue();
3607 }
3608
3609 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3610 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3611 DCI.CombineTo(N, Lo, Hi);
3612 return SDValue(N, 0);
3613}
3614
3615SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3616 DAGCombinerInfo &DCI) const {
3617 EVT VT = N->getValueType(0);
3618
3619 if (!Subtarget->hasMulI24() || VT.isVector())
3620 return SDValue();
3621
3622 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3623 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3624 // unnecessarily). isDivergent() is used as an approximation of whether the
3625 // value is in an SGPR.
3626 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3627 // valu op anyway)
3628 if (Subtarget->hasSMulHi() && !N->isDivergent())
3629 return SDValue();
3630
3631 SelectionDAG &DAG = DCI.DAG;
3632 SDLoc DL(N);
3633
3634 SDValue N0 = N->getOperand(0);
3635 SDValue N1 = N->getOperand(1);
3636
3637 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3638 return SDValue();
3639
3640 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3641 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3642
3643 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3644 DCI.AddToWorklist(Mulhi.getNode());
3645 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3646}
3647
3648SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3649 DAGCombinerInfo &DCI) const {
3650 EVT VT = N->getValueType(0);
3651
3652 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3653 return SDValue();
3654
3655 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3656 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3657 // unnecessarily). isDivergent() is used as an approximation of whether the
3658 // value is in an SGPR.
3659 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3660 // valu op anyway)
3661 if (Subtarget->hasSMulHi() && !N->isDivergent())
3662 return SDValue();
3663
3664 SelectionDAG &DAG = DCI.DAG;
3665 SDLoc DL(N);
3666
3667 SDValue N0 = N->getOperand(0);
3668 SDValue N1 = N->getOperand(1);
3669
3670 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3671 return SDValue();
3672
3673 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3674 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3675
3676 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3677 DCI.AddToWorklist(Mulhi.getNode());
3678 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3679}
3680
3681SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3682 SDValue Op,
3683 const SDLoc &DL,
3684 unsigned Opc) const {
3685 EVT VT = Op.getValueType();
3686 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3687 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3688 LegalVT != MVT::i16))
3689 return SDValue();
3690
3691 if (VT != MVT::i32)
3692 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3693
3694 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3695 if (VT != MVT::i32)
3696 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3697
3698 return FFBX;
3699}
3700
3701// The native instructions return -1 on 0 input. Optimize out a select that
3702// produces -1 on 0.
3703//
3704// TODO: If zero is not undef, we could also do this if the output is compared
3705// against the bitwidth.
3706//
3707// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3708SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3709 SDValue LHS, SDValue RHS,
3710 DAGCombinerInfo &DCI) const {
3711 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3712 if (!CmpRhs || !CmpRhs->isZero())
3713 return SDValue();
3714
3715 SelectionDAG &DAG = DCI.DAG;
3716 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3717 SDValue CmpLHS = Cond.getOperand(0);
3718
3719 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3720 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3721 if (CCOpcode == ISD::SETEQ &&
3722 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3723 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
3724 unsigned Opc =
3725 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3726 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3727 }
3728
3729 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3730 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3731 if (CCOpcode == ISD::SETNE &&
3732 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3733 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
3734 unsigned Opc =
3735 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3736
3737 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3738 }
3739
3740 return SDValue();
3741}
3742
3743static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3744 unsigned Op,
3745 const SDLoc &SL,
3746 SDValue Cond,
3747 SDValue N1,
3748 SDValue N2) {
3749 SelectionDAG &DAG = DCI.DAG;
3750 EVT VT = N1.getValueType();
3751
3752 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3753 N1.getOperand(0), N2.getOperand(0));
3754 DCI.AddToWorklist(NewSelect.getNode());
3755 return DAG.getNode(Op, SL, VT, NewSelect);
3756}
3757
3758// Pull a free FP operation out of a select so it may fold into uses.
3759//
3760// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3761// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3762//
3763// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3764// select c, (fabs x), +k -> fabs (select c, x, k)
3765SDValue
3766AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3767 SDValue N) const {
3768 SelectionDAG &DAG = DCI.DAG;
3769 SDValue Cond = N.getOperand(0);
3770 SDValue LHS = N.getOperand(1);
3771 SDValue RHS = N.getOperand(2);
3772
3773 EVT VT = N.getValueType();
3774 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3775 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3776 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
3777 return SDValue();
3778
3779 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3780 SDLoc(N), Cond, LHS, RHS);
3781 }
3782
3783 bool Inv = false;
3784 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3785 std::swap(LHS, RHS);
3786 Inv = true;
3787 }
3788
3789 // TODO: Support vector constants.
3790 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3791 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
3792 !selectSupportsSourceMods(N.getNode())) {
3793 SDLoc SL(N);
3794 // If one side is an fneg/fabs and the other is a constant, we can push the
3795 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3796 SDValue NewLHS = LHS.getOperand(0);
3797 SDValue NewRHS = RHS;
3798
3799 // Careful: if the neg can be folded up, don't try to pull it back down.
3800 bool ShouldFoldNeg = true;
3801
3802 if (NewLHS.hasOneUse()) {
3803 unsigned Opc = NewLHS.getOpcode();
3804 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
3805 ShouldFoldNeg = false;
3806 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3807 ShouldFoldNeg = false;
3808 }
3809
3810 if (ShouldFoldNeg) {
3811 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
3812 return SDValue();
3813
3814 // We're going to be forced to use a source modifier anyway, there's no
3815 // point to pulling the negate out unless we can get a size reduction by
3816 // negating the constant.
3817 //
3818 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
3819 // about cheaper constants.
3820 if (NewLHS.getOpcode() == ISD::FABS &&
3821 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
3822 return SDValue();
3823
3824 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
3825 return SDValue();
3826
3827 if (LHS.getOpcode() == ISD::FNEG)
3828 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3829
3830 if (Inv)
3831 std::swap(NewLHS, NewRHS);
3832
3833 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3834 Cond, NewLHS, NewRHS);
3835 DCI.AddToWorklist(NewSelect.getNode());
3836 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3837 }
3838 }
3839
3840 return SDValue();
3841}
3842
3843SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3844 DAGCombinerInfo &DCI) const {
3845 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3846 return Folded;
3847
3848 SDValue Cond = N->getOperand(0);
3849 if (Cond.getOpcode() != ISD::SETCC)
3850 return SDValue();
3851
3852 EVT VT = N->getValueType(0);
3853 SDValue LHS = Cond.getOperand(0);
3854 SDValue RHS = Cond.getOperand(1);
3855 SDValue CC = Cond.getOperand(2);
3856
3857 SDValue True = N->getOperand(1);
3858 SDValue False = N->getOperand(2);
3859
3860 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3861 SelectionDAG &DAG = DCI.DAG;
3862 if (DAG.isConstantValueOfAnyType(True) &&
3863 !DAG.isConstantValueOfAnyType(False)) {
3864 // Swap cmp + select pair to move constant to false input.
3865 // This will allow using VOPC cndmasks more often.
3866 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3867
3868 SDLoc SL(N);
3869 ISD::CondCode NewCC =
3870 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3871
3872 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3873 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3874 }
3875
3876 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3877 SDValue MinMax
3878 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3879 // Revisit this node so we can catch min3/max3/med3 patterns.
3880 //DCI.AddToWorklist(MinMax.getNode());
3881 return MinMax;
3882 }
3883 }
3884
3885 // There's no reason to not do this if the condition has other uses.
3886 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3887}
3888
3889static bool isInv2Pi(const APFloat &APF) {
3890 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3891 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3892 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3893
3894 return APF.bitwiseIsEqual(KF16) ||
3895 APF.bitwiseIsEqual(KF32) ||
3896 APF.bitwiseIsEqual(KF64);
3897}
3898
3899// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3900// additional cost to negate them.
3901TargetLowering::NegatibleCost
3902AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
3903 if (C->isZero())
3904 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
3905
3906 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3907 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
3908
3909 return NegatibleCost::Neutral;
3910}
3911
3912bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3913 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3914 return getConstantNegateCost(C) == NegatibleCost::Expensive;
3915 return false;
3916}
3917
3918bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
3919 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3920 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
3921 return false;
3922}
3923
3924static unsigned inverseMinMax(unsigned Opc) {
3925 switch (Opc) {
3926 case ISD::FMAXNUM:
3927 return ISD::FMINNUM;
3928 case ISD::FMINNUM:
3929 return ISD::FMAXNUM;
3930 case ISD::FMAXNUM_IEEE:
3931 return ISD::FMINNUM_IEEE;
3932 case ISD::FMINNUM_IEEE:
3933 return ISD::FMAXNUM_IEEE;
3934 case AMDGPUISD::FMAX_LEGACY:
3935 return AMDGPUISD::FMIN_LEGACY;
3936 case AMDGPUISD::FMIN_LEGACY:
3937 return AMDGPUISD::FMAX_LEGACY;
3938 default:
3939 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3939)
;
3940 }
3941}
3942
3943/// \return true if it's profitable to try to push an fneg into its source
3944/// instruction.
3945bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
3946 // If the input has multiple uses and we can either fold the negate down, or
3947 // the other uses cannot, give up. This both prevents unprofitable
3948 // transformations and infinite loops: we won't repeatedly try to fold around
3949 // a negate that has no 'good' form.
3950 if (N0.hasOneUse()) {
3951 // This may be able to fold into the source, but at a code size cost. Don't
3952 // fold if the fold into the user is free.
3953 if (allUsesHaveSourceMods(N, 0))
3954 return false;
3955 } else {
3956 if (fnegFoldsIntoOp(N0.getNode()) &&
3957 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3958 return false;
3959 }
3960
3961 return true;
3962}
3963
3964SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3965 DAGCombinerInfo &DCI) const {
3966 SelectionDAG &DAG = DCI.DAG;
3967 SDValue N0 = N->getOperand(0);
3968 EVT VT = N->getValueType(0);
3969
3970 unsigned Opc = N0.getOpcode();
3971
3972 if (!shouldFoldFNegIntoSrc(N, N0))
3973 return SDValue();
3974
3975 SDLoc SL(N);
3976 switch (Opc) {
3977 case ISD::FADD: {
3978 if (!mayIgnoreSignedZero(N0))
3979 return SDValue();
3980
3981 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3982 SDValue LHS = N0.getOperand(0);
3983 SDValue RHS = N0.getOperand(1);
3984
3985 if (LHS.getOpcode() != ISD::FNEG)
3986 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3987 else
3988 LHS = LHS.getOperand(0);
3989
3990 if (RHS.getOpcode() != ISD::FNEG)
3991 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3992 else
3993 RHS = RHS.getOperand(0);
3994
3995 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3996 if (Res.getOpcode() != ISD::FADD)
3997 return SDValue(); // Op got folded away.
3998 if (!N0.hasOneUse())
3999 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4000 return Res;
4001 }
4002 case ISD::FMUL:
4003 case AMDGPUISD::FMUL_LEGACY: {
4004 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4005 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4006 SDValue LHS = N0.getOperand(0);
4007 SDValue RHS = N0.getOperand(1);
4008
4009 if (LHS.getOpcode() == ISD::FNEG)
4010 LHS = LHS.getOperand(0);
4011 else if (RHS.getOpcode() == ISD::FNEG)
4012 RHS = RHS.getOperand(0);
4013 else
4014 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4015
4016 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4017 if (Res.getOpcode() != Opc)
4018 return SDValue(); // Op got folded away.
4019 if (!N0.hasOneUse())
4020 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4021 return Res;
4022 }
4023 case ISD::FMA:
4024 case ISD::FMAD: {
4025 // TODO: handle llvm.amdgcn.fma.legacy
4026 if (!mayIgnoreSignedZero(N0))
4027 return SDValue();
4028
4029 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4030 SDValue LHS = N0.getOperand(0);
4031 SDValue MHS = N0.getOperand(1);
4032 SDValue RHS = N0.getOperand(2);
4033
4034 if (LHS.getOpcode() == ISD::FNEG)
4035 LHS = LHS.getOperand(0);
4036 else if (MHS.getOpcode() == ISD::FNEG)
4037 MHS = MHS.getOperand(0);
4038 else
4039 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4040
4041 if (RHS.getOpcode() != ISD::FNEG)
4042 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4043 else
4044 RHS = RHS.getOperand(0);
4045
4046 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4047 if (Res.getOpcode() != Opc)
4048 return SDValue(); // Op got folded away.
4049 if (!N0.hasOneUse())
4050 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4051 return Res;
4052 }
4053 case ISD::FMAXNUM:
4054 case ISD::FMINNUM:
4055 case ISD::FMAXNUM_IEEE:
4056 case ISD::FMINNUM_IEEE:
4057 case AMDGPUISD::FMAX_LEGACY:
4058 case AMDGPUISD::FMIN_LEGACY: {
4059 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4060 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4061 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4062 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4063
4064 SDValue LHS = N0.getOperand(0);
4065 SDValue RHS = N0.getOperand(1);
4066
4067 // 0 doesn't have a negated inline immediate.
4068 // TODO: This constant check should be generalized to other operations.
4069 if (isConstantCostlierToNegate(RHS))
4070 return SDValue();
4071
4072 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4073 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4074 unsigned Opposite = inverseMinMax(Opc);
4075
4076 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4077 if (Res.getOpcode() != Opposite)
4078 return SDValue(); // Op got folded away.
4079 if (!N0.hasOneUse())
4080 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4081 return Res;
4082 }
4083 case AMDGPUISD::FMED3: {
4084 SDValue Ops[3];
4085 for (unsigned I = 0; I < 3; ++I)
4086 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4087
4088 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4089 if (Res.getOpcode() != AMDGPUISD::FMED3)
4090 return SDValue(); // Op got folded away.
4091
4092 if (!N0.hasOneUse()) {
4093 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4094 DAG.ReplaceAllUsesWith(N0, Neg);
4095
4096 for (SDNode *U : Neg->uses())
4097 DCI.AddToWorklist(U);
4098 }
4099
4100 return Res;
4101 }
4102 case ISD::FP_EXTEND:
4103 case ISD::FTRUNC:
4104 case ISD::FRINT:
4105 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4106 case ISD::FSIN:
4107 case ISD::FCANONICALIZE:
4108 case AMDGPUISD::RCP:
4109 case AMDGPUISD::RCP_LEGACY:
4110 case AMDGPUISD::RCP_IFLAG:
4111 case AMDGPUISD::SIN_HW: {
4112 SDValue CvtSrc = N0.getOperand(0);
4113 if (CvtSrc.getOpcode() == ISD::FNEG) {
4114 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4115 // (fneg (rcp (fneg x))) -> (rcp x)
4116 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4117 }
4118
4119 if (!N0.hasOneUse())
4120 return SDValue();
4121
4122 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4123 // (fneg (rcp x)) -> (rcp (fneg x))
4124 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4125 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4126 }
4127 case ISD::FP_ROUND: {
4128 SDValue CvtSrc = N0.getOperand(0);
4129
4130 if (CvtSrc.getOpcode() == ISD::FNEG) {
4131 // (fneg (fp_round (fneg x))) -> (fp_round x)
4132 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4133 CvtSrc.getOperand(0), N0.getOperand(1));
4134 }
4135
4136 if (!N0.hasOneUse())
4137 return SDValue();
4138
4139 // (fneg (fp_round x)) -> (fp_round (fneg x))
4140 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4141 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4142 }
4143 case ISD::FP16_TO_FP: {
4144 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4145 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4146 // Put the fneg back as a legal source operation that can be matched later.
4147 SDLoc SL(N);
4148
4149 SDValue Src = N0.getOperand(0);
4150 EVT SrcVT = Src.getValueType();
4151
4152 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4153 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4154 DAG.getConstant(0x8000, SL, SrcVT));
4155 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4156 }
4157 case ISD::SELECT: {
4158 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4159 // TODO: Invert conditions of foldFreeOpFromSelect
4160 return SDValue();
4161 }
4162 case ISD::BITCAST: {
4163 SDLoc SL(N);
4164 SDValue BCSrc = N0.getOperand(0);
4165 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4166 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4167 if (HighBits.getValueType().getSizeInBits() != 32 ||
4168 !fnegFoldsIntoOp(HighBits.getNode()))
4169 return SDValue();
4170
4171 // f64 fneg only really needs to operate on the high half of of the
4172 // register, so try to force it to an f32 operation to help make use of
4173 // source modifiers.
4174 //
4175 //
4176 // fneg (f64 (bitcast (build_vector x, y))) ->
4177 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4178 // (fneg (bitcast i32:y to f32)))
4179
4180 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4181 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4182 SDValue CastBack =
4183 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4184
4185 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4186 Ops.back() = CastBack;
4187 DCI.AddToWorklist(NegHi.getNode());
4188 SDValue Build =
4189 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4190 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4191
4192 if (!N0.hasOneUse())
4193 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4194 return Result;
4195 }
4196
4197 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4198 BCSrc.hasOneUse()) {
4199 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4200 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4201
4202 // TODO: Cast back result for multiple uses is beneficial in some cases.
4203
4204 SDValue LHS =
4205 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4206 SDValue RHS =
4207 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4208
4209 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4210 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4211
4212 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4213 NegRHS);
4214 }
4215
4216 return SDValue();
4217 }
4218 default:
4219 return SDValue();
4220 }
4221}
4222
4223SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4224 DAGCombinerInfo &DCI) const {
4225 SelectionDAG &DAG = DCI.DAG;
4226 SDValue N0 = N->getOperand(0);
4227
4228 if (!N0.hasOneUse())
4229 return SDValue();
4230
4231 switch (N0.getOpcode()) {
4232 case ISD::FP16_TO_FP: {
4233 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")(static_cast <bool> (!Subtarget->has16BitInsts() &&
"should only see if f16 is illegal") ? void (0) : __assert_fail
("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4233, __extension__
__PRETTY_FUNCTION__))
;
4234 SDLoc SL(N);
4235 SDValue Src = N0.getOperand(0);
4236 EVT SrcVT = Src.getValueType();
4237
4238 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4239 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4240 DAG.getConstant(0x7fff, SL, SrcVT));
4241 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4242 }
4243 default:
4244 return SDValue();
4245 }
4246}
4247
4248SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4249 DAGCombinerInfo &DCI) const {
4250 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4251 if (!CFP)
4252 return SDValue();
4253
4254 // XXX - Should this flush denormals?
4255 const APFloat &Val = CFP->getValueAPF();
4256 APFloat One(Val.getSemantics(), "1.0");
4257 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4258}
4259
4260SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4261 DAGCombinerInfo &DCI) const {
4262 SelectionDAG &DAG = DCI.DAG;
4263 SDLoc DL(N);
4264
4265 switch(N->getOpcode()) {
4266 default:
4267 break;
4268 case ISD::BITCAST: {
4269 EVT DestVT = N->getValueType(0);
4270
4271 // Push casts through vector builds. This helps avoid emitting a large
4272 // number of copies when materializing floating point vector constants.
4273 //
4274 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4275 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4276 if (DestVT.isVector()) {
4277 SDValue Src = N->getOperand(0);
4278 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4279 EVT SrcVT = Src.getValueType();
4280 unsigned NElts = DestVT.getVectorNumElements();
4281
4282 if (SrcVT.getVectorNumElements() == NElts) {
4283 EVT DestEltVT = DestVT.getVectorElementType();
4284
4285 SmallVector<SDValue, 8> CastedElts;
4286 SDLoc SL(N);
4287 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4288 SDValue Elt = Src.getOperand(I);
4289 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4290 }
4291
4292 return DAG.getBuildVector(DestVT, SL, CastedElts);
4293 }
4294 }
4295 }
4296
4297 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4298 break;
4299
4300 // Fold bitcasts of constants.
4301 //
4302 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4303 // TODO: Generalize and move to DAGCombiner
4304 SDValue Src = N->getOperand(0);
4305 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4306 SDLoc SL(N);
4307 uint64_t CVal = C->getZExtValue();
4308 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4309 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4310 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4311 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4312 }
4313
4314 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4315 const APInt &Val = C->getValueAPF().bitcastToAPInt();
4316 SDLoc SL(N);
4317 uint64_t CVal = Val.getZExtValue();
4318 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4319 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4320 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4321
4322 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4323 }
4324
4325 break;
4326 }
4327 case ISD::SHL: {
4328 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4329 break;
4330
4331 return performShlCombine(N, DCI);
4332 }
4333 case ISD::SRL: {
4334 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4335 break;
4336
4337 return performSrlCombine(N, DCI);
4338 }
4339 case ISD::SRA: {
4340 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4341 break;
4342
4343 return performSraCombine(N, DCI);
4344 }
4345 case ISD::TRUNCATE:
4346 return performTruncateCombine(N, DCI);
4347 case ISD::MUL:
4348 return performMulCombine(N, DCI);
4349 case ISD::SMUL_LOHI:
4350 case ISD::UMUL_LOHI:
4351 return performMulLoHiCombine(N, DCI);
4352 case ISD::MULHS:
4353 return performMulhsCombine(N, DCI);
4354 case ISD::MULHU:
4355 return performMulhuCombine(N, DCI);
4356 case AMDGPUISD::MUL_I24:
4357 case AMDGPUISD::MUL_U24:
4358 case AMDGPUISD::MULHI_I24:
4359 case AMDGPUISD::MULHI_U24:
4360 return simplifyMul24(N, DCI);
4361 case ISD::SELECT:
4362 return performSelectCombine(N, DCI);
4363 case ISD::FNEG:
4364 return performFNegCombine(N, DCI);
4365 case ISD::FABS:
4366 return performFAbsCombine(N, DCI);
4367 case AMDGPUISD::BFE_I32:
4368 case AMDGPUISD::BFE_U32: {
4369 assert(!N->getValueType(0).isVector() &&(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4370, __extension__
__PRETTY_FUNCTION__))
4370 "Vector handling of BFE not implemented")(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4370, __extension__
__PRETTY_FUNCTION__))
;
4371 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4372 if (!Width)
4373 break;
4374
4375 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4376 if (WidthVal == 0)
4377 return DAG.getConstant(0, DL, MVT::i32);
4378
4379 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4380 if (!Offset)
4381 break;
4382
4383 SDValue BitsFrom = N->getOperand(0);
4384 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4385
4386 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4387
4388 if (OffsetVal == 0) {
4389 // This is already sign / zero extended, so try to fold away extra BFEs.
4390 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4391
4392 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4393 if (OpSignBits >= SignBits)
4394 return BitsFrom;
4395
4396 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4397 if (Signed) {
4398 // This is a sign_extend_inreg. Replace it to take advantage of existing
4399 // DAG Combines. If not eliminated, we will match back to BFE during
4400 // selection.
4401
4402 // TODO: The sext_inreg of extended types ends, although we can could
4403 // handle them in a single BFE.
4404 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4405 DAG.getValueType(SmallVT));
4406 }
4407
4408 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4409 }
4410
4411 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4412 if (Signed) {
4413 return constantFoldBFE<int32_t>(DAG,
4414 CVal->getSExtValue(),
4415 OffsetVal,
4416 WidthVal,
4417 DL);
4418 }
4419
4420 return constantFoldBFE<uint32_t>(DAG,
4421 CVal->getZExtValue(),
4422 OffsetVal,
4423 WidthVal,
4424 DL);
4425 }
4426
4427 if ((OffsetVal + WidthVal) >= 32 &&
4428 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4429 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4430 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4431 BitsFrom, ShiftVal);
4432 }
4433
4434 if (BitsFrom.hasOneUse()) {
4435 APInt Demanded = APInt::getBitsSet(32,
4436 OffsetVal,
4437 OffsetVal + WidthVal);
4438
4439 KnownBits Known;
4440 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4441 !DCI.isBeforeLegalizeOps());
4442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4443 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4444 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4445 DCI.CommitTargetLoweringOpt(TLO);
4446 }
4447 }
4448
4449 break;
4450 }
4451 case ISD::LOAD:
4452 return performLoadCombine(N, DCI);
4453 case ISD::STORE:
4454 return performStoreCombine(N, DCI);
4455 case AMDGPUISD::RCP:
4456 case AMDGPUISD::RCP_IFLAG:
4457 return performRcpCombine(N, DCI);
4458 case ISD::AssertZext:
4459 case ISD::AssertSext:
4460 return performAssertSZExtCombine(N, DCI);
4461 case ISD::INTRINSIC_WO_CHAIN:
4462 return performIntrinsicWOChainCombine(N, DCI);
4463 }
4464 return SDValue();
4465}
4466
4467//===----------------------------------------------------------------------===//
4468// Helper functions
4469//===----------------------------------------------------------------------===//
4470
4471SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4472 const TargetRegisterClass *RC,
4473 Register Reg, EVT VT,
4474 const SDLoc &SL,
4475 bool RawReg) const {
4476 MachineFunction &MF = DAG.getMachineFunction();
4477 MachineRegisterInfo &MRI = MF.getRegInfo();
4478 Register VReg;
4479
4480 if (!MRI.isLiveIn(Reg)) {
4481 VReg = MRI.createVirtualRegister(RC);
4482 MRI.addLiveIn(Reg, VReg);
4483 } else {
4484 VReg = MRI.getLiveInVirtReg(Reg);
4485 }
4486
4487 if (RawReg)
4488 return DAG.getRegister(VReg, VT);
4489
4490 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4491}
4492
4493// This may be called multiple times, and nothing prevents creating multiple
4494// objects at the same offset. See if we already defined this object.
4495static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4496 int64_t Offset) {
4497 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4498 if (MFI.getObjectOffset(I) == Offset) {
4499 assert(MFI.getObjectSize(I) == Size)(static_cast <bool> (MFI.getObjectSize(I) == Size) ? void
(0) : __assert_fail ("MFI.getObjectSize(I) == Size", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4499, __extension__ __PRETTY_FUNCTION__))
;
4500 return I;
4501 }
4502 }
4503
4504 return MFI.CreateFixedObject(Size, Offset, true);
4505}
4506
4507SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4508 EVT VT,
4509 const SDLoc &SL,
4510 int64_t Offset) const {
4511 MachineFunction &MF = DAG.getMachineFunction();
4512 MachineFrameInfo &MFI = MF.getFrameInfo();
4513 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4514
4515 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4516 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4517
4518 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4519 MachineMemOperand::MODereferenceable |
4520 MachineMemOperand::MOInvariant);
4521}
4522
4523SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4524 const SDLoc &SL,
4525 SDValue Chain,
4526 SDValue ArgVal,
4527 int64_t Offset) const {
4528 MachineFunction &MF = DAG.getMachineFunction();
4529 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4530 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4531
4532 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4533 // Stores to the argument stack area are relative to the stack pointer.
4534 SDValue SP =
4535 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4536 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4537 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4538 MachineMemOperand::MODereferenceable);
4539 return Store;
4540}
4541
4542SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4543 const TargetRegisterClass *RC,
4544 EVT VT, const SDLoc &SL,
4545 const ArgDescriptor &Arg) const {
4546 assert(Arg && "Attempting to load missing argument")(static_cast <bool> (Arg && "Attempting to load missing argument"
) ? void (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4546, __extension__
__PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4547
4548 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4549 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4550 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4551
4552 if (!Arg.isMasked())
4
Taking false branch
4553 return V;
4554
4555 unsigned Mask = Arg.getMask();
4556 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5
Calling 'countr_zero<unsigned int>'
12
Returning from 'countr_zero<unsigned int>'
13
'Shift' initialized to 32
4557 V = DAG.getNode(ISD::SRL, SL, VT, V,
4558 DAG.getShiftAmountConstant(Shift, VT, SL));
4559 return DAG.getNode(ISD::AND, SL, VT, V,
4560 DAG.getConstant(Mask >> Shift, SL, VT));
14
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4561}
4562
4563uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4564 const MachineFunction &MF, const ImplicitParameter Param) const {
4565 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4566 const AMDGPUSubtarget &ST =
4567 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4568 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4569 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4570 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4571 ExplicitArgOffset;
4572 switch (Param) {
4573 case FIRST_IMPLICIT:
4574 return ArgOffset;
4575 case PRIVATE_BASE:
4576 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
4577 case SHARED_BASE:
4578 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
4579 case QUEUE_PTR:
4580 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
4581 }
4582 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4582)
;
4583}
4584
4585#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4586
4587const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4588 switch ((AMDGPUISD::NodeType)Opcode) {
4589 case AMDGPUISD::FIRST_NUMBER: break;
4590 // AMDIL DAG nodes
4591 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4592 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4593
4594 // AMDGPU DAG nodes
4595 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4596 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4597 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4598 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4599 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4600 NODE_NAME_CASE(TC_RETURN_GFX)case AMDGPUISD::TC_RETURN_GFX: return "TC_RETURN_GFX";
4601 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4602 NODE_NAME_CASE(RET_GLUE)case AMDGPUISD::RET_GLUE: return "RET_GLUE";
4603 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4604 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4605 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4606 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4607 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4608 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4609 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4610 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4611 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4612 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4613 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4614 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4615 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4616 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4617 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4618 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4619 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4620 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4621 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4622 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4623 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4624 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4625 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4626 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4627 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4628 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4629 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4630 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4631 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4632 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4633 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4634 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4635 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4636 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4637 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4638 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4639 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4640 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4641 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4642 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4643 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4644 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4645 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4646 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4647 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4648 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4649 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4650 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4651 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4652 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4653 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4654 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4655 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4656 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4657 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4658 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4659 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4660 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4661 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4662 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4663 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4664 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4665 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4666 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4667 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4668 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4669 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4670 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4671 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4672 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4673 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4674 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4675 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4676 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4677 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4678 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4679 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4680 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4681 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4682 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)case AMDGPUISD::FPTRUNC_ROUND_UPWARD: return "FPTRUNC_ROUND_UPWARD"
;
4683 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)case AMDGPUISD::FPTRUNC_ROUND_DOWNWARD: return "FPTRUNC_ROUND_DOWNWARD"
;
4684 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4685 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4686 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4687 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4688 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4689 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4690 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4691 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4692 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4693 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4694 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4695 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4696 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4697 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4698 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4699 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4700 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4701 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4702 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4703 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4704 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4705 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4706 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4707 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4708 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)case AMDGPUISD::BUFFER_LOAD_FORMAT_TFE: return "BUFFER_LOAD_FORMAT_TFE"
;
4709 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4710 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4711 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4712 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4713 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4714 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4715 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4716 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4717 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4718 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4719 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4720 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4721 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4722 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4723 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4724 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4725 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4726 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4727 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4728 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4729 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)case AMDGPUISD::BUFFER_ATOMIC_CSUB: return "BUFFER_ATOMIC_CSUB"
;
4730 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4731 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)case AMDGPUISD::BUFFER_ATOMIC_FMIN: return "BUFFER_ATOMIC_FMIN"
;
4732 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)case AMDGPUISD::BUFFER_ATOMIC_FMAX: return "BUFFER_ATOMIC_FMAX"
;
4733
4734 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4735 }
4736 return nullptr;
4737}
4738
4739SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4740 SelectionDAG &DAG, int Enabled,
4741 int &RefinementSteps,
4742 bool &UseOneConstNR,
4743 bool Reciprocal) const {
4744 EVT VT = Operand.getValueType();
4745
4746 if (VT == MVT::f32) {
4747 RefinementSteps = 0;
4748 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4749 }
4750
4751 // TODO: There is also f64 rsq instruction, but the documentation is less
4752 // clear on its precision.
4753
4754 return SDValue();
4755}
4756
4757SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4758 SelectionDAG &DAG, int Enabled,
4759 int &RefinementSteps) const {
4760 EVT VT = Operand.getValueType();
4761
4762 if (VT == MVT::f32) {
4763 // Reciprocal, < 1 ulp error.
4764 //
4765 // This reciprocal approximation converges to < 0.5 ulp error with one
4766 // newton rhapson performed with two fused multiple adds (FMAs).
4767
4768 RefinementSteps = 0;
4769 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4770 }
4771
4772 // TODO: There is also f64 rcp instruction, but the documentation is less
4773 // clear on its precision.
4774
4775 return SDValue();
4776}
4777
4778static unsigned workitemIntrinsicDim(unsigned ID) {
4779 switch (ID) {
4780 case Intrinsic::amdgcn_workitem_id_x:
4781 return 0;
4782 case Intrinsic::amdgcn_workitem_id_y:
4783 return 1;
4784 case Intrinsic::amdgcn_workitem_id_z:
4785 return 2;
4786 default:
4787 llvm_unreachable("not a workitem intrinsic")::llvm::llvm_unreachable_internal("not a workitem intrinsic",
"llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4787)
;
4788 }
4789}
4790
4791void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4792 const SDValue Op, KnownBits &Known,
4793 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4794
4795 Known.resetAll(); // Don't know anything.
4796
4797 unsigned Opc = Op.getOpcode();
4798
4799 switch (Opc) {
4800 default:
4801 break;
4802 case AMDGPUISD::CARRY:
4803 case AMDGPUISD::BORROW: {
4804 Known.Zero = APInt::getHighBitsSet(32, 31);
4805 break;
4806 }
4807
4808 case AMDGPUISD::BFE_I32:
4809 case AMDGPUISD::BFE_U32: {
4810 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4811 if (!CWidth)
4812 return;
4813
4814 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4815
4816 if (Opc == AMDGPUISD::BFE_U32)
4817 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4818
4819 break;
4820 }
4821 case AMDGPUISD::FP_TO_FP16: {
4822 unsigned BitWidth = Known.getBitWidth();
4823
4824 // High bits are zero.
4825 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4826 break;
4827 }
4828 case AMDGPUISD::MUL_U24:
4829 case AMDGPUISD::MUL_I24: {
4830 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4831 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4832 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4833 RHSKnown.countMinTrailingZeros();
4834 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4835 // Skip extra check if all bits are known zeros.
4836 if (TrailZ >= 32)
4837 break;
4838
4839 // Truncate to 24 bits.
4840 LHSKnown = LHSKnown.trunc(24);
4841 RHSKnown = RHSKnown.trunc(24);
4842
4843 if (Opc == AMDGPUISD::MUL_I24) {
4844 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4845 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4846 unsigned MaxValBits = LHSValBits + RHSValBits;
4847 if (MaxValBits > 32)
4848 break;
4849 unsigned SignBits = 32 - MaxValBits + 1;
4850 bool LHSNegative = LHSKnown.isNegative();
4851 bool LHSNonNegative = LHSKnown.isNonNegative();
4852 bool LHSPositive = LHSKnown.isStrictlyPositive();
4853 bool RHSNegative = RHSKnown.isNegative();
4854 bool RHSNonNegative = RHSKnown.isNonNegative();
4855 bool RHSPositive = RHSKnown.isStrictlyPositive();
4856
4857 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4858 Known.Zero.setHighBits(SignBits);
4859 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4860 Known.One.setHighBits(SignBits);
4861 } else {
4862 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4863 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4864 unsigned MaxValBits = LHSValBits + RHSValBits;
4865 if (MaxValBits >= 32)
4866 break;
4867 Known.Zero.setBitsFrom(MaxValBits);
4868 }
4869 break;
4870 }
4871 case AMDGPUISD::PERM: {
4872 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4873 if (!CMask)
4874 return;
4875
4876 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4877 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4878 unsigned Sel = CMask->getZExtValue();
4879
4880 for (unsigned I = 0; I < 32; I += 8) {
4881 unsigned SelBits = Sel & 0xff;
4882 if (SelBits < 4) {
4883 SelBits *= 8;
4884 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4885 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4886 } else if (SelBits < 7) {
4887 SelBits = (SelBits & 3) * 8;
4888 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4889 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4890 } else if (SelBits == 0x0c) {
4891 Known.Zero |= 0xFFull << I;
4892 } else if (SelBits > 0x0c) {
4893 Known.One |= 0xFFull << I;
4894 }
4895 Sel >>= 8;
4896 }
4897 break;
4898 }
4899 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4900 Known.Zero.setHighBits(24);
4901 break;
4902 }
4903 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4904 Known.Zero.setHighBits(16);
4905 break;
4906 }
4907 case AMDGPUISD::LDS: {
4908 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4909 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4910
4911 Known.Zero.setHighBits(16);
4912 Known.Zero.setLowBits(Log2(Alignment));
4913 break;
4914 }
4915 case ISD::INTRINSIC_WO_CHAIN: {
4916 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4917 switch (IID) {
4918 case Intrinsic::amdgcn_workitem_id_x:
4919 case Intrinsic::amdgcn_workitem_id_y:
4920 case Intrinsic::amdgcn_workitem_id_z: {
4921 unsigned MaxValue = Subtarget->getMaxWorkitemID(
4922 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
4923 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
4924 break;
4925 }
4926 default:
4927 break;
4928 }
4929 }
4930 }
4931}
4932
4933unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4934 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4935 unsigned Depth) const {
4936 switch (Op.getOpcode()) {
4937 case AMDGPUISD::BFE_I32: {
4938 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4939 if (!Width)
4940 return 1;
4941
4942 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4943 if (!isNullConstant(Op.getOperand(1)))
4944 return SignBits;
4945
4946 // TODO: Could probably figure something out with non-0 offsets.
4947 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4948 return std::max(SignBits, Op0SignBits);
4949 }
4950
4951 case AMDGPUISD::BFE_U32: {
4952 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4953 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4954 }
4955
4956 case AMDGPUISD::CARRY:
4957 case AMDGPUISD::BORROW:
4958 return 31;
4959 case AMDGPUISD::BUFFER_LOAD_BYTE:
4960 return 25;
4961 case AMDGPUISD::BUFFER_LOAD_SHORT:
4962 return 17;
4963 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4964 return 24;
4965 case AMDGPUISD::BUFFER_LOAD_USHORT:
4966 return 16;
4967 case AMDGPUISD::FP_TO_FP16:
4968 return 16;
4969 default:
4970 return 1;
4971 }
4972}
4973
4974unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4975 GISelKnownBits &Analysis, Register R,
4976 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4977 unsigned Depth) const {
4978 const MachineInstr *MI = MRI.getVRegDef(R);
4979 if (!MI)
4980 return 1;
4981
4982 // TODO: Check range metadata on MMO.
4983 switch (MI->getOpcode()) {
4984 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4985 return 25;
4986 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4987 return 17;
4988 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4989 return 24;
4990 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4991 return 16;
4992 default:
4993 return 1;
4994 }
4995}
4996
4997bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4998 const SelectionDAG &DAG,
4999 bool SNaN,
5000 unsigned Depth) const {
5001 unsigned Opcode = Op.getOpcode();
5002 switch (Opcode) {
5003 case AMDGPUISD::FMIN_LEGACY:
5004 case AMDGPUISD::FMAX_LEGACY: {
5005 if (SNaN)
5006 return true;
5007
5008 // TODO: Can check no nans on one of the operands for each one, but which
5009 // one?
5010 return false;
5011 }
5012 case AMDGPUISD::FMUL_LEGACY:
5013 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5014 if (SNaN)
5015 return true;
5016 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5017 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5018 }
5019 case AMDGPUISD::FMED3:
5020 case AMDGPUISD::FMIN3:
5021 case AMDGPUISD::FMAX3:
5022 case AMDGPUISD::FMAD_FTZ: {
5023 if (SNaN)
5024 return true;
5025 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5026 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5027 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5028 }
5029 case AMDGPUISD::CVT_F32_UBYTE0:
5030 case AMDGPUISD::CVT_F32_UBYTE1:
5031 case AMDGPUISD::CVT_F32_UBYTE2:
5032 case AMDGPUISD::CVT_F32_UBYTE3:
5033 return true;
5034
5035 case AMDGPUISD::RCP:
5036 case AMDGPUISD::RSQ:
5037 case AMDGPUISD::RCP_LEGACY:
5038 case AMDGPUISD::RSQ_CLAMP: {
5039 if (SNaN)
5040 return true;
5041
5042 // TODO: Need is known positive check.
5043 return false;
5044 }
5045 case AMDGPUISD::LDEXP:
5046 case AMDGPUISD::FRACT: {
5047 if (SNaN)
5048 return true;
5049 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5050 }
5051 case AMDGPUISD::DIV_SCALE:
5052 case AMDGPUISD::DIV_FMAS:
5053 case AMDGPUISD::DIV_FIXUP:
5054 // TODO: Refine on operands.
5055 return SNaN;
5056 case AMDGPUISD::SIN_HW:
5057 case AMDGPUISD::COS_HW: {
5058 // TODO: Need check for infinity
5059 return SNaN;
5060 }
5061 case ISD::INTRINSIC_WO_CHAIN: {
5062 unsigned IntrinsicID
5063 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5064 // TODO: Handle more intrinsics
5065 switch (IntrinsicID) {
5066 case Intrinsic::amdgcn_cubeid:
5067 return true;
5068
5069 case Intrinsic::amdgcn_frexp_mant: {
5070 if (SNaN)
5071 return true;
5072 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5073 }
5074 case Intrinsic::amdgcn_cvt_pkrtz: {
5075 if (SNaN)
5076 return true;
5077 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5078 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5079 }
5080 case Intrinsic::amdgcn_rcp:
5081 case Intrinsic::amdgcn_rsq:
5082 case Intrinsic::amdgcn_rcp_legacy:
5083 case Intrinsic::amdgcn_rsq_legacy:
5084 case Intrinsic::amdgcn_rsq_clamp: {
5085 if (SNaN)
5086 return true;
5087
5088 // TODO: Need is known positive check.
5089 return false;
5090 }
5091 case Intrinsic::amdgcn_trig_preop:
5092 case Intrinsic::amdgcn_fdot2:
5093 // TODO: Refine on operand
5094 return SNaN;
5095 case Intrinsic::amdgcn_fma_legacy:
5096 if (SNaN)
5097 return true;
5098 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5099 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5100 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5101 default:
5102 return false;
5103 }
5104 }
5105 default:
5106 return false;
5107 }
5108}
5109
5110TargetLowering::AtomicExpansionKind
5111AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
5112 switch (RMW->getOperation()) {
5113 case AtomicRMWInst::Nand:
5114 case AtomicRMWInst::FAdd:
5115 case AtomicRMWInst::FSub:
5116 case AtomicRMWInst::FMax:
5117 case AtomicRMWInst::FMin:
5118 return AtomicExpansionKind::CmpXChg;
5119 default: {
5120 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5121 unsigned Size = IntTy->getBitWidth();
5122 if (Size == 32 || Size == 64)
5123 return AtomicExpansionKind::None;
5124 }
5125
5126 return AtomicExpansionKind::CmpXChg;
5127 }
5128 }
5129}
5130
5131bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
5132 unsigned Opc, LLT Ty1, LLT Ty2) const {
5133 return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
5134 Ty2 == LLT::scalar(32);
5135}

/build/source/llvm/include/llvm/ADT/bit.h

1//===-- llvm/ADT/bit.h - C++20 <bit> ----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the C++20 <bit> header.
11///
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_ADT_BIT_H
15#define LLVM_ADT_BIT_H
16
17#include "llvm/Support/Compiler.h"
18#include <cstdint>
19#include <limits>
20#include <type_traits>
21
22#if !__has_builtin(__builtin_bit_cast)1
23#include <cstring>
24#endif
25
26#if defined(_MSC_VER) && !defined(_DEBUG1)
27#include <cstdlib> // for _byteswap_{ushort,ulong,uint64}
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and bit.h is popular via MathExtras.h.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44// This implementation of bit_cast is different from the C++20 one in two ways:
45// - It isn't constexpr because that requires compiler support.
46// - It requires trivially-constructible To, to avoid UB in the implementation.
47template <
48 typename To, typename From,
49 typename = std::enable_if_t<sizeof(To) == sizeof(From)>,
50 typename = std::enable_if_t<std::is_trivially_constructible<To>::value>,
51 typename = std::enable_if_t<std::is_trivially_copyable<To>::value>,
52 typename = std::enable_if_t<std::is_trivially_copyable<From>::value>>
53[[nodiscard]] inline To bit_cast(const From &from) noexcept {
54#if __has_builtin(__builtin_bit_cast)1
55 return __builtin_bit_cast(To, from);
56#else
57 To to;
58 std::memcpy(&to, &from, sizeof(To));
59 return to;
60#endif
61}
62
63/// Reverses the bytes in the given integer value V.
64template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
65[[nodiscard]] constexpr T byteswap(T V) noexcept {
66 if constexpr (sizeof(T) == 1) {
67 return V;
68 } else if constexpr (sizeof(T) == 2) {
69 uint16_t UV = V;
70#if defined(_MSC_VER) && !defined(_DEBUG1)
71 // The DLL version of the runtime lacks these functions (bug!?), but in a
72 // release build they're replaced with BSWAP instructions anyway.
73 return _byteswap_ushort(UV);
74#else
75 uint16_t Hi = UV << 8;
76 uint16_t Lo = UV >> 8;
77 return Hi | Lo;
78#endif
79 } else if constexpr (sizeof(T) == 4) {
80 uint32_t UV = V;
81#if __has_builtin(__builtin_bswap32)1
82 return __builtin_bswap32(UV);
83#elif defined(_MSC_VER) && !defined(_DEBUG1)
84 return _byteswap_ulong(UV);
85#else
86 uint32_t Byte0 = UV & 0x000000FF;
87 uint32_t Byte1 = UV & 0x0000FF00;
88 uint32_t Byte2 = UV & 0x00FF0000;
89 uint32_t Byte3 = UV & 0xFF000000;
90 return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
91#endif
92 } else if constexpr (sizeof(T) == 8) {
93 uint64_t UV = V;
94#if __has_builtin(__builtin_bswap64)1
95 return __builtin_bswap64(UV);
96#elif defined(_MSC_VER) && !defined(_DEBUG1)
97 return _byteswap_uint64(UV);
98#else
99 uint64_t Hi = llvm::byteswap<uint32_t>(UV);
100 uint32_t Lo = llvm::byteswap<uint32_t>(UV >> 32);
101 return (Hi << 32) | Lo;
102#endif
103 } else {
104 static_assert(!sizeof(T *), "Don't know how to handle the given type.");
105 return 0;
106 }
107}
108
109template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
110[[nodiscard]] constexpr inline bool has_single_bit(T Value) noexcept {
111 return (Value != 0) && ((Value & (Value - 1)) == 0);
112}
113
114namespace detail {
115template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
116 static unsigned count(T Val) {
117 if (!Val)
118 return std::numeric_limits<T>::digits;
119 if (Val & 0x1)
120 return 0;
121
122 // Bisection method.
123 unsigned ZeroBits = 0;
124 T Shift = std::numeric_limits<T>::digits >> 1;
125 T Mask = std::numeric_limits<T>::max() >> Shift;
126 while (Shift) {
127 if ((Val & Mask) == 0) {
128 Val >>= Shift;
129 ZeroBits |= Shift;
130 }
131 Shift >>= 1;
132 Mask >>= Shift;
133 }
134 return ZeroBits;
135 }
136};
137
138#if defined(__GNUC__4) || defined(_MSC_VER)
139template <typename T> struct TrailingZerosCounter<T, 4> {
140 static unsigned count(T Val) {
141 if (Val == 0)
7
Assuming 'Val' is equal to 0
8
Taking true branch
142 return 32;
9
Returning the value 32
143
144#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
145 return __builtin_ctz(Val);
146#elif defined(_MSC_VER)
147 unsigned long Index;
148 _BitScanForward(&Index, Val);
149 return Index;
150#endif
151 }
152};
153
154#if !defined(_MSC_VER) || defined(_M_X64)
155template <typename T> struct TrailingZerosCounter<T, 8> {
156 static unsigned count(T Val) {
157 if (Val == 0)
158 return 64;
159
160#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
161 return __builtin_ctzll(Val);
162#elif defined(_MSC_VER)
163 unsigned long Index;
164 _BitScanForward64(&Index, Val);
165 return Index;
166#endif
167 }
168};
169#endif
170#endif
171} // namespace detail
172
173/// Count number of 0's from the least significant bit to the most
174/// stopping at the first 1.
175///
176/// Only unsigned integral types are allowed.
177///
178/// Returns std::numeric_limits<T>::digits on an input of 0.
179template <typename T> [[nodiscard]] int countr_zero(T Val) {
180 static_assert(std::is_unsigned_v<T>,
181 "Only unsigned integral types are allowed.");
182 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val);
6
Calling 'TrailingZerosCounter::count'
10
Returning from 'TrailingZerosCounter::count'
11
Returning the value 32
183}
184
185namespace detail {
186template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
187 static unsigned count(T Val) {
188 if (!Val)
189 return std::numeric_limits<T>::digits;
190
191 // Bisection method.
192 unsigned ZeroBits = 0;
193 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
194 T Tmp = Val >> Shift;
195 if (Tmp)
196 Val = Tmp;
197 else
198 ZeroBits |= Shift;
199 }
200 return ZeroBits;
201 }
202};
203
204#if defined(__GNUC__4) || defined(_MSC_VER)
205template <typename T> struct LeadingZerosCounter<T, 4> {
206 static unsigned count(T Val) {
207 if (Val == 0)
208 return 32;
209
210#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
211 return __builtin_clz(Val);
212#elif defined(_MSC_VER)
213 unsigned long Index;
214 _BitScanReverse(&Index, Val);
215 return Index ^ 31;
216#endif
217 }
218};
219
220#if !defined(_MSC_VER) || defined(_M_X64)
221template <typename T> struct LeadingZerosCounter<T, 8> {
222 static unsigned count(T Val) {
223 if (Val == 0)
224 return 64;
225
226#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
227 return __builtin_clzll(Val);
228#elif defined(_MSC_VER)
229 unsigned long Index;
230 _BitScanReverse64(&Index, Val);
231 return Index ^ 63;
232#endif
233 }
234};
235#endif
236#endif
237} // namespace detail
238
239/// Count number of 0's from the most significant bit to the least
240/// stopping at the first 1.
241///
242/// Only unsigned integral types are allowed.
243///
244/// Returns std::numeric_limits<T>::digits on an input of 0.
245template <typename T> [[nodiscard]] int countl_zero(T Val) {
246 static_assert(std::is_unsigned_v<T>,
247 "Only unsigned integral types are allowed.");
248 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val);
249}
250
251/// Count the number of ones from the most significant bit to the first
252/// zero bit.
253///
254/// Ex. countl_one(0xFF0FFF00) == 8.
255/// Only unsigned integral types are allowed.
256///
257/// Returns std::numeric_limits<T>::digits on an input of all ones.
258template <typename T> [[nodiscard]] int countl_one(T Value) {
259 static_assert(std::is_unsigned_v<T>,
260 "Only unsigned integral types are allowed.");
261 return llvm::countl_zero<T>(~Value);
262}
263
264/// Count the number of ones from the least significant bit to the first
265/// zero bit.
266///
267/// Ex. countr_one(0x00FF00FF) == 8.
268/// Only unsigned integral types are allowed.
269///
270/// Returns std::numeric_limits<T>::digits on an input of all ones.
271template <typename T> [[nodiscard]] int countr_one(T Value) {
272 static_assert(std::is_unsigned_v<T>,
273 "Only unsigned integral types are allowed.");
274 return llvm::countr_zero<T>(~Value);
275}
276
277/// Returns the number of bits needed to represent Value if Value is nonzero.
278/// Returns 0 otherwise.
279///
280/// Ex. bit_width(5) == 3.
281template <typename T> [[nodiscard]] int bit_width(T Value) {
282 static_assert(std::is_unsigned_v<T>,
283 "Only unsigned integral types are allowed.");
284 return std::numeric_limits<T>::digits - llvm::countl_zero(Value);
285}
286
287/// Returns the largest integral power of two no greater than Value if Value is
288/// nonzero. Returns 0 otherwise.
289///
290/// Ex. bit_floor(5) == 4.
291template <typename T> [[nodiscard]] T bit_floor(T Value) {
292 static_assert(std::is_unsigned_v<T>,
293 "Only unsigned integral types are allowed.");
294 if (!Value)
295 return 0;
296 return T(1) << (llvm::bit_width(Value) - 1);
297}
298
299/// Returns the smallest integral power of two no smaller than Value if Value is
300/// nonzero. Returns 1 otherwise.
301///
302/// Ex. bit_ceil(5) == 8.
303///
304/// The return value is undefined if the input is larger than the largest power
305/// of two representable in T.
306template <typename T> [[nodiscard]] T bit_ceil(T Value) {
307 static_assert(std::is_unsigned_v<T>,
308 "Only unsigned integral types are allowed.");
309 if (Value < 2)
310 return 1;
311 return T(1) << llvm::bit_width<T>(Value - 1u);
312}
313
314namespace detail {
315template <typename T, std::size_t SizeOfT> struct PopulationCounter {
316 static int count(T Value) {
317 // Generic version, forward to 32 bits.
318 static_assert(SizeOfT <= 4, "Not implemented!");
319#if defined(__GNUC__4)
320 return (int)__builtin_popcount(Value);
321#else
322 uint32_t v = Value;
323 v = v - ((v >> 1) & 0x55555555);
324 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
325 return int(((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24);
326#endif
327 }
328};
329
330template <typename T> struct PopulationCounter<T, 8> {
331 static int count(T Value) {
332#if defined(__GNUC__4)
333 return (int)__builtin_popcountll(Value);
334#else
335 uint64_t v = Value;
336 v = v - ((v >> 1) & 0x5555555555555555ULL);
337 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
338 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
339 return int((uint64_t)(v * 0x0101010101010101ULL) >> 56);
340#endif
341 }
342};
343} // namespace detail
344
345/// Count the number of set bits in a value.
346/// Ex. popcount(0xF000F000) = 8
347/// Returns 0 if the word is zero.
348template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
349[[nodiscard]] inline int popcount(T Value) noexcept {
350 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
351}
352
353// Forward-declare rotr so that rotl can use it.
354template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
355[[nodiscard]] constexpr T rotr(T V, int R);
356
357template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
358[[nodiscard]] constexpr T rotl(T V, int R) {
359 unsigned N = std::numeric_limits<T>::digits;
360
361 R = R % N;
362 if (!R)
363 return V;
364
365 if (R < 0)
366 return llvm::rotr(V, -R);
367
368 return (V << R) | (V >> (N - R));
369}
370
371template <typename T, typename> [[nodiscard]] constexpr T rotr(T V, int R) {
372 unsigned N = std::numeric_limits<T>::digits;
373
374 R = R % N;
375 if (!R)
376 return V;
377
378 if (R < 0)
379 return llvm::rotl(V, -R);
380
381 return (V >> R) | (V << (N - R));
382}
383
384} // namespace llvm
385
386#endif