Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4486, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1679443490 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-03-22-005342-16304-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/source/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/CodeGen/Analysis.h"
22#include "llvm/CodeGen/MachineFrameInfo.h"
23#include "llvm/IR/DiagnosticInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
25#include "llvm/Support/CommandLine.h"
26#include "llvm/Support/KnownBits.h"
27#include "llvm/Target/TargetMachine.h"
28
29using namespace llvm;
30
31#include "AMDGPUGenCallingConv.inc"
32
33static cl::opt<bool> AMDGPUBypassSlowDiv(
34 "amdgpu-bypass-slow-div",
35 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36 cl::init(true));
37
38// Find a larger type to do a load / store of a vector with.
39EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
40 unsigned StoreSize = VT.getStoreSizeInBits();
41 if (StoreSize <= 32)
42 return EVT::getIntegerVT(Ctx, StoreSize);
43
44 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")(static_cast <bool> (StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? void (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 44, __extension__
__PRETTY_FUNCTION__))
;
45 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46}
47
48unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
49 return DAG.computeKnownBits(Op).countMaxActiveBits();
50}
51
52unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
53 // In order for this to be a signed 24-bit value, bit 23, must
54 // be a sign bit.
55 return DAG.ComputeMaxSignificantBits(Op);
56}
57
58AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
59 const AMDGPUSubtarget &STI)
60 : TargetLowering(TM), Subtarget(&STI) {
61 // Lower floating point store/load to integer store/load to reduce the number
62 // of patterns in tablegen.
63 setOperationAction(ISD::LOAD, MVT::f32, Promote);
64 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
65
66 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
67 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
68
69 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
70 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
71
72 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
73 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
74
75 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
77
78 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
80
81 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
83
84 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
86
87 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
89
90 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
92
93 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
95
96 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
98
99 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
101
102 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
104
105 setOperationAction(ISD::LOAD, MVT::i64, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
107
108 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
110
111 setOperationAction(ISD::LOAD, MVT::f64, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
113
114 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
116
117 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
119
120 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
122
123 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
125
126 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
128
129 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
130 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
131
132 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
133 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
134
135 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
136 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
137
138 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
139 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
140
141 // There are no 64-bit extloads. These should be done as a 32-bit extload and
142 // an extension to 64-bit.
143 for (MVT VT : MVT::integer_valuetypes())
144 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
145 Expand);
146
147 for (MVT VT : MVT::integer_valuetypes()) {
148 if (VT == MVT::i64)
149 continue;
150
151 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
152 setLoadExtAction(Op, VT, MVT::i1, Promote);
153 setLoadExtAction(Op, VT, MVT::i8, Legal);
154 setLoadExtAction(Op, VT, MVT::i16, Legal);
155 setLoadExtAction(Op, VT, MVT::i32, Expand);
156 }
157 }
158
159 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
160 for (auto MemVT :
161 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
162 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
163 Expand);
164
165 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
166 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
167 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
168 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
169 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
172 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
173
174 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
175 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
180
181 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
188
189 setOperationAction(ISD::STORE, MVT::f32, Promote);
190 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
191
192 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
193 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
194
195 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
196 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
197
198 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
199 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
200
201 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
202 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
203
204 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
205 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
206
207 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
208 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
209
210 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
211 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
212
213 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
214 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
215
216 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
217 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
218
219 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
220 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
221
222 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
223 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
224
225 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
226 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
227
228 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
229 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
230
231 setOperationAction(ISD::STORE, MVT::i64, Promote);
232 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
233
234 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
235 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
236
237 setOperationAction(ISD::STORE, MVT::f64, Promote);
238 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
239
240 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
241 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
242
243 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
244 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
245
246 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
247 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
248
249 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
250 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
251
252 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
253 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
254
255 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
256 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
257
258 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
259 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
260
261 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
262 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
263
264 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
265 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
266
267 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
268 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
269 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
270 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
271
272 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
273 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
274 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
275 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
276
277 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
278 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
279 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
280 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
281 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
282 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
283 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
284 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
285
286 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
287 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
288 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
289
290 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
291 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
292
293 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
294 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
295 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
296 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
297
298 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
299 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
300 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
301 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
302
303 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
304 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
305
306 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
307 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
308 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
309 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
310 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
311 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
312 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
313
314 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
315 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
316
317 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
318
319 // This is totally unsupported, just custom lower to produce an error.
320 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
321
322 // Library functions. These default to Expand, but we have instructions
323 // for them.
324 setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
325 ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
326 ISD::FMAXNUM},
327 MVT::f32, Legal);
328
329 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
330
331 setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
332
333 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
334
335 setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom);
336
337 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
338
339 if (Subtarget->has16BitInsts())
340 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
341 else
342 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
343
344 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
345 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
346 // default unless marked custom/legal.
347 setOperationAction(
348 ISD::IS_FPCLASS,
349 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
350 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
351 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
352 Custom);
353
354 // Expand to fneg + fadd.
355 setOperationAction(ISD::FSUB, MVT::f64, Expand);
356
357 setOperationAction(ISD::CONCAT_VECTORS,
358 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
359 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
360 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
361 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
362 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
363 Custom);
364 setOperationAction(
365 ISD::EXTRACT_SUBVECTOR,
366 {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32,
367 MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32,
368 MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32,
369 MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, MVT::v9i32,
370 MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
371 MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
372 MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
373 MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
374 MVT::v16f64, MVT::v16i64},
375 Custom);
376
377 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
378 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
379
380 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
381 for (MVT VT : ScalarIntVTs) {
382 // These should use [SU]DIVREM, so set them to expand
383 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
384 Expand);
385
386 // GPU does not have divrem function for signed or unsigned.
387 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
388
389 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
390 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
391
392 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
393
394 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
395 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
396 }
397
398 // The hardware supports 32-bit FSHR, but not FSHL.
399 setOperationAction(ISD::FSHR, MVT::i32, Legal);
400
401 // The hardware supports 32-bit ROTR, but not ROTL.
402 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
403 setOperationAction(ISD::ROTR, MVT::i64, Expand);
404
405 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
406
407 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
408 setOperationAction(
409 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
410 MVT::i64, Custom);
411 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
412
413 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
414 Legal);
415
416 setOperationAction(
417 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
418 MVT::i64, Custom);
419
420 static const MVT::SimpleValueType VectorIntTypes[] = {
421 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
422 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
423
424 for (MVT VT : VectorIntTypes) {
425 // Expand the following operations for the current type by default.
426 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
427 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
428 ISD::MULHS, ISD::OR, ISD::SHL,
429 ISD::SRA, ISD::SRL, ISD::ROTL,
430 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
431 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
432 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
433 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
434 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
435 ISD::XOR, ISD::BSWAP, ISD::CTPOP,
436 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
437 ISD::SETCC},
438 VT, Expand);
439 }
440
441 static const MVT::SimpleValueType FloatVectorTypes[] = {
442 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
443 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
444
445 for (MVT VT : FloatVectorTypes) {
446 setOperationAction(
447 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD,
448 ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2,
449 ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG,
450 ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC,
451 ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
452 ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG,
453 ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
454 ISD::SETCC, ISD::FCANONICALIZE},
455 VT, Expand);
456 }
457
458 // This causes using an unrolled select operation rather than expansion with
459 // bit operations. This is in general better, but the alternative using BFI
460 // instructions may be better if the select sources are SGPRs.
461 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
462 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
463
464 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
465 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
466
467 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
468 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
469
470 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
471 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
472
473 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
474 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
475
476 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
477 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
478
479 setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
480 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
481
482 setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
483 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
484
485 setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
486 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
487
488 setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
489 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
490
491 // There are no libcalls of any kind.
492 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
493 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
494
495 setSchedulingPreference(Sched::RegPressure);
496 setJumpIsExpensive(true);
497
498 // FIXME: This is only partially true. If we have to do vector compares, any
499 // SGPR pair can be a condition register. If we have a uniform condition, we
500 // are better off doing SALU operations, where there is only one SCC. For now,
501 // we don't have a way of knowing during instruction selection if a condition
502 // will be uniform and we always use vector compares. Assume we are using
503 // vector compares until that is fixed.
504 setHasMultipleConditionRegisters(true);
505
506 setMinCmpXchgSizeInBits(32);
507 setSupportsUnalignedAtomics(false);
508
509 PredictableSelectIsExpensive = false;
510
511 // We want to find all load dependencies for long chains of stores to enable
512 // merging into very wide vectors. The problem is with vectors with > 4
513 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
514 // vectors are a legal type, even though we have to split the loads
515 // usually. When we can more precisely specify load legality per address
516 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
517 // smarter so that they can figure out what to do in 2 iterations without all
518 // N > 4 stores on the same chain.
519 GatherAllAliasesMaxDepth = 16;
520
521 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
522 // about these during lowering.
523 MaxStoresPerMemcpy = 0xffffffff;
524 MaxStoresPerMemmove = 0xffffffff;
525 MaxStoresPerMemset = 0xffffffff;
526
527 // The expansion for 64-bit division is enormous.
528 if (AMDGPUBypassSlowDiv)
529 addBypassSlowDiv(64, 32);
530
531 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
532 ISD::SRA, ISD::SRL,
533 ISD::TRUNCATE, ISD::MUL,
534 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
535 ISD::MULHU, ISD::MULHS,
536 ISD::SELECT, ISD::SELECT_CC,
537 ISD::STORE, ISD::FADD,
538 ISD::FSUB, ISD::FNEG,
539 ISD::FABS, ISD::AssertZext,
540 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
541}
542
543bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
544 if (getTargetMachine().Options.NoSignedZerosFPMath)
545 return true;
546
547 const auto Flags = Op.getNode()->getFlags();
548 if (Flags.hasNoSignedZeros())
549 return true;
550
551 return false;
552}
553
554//===----------------------------------------------------------------------===//
555// Target Information
556//===----------------------------------------------------------------------===//
557
558LLVM_READNONE__attribute__((__const__))
559static bool fnegFoldsIntoOp(unsigned Opc) {
560 switch (Opc) {
561 case ISD::FADD:
562 case ISD::FSUB:
563 case ISD::FMUL:
564 case ISD::FMA:
565 case ISD::FMAD:
566 case ISD::FMINNUM:
567 case ISD::FMAXNUM:
568 case ISD::FMINNUM_IEEE:
569 case ISD::FMAXNUM_IEEE:
570 case ISD::SELECT:
571 case ISD::FSIN:
572 case ISD::FTRUNC:
573 case ISD::FRINT:
574 case ISD::FNEARBYINT:
575 case ISD::FCANONICALIZE:
576 case AMDGPUISD::RCP:
577 case AMDGPUISD::RCP_LEGACY:
578 case AMDGPUISD::RCP_IFLAG:
579 case AMDGPUISD::SIN_HW:
580 case AMDGPUISD::FMUL_LEGACY:
581 case AMDGPUISD::FMIN_LEGACY:
582 case AMDGPUISD::FMAX_LEGACY:
583 case AMDGPUISD::FMED3:
584 // TODO: handle llvm.amdgcn.fma.legacy
585 return true;
586 default:
587 return false;
588 }
589}
590
591/// \p returns true if the operation will definitely need to use a 64-bit
592/// encoding, and thus will use a VOP3 encoding regardless of the source
593/// modifiers.
594LLVM_READONLY__attribute__((__pure__))
595static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
596 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
597 VT == MVT::f64;
598}
599
600/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
601/// type for ISD::SELECT.
602LLVM_READONLY__attribute__((__pure__))
603static bool selectSupportsSourceMods(const SDNode *N) {
604 // TODO: Only applies if select will be vector
605 return N->getValueType(0) == MVT::f32;
606}
607
608// Most FP instructions support source modifiers, but this could be refined
609// slightly.
610LLVM_READONLY__attribute__((__pure__))
611static bool hasSourceMods(const SDNode *N) {
612 if (isa<MemSDNode>(N))
613 return false;
614
615 switch (N->getOpcode()) {
616 case ISD::CopyToReg:
617 case ISD::FDIV:
618 case ISD::FREM:
619 case ISD::INLINEASM:
620 case ISD::INLINEASM_BR:
621 case AMDGPUISD::DIV_SCALE:
622 case ISD::INTRINSIC_W_CHAIN:
623
624 // TODO: Should really be looking at the users of the bitcast. These are
625 // problematic because bitcasts are used to legalize all stores to integer
626 // types.
627 case ISD::BITCAST:
628 return false;
629 case ISD::INTRINSIC_WO_CHAIN: {
630 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
631 case Intrinsic::amdgcn_interp_p1:
632 case Intrinsic::amdgcn_interp_p2:
633 case Intrinsic::amdgcn_interp_mov:
634 case Intrinsic::amdgcn_interp_p1_f16:
635 case Intrinsic::amdgcn_interp_p2_f16:
636 return false;
637 default:
638 return true;
639 }
640 }
641 case ISD::SELECT:
642 return selectSupportsSourceMods(N);
643 default:
644 return true;
645 }
646}
647
648bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
649 unsigned CostThreshold) {
650 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
651 // it is truly free to use a source modifier in all cases. If there are
652 // multiple users but for each one will necessitate using VOP3, there will be
653 // a code size increase. Try to avoid increasing code size unless we know it
654 // will save on the instruction count.
655 unsigned NumMayIncreaseSize = 0;
656 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
657
658 assert(!N->use_empty())(static_cast <bool> (!N->use_empty()) ? void (0) : __assert_fail
("!N->use_empty()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 658, __extension__ __PRETTY_FUNCTION__))
;
659
660 // XXX - Should this limit number of uses to check?
661 for (const SDNode *U : N->uses()) {
662 if (!hasSourceMods(U))
663 return false;
664
665 if (!opMustUseVOP3Encoding(U, VT)) {
666 if (++NumMayIncreaseSize > CostThreshold)
667 return false;
668 }
669 }
670
671 return true;
672}
673
674EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
675 ISD::NodeType ExtendKind) const {
676 assert(!VT.isVector() && "only scalar expected")(static_cast <bool> (!VT.isVector() && "only scalar expected"
) ? void (0) : __assert_fail ("!VT.isVector() && \"only scalar expected\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 676, __extension__
__PRETTY_FUNCTION__))
;
677
678 // Round to the next multiple of 32-bits.
679 unsigned Size = VT.getSizeInBits();
680 if (Size <= 32)
681 return MVT::i32;
682 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
683}
684
685MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
686 return MVT::i32;
687}
688
689bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
690 return true;
691}
692
693// The backend supports 32 and 64 bit floating point immediates.
694// FIXME: Why are we reporting vectors of FP immediates as legal?
695bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
696 bool ForCodeSize) const {
697 EVT ScalarVT = VT.getScalarType();
698 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
699 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
700}
701
702// We don't want to shrink f64 / f32 constants.
703bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
704 EVT ScalarVT = VT.getScalarType();
705 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
706}
707
708bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
709 ISD::LoadExtType ExtTy,
710 EVT NewVT) const {
711 // TODO: This may be worth removing. Check regression tests for diffs.
712 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
713 return false;
714
715 unsigned NewSize = NewVT.getStoreSizeInBits();
716
717 // If we are reducing to a 32-bit load or a smaller multi-dword load,
718 // this is always better.
719 if (NewSize >= 32)
720 return true;
721
722 EVT OldVT = N->getValueType(0);
723 unsigned OldSize = OldVT.getStoreSizeInBits();
724
725 MemSDNode *MN = cast<MemSDNode>(N);
726 unsigned AS = MN->getAddressSpace();
727 // Do not shrink an aligned scalar load to sub-dword.
728 // Scalar engine cannot do sub-dword loads.
729 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
730 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
731 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
732 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
733 MN->isInvariant())) &&
734 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
735 return false;
736
737 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
738 // extloads, so doing one requires using a buffer_load. In cases where we
739 // still couldn't use a scalar load, using the wider load shouldn't really
740 // hurt anything.
741
742 // If the old size already had to be an extload, there's no harm in continuing
743 // to reduce the width.
744 return (OldSize < 32);
745}
746
747bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
748 const SelectionDAG &DAG,
749 const MachineMemOperand &MMO) const {
750
751 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())(static_cast <bool> (LoadTy.getSizeInBits() == CastTy.getSizeInBits
()) ? void (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 751, __extension__
__PRETTY_FUNCTION__))
;
752
753 if (LoadTy.getScalarType() == MVT::i32)
754 return false;
755
756 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
757 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
758
759 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
760 return false;
761
762 unsigned Fast = 0;
763 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
764 CastTy, MMO, &Fast) &&
765 Fast;
766}
767
768// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
769// profitable with the expansion for 64-bit since it's generally good to
770// speculate things.
771bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
772 return true;
773}
774
775bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
776 return true;
777}
778
779bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
780 switch (N->getOpcode()) {
781 case ISD::EntryToken:
782 case ISD::TokenFactor:
783 return true;
784 case ISD::INTRINSIC_WO_CHAIN: {
785 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
786 switch (IntrID) {
787 case Intrinsic::amdgcn_readfirstlane:
788 case Intrinsic::amdgcn_readlane:
789 return true;
790 }
791 return false;
792 }
793 case ISD::LOAD:
794 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
795 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
796 return true;
797 return false;
798 case AMDGPUISD::SETCC: // ballot-style instruction
799 return true;
800 }
801 return false;
802}
803
804SDValue AMDGPUTargetLowering::getNegatedExpression(
805 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
806 NegatibleCost &Cost, unsigned Depth) const {
807
808 switch (Op.getOpcode()) {
809 case ISD::FMA:
810 case ISD::FMAD: {
811 // Negating a fma is not free if it has users without source mods.
812 if (!allUsesHaveSourceMods(Op.getNode()))
813 return SDValue();
814 break;
815 }
816 case AMDGPUISD::RCP: {
817 SDValue Src = Op.getOperand(0);
818 EVT VT = Op.getValueType();
819 SDLoc SL(Op);
820
821 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
822 ForCodeSize, Cost, Depth + 1);
823 if (NegSrc)
824 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
825 return SDValue();
826 }
827 default:
828 break;
829 }
830
831 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
832 ForCodeSize, Cost, Depth);
833}
834
835//===---------------------------------------------------------------------===//
836// Target Properties
837//===---------------------------------------------------------------------===//
838
839bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
840 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 840, __extension__ __PRETTY_FUNCTION__))
;
841
842 // Packed operations do not have a fabs modifier.
843 return VT == MVT::f32 || VT == MVT::f64 ||
844 (Subtarget->has16BitInsts() && VT == MVT::f16);
845}
846
847bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
848 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 848, __extension__ __PRETTY_FUNCTION__))
;
849 // Report this based on the end legalized type.
850 VT = VT.getScalarType();
851 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
852}
853
854bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
855 unsigned NumElem,
856 unsigned AS) const {
857 return true;
858}
859
860bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
861 // There are few operations which truly have vector input operands. Any vector
862 // operation is going to involve operations on each component, and a
863 // build_vector will be a copy per element, so it always makes sense to use a
864 // build_vector input in place of the extracted element to avoid a copy into a
865 // super register.
866 //
867 // We should probably only do this if all users are extracts only, but this
868 // should be the common case.
869 return true;
870}
871
872bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
873 // Truncate is just accessing a subregister.
874
875 unsigned SrcSize = Source.getSizeInBits();
876 unsigned DestSize = Dest.getSizeInBits();
877
878 return DestSize < SrcSize && DestSize % 32 == 0 ;
879}
880
881bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
882 // Truncate is just accessing a subregister.
883
884 unsigned SrcSize = Source->getScalarSizeInBits();
885 unsigned DestSize = Dest->getScalarSizeInBits();
886
887 if (DestSize== 16 && Subtarget->has16BitInsts())
888 return SrcSize >= 32;
889
890 return DestSize < SrcSize && DestSize % 32 == 0;
891}
892
893bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
894 unsigned SrcSize = Src->getScalarSizeInBits();
895 unsigned DestSize = Dest->getScalarSizeInBits();
896
897 if (SrcSize == 16 && Subtarget->has16BitInsts())
898 return DestSize >= 32;
899
900 return SrcSize == 32 && DestSize == 64;
901}
902
903bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
904 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
905 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
906 // this will enable reducing 64-bit operations the 32-bit, which is always
907 // good.
908
909 if (Src == MVT::i16)
910 return Dest == MVT::i32 ||Dest == MVT::i64 ;
911
912 return Src == MVT::i32 && Dest == MVT::i64;
913}
914
915bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
916 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
917 // limited number of native 64-bit operations. Shrinking an operation to fit
918 // in a single 32-bit register should always be helpful. As currently used,
919 // this is much less general than the name suggests, and is only used in
920 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
921 // not profitable, and may actually be harmful.
922 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
923}
924
925bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
926 const SDNode* N, CombineLevel Level) const {
927 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||(static_cast <bool> ((N->getOpcode() == ISD::SHL || N
->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL
) && "Expected shift op") ? void (0) : __assert_fail (
"(N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && \"Expected shift op\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 929, __extension__
__PRETTY_FUNCTION__))
928 N->getOpcode() == ISD::SRL) &&(static_cast <bool> ((N->getOpcode() == ISD::SHL || N
->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL
) && "Expected shift op") ? void (0) : __assert_fail (
"(N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && \"Expected shift op\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 929, __extension__
__PRETTY_FUNCTION__))
929 "Expected shift op")(static_cast <bool> ((N->getOpcode() == ISD::SHL || N
->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL
) && "Expected shift op") ? void (0) : __assert_fail (
"(N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && \"Expected shift op\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 929, __extension__
__PRETTY_FUNCTION__))
;
930 // Always commute pre-type legalization and right shifts.
931 // We're looking for shl(or(x,y),z) patterns.
932 if (Level < CombineLevel::AfterLegalizeTypes ||
933 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
934 return true;
935
936 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
937 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
938 (N->use_begin()->getOpcode() == ISD::SRA ||
939 N->use_begin()->getOpcode() == ISD::SRL))
940 return false;
941
942 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
943 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
944 if (LHS.getOpcode() != ISD::SHL)
945 return false;
946 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
947 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
948 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
949 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
950 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
951 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
952 };
953 SDValue LHS = N->getOperand(0).getOperand(0);
954 SDValue RHS = N->getOperand(0).getOperand(1);
955 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
956}
957
958//===---------------------------------------------------------------------===//
959// TargetLowering Callbacks
960//===---------------------------------------------------------------------===//
961
962CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
963 bool IsVarArg) {
964 switch (CC) {
965 case CallingConv::AMDGPU_VS:
966 case CallingConv::AMDGPU_GS:
967 case CallingConv::AMDGPU_PS:
968 case CallingConv::AMDGPU_CS:
969 case CallingConv::AMDGPU_HS:
970 case CallingConv::AMDGPU_ES:
971 case CallingConv::AMDGPU_LS:
972 return CC_AMDGPU;
973 case CallingConv::C:
974 case CallingConv::Fast:
975 case CallingConv::Cold:
976 return CC_AMDGPU_Func;
977 case CallingConv::AMDGPU_Gfx:
978 return CC_SI_Gfx;
979 case CallingConv::AMDGPU_KERNEL:
980 case CallingConv::SPIR_KERNEL:
981 default:
982 report_fatal_error("Unsupported calling convention for call");
983 }
984}
985
986CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
987 bool IsVarArg) {
988 switch (CC) {
989 case CallingConv::AMDGPU_KERNEL:
990 case CallingConv::SPIR_KERNEL:
991 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 991)
;
992 case CallingConv::AMDGPU_VS:
993 case CallingConv::AMDGPU_GS:
994 case CallingConv::AMDGPU_PS:
995 case CallingConv::AMDGPU_CS:
996 case CallingConv::AMDGPU_HS:
997 case CallingConv::AMDGPU_ES:
998 case CallingConv::AMDGPU_LS:
999 return RetCC_SI_Shader;
1000 case CallingConv::AMDGPU_Gfx:
1001 return RetCC_SI_Gfx;
1002 case CallingConv::C:
1003 case CallingConv::Fast:
1004 case CallingConv::Cold:
1005 return RetCC_AMDGPU_Func;
1006 default:
1007 report_fatal_error("Unsupported calling convention.");
1008 }
1009}
1010
1011/// The SelectionDAGBuilder will automatically promote function arguments
1012/// with illegal types. However, this does not work for the AMDGPU targets
1013/// since the function arguments are stored in memory as these illegal types.
1014/// In order to handle this properly we need to get the original types sizes
1015/// from the LLVM IR Function and fixup the ISD:InputArg values before
1016/// passing them to AnalyzeFormalArguments()
1017
1018/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1019/// input values across multiple registers. Each item in the Ins array
1020/// represents a single value that will be stored in registers. Ins[x].VT is
1021/// the value type of the value that will be stored in the register, so
1022/// whatever SDNode we lower the argument to needs to be this type.
1023///
1024/// In order to correctly lower the arguments we need to know the size of each
1025/// argument. Since Ins[x].VT gives us the size of the register that will
1026/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1027/// for the original function argument so that we can deduce the correct memory
1028/// type to use for Ins[x]. In most cases the correct memory type will be
1029/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1030/// we have a kernel argument of type v8i8, this argument will be split into
1031/// 8 parts and each part will be represented by its own item in the Ins array.
1032/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1033/// the argument before it was split. From this, we deduce that the memory type
1034/// for each individual part is i8. We pass the memory type as LocVT to the
1035/// calling convention analysis function and the register type (Ins[x].VT) as
1036/// the ValVT.
1037void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1038 CCState &State,
1039 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1040 const MachineFunction &MF = State.getMachineFunction();
1041 const Function &Fn = MF.getFunction();
1042 LLVMContext &Ctx = Fn.getParent()->getContext();
1043 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1044 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1045 CallingConv::ID CC = Fn.getCallingConv();
1046
1047 Align MaxAlign = Align(1);
1048 uint64_t ExplicitArgOffset = 0;
1049 const DataLayout &DL = Fn.getParent()->getDataLayout();
1050
1051 unsigned InIndex = 0;
1052
1053 for (const Argument &Arg : Fn.args()) {
1054 const bool IsByRef = Arg.hasByRefAttr();
1055 Type *BaseArgTy = Arg.getType();
1056 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1057 Align Alignment = DL.getValueOrABITypeAlignment(
1058 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1059 MaxAlign = std::max(Alignment, MaxAlign);
1060 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1061
1062 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1063 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1064
1065 // We're basically throwing away everything passed into us and starting over
1066 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1067 // to us as computed in Ins.
1068 //
1069 // We also need to figure out what type legalization is trying to do to get
1070 // the correct memory offsets.
1071
1072 SmallVector<EVT, 16> ValueVTs;
1073 SmallVector<uint64_t, 16> Offsets;
1074 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1075
1076 for (unsigned Value = 0, NumValues = ValueVTs.size();
1077 Value != NumValues; ++Value) {
1078 uint64_t BasePartOffset = Offsets[Value];
1079
1080 EVT ArgVT = ValueVTs[Value];
1081 EVT MemVT = ArgVT;
1082 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1083 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1084
1085 if (NumRegs == 1) {
1086 // This argument is not split, so the IR type is the memory type.
1087 if (ArgVT.isExtended()) {
1088 // We have an extended type, like i24, so we should just use the
1089 // register type.
1090 MemVT = RegisterVT;
1091 } else {
1092 MemVT = ArgVT;
1093 }
1094 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1095 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1096 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())(static_cast <bool> (ArgVT.getVectorNumElements() > RegisterVT
.getVectorNumElements()) ? void (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1096, __extension__
__PRETTY_FUNCTION__))
;
1097 // We have a vector value which has been split into a vector with
1098 // the same scalar type, but fewer elements. This should handle
1099 // all the floating-point vector types.
1100 MemVT = RegisterVT;
1101 } else if (ArgVT.isVector() &&
1102 ArgVT.getVectorNumElements() == NumRegs) {
1103 // This arg has been split so that each element is stored in a separate
1104 // register.
1105 MemVT = ArgVT.getScalarType();
1106 } else if (ArgVT.isExtended()) {
1107 // We have an extended type, like i65.
1108 MemVT = RegisterVT;
1109 } else {
1110 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1111 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)(static_cast <bool> (ArgVT.getStoreSizeInBits() % NumRegs
== 0) ? void (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1111, __extension__
__PRETTY_FUNCTION__))
;
1112 if (RegisterVT.isInteger()) {
1113 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1114 } else if (RegisterVT.isVector()) {
1115 assert(!RegisterVT.getScalarType().isFloatingPoint())(static_cast <bool> (!RegisterVT.getScalarType().isFloatingPoint
()) ? void (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1115, __extension__
__PRETTY_FUNCTION__))
;
1116 unsigned NumElements = RegisterVT.getVectorNumElements();
1117 assert(MemoryBits % NumElements == 0)(static_cast <bool> (MemoryBits % NumElements == 0) ? void
(0) : __assert_fail ("MemoryBits % NumElements == 0", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1117, __extension__ __PRETTY_FUNCTION__))
;
1118 // This vector type has been split into another vector type with
1119 // a different elements size.
1120 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1121 MemoryBits / NumElements);
1122 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1123 } else {
1124 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1124)
;
1125 }
1126 }
1127
1128 // Convert one element vectors to scalar.
1129 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1130 MemVT = MemVT.getScalarType();
1131
1132 // Round up vec3/vec5 argument.
1133 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1134 assert(MemVT.getVectorNumElements() == 3 ||(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1137, __extension__
__PRETTY_FUNCTION__))
1135 MemVT.getVectorNumElements() == 5 ||(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1137, __extension__
__PRETTY_FUNCTION__))
1136 (MemVT.getVectorNumElements() >= 9 &&(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1137, __extension__
__PRETTY_FUNCTION__))
1137 MemVT.getVectorNumElements() <= 12))(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements
() >= 9 && MemVT.getVectorNumElements() <= 12))
? void (0) : __assert_fail ("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5 || (MemVT.getVectorNumElements() >= 9 && MemVT.getVectorNumElements() <= 12)"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1137, __extension__
__PRETTY_FUNCTION__))
;
1138 MemVT = MemVT.getPow2VectorType(State.getContext());
1139 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1140 MemVT = MemVT.getRoundIntegerType(State.getContext());
1141 }
1142
1143 unsigned PartOffset = 0;
1144 for (unsigned i = 0; i != NumRegs; ++i) {
1145 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1146 BasePartOffset + PartOffset,
1147 MemVT.getSimpleVT(),
1148 CCValAssign::Full));
1149 PartOffset += MemVT.getStoreSize();
1150 }
1151 }
1152 }
1153}
1154
1155SDValue AMDGPUTargetLowering::LowerReturn(
1156 SDValue Chain, CallingConv::ID CallConv,
1157 bool isVarArg,
1158 const SmallVectorImpl<ISD::OutputArg> &Outs,
1159 const SmallVectorImpl<SDValue> &OutVals,
1160 const SDLoc &DL, SelectionDAG &DAG) const {
1161 // FIXME: Fails for r600 tests
1162 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1163 // "wave terminate should not have return values");
1164 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1165}
1166
1167//===---------------------------------------------------------------------===//
1168// Target specific lowering
1169//===---------------------------------------------------------------------===//
1170
1171/// Selects the correct CCAssignFn for a given CallingConvention value.
1172CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1173 bool IsVarArg) {
1174 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1175}
1176
1177CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1178 bool IsVarArg) {
1179 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1180}
1181
1182SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1183 SelectionDAG &DAG,
1184 MachineFrameInfo &MFI,
1185 int ClobberedFI) const {
1186 SmallVector<SDValue, 8> ArgChains;
1187 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1188 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1189
1190 // Include the original chain at the beginning of the list. When this is
1191 // used by target LowerCall hooks, this helps legalize find the
1192 // CALLSEQ_BEGIN node.
1193 ArgChains.push_back(Chain);
1194
1195 // Add a chain value for each stack argument corresponding
1196 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1197 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1198 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1199 if (FI->getIndex() < 0) {
1200 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1201 int64_t InLastByte = InFirstByte;
1202 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1203
1204 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1205 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1206 ArgChains.push_back(SDValue(L, 1));
1207 }
1208 }
1209 }
1210 }
1211
1212 // Build a tokenfactor for all the chains.
1213 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1214}
1215
1216SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1217 SmallVectorImpl<SDValue> &InVals,
1218 StringRef Reason) const {
1219 SDValue Callee = CLI.Callee;
1220 SelectionDAG &DAG = CLI.DAG;
1221
1222 const Function &Fn = DAG.getMachineFunction().getFunction();
1223
1224 StringRef FuncName("<unknown>");
1225
1226 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1227 FuncName = G->getSymbol();
1228 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1229 FuncName = G->getGlobal()->getName();
1230
1231 DiagnosticInfoUnsupported NoCalls(
1232 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1233 DAG.getContext()->diagnose(NoCalls);
1234
1235 if (!CLI.IsTailCall) {
1236 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1237 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1238 }
1239
1240 return DAG.getEntryNode();
1241}
1242
1243SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1244 SmallVectorImpl<SDValue> &InVals) const {
1245 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1246}
1247
1248SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1249 SelectionDAG &DAG) const {
1250 const Function &Fn = DAG.getMachineFunction().getFunction();
1251
1252 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1253 SDLoc(Op).getDebugLoc());
1254 DAG.getContext()->diagnose(NoDynamicAlloca);
1255 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1256 return DAG.getMergeValues(Ops, SDLoc());
1257}
1258
1259SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1260 SelectionDAG &DAG) const {
1261 switch (Op.getOpcode()) {
1262 default:
1263 Op->print(errs(), &DAG);
1264 llvm_unreachable("Custom lowering code for this "::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1265)
1265 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1265)
;
1266 break;
1267 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1268 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1269 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1270 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1271 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1272 case ISD::FREM: return LowerFREM(Op, DAG);
1273 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1274 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1275 case ISD::FRINT: return LowerFRINT(Op, DAG);
1276 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1277 case ISD::FROUNDEVEN:
1278 return LowerFROUNDEVEN(Op, DAG);
1279 case ISD::FROUND: return LowerFROUND(Op, DAG);
1280 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1281 case ISD::FLOG:
1282 return LowerFLOG(Op, DAG, numbers::ln2f);
1283 case ISD::FLOG10:
1284 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1285 case ISD::FEXP:
1286 return lowerFEXP(Op, DAG);
1287 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1288 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1289 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1290 case ISD::FP_TO_SINT:
1291 case ISD::FP_TO_UINT:
1292 return LowerFP_TO_INT(Op, DAG);
1293 case ISD::CTTZ:
1294 case ISD::CTTZ_ZERO_UNDEF:
1295 case ISD::CTLZ:
1296 case ISD::CTLZ_ZERO_UNDEF:
1297 return LowerCTLZ_CTTZ(Op, DAG);
1298 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1299 }
1300 return Op;
1301}
1302
1303void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1304 SmallVectorImpl<SDValue> &Results,
1305 SelectionDAG &DAG) const {
1306 switch (N->getOpcode()) {
1307 case ISD::SIGN_EXTEND_INREG:
1308 // Different parts of legalization seem to interpret which type of
1309 // sign_extend_inreg is the one to check for custom lowering. The extended
1310 // from type is what really matters, but some places check for custom
1311 // lowering of the result type. This results in trying to use
1312 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1313 // nothing here and let the illegal result integer be handled normally.
1314 return;
1315 default:
1316 return;
1317 }
1318}
1319
1320SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1321 SDValue Op,
1322 SelectionDAG &DAG) const {
1323
1324 const DataLayout &DL = DAG.getDataLayout();
1325 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1326 const GlobalValue *GV = G->getGlobal();
1327
1328 if (!MFI->isModuleEntryFunction()) {
1329 if (std::optional<uint32_t> Address =
1330 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
1331 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1332 }
1333 }
1334
1335 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1336 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1337 if (!MFI->isModuleEntryFunction() &&
1338 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1339 SDLoc DL(Op);
1340 const Function &Fn = DAG.getMachineFunction().getFunction();
1341 DiagnosticInfoUnsupported BadLDSDecl(
1342 Fn, "local memory global used by non-kernel function",
1343 DL.getDebugLoc(), DS_Warning);
1344 DAG.getContext()->diagnose(BadLDSDecl);
1345
1346 // We currently don't have a way to correctly allocate LDS objects that
1347 // aren't directly associated with a kernel. We do force inlining of
1348 // functions that use local objects. However, if these dead functions are
1349 // not eliminated, we don't want a compile time error. Just emit a warning
1350 // and a trap, since there should be no callable path here.
1351 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1352 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1353 Trap, DAG.getRoot());
1354 DAG.setRoot(OutputChain);
1355 return DAG.getUNDEF(Op.getValueType());
1356 }
1357
1358 // XXX: What does the value of G->getOffset() mean?
1359 assert(G->getOffset() == 0 &&(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1360, __extension__
__PRETTY_FUNCTION__))
1360 "Do not know what to do with an non-zero offset")(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1360, __extension__
__PRETTY_FUNCTION__))
;
1361
1362 // TODO: We could emit code to handle the initialization somewhere.
1363 // We ignore the initializer for now and legalize it to allow selection.
1364 // The initializer will anyway get errored out during assembly emission.
1365 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1366 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1367 }
1368 return SDValue();
1369}
1370
1371SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1372 SelectionDAG &DAG) const {
1373 SmallVector<SDValue, 8> Args;
1374 SDLoc SL(Op);
1375
1376 EVT VT = Op.getValueType();
1377 if (VT.getVectorElementType().getSizeInBits() < 32) {
1378 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1379 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1380 unsigned NewNumElt = OpBitSize / 32;
1381 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1382 : EVT::getVectorVT(*DAG.getContext(),
1383 MVT::i32, NewNumElt);
1384 for (const SDUse &U : Op->ops()) {
1385 SDValue In = U.get();
1386 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1387 if (NewNumElt > 1)
1388 DAG.ExtractVectorElements(NewIn, Args);
1389 else
1390 Args.push_back(NewIn);
1391 }
1392
1393 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1394 NewNumElt * Op.getNumOperands());
1395 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1396 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1397 }
1398 }
1399
1400 for (const SDUse &U : Op->ops())
1401 DAG.ExtractVectorElements(U.get(), Args);
1402
1403 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1404}
1405
1406SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1407 SelectionDAG &DAG) const {
1408
1409 SmallVector<SDValue, 8> Args;
1410 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1411 EVT VT = Op.getValueType();
1412 EVT SrcVT = Op.getOperand(0).getValueType();
1413
1414 // For these types, we have some TableGen patterns except if the index is 1
1415 if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1416 (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1417 Start != 1)
1418 return Op;
1419
1420 if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1421 (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1422 (Start == 0 || Start == 4))
1423 return Op;
1424
1425 if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
1426 (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
1427 (Start == 0 || Start == 8))
1428 return Op;
1429
1430 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1431 VT.getVectorNumElements());
1432
1433 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1434}
1435
1436// TODO: Handle fabs too
1437static SDValue peekFNeg(SDValue Val) {
1438 if (Val.getOpcode() == ISD::FNEG)
1439 return Val.getOperand(0);
1440
1441 return Val;
1442}
1443SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1444 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1445 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1446 SelectionDAG &DAG = DCI.DAG;
1447 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1448 switch (CCOpcode) {
1449 case ISD::SETOEQ:
1450 case ISD::SETONE:
1451 case ISD::SETUNE:
1452 case ISD::SETNE:
1453 case ISD::SETUEQ:
1454 case ISD::SETEQ:
1455 case ISD::SETFALSE:
1456 case ISD::SETFALSE2:
1457 case ISD::SETTRUE:
1458 case ISD::SETTRUE2:
1459 case ISD::SETUO:
1460 case ISD::SETO:
1461 break;
1462 case ISD::SETULE:
1463 case ISD::SETULT: {
1464 if (LHS == True)
1465 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1466 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1467 }
1468 case ISD::SETOLE:
1469 case ISD::SETOLT:
1470 case ISD::SETLE:
1471 case ISD::SETLT: {
1472 // Ordered. Assume ordered for undefined.
1473
1474 // Only do this after legalization to avoid interfering with other combines
1475 // which might occur.
1476 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1477 !DCI.isCalledByLegalizer())
1478 return SDValue();
1479
1480 // We need to permute the operands to get the correct NaN behavior. The
1481 // selected operand is the second one based on the failing compare with NaN,
1482 // so permute it based on the compare type the hardware uses.
1483 if (LHS == True)
1484 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1485 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1486 }
1487 case ISD::SETUGE:
1488 case ISD::SETUGT: {
1489 if (LHS == True)
1490 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1491 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1492 }
1493 case ISD::SETGT:
1494 case ISD::SETGE:
1495 case ISD::SETOGE:
1496 case ISD::SETOGT: {
1497 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1498 !DCI.isCalledByLegalizer())
1499 return SDValue();
1500
1501 if (LHS == True)
1502 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1503 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1504 }
1505 case ISD::SETCC_INVALID:
1506 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1506)
;
1507 }
1508 return SDValue();
1509}
1510
1511/// Generate Min/Max node
1512SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1513 SDValue LHS, SDValue RHS,
1514 SDValue True, SDValue False,
1515 SDValue CC,
1516 DAGCombinerInfo &DCI) const {
1517 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1518 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1519
1520 SelectionDAG &DAG = DCI.DAG;
1521
1522 // If we can't directly match this, try to see if we can fold an fneg to
1523 // match.
1524
1525 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1526 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1527 SDValue NegTrue = peekFNeg(True);
1528
1529 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1530 // fmin/fmax.
1531 //
1532 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1533 // -> fneg (fmin_legacy lhs, K)
1534 //
1535 // TODO: Use getNegatedExpression
1536 if (LHS == NegTrue && CFalse && CRHS) {
1537 APFloat NegRHS = neg(CRHS->getValueAPF());
1538 if (NegRHS == CFalse->getValueAPF()) {
1539 SDValue Combined =
1540 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1541 if (Combined)
1542 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1543 return SDValue();
1544 }
1545 }
1546
1547 return SDValue();
1548}
1549
1550std::pair<SDValue, SDValue>
1551AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1552 SDLoc SL(Op);
1553
1554 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1555
1556 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1557 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1558
1559 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1560 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1561
1562 return std::pair(Lo, Hi);
1563}
1564
1565SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1566 SDLoc SL(Op);
1567
1568 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1569 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1570 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1571}
1572
1573SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1574 SDLoc SL(Op);
1575
1576 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1577 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1578 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1579}
1580
1581// Split a vector type into two parts. The first part is a power of two vector.
1582// The second part is whatever is left over, and is a scalar if it would
1583// otherwise be a 1-vector.
1584std::pair<EVT, EVT>
1585AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1586 EVT LoVT, HiVT;
1587 EVT EltVT = VT.getVectorElementType();
1588 unsigned NumElts = VT.getVectorNumElements();
1589 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1590 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1591 HiVT = NumElts - LoNumElts == 1
1592 ? EltVT
1593 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1594 return std::pair(LoVT, HiVT);
1595}
1596
1597// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1598// scalar.
1599std::pair<SDValue, SDValue>
1600AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1601 const EVT &LoVT, const EVT &HiVT,
1602 SelectionDAG &DAG) const {
1603 assert(LoVT.getVectorNumElements() +(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1606, __extension__
__PRETTY_FUNCTION__))
1604 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1606, __extension__
__PRETTY_FUNCTION__))
1605 N.getValueType().getVectorNumElements() &&(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1606, __extension__
__PRETTY_FUNCTION__))
1606 "More vector elements requested than available!")(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1606, __extension__
__PRETTY_FUNCTION__))
;
1607 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1608 DAG.getVectorIdxConstant(0, DL));
1609 SDValue Hi = DAG.getNode(
1610 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1611 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1612 return std::pair(Lo, Hi);
1613}
1614
1615SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1616 SelectionDAG &DAG) const {
1617 LoadSDNode *Load = cast<LoadSDNode>(Op);
1618 EVT VT = Op.getValueType();
1619 SDLoc SL(Op);
1620
1621
1622 // If this is a 2 element vector, we really want to scalarize and not create
1623 // weird 1 element vectors.
1624 if (VT.getVectorNumElements() == 2) {
1625 SDValue Ops[2];
1626 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1627 return DAG.getMergeValues(Ops, SL);
1628 }
1629
1630 SDValue BasePtr = Load->getBasePtr();
1631 EVT MemVT = Load->getMemoryVT();
1632
1633 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1634
1635 EVT LoVT, HiVT;
1636 EVT LoMemVT, HiMemVT;
1637 SDValue Lo, Hi;
1638
1639 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1640 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1641 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1642
1643 unsigned Size = LoMemVT.getStoreSize();
1644 Align BaseAlign = Load->getAlign();
1645 Align HiAlign = commonAlignment(BaseAlign, Size);
1646
1647 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1648 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1649 BaseAlign, Load->getMemOperand()->getFlags());
1650 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1651 SDValue HiLoad =
1652 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1653 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1654 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1655
1656 SDValue Join;
1657 if (LoVT == HiVT) {
1658 // This is the case that the vector is power of two so was evenly split.
1659 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1660 } else {
1661 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1662 DAG.getVectorIdxConstant(0, SL));
1663 Join = DAG.getNode(
1664 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1665 VT, Join, HiLoad,
1666 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1667 }
1668
1669 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1670 LoLoad.getValue(1), HiLoad.getValue(1))};
1671
1672 return DAG.getMergeValues(Ops, SL);
1673}
1674
1675SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1676 SelectionDAG &DAG) const {
1677 LoadSDNode *Load = cast<LoadSDNode>(Op);
1678 EVT VT = Op.getValueType();
1679 SDValue BasePtr = Load->getBasePtr();
1680 EVT MemVT = Load->getMemoryVT();
1681 SDLoc SL(Op);
1682 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1683 Align BaseAlign = Load->getAlign();
1684 unsigned NumElements = MemVT.getVectorNumElements();
1685
1686 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1687 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1688 if (NumElements != 3 ||
1689 (BaseAlign < Align(8) &&
1690 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1691 return SplitVectorLoad(Op, DAG);
1692
1693 assert(NumElements == 3)(static_cast <bool> (NumElements == 3) ? void (0) : __assert_fail
("NumElements == 3", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1693, __extension__ __PRETTY_FUNCTION__))
;
1694
1695 EVT WideVT =
1696 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1697 EVT WideMemVT =
1698 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1699 SDValue WideLoad = DAG.getExtLoad(
1700 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1701 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1702 return DAG.getMergeValues(
1703 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1704 DAG.getVectorIdxConstant(0, SL)),
1705 WideLoad.getValue(1)},
1706 SL);
1707}
1708
1709SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1710 SelectionDAG &DAG) const {
1711 StoreSDNode *Store = cast<StoreSDNode>(Op);
1712 SDValue Val = Store->getValue();
1713 EVT VT = Val.getValueType();
1714
1715 // If this is a 2 element vector, we really want to scalarize and not create
1716 // weird 1 element vectors.
1717 if (VT.getVectorNumElements() == 2)
1718 return scalarizeVectorStore(Store, DAG);
1719
1720 EVT MemVT = Store->getMemoryVT();
1721 SDValue Chain = Store->getChain();
1722 SDValue BasePtr = Store->getBasePtr();
1723 SDLoc SL(Op);
1724
1725 EVT LoVT, HiVT;
1726 EVT LoMemVT, HiMemVT;
1727 SDValue Lo, Hi;
1728
1729 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1730 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1731 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1732
1733 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1734
1735 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1736 Align BaseAlign = Store->getAlign();
1737 unsigned Size = LoMemVT.getStoreSize();
1738 Align HiAlign = commonAlignment(BaseAlign, Size);
1739
1740 SDValue LoStore =
1741 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1742 Store->getMemOperand()->getFlags());
1743 SDValue HiStore =
1744 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1745 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1746
1747 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1748}
1749
1750// This is a shortcut for integer division because we have fast i32<->f32
1751// conversions, and fast f32 reciprocal instructions. The fractional part of a
1752// float is enough to accurately represent up to a 24-bit signed integer.
1753SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1754 bool Sign) const {
1755 SDLoc DL(Op);
1756 EVT VT = Op.getValueType();
1757 SDValue LHS = Op.getOperand(0);
1758 SDValue RHS = Op.getOperand(1);
1759 MVT IntVT = MVT::i32;
1760 MVT FltVT = MVT::f32;
1761
1762 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1763 if (LHSSignBits < 9)
1764 return SDValue();
1765
1766 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1767 if (RHSSignBits < 9)
1768 return SDValue();
1769
1770 unsigned BitSize = VT.getSizeInBits();
1771 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1772 unsigned DivBits = BitSize - SignBits;
1773 if (Sign)
1774 ++DivBits;
1775
1776 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1777 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1778
1779 SDValue jq = DAG.getConstant(1, DL, IntVT);
1780
1781 if (Sign) {
1782 // char|short jq = ia ^ ib;
1783 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1784
1785 // jq = jq >> (bitsize - 2)
1786 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1787 DAG.getConstant(BitSize - 2, DL, VT));
1788
1789 // jq = jq | 0x1
1790 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1791 }
1792
1793 // int ia = (int)LHS;
1794 SDValue ia = LHS;
1795
1796 // int ib, (int)RHS;
1797 SDValue ib = RHS;
1798
1799 // float fa = (float)ia;
1800 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1801
1802 // float fb = (float)ib;
1803 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1804
1805 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1806 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1807
1808 // fq = trunc(fq);
1809 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1810
1811 // float fqneg = -fq;
1812 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1813
1814 MachineFunction &MF = DAG.getMachineFunction();
1815
1816 bool UseFmadFtz = false;
1817 if (Subtarget->isGCN()) {
1818 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1819 UseFmadFtz = MFI->getMode().allFP32Denormals();
1820 }
1821
1822 // float fr = mad(fqneg, fb, fa);
1823 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1824 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1825 : (unsigned)ISD::FMAD;
1826 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1827
1828 // int iq = (int)fq;
1829 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1830
1831 // fr = fabs(fr);
1832 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1833
1834 // fb = fabs(fb);
1835 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1836
1837 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1838
1839 // int cv = fr >= fb;
1840 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1841
1842 // jq = (cv ? jq : 0);
1843 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1844
1845 // dst = iq + jq;
1846 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1847
1848 // Rem needs compensation, it's easier to recompute it
1849 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1850 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1851
1852 // Truncate to number of bits this divide really is.
1853 if (Sign) {
1854 SDValue InRegSize
1855 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1856 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1857 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1858 } else {
1859 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1860 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1861 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1862 }
1863
1864 return DAG.getMergeValues({ Div, Rem }, DL);
1865}
1866
1867void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1868 SelectionDAG &DAG,
1869 SmallVectorImpl<SDValue> &Results) const {
1870 SDLoc DL(Op);
1871 EVT VT = Op.getValueType();
1872
1873 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")(static_cast <bool> (VT == MVT::i64 && "LowerUDIVREM64 expects an i64"
) ? void (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1873, __extension__
__PRETTY_FUNCTION__))
;
1874
1875 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1876
1877 SDValue One = DAG.getConstant(1, DL, HalfVT);
1878 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1879
1880 //HiLo split
1881 SDValue LHS = Op.getOperand(0);
1882 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1883 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1884
1885 SDValue RHS = Op.getOperand(1);
1886 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1887 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1888
1889 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1890 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1891
1892 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1893 LHS_Lo, RHS_Lo);
1894
1895 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1896 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1897
1898 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1899 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1900 return;
1901 }
1902
1903 if (isTypeLegal(MVT::i64)) {
1904 // The algorithm here is based on ideas from "Software Integer Division",
1905 // Tom Rodeheffer, August 2008.
1906
1907 MachineFunction &MF = DAG.getMachineFunction();
1908 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1909
1910 // Compute denominator reciprocal.
1911 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1912 (unsigned)ISD::FMA :
1913 !MFI->getMode().allFP32Denormals() ?
1914 (unsigned)ISD::FMAD :
1915 (unsigned)AMDGPUISD::FMAD_FTZ;
1916
1917 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1918 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1919 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1920 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1921 Cvt_Lo);
1922 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1923 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1924 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1925 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1926 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1927 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1928 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1929 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1930 Mul1);
1931 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1932 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1933 SDValue Rcp64 = DAG.getBitcast(VT,
1934 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1935
1936 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1937 SDValue One64 = DAG.getConstant(1, DL, VT);
1938 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1939 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1940
1941 // First round of UNR (Unsigned integer Newton-Raphson).
1942 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1943 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1944 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1945 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1946 Zero);
1947 SDValue Mulhi1_Hi =
1948 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1949 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1950 Mulhi1_Lo, Zero1);
1951 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1952 Mulhi1_Hi, Add1_Lo.getValue(1));
1953 SDValue Add1 = DAG.getBitcast(VT,
1954 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1955
1956 // Second round of UNR.
1957 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1958 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1959 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1960 Zero);
1961 SDValue Mulhi2_Hi =
1962 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1963 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1964 Mulhi2_Lo, Zero1);
1965 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1966 Mulhi2_Hi, Add2_Lo.getValue(1));
1967 SDValue Add2 = DAG.getBitcast(VT,
1968 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1969
1970 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1971
1972 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1973
1974 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1975 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1976 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1977 Mul3_Lo, Zero1);
1978 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1979 Mul3_Hi, Sub1_Lo.getValue(1));
1980 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1981 SDValue Sub1 = DAG.getBitcast(VT,
1982 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1983
1984 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1985 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1986 ISD::SETUGE);
1987 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1988 ISD::SETUGE);
1989 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1990
1991 // TODO: Here and below portions of the code can be enclosed into if/endif.
1992 // Currently control flow is unconditional and we have 4 selects after
1993 // potential endif to substitute PHIs.
1994
1995 // if C3 != 0 ...
1996 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1997 RHS_Lo, Zero1);
1998 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1999 RHS_Hi, Sub1_Lo.getValue(1));
2000 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
2001 Zero, Sub2_Lo.getValue(1));
2002 SDValue Sub2 = DAG.getBitcast(VT,
2003 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2004
2005 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2006
2007 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2008 ISD::SETUGE);
2009 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2010 ISD::SETUGE);
2011 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2012
2013 // if (C6 != 0)
2014 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2015
2016 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
2017 RHS_Lo, Zero1);
2018 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
2019 RHS_Hi, Sub2_Lo.getValue(1));
2020 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
2021 Zero, Sub3_Lo.getValue(1));
2022 SDValue Sub3 = DAG.getBitcast(VT,
2023 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2024
2025 // endif C6
2026 // endif C3
2027
2028 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2029 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2030
2031 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2032 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2033
2034 Results.push_back(Div);
2035 Results.push_back(Rem);
2036
2037 return;
2038 }
2039
2040 // r600 expandion.
2041 // Get Speculative values
2042 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2043 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2044
2045 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2046 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2047 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2048
2049 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2050 SDValue DIV_Lo = Zero;
2051
2052 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2053
2054 for (unsigned i = 0; i < halfBitWidth; ++i) {
2055 const unsigned bitPos = halfBitWidth - i - 1;
2056 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2057 // Get value of high bit
2058 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2059 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2060 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2061
2062 // Shift
2063 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2064 // Add LHS high bit
2065 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2066
2067 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2068 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2069
2070 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2071
2072 // Update REM
2073 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2074 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2075 }
2076
2077 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2078 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2079 Results.push_back(DIV);
2080 Results.push_back(REM);
2081}
2082
2083SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2084 SelectionDAG &DAG) const {
2085 SDLoc DL(Op);
2086 EVT VT = Op.getValueType();
2087
2088 if (VT == MVT::i64) {
2089 SmallVector<SDValue, 2> Results;
2090 LowerUDIVREM64(Op, DAG, Results);
2091 return DAG.getMergeValues(Results, DL);
2092 }
2093
2094 if (VT == MVT::i32) {
2095 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2096 return Res;
2097 }
2098
2099 SDValue X = Op.getOperand(0);
2100 SDValue Y = Op.getOperand(1);
2101
2102 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2103 // algorithm used here.
2104
2105 // Initial estimate of inv(y).
2106 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2107
2108 // One round of UNR.
2109 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2110 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2111 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2112 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2113
2114 // Quotient/remainder estimate.
2115 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2116 SDValue R =
2117 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2118
2119 // First quotient/remainder refinement.
2120 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2121 SDValue One = DAG.getConstant(1, DL, VT);
2122 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2123 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2124 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2125 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2126 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2127
2128 // Second quotient/remainder refinement.
2129 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2130 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2131 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2132 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2133 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2134
2135 return DAG.getMergeValues({Q, R}, DL);
2136}
2137
2138SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2139 SelectionDAG &DAG) const {
2140 SDLoc DL(Op);
2141 EVT VT = Op.getValueType();
2142
2143 SDValue LHS = Op.getOperand(0);
2144 SDValue RHS = Op.getOperand(1);
2145
2146 SDValue Zero = DAG.getConstant(0, DL, VT);
2147 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2148
2149 if (VT == MVT::i32) {
2150 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2151 return Res;
2152 }
2153
2154 if (VT == MVT::i64 &&
2155 DAG.ComputeNumSignBits(LHS) > 32 &&
2156 DAG.ComputeNumSignBits(RHS) > 32) {
2157 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2158
2159 //HiLo split
2160 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2161 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2162 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2163 LHS_Lo, RHS_Lo);
2164 SDValue Res[2] = {
2165 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2166 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2167 };
2168 return DAG.getMergeValues(Res, DL);
2169 }
2170
2171 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2172 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2173 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2174 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2175
2176 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2177 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2178
2179 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2180 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2181
2182 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2183 SDValue Rem = Div.getValue(1);
2184
2185 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2186 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2187
2188 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2189 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2190
2191 SDValue Res[2] = {
2192 Div,
2193 Rem
2194 };
2195 return DAG.getMergeValues(Res, DL);
2196}
2197
2198// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2199SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2200 SDLoc SL(Op);
2201 EVT VT = Op.getValueType();
2202 auto Flags = Op->getFlags();
2203 SDValue X = Op.getOperand(0);
2204 SDValue Y = Op.getOperand(1);
2205
2206 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2207 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2208 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2209 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2210 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2211}
2212
2213SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2214 SDLoc SL(Op);
2215 SDValue Src = Op.getOperand(0);
2216
2217 // result = trunc(src)
2218 // if (src > 0.0 && src != result)
2219 // result += 1.0
2220
2221 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2222
2223 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2224 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2225
2226 EVT SetCCVT =
2227 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2228
2229 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2230 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2231 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2232
2233 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2234 // TODO: Should this propagate fast-math-flags?
2235 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2236}
2237
2238static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2239 SelectionDAG &DAG) {
2240 const unsigned FractBits = 52;
2241 const unsigned ExpBits = 11;
2242
2243 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2244 Hi,
2245 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2246 DAG.getConstant(ExpBits, SL, MVT::i32));
2247 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2248 DAG.getConstant(1023, SL, MVT::i32));
2249
2250 return Exp;
2251}
2252
2253SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2254 SDLoc SL(Op);
2255 SDValue Src = Op.getOperand(0);
2256
2257 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2257, __extension__ __PRETTY_FUNCTION__))
;
2258
2259 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2260
2261 // Extract the upper half, since this is where we will find the sign and
2262 // exponent.
2263 SDValue Hi = getHiHalf64(Src, DAG);
2264
2265 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2266
2267 const unsigned FractBits = 52;
2268
2269 // Extract the sign bit.
2270 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2271 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2272
2273 // Extend back to 64-bits.
2274 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2275 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2276
2277 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2278 const SDValue FractMask
2279 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2280
2281 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2282 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2283 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2284
2285 EVT SetCCVT =
2286 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2287
2288 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2289
2290 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2291 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2292
2293 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2294 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2295
2296 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2297}
2298
2299SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2300 SDLoc SL(Op);
2301 SDValue Src = Op.getOperand(0);
2302
2303 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2303, __extension__ __PRETTY_FUNCTION__))
;
2304
2305 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2306 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2307 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2308
2309 // TODO: Should this propagate fast-math-flags?
2310
2311 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2312 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2313
2314 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2315
2316 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2317 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2318
2319 EVT SetCCVT =
2320 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2321 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2322
2323 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2324}
2325
2326SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2327 // FNEARBYINT and FRINT are the same, except in their handling of FP
2328 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2329 // rint, so just treat them as equivalent.
2330 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2331}
2332
2333SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2334 SelectionDAG &DAG) const {
2335 auto VT = Op.getValueType();
2336 auto Arg = Op.getOperand(0u);
2337 return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
2338}
2339
2340// XXX - May require not supporting f32 denormals?
2341
2342// Don't handle v2f16. The extra instructions to scalarize and repack around the
2343// compare and vselect end up producing worse code than scalarizing the whole
2344// operation.
2345SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2346 SDLoc SL(Op);
2347 SDValue X = Op.getOperand(0);
2348 EVT VT = Op.getValueType();
2349
2350 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2351
2352 // TODO: Should this propagate fast-math-flags?
2353
2354 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2355
2356 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2357
2358 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2359 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2360 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2361
2362 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2363
2364 EVT SetCCVT =
2365 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2366
2367 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2368
2369 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2370
2371 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2372}
2373
2374SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2375 SDLoc SL(Op);
2376 SDValue Src = Op.getOperand(0);
2377
2378 // result = trunc(src);
2379 // if (src < 0.0 && src != result)
2380 // result += -1.0.
2381
2382 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2383
2384 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2385 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2386
2387 EVT SetCCVT =
2388 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2389
2390 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2391 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2392 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2393
2394 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2395 // TODO: Should this propagate fast-math-flags?
2396 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2397}
2398
2399SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2400 double Log2BaseInverted) const {
2401 EVT VT = Op.getValueType();
2402
2403 SDLoc SL(Op);
2404 SDValue Operand = Op.getOperand(0);
2405 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2406 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2407
2408 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2409}
2410
2411// exp2(M_LOG2E_F * f);
2412SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2413 EVT VT = Op.getValueType();
2414 SDLoc SL(Op);
2415 SDValue Src = Op.getOperand(0);
2416
2417 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2418 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2419 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2420}
2421
2422static bool isCtlzOpc(unsigned Opc) {
2423 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2424}
2425
2426static bool isCttzOpc(unsigned Opc) {
2427 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2428}
2429
2430SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2431 SDLoc SL(Op);
2432 SDValue Src = Op.getOperand(0);
2433
2434 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()))(static_cast <bool> (isCtlzOpc(Op.getOpcode()) || isCttzOpc
(Op.getOpcode())) ? void (0) : __assert_fail ("isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2434, __extension__
__PRETTY_FUNCTION__))
;
2435 bool Ctlz = isCtlzOpc(Op.getOpcode());
2436 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2437
2438 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2439 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2440
2441 if (Src.getValueType() == MVT::i32) {
2442 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2443 // (cttz hi:lo) -> (umin (ffbl src), 32)
2444 // (ctlz_zero_undef src) -> (ffbh src)
2445 // (cttz_zero_undef src) -> (ffbl src)
2446 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2447 if (!ZeroUndef) {
2448 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2449 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2450 }
2451 return NewOpr;
2452 }
2453
2454 SDValue Lo, Hi;
2455 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2456
2457 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2458 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2459
2460 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2461 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2462 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2463 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2464
2465 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2466 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2467 if (Ctlz)
2468 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2469 else
2470 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2471
2472 SDValue NewOpr;
2473 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2474 if (!ZeroUndef) {
2475 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2476 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2477 }
2478
2479 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2480}
2481
2482SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2483 bool Signed) const {
2484 // The regular method converting a 64-bit integer to float roughly consists of
2485 // 2 steps: normalization and rounding. In fact, after normalization, the
2486 // conversion from a 64-bit integer to a float is essentially the same as the
2487 // one from a 32-bit integer. The only difference is that it has more
2488 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2489 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2490 // converted into the correct float number. The basic steps for the unsigned
2491 // conversion are illustrated in the following pseudo code:
2492 //
2493 // f32 uitofp(i64 u) {
2494 // i32 hi, lo = split(u);
2495 // // Only count the leading zeros in hi as we have native support of the
2496 // // conversion from i32 to f32. If hi is all 0s, the conversion is
2497 // // reduced to a 32-bit one automatically.
2498 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2499 // u <<= shamt;
2500 // hi, lo = split(u);
2501 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2502 // // convert it as a 32-bit integer and scale the result back.
2503 // return uitofp(hi) * 2^(32 - shamt);
2504 // }
2505 //
2506 // The signed one follows the same principle but uses 'ffbh_i32' to count its
2507 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2508 // converted instead followed by negation based its sign bit.
2509
2510 SDLoc SL(Op);
2511 SDValue Src = Op.getOperand(0);
2512
2513 SDValue Lo, Hi;
2514 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2515 SDValue Sign;
2516 SDValue ShAmt;
2517 if (Signed && Subtarget->isGCN()) {
2518 // We also need to consider the sign bit in Lo if Hi has just sign bits,
2519 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2520 // account. That is, the maximal shift is
2521 // - 32 if Lo and Hi have opposite signs;
2522 // - 33 if Lo and Hi have the same sign.
2523 //
2524 // Or, MaxShAmt = 33 + OppositeSign, where
2525 //
2526 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2527 // - -1 if Lo and Hi have opposite signs; and
2528 // - 0 otherwise.
2529 //
2530 // All in all, ShAmt is calculated as
2531 //
2532 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2533 //
2534 // or
2535 //
2536 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2537 //
2538 // to reduce the critical path.
2539 SDValue OppositeSign = DAG.getNode(
2540 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2541 DAG.getConstant(31, SL, MVT::i32));
2542 SDValue MaxShAmt =
2543 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2544 OppositeSign);
2545 // Count the leading sign bits.
2546 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2547 // Different from unsigned conversion, the shift should be one bit less to
2548 // preserve the sign bit.
2549 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2550 DAG.getConstant(1, SL, MVT::i32));
2551 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2552 } else {
2553 if (Signed) {
2554 // Without 'ffbh_i32', only leading zeros could be counted. Take the
2555 // absolute value first.
2556 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2557 DAG.getConstant(63, SL, MVT::i64));
2558 SDValue Abs =
2559 DAG.getNode(ISD::XOR, SL, MVT::i64,
2560 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2561 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2562 }
2563 // Count the leading zeros.
2564 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2565 // The shift amount for signed integers is [0, 32].
2566 }
2567 // Normalize the given 64-bit integer.
2568 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2569 // Split it again.
2570 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2571 // Calculate the adjust bit for rounding.
2572 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2573 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2574 DAG.getConstant(1, SL, MVT::i32), Lo);
2575 // Get the 32-bit normalized integer.
2576 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2577 // Convert the normalized 32-bit integer into f32.
2578 unsigned Opc =
2579 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2580 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2581
2582 // Finally, need to scale back the converted floating number as the original
2583 // 64-bit integer is converted as a 32-bit one.
2584 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2585 ShAmt);
2586 // On GCN, use LDEXP directly.
2587 if (Subtarget->isGCN())
2588 return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2589
2590 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2591 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2592 // exponent is enough to avoid overflowing into the sign bit.
2593 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2594 DAG.getConstant(23, SL, MVT::i32));
2595 SDValue IVal =
2596 DAG.getNode(ISD::ADD, SL, MVT::i32,
2597 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2598 if (Signed) {
2599 // Set the sign bit.
2600 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2601 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2602 DAG.getConstant(31, SL, MVT::i32));
2603 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2604 }
2605 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2606}
2607
2608SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2609 bool Signed) const {
2610 SDLoc SL(Op);
2611 SDValue Src = Op.getOperand(0);
2612
2613 SDValue Lo, Hi;
2614 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2615
2616 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2617 SL, MVT::f64, Hi);
2618
2619 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2620
2621 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2622 DAG.getConstant(32, SL, MVT::i32));
2623 // TODO: Should this propagate fast-math-flags?
2624 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2625}
2626
2627SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2628 SelectionDAG &DAG) const {
2629 // TODO: Factor out code common with LowerSINT_TO_FP.
2630 EVT DestVT = Op.getValueType();
2631 SDValue Src = Op.getOperand(0);
2632 EVT SrcVT = Src.getValueType();
2633
2634 if (SrcVT == MVT::i16) {
2635 if (DestVT == MVT::f16)
2636 return Op;
2637 SDLoc DL(Op);
2638
2639 // Promote src to i32
2640 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2641 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2642 }
2643
2644 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2644, __extension__
__PRETTY_FUNCTION__))
;
2645
2646 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2647 SDLoc DL(Op);
2648
2649 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2650 SDValue FPRoundFlag =
2651 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2652 SDValue FPRound =
2653 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2654
2655 return FPRound;
2656 }
2657
2658 if (DestVT == MVT::f32)
2659 return LowerINT_TO_FP32(Op, DAG, false);
2660
2661 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2661, __extension__ __PRETTY_FUNCTION__))
;
2662 return LowerINT_TO_FP64(Op, DAG, false);
2663}
2664
2665SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2666 SelectionDAG &DAG) const {
2667 EVT DestVT = Op.getValueType();
2668
2669 SDValue Src = Op.getOperand(0);
2670 EVT SrcVT = Src.getValueType();
2671
2672 if (SrcVT == MVT::i16) {
2673 if (DestVT == MVT::f16)
2674 return Op;
2675
2676 SDLoc DL(Op);
2677 // Promote src to i32
2678 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2679 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2680 }
2681
2682 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2682, __extension__
__PRETTY_FUNCTION__))
;
2683
2684 // TODO: Factor out code common with LowerUINT_TO_FP.
2685
2686 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2687 SDLoc DL(Op);
2688 SDValue Src = Op.getOperand(0);
2689
2690 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2691 SDValue FPRoundFlag =
2692 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2693 SDValue FPRound =
2694 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2695
2696 return FPRound;
2697 }
2698
2699 if (DestVT == MVT::f32)
2700 return LowerINT_TO_FP32(Op, DAG, true);
2701
2702 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2702, __extension__ __PRETTY_FUNCTION__))
;
2703 return LowerINT_TO_FP64(Op, DAG, true);
2704}
2705
2706SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2707 bool Signed) const {
2708 SDLoc SL(Op);
2709
2710 SDValue Src = Op.getOperand(0);
2711 EVT SrcVT = Src.getValueType();
2712
2713 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64)(static_cast <bool> (SrcVT == MVT::f32 || SrcVT == MVT::
f64) ? void (0) : __assert_fail ("SrcVT == MVT::f32 || SrcVT == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2713, __extension__
__PRETTY_FUNCTION__))
;
2714
2715 // The basic idea of converting a floating point number into a pair of 32-bit
2716 // integers is illustrated as follows:
2717 //
2718 // tf := trunc(val);
2719 // hif := floor(tf * 2^-32);
2720 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2721 // hi := fptoi(hif);
2722 // lo := fptoi(lof);
2723 //
2724 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2725 SDValue Sign;
2726 if (Signed && SrcVT == MVT::f32) {
2727 // However, a 32-bit floating point number has only 23 bits mantissa and
2728 // it's not enough to hold all the significant bits of `lof` if val is
2729 // negative. To avoid the loss of precision, We need to take the absolute
2730 // value after truncating and flip the result back based on the original
2731 // signedness.
2732 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2733 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2734 DAG.getConstant(31, SL, MVT::i32));
2735 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2736 }
2737
2738 SDValue K0, K1;
2739 if (SrcVT == MVT::f64) {
2740 K0 = DAG.getConstantFP(
2741 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL), SL,
2742 SrcVT);
2743 K1 = DAG.getConstantFP(
2744 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL), SL,
2745 SrcVT);
2746 } else {
2747 K0 = DAG.getConstantFP(
2748 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U), SL, SrcVT);
2749 K1 = DAG.getConstantFP(
2750 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U), SL, SrcVT);
2751 }
2752 // TODO: Should this propagate fast-math-flags?
2753 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2754
2755 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2756
2757 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2758
2759 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2760 : ISD::FP_TO_UINT,
2761 SL, MVT::i32, FloorMul);
2762 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2763
2764 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2765 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2766
2767 if (Signed && SrcVT == MVT::f32) {
2768 assert(Sign)(static_cast <bool> (Sign) ? void (0) : __assert_fail (
"Sign", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2768
, __extension__ __PRETTY_FUNCTION__))
;
2769 // Flip the result based on the signedness, which is either all 0s or 1s.
2770 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2771 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2772 // r := xor(r, sign) - sign;
2773 Result =
2774 DAG.getNode(ISD::SUB, SL, MVT::i64,
2775 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2776 }
2777
2778 return Result;
2779}
2780
2781SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2782 SDLoc DL(Op);
2783 SDValue N0 = Op.getOperand(0);
2784
2785 // Convert to target node to get known bits
2786 if (N0.getValueType() == MVT::f32)
2787 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2788
2789 if (getTargetMachine().Options.UnsafeFPMath) {
2790 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2791 return SDValue();
2792 }
2793
2794 assert(N0.getSimpleValueType() == MVT::f64)(static_cast <bool> (N0.getSimpleValueType() == MVT::f64
) ? void (0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2794, __extension__
__PRETTY_FUNCTION__))
;
2795
2796 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2797 const unsigned ExpMask = 0x7ff;
2798 const unsigned ExpBiasf64 = 1023;
2799 const unsigned ExpBiasf16 = 15;
2800 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2801 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2802 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2803 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2804 DAG.getConstant(32, DL, MVT::i64));
2805 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2806 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2807 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2808 DAG.getConstant(20, DL, MVT::i64));
2809 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2810 DAG.getConstant(ExpMask, DL, MVT::i32));
2811 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2812 // add the f16 bias (15) to get the biased exponent for the f16 format.
2813 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2814 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2815
2816 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2817 DAG.getConstant(8, DL, MVT::i32));
2818 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2819 DAG.getConstant(0xffe, DL, MVT::i32));
2820
2821 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2822 DAG.getConstant(0x1ff, DL, MVT::i32));
2823 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2824
2825 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2826 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2827
2828 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2829 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2830 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2831 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2832
2833 // N = M | (E << 12);
2834 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2835 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2836 DAG.getConstant(12, DL, MVT::i32)));
2837
2838 // B = clamp(1-E, 0, 13);
2839 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2840 One, E);
2841 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2842 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2843 DAG.getConstant(13, DL, MVT::i32));
2844
2845 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2846 DAG.getConstant(0x1000, DL, MVT::i32));
2847
2848 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2849 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2850 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2851 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2852
2853 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2854 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2855 DAG.getConstant(0x7, DL, MVT::i32));
2856 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2857 DAG.getConstant(2, DL, MVT::i32));
2858 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2859 One, Zero, ISD::SETEQ);
2860 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2861 One, Zero, ISD::SETGT);
2862 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2863 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2864
2865 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2866 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2867 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2868 I, V, ISD::SETEQ);
2869
2870 // Extract the sign bit.
2871 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2872 DAG.getConstant(16, DL, MVT::i32));
2873 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2874 DAG.getConstant(0x8000, DL, MVT::i32));
2875
2876 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2877 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2878}
2879
2880SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2881 SelectionDAG &DAG) const {
2882 SDValue Src = Op.getOperand(0);
2883 unsigned OpOpcode = Op.getOpcode();
2884 EVT SrcVT = Src.getValueType();
2885 EVT DestVT = Op.getValueType();
2886
2887 // Will be selected natively
2888 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2889 return Op;
2890
2891 // Promote i16 to i32
2892 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2893 SDLoc DL(Op);
2894
2895 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2896 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2897 }
2898
2899 if (SrcVT == MVT::f16 ||
2900 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2901 SDLoc DL(Op);
2902
2903 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2904 unsigned Ext =
2905 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2906 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2907 }
2908
2909 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2910 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2911
2912 return SDValue();
2913}
2914
2915SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2916 SelectionDAG &DAG) const {
2917 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2918 MVT VT = Op.getSimpleValueType();
2919 MVT ScalarVT = VT.getScalarType();
2920
2921 assert(VT.isVector())(static_cast <bool> (VT.isVector()) ? void (0) : __assert_fail
("VT.isVector()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2921, __extension__ __PRETTY_FUNCTION__))
;
2922
2923 SDValue Src = Op.getOperand(0);
2924 SDLoc DL(Op);
2925
2926 // TODO: Don't scalarize on Evergreen?
2927 unsigned NElts = VT.getVectorNumElements();
2928 SmallVector<SDValue, 8> Args;
2929 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2930
2931 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2932 for (unsigned I = 0; I < NElts; ++I)
2933 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2934
2935 return DAG.getBuildVector(VT, DL, Args);
2936}
2937
2938//===----------------------------------------------------------------------===//
2939// Custom DAG optimizations
2940//===----------------------------------------------------------------------===//
2941
2942static bool isU24(SDValue Op, SelectionDAG &DAG) {
2943 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2944}
2945
2946static bool isI24(SDValue Op, SelectionDAG &DAG) {
2947 EVT VT = Op.getValueType();
2948 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2949 // as unsigned 24-bit values.
2950 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2951}
2952
2953static SDValue simplifyMul24(SDNode *Node24,
2954 TargetLowering::DAGCombinerInfo &DCI) {
2955 SelectionDAG &DAG = DCI.DAG;
2956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2957 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2958
2959 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2960 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2961 unsigned NewOpcode = Node24->getOpcode();
2962 if (IsIntrin) {
2963 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2964 switch (IID) {
2965 case Intrinsic::amdgcn_mul_i24:
2966 NewOpcode = AMDGPUISD::MUL_I24;
2967 break;
2968 case Intrinsic::amdgcn_mul_u24:
2969 NewOpcode = AMDGPUISD::MUL_U24;
2970 break;
2971 case Intrinsic::amdgcn_mulhi_i24:
2972 NewOpcode = AMDGPUISD::MULHI_I24;
2973 break;
2974 case Intrinsic::amdgcn_mulhi_u24:
2975 NewOpcode = AMDGPUISD::MULHI_U24;
2976 break;
2977 default:
2978 llvm_unreachable("Expected 24-bit mul intrinsic")::llvm::llvm_unreachable_internal("Expected 24-bit mul intrinsic"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2978)
;
2979 }
2980 }
2981
2982 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2983
2984 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2985 // the operands to have other uses, but will only perform simplifications that
2986 // involve bypassing some nodes for this user.
2987 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2988 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2989 if (DemandedLHS || DemandedRHS)
2990 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2991 DemandedLHS ? DemandedLHS : LHS,
2992 DemandedRHS ? DemandedRHS : RHS);
2993
2994 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2995 // operands if this node is the only user.
2996 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2997 return SDValue(Node24, 0);
2998 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2999 return SDValue(Node24, 0);
3000
3001 return SDValue();
3002}
3003
3004template <typename IntTy>
3005static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3006 uint32_t Width, const SDLoc &DL) {
3007 if (Width + Offset < 32) {
3008 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3009 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3010 return DAG.getConstant(Result, DL, MVT::i32);
3011 }
3012
3013 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3014}
3015
3016static bool hasVolatileUser(SDNode *Val) {
3017 for (SDNode *U : Val->uses()) {
3018 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3019 if (M->isVolatile())
3020 return true;
3021 }
3022 }
3023
3024 return false;
3025}
3026
3027bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3028 // i32 vectors are the canonical memory type.
3029 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3030 return false;
3031
3032 if (!VT.isByteSized())
3033 return false;
3034
3035 unsigned Size = VT.getStoreSize();
3036
3037 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3038 return false;
3039
3040 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3041 return false;
3042
3043 return true;
3044}
3045
3046// Replace load of an illegal type with a store of a bitcast to a friendlier
3047// type.
3048SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3049 DAGCombinerInfo &DCI) const {
3050 if (!DCI.isBeforeLegalize())
3051 return SDValue();
3052
3053 LoadSDNode *LN = cast<LoadSDNode>(N);
3054 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3055 return SDValue();
3056
3057 SDLoc SL(N);
3058 SelectionDAG &DAG = DCI.DAG;
3059 EVT VT = LN->getMemoryVT();
3060
3061 unsigned Size = VT.getStoreSize();
3062 Align Alignment = LN->getAlign();
3063 if (Alignment < Size && isTypeLegal(VT)) {
3064 unsigned IsFast;
3065 unsigned AS = LN->getAddressSpace();
3066
3067 // Expand unaligned loads earlier than legalization. Due to visitation order
3068 // problems during legalization, the emitted instructions to pack and unpack
3069 // the bytes again are not eliminated in the case of an unaligned copy.
3070 if (!allowsMisalignedMemoryAccesses(
3071 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3072 if (VT.isVector())
3073 return SplitVectorLoad(SDValue(LN, 0), DAG);
3074
3075 SDValue Ops[2];
3076 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3077
3078 return DAG.getMergeValues(Ops, SDLoc(N));
3079 }
3080
3081 if (!IsFast)
3082 return SDValue();
3083 }
3084
3085 if (!shouldCombineMemoryType(VT))
3086 return SDValue();
3087
3088 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3089
3090 SDValue NewLoad
3091 = DAG.getLoad(NewVT, SL, LN->getChain(),
3092 LN->getBasePtr(), LN->getMemOperand());
3093
3094 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3095 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3096 return SDValue(N, 0);
3097}
3098
3099// Replace store of an illegal type with a store of a bitcast to a friendlier
3100// type.
3101SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3102 DAGCombinerInfo &DCI) const {
3103 if (!DCI.isBeforeLegalize())
3104 return SDValue();
3105
3106 StoreSDNode *SN = cast<StoreSDNode>(N);
3107 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3108 return SDValue();
3109
3110 EVT VT = SN->getMemoryVT();
3111 unsigned Size = VT.getStoreSize();
3112
3113 SDLoc SL(N);
3114 SelectionDAG &DAG = DCI.DAG;
3115 Align Alignment = SN->getAlign();
3116 if (Alignment < Size && isTypeLegal(VT)) {
3117 unsigned IsFast;
3118 unsigned AS = SN->getAddressSpace();
3119
3120 // Expand unaligned stores earlier than legalization. Due to visitation
3121 // order problems during legalization, the emitted instructions to pack and
3122 // unpack the bytes again are not eliminated in the case of an unaligned
3123 // copy.
3124 if (!allowsMisalignedMemoryAccesses(
3125 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3126 if (VT.isVector())
3127 return SplitVectorStore(SDValue(SN, 0), DAG);
3128
3129 return expandUnalignedStore(SN, DAG);
3130 }
3131
3132 if (!IsFast)
3133 return SDValue();
3134 }
3135
3136 if (!shouldCombineMemoryType(VT))
3137 return SDValue();
3138
3139 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3140 SDValue Val = SN->getValue();
3141
3142 //DCI.AddToWorklist(Val.getNode());
3143
3144 bool OtherUses = !Val.hasOneUse();
3145 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3146 if (OtherUses) {
3147 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3148 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3149 }
3150
3151 return DAG.getStore(SN->getChain(), SL, CastVal,
3152 SN->getBasePtr(), SN->getMemOperand());
3153}
3154
3155// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3156// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3157// issues.
3158SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3159 DAGCombinerInfo &DCI) const {
3160 SelectionDAG &DAG = DCI.DAG;
3161 SDValue N0 = N->getOperand(0);
3162
3163 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3164 // (vt2 (truncate (assertzext vt0:x, vt1)))
3165 if (N0.getOpcode() == ISD::TRUNCATE) {
3166 SDValue N1 = N->getOperand(1);
3167 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3168 SDLoc SL(N);
3169
3170 SDValue Src = N0.getOperand(0);
3171 EVT SrcVT = Src.getValueType();
3172 if (SrcVT.bitsGE(ExtVT)) {
3173 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3174 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3175 }
3176 }
3177
3178 return SDValue();
3179}
3180
3181SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3182 SDNode *N, DAGCombinerInfo &DCI) const {
3183 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3184 switch (IID) {
3185 case Intrinsic::amdgcn_mul_i24:
3186 case Intrinsic::amdgcn_mul_u24:
3187 case Intrinsic::amdgcn_mulhi_i24:
3188 case Intrinsic::amdgcn_mulhi_u24:
3189 return simplifyMul24(N, DCI);
3190 case Intrinsic::amdgcn_fract:
3191 case Intrinsic::amdgcn_rsq:
3192 case Intrinsic::amdgcn_rcp_legacy:
3193 case Intrinsic::amdgcn_rsq_legacy:
3194 case Intrinsic::amdgcn_rsq_clamp:
3195 case Intrinsic::amdgcn_ldexp: {
3196 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3197 SDValue Src = N->getOperand(1);
3198 return Src.isUndef() ? Src : SDValue();
3199 }
3200 default:
3201 return SDValue();
3202 }
3203}
3204
3205/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3206/// binary operation \p Opc to it with the corresponding constant operands.
3207SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3208 DAGCombinerInfo &DCI, const SDLoc &SL,
3209 unsigned Opc, SDValue LHS,
3210 uint32_t ValLo, uint32_t ValHi) const {
3211 SelectionDAG &DAG = DCI.DAG;
3212 SDValue Lo, Hi;
3213 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3214
3215 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3216 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3217
3218 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3219 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3220
3221 // Re-visit the ands. It's possible we eliminated one of them and it could
3222 // simplify the vector.
3223 DCI.AddToWorklist(Lo.getNode());
3224 DCI.AddToWorklist(Hi.getNode());
3225
3226 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3227 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3228}
3229
3230SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3231 DAGCombinerInfo &DCI) const {
3232 EVT VT = N->getValueType(0);
3233
3234 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3235 if (!RHS)
3236 return SDValue();
3237
3238 SDValue LHS = N->getOperand(0);
3239 unsigned RHSVal = RHS->getZExtValue();
3240 if (!RHSVal)
3241 return LHS;
3242
3243 SDLoc SL(N);
3244 SelectionDAG &DAG = DCI.DAG;
3245
3246 switch (LHS->getOpcode()) {
3247 default:
3248 break;
3249 case ISD::ZERO_EXTEND:
3250 case ISD::SIGN_EXTEND:
3251 case ISD::ANY_EXTEND: {
3252 SDValue X = LHS->getOperand(0);
3253
3254 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3255 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3256 // Prefer build_vector as the canonical form if packed types are legal.
3257 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3258 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3259 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3260 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3261 }
3262
3263 // shl (ext x) => zext (shl x), if shift does not overflow int
3264 if (VT != MVT::i64)
3265 break;
3266 KnownBits Known = DAG.computeKnownBits(X);
3267 unsigned LZ = Known.countMinLeadingZeros();
3268 if (LZ < RHSVal)
3269 break;
3270 EVT XVT = X.getValueType();
3271 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3272 return DAG.getZExtOrTrunc(Shl, SL, VT);
3273 }
3274 }
3275
3276 if (VT != MVT::i64)
3277 return SDValue();
3278
3279 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3280
3281 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3282 // common case, splitting this into a move and a 32-bit shift is faster and
3283 // the same code size.
3284 if (RHSVal < 32)
3285 return SDValue();
3286
3287 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3288
3289 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3290 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3291
3292 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3293
3294 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3295 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3296}
3297
3298SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3299 DAGCombinerInfo &DCI) const {
3300 if (N->getValueType(0) != MVT::i64)
3301 return SDValue();
3302
3303 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3304 if (!RHS)
3305 return SDValue();
3306
3307 SelectionDAG &DAG = DCI.DAG;
3308 SDLoc SL(N);
3309 unsigned RHSVal = RHS->getZExtValue();
3310
3311 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3312 if (RHSVal == 32) {
3313 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3314 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3315 DAG.getConstant(31, SL, MVT::i32));
3316
3317 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3318 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3319 }
3320
3321 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3322 if (RHSVal == 63) {
3323 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3324 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3325 DAG.getConstant(31, SL, MVT::i32));
3326 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3327 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3328 }
3329
3330 return SDValue();
3331}
3332
3333SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3334 DAGCombinerInfo &DCI) const {
3335 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3336 if (!RHS)
3337 return SDValue();
3338
3339 EVT VT = N->getValueType(0);
3340 SDValue LHS = N->getOperand(0);
3341 unsigned ShiftAmt = RHS->getZExtValue();
3342 SelectionDAG &DAG = DCI.DAG;
3343 SDLoc SL(N);
3344
3345 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3346 // this improves the ability to match BFE patterns in isel.
3347 if (LHS.getOpcode() == ISD::AND) {
3348 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3349 unsigned MaskIdx, MaskLen;
3350 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3351 MaskIdx == ShiftAmt) {
3352 return DAG.getNode(
3353 ISD::AND, SL, VT,
3354 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3355 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3356 }
3357 }
3358 }
3359
3360 if (VT != MVT::i64)
3361 return SDValue();
3362
3363 if (ShiftAmt < 32)
3364 return SDValue();
3365
3366 // srl i64:x, C for C >= 32
3367 // =>
3368 // build_pair (srl hi_32(x), C - 32), 0
3369 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3370
3371 SDValue Hi = getHiHalf64(LHS, DAG);
3372
3373 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3374 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3375
3376 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3377
3378 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3379}
3380
3381SDValue AMDGPUTargetLowering::performTruncateCombine(
3382 SDNode *N, DAGCombinerInfo &DCI) const {
3383 SDLoc SL(N);
3384 SelectionDAG &DAG = DCI.DAG;
3385 EVT VT = N->getValueType(0);
3386 SDValue Src = N->getOperand(0);
3387
3388 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3389 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3390 SDValue Vec = Src.getOperand(0);
3391 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3392 SDValue Elt0 = Vec.getOperand(0);
3393 EVT EltVT = Elt0.getValueType();
3394 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3395 if (EltVT.isFloatingPoint()) {
3396 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3397 EltVT.changeTypeToInteger(), Elt0);
3398 }
3399
3400 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3401 }
3402 }
3403 }
3404
3405 // Equivalent of above for accessing the high element of a vector as an
3406 // integer operation.
3407 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3408 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3409 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3410 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3411 SDValue BV = stripBitcast(Src.getOperand(0));
3412 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3413 BV.getValueType().getVectorNumElements() == 2) {
3414 SDValue SrcElt = BV.getOperand(1);
3415 EVT SrcEltVT = SrcElt.getValueType();
3416 if (SrcEltVT.isFloatingPoint()) {
3417 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3418 SrcEltVT.changeTypeToInteger(), SrcElt);
3419 }
3420
3421 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3422 }
3423 }
3424 }
3425 }
3426
3427 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3428 //
3429 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3430 // i16 (trunc (srl (i32 (trunc x), K)))
3431 if (VT.getScalarSizeInBits() < 32) {
3432 EVT SrcVT = Src.getValueType();
3433 if (SrcVT.getScalarSizeInBits() > 32 &&
3434 (Src.getOpcode() == ISD::SRL ||
3435 Src.getOpcode() == ISD::SRA ||
3436 Src.getOpcode() == ISD::SHL)) {
3437 SDValue Amt = Src.getOperand(1);
3438 KnownBits Known = DAG.computeKnownBits(Amt);
3439
3440 // - For left shifts, do the transform as long as the shift
3441 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
3442 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
3443 // losing information stored in the high bits when truncating.
3444 const unsigned MaxCstSize =
3445 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
3446 if (Known.getMaxValue().ule(MaxCstSize)) {
3447 EVT MidVT = VT.isVector() ?
3448 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3449 VT.getVectorNumElements()) : MVT::i32;
3450
3451 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3452 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3453 Src.getOperand(0));
3454 DCI.AddToWorklist(Trunc.getNode());
3455
3456 if (Amt.getValueType() != NewShiftVT) {
3457 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3458 DCI.AddToWorklist(Amt.getNode());
3459 }
3460
3461 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3462 Trunc, Amt);
3463 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3464 }
3465 }
3466 }
3467
3468 return SDValue();
3469}
3470
3471// We need to specifically handle i64 mul here to avoid unnecessary conversion
3472// instructions. If we only match on the legalized i64 mul expansion,
3473// SimplifyDemandedBits will be unable to remove them because there will be
3474// multiple uses due to the separate mul + mulh[su].
3475static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3476 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3477 if (Size <= 32) {
3478 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3479 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3480 }
3481
3482 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3483 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3484
3485 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3486 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3487
3488 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3489}
3490
3491SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3492 DAGCombinerInfo &DCI) const {
3493 EVT VT = N->getValueType(0);
3494
3495 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3496 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3497 // unnecessarily). isDivergent() is used as an approximation of whether the
3498 // value is in an SGPR.
3499 if (!N->isDivergent())
3500 return SDValue();
3501
3502 unsigned Size = VT.getSizeInBits();
3503 if (VT.isVector() || Size > 64)
3504 return SDValue();
3505
3506 // There are i16 integer mul/mad.
3507 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3508 return SDValue();
3509
3510 SelectionDAG &DAG = DCI.DAG;
3511 SDLoc DL(N);
3512
3513 SDValue N0 = N->getOperand(0);
3514 SDValue N1 = N->getOperand(1);
3515
3516 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3517 // in the source into any_extends if the result of the mul is truncated. Since
3518 // we can assume the high bits are whatever we want, use the underlying value
3519 // to avoid the unknown high bits from interfering.
3520 if (N0.getOpcode() == ISD::ANY_EXTEND)
3521 N0 = N0.getOperand(0);
3522
3523 if (N1.getOpcode() == ISD::ANY_EXTEND)
3524 N1 = N1.getOperand(0);
3525
3526 SDValue Mul;
3527
3528 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3529 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3530 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3531 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3532 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3533 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3534 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3535 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3536 } else {
3537 return SDValue();
3538 }
3539
3540 // We need to use sext even for MUL_U24, because MUL_U24 is used
3541 // for signed multiply of 8 and 16-bit types.
3542 return DAG.getSExtOrTrunc(Mul, DL, VT);
3543}
3544
3545SDValue
3546AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3547 DAGCombinerInfo &DCI) const {
3548 if (N->getValueType(0) != MVT::i32)
3549 return SDValue();
3550
3551 SelectionDAG &DAG = DCI.DAG;
3552 SDLoc DL(N);
3553
3554 SDValue N0 = N->getOperand(0);
3555 SDValue N1 = N->getOperand(1);
3556
3557 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3558 // in the source into any_extends if the result of the mul is truncated. Since
3559 // we can assume the high bits are whatever we want, use the underlying value
3560 // to avoid the unknown high bits from interfering.
3561 if (N0.getOpcode() == ISD::ANY_EXTEND)
3562 N0 = N0.getOperand(0);
3563 if (N1.getOpcode() == ISD::ANY_EXTEND)
3564 N1 = N1.getOperand(0);
3565
3566 // Try to use two fast 24-bit multiplies (one for each half of the result)
3567 // instead of one slow extending multiply.
3568 unsigned LoOpcode, HiOpcode;
3569 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3570 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3571 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3572 LoOpcode = AMDGPUISD::MUL_U24;
3573 HiOpcode = AMDGPUISD::MULHI_U24;
3574 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3575 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3576 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3577 LoOpcode = AMDGPUISD::MUL_I24;
3578 HiOpcode = AMDGPUISD::MULHI_I24;
3579 } else {
3580 return SDValue();
3581 }
3582
3583 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3584 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3585 DCI.CombineTo(N, Lo, Hi);
3586 return SDValue(N, 0);
3587}
3588
3589SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3590 DAGCombinerInfo &DCI) const {
3591 EVT VT = N->getValueType(0);
3592
3593 if (!Subtarget->hasMulI24() || VT.isVector())
3594 return SDValue();
3595
3596 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3597 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3598 // unnecessarily). isDivergent() is used as an approximation of whether the
3599 // value is in an SGPR.
3600 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3601 // valu op anyway)
3602 if (Subtarget->hasSMulHi() && !N->isDivergent())
3603 return SDValue();
3604
3605 SelectionDAG &DAG = DCI.DAG;
3606 SDLoc DL(N);
3607
3608 SDValue N0 = N->getOperand(0);
3609 SDValue N1 = N->getOperand(1);
3610
3611 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3612 return SDValue();
3613
3614 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3615 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3616
3617 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3618 DCI.AddToWorklist(Mulhi.getNode());
3619 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3620}
3621
3622SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3623 DAGCombinerInfo &DCI) const {
3624 EVT VT = N->getValueType(0);
3625
3626 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3627 return SDValue();
3628
3629 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3630 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3631 // unnecessarily). isDivergent() is used as an approximation of whether the
3632 // value is in an SGPR.
3633 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3634 // valu op anyway)
3635 if (Subtarget->hasSMulHi() && !N->isDivergent())
3636 return SDValue();
3637
3638 SelectionDAG &DAG = DCI.DAG;
3639 SDLoc DL(N);
3640
3641 SDValue N0 = N->getOperand(0);
3642 SDValue N1 = N->getOperand(1);
3643
3644 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3645 return SDValue();
3646
3647 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3648 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3649
3650 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3651 DCI.AddToWorklist(Mulhi.getNode());
3652 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3653}
3654
3655static bool isNegativeOne(SDValue Val) {
3656 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3657 return C->isAllOnes();
3658 return false;
3659}
3660
3661SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3662 SDValue Op,
3663 const SDLoc &DL,
3664 unsigned Opc) const {
3665 EVT VT = Op.getValueType();
3666 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3667 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3668 LegalVT != MVT::i16))
3669 return SDValue();
3670
3671 if (VT != MVT::i32)
3672 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3673
3674 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3675 if (VT != MVT::i32)
3676 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3677
3678 return FFBX;
3679}
3680
3681// The native instructions return -1 on 0 input. Optimize out a select that
3682// produces -1 on 0.
3683//
3684// TODO: If zero is not undef, we could also do this if the output is compared
3685// against the bitwidth.
3686//
3687// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3688SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3689 SDValue LHS, SDValue RHS,
3690 DAGCombinerInfo &DCI) const {
3691 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3692 if (!CmpRhs || !CmpRhs->isZero())
3693 return SDValue();
3694
3695 SelectionDAG &DAG = DCI.DAG;
3696 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3697 SDValue CmpLHS = Cond.getOperand(0);
3698
3699 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3700 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3701 if (CCOpcode == ISD::SETEQ &&
3702 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3703 RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3704 unsigned Opc =
3705 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3706 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3707 }
3708
3709 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3710 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3711 if (CCOpcode == ISD::SETNE &&
3712 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3713 LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3714 unsigned Opc =
3715 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3716
3717 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3718 }
3719
3720 return SDValue();
3721}
3722
3723static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3724 unsigned Op,
3725 const SDLoc &SL,
3726 SDValue Cond,
3727 SDValue N1,
3728 SDValue N2) {
3729 SelectionDAG &DAG = DCI.DAG;
3730 EVT VT = N1.getValueType();
3731
3732 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3733 N1.getOperand(0), N2.getOperand(0));
3734 DCI.AddToWorklist(NewSelect.getNode());
3735 return DAG.getNode(Op, SL, VT, NewSelect);
3736}
3737
3738// Pull a free FP operation out of a select so it may fold into uses.
3739//
3740// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3741// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3742//
3743// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3744// select c, (fabs x), +k -> fabs (select c, x, k)
3745SDValue
3746AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3747 SDValue N) const {
3748 SelectionDAG &DAG = DCI.DAG;
3749 SDValue Cond = N.getOperand(0);
3750 SDValue LHS = N.getOperand(1);
3751 SDValue RHS = N.getOperand(2);
3752
3753 EVT VT = N.getValueType();
3754 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3755 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3756 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
3757 return SDValue();
3758
3759 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3760 SDLoc(N), Cond, LHS, RHS);
3761 }
3762
3763 bool Inv = false;
3764 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3765 std::swap(LHS, RHS);
3766 Inv = true;
3767 }
3768
3769 // TODO: Support vector constants.
3770 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3771 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
3772 !selectSupportsSourceMods(N.getNode())) {
3773 SDLoc SL(N);
3774 // If one side is an fneg/fabs and the other is a constant, we can push the
3775 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3776 SDValue NewLHS = LHS.getOperand(0);
3777 SDValue NewRHS = RHS;
3778
3779 // Careful: if the neg can be folded up, don't try to pull it back down.
3780 bool ShouldFoldNeg = true;
3781
3782 if (NewLHS.hasOneUse()) {
3783 unsigned Opc = NewLHS.getOpcode();
3784 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3785 ShouldFoldNeg = false;
3786 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3787 ShouldFoldNeg = false;
3788 }
3789
3790 if (ShouldFoldNeg) {
3791 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
3792 return SDValue();
3793
3794 // We're going to be forced to use a source modifier anyway, there's no
3795 // point to pulling the negate out unless we can get a size reduction by
3796 // negating the constant.
3797 //
3798 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
3799 // about cheaper constants.
3800 if (NewLHS.getOpcode() == ISD::FABS &&
3801 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
3802 return SDValue();
3803
3804 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
3805 return SDValue();
3806
3807 if (LHS.getOpcode() == ISD::FNEG)
3808 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3809
3810 if (Inv)
3811 std::swap(NewLHS, NewRHS);
3812
3813 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3814 Cond, NewLHS, NewRHS);
3815 DCI.AddToWorklist(NewSelect.getNode());
3816 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3817 }
3818 }
3819
3820 return SDValue();
3821}
3822
3823SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3824 DAGCombinerInfo &DCI) const {
3825 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3826 return Folded;
3827
3828 SDValue Cond = N->getOperand(0);
3829 if (Cond.getOpcode() != ISD::SETCC)
3830 return SDValue();
3831
3832 EVT VT = N->getValueType(0);
3833 SDValue LHS = Cond.getOperand(0);
3834 SDValue RHS = Cond.getOperand(1);
3835 SDValue CC = Cond.getOperand(2);
3836
3837 SDValue True = N->getOperand(1);
3838 SDValue False = N->getOperand(2);
3839
3840 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3841 SelectionDAG &DAG = DCI.DAG;
3842 if (DAG.isConstantValueOfAnyType(True) &&
3843 !DAG.isConstantValueOfAnyType(False)) {
3844 // Swap cmp + select pair to move constant to false input.
3845 // This will allow using VOPC cndmasks more often.
3846 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3847
3848 SDLoc SL(N);
3849 ISD::CondCode NewCC =
3850 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3851
3852 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3853 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3854 }
3855
3856 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3857 SDValue MinMax
3858 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3859 // Revisit this node so we can catch min3/max3/med3 patterns.
3860 //DCI.AddToWorklist(MinMax.getNode());
3861 return MinMax;
3862 }
3863 }
3864
3865 // There's no reason to not do this if the condition has other uses.
3866 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3867}
3868
3869static bool isInv2Pi(const APFloat &APF) {
3870 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3871 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3872 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3873
3874 return APF.bitwiseIsEqual(KF16) ||
3875 APF.bitwiseIsEqual(KF32) ||
3876 APF.bitwiseIsEqual(KF64);
3877}
3878
3879// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3880// additional cost to negate them.
3881TargetLowering::NegatibleCost
3882AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
3883 if (C->isZero())
3884 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
3885
3886 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3887 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
3888
3889 return NegatibleCost::Neutral;
3890}
3891
3892bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3893 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3894 return getConstantNegateCost(C) == NegatibleCost::Expensive;
3895 return false;
3896}
3897
3898bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
3899 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3900 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
3901 return false;
3902}
3903
3904static unsigned inverseMinMax(unsigned Opc) {
3905 switch (Opc) {
3906 case ISD::FMAXNUM:
3907 return ISD::FMINNUM;
3908 case ISD::FMINNUM:
3909 return ISD::FMAXNUM;
3910 case ISD::FMAXNUM_IEEE:
3911 return ISD::FMINNUM_IEEE;
3912 case ISD::FMINNUM_IEEE:
3913 return ISD::FMAXNUM_IEEE;
3914 case AMDGPUISD::FMAX_LEGACY:
3915 return AMDGPUISD::FMIN_LEGACY;
3916 case AMDGPUISD::FMIN_LEGACY:
3917 return AMDGPUISD::FMAX_LEGACY;
3918 default:
3919 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3919)
;
3920 }
3921}
3922
3923/// \return true if it's profitable to try to push an fneg into its source
3924/// instruction.
3925bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
3926 unsigned Opc = N0.getOpcode();
3927
3928 // If the input has multiple uses and we can either fold the negate down, or
3929 // the other uses cannot, give up. This both prevents unprofitable
3930 // transformations and infinite loops: we won't repeatedly try to fold around
3931 // a negate that has no 'good' form.
3932 if (N0.hasOneUse()) {
3933 // This may be able to fold into the source, but at a code size cost. Don't
3934 // fold if the fold into the user is free.
3935 if (allUsesHaveSourceMods(N, 0))
3936 return false;
3937 } else {
3938 if (fnegFoldsIntoOp(Opc) &&
3939 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3940 return false;
3941 }
3942
3943 return true;
3944}
3945
3946SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3947 DAGCombinerInfo &DCI) const {
3948 SelectionDAG &DAG = DCI.DAG;
3949 SDValue N0 = N->getOperand(0);
3950 EVT VT = N->getValueType(0);
3951
3952 unsigned Opc = N0.getOpcode();
3953
3954 if (!shouldFoldFNegIntoSrc(N, N0))
3955 return SDValue();
3956
3957 SDLoc SL(N);
3958 switch (Opc) {
3959 case ISD::FADD: {
3960 if (!mayIgnoreSignedZero(N0))
3961 return SDValue();
3962
3963 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3964 SDValue LHS = N0.getOperand(0);
3965 SDValue RHS = N0.getOperand(1);
3966
3967 if (LHS.getOpcode() != ISD::FNEG)
3968 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3969 else
3970 LHS = LHS.getOperand(0);
3971
3972 if (RHS.getOpcode() != ISD::FNEG)
3973 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3974 else
3975 RHS = RHS.getOperand(0);
3976
3977 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3978 if (Res.getOpcode() != ISD::FADD)
3979 return SDValue(); // Op got folded away.
3980 if (!N0.hasOneUse())
3981 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3982 return Res;
3983 }
3984 case ISD::FMUL:
3985 case AMDGPUISD::FMUL_LEGACY: {
3986 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3987 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3988 SDValue LHS = N0.getOperand(0);
3989 SDValue RHS = N0.getOperand(1);
3990
3991 if (LHS.getOpcode() == ISD::FNEG)
3992 LHS = LHS.getOperand(0);
3993 else if (RHS.getOpcode() == ISD::FNEG)
3994 RHS = RHS.getOperand(0);
3995 else
3996 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3997
3998 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3999 if (Res.getOpcode() != Opc)
4000 return SDValue(); // Op got folded away.
4001 if (!N0.hasOneUse())
4002 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4003 return Res;
4004 }
4005 case ISD::FMA:
4006 case ISD::FMAD: {
4007 // TODO: handle llvm.amdgcn.fma.legacy
4008 if (!mayIgnoreSignedZero(N0))
4009 return SDValue();
4010
4011 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4012 SDValue LHS = N0.getOperand(0);
4013 SDValue MHS = N0.getOperand(1);
4014 SDValue RHS = N0.getOperand(2);
4015
4016 if (LHS.getOpcode() == ISD::FNEG)
4017 LHS = LHS.getOperand(0);
4018 else if (MHS.getOpcode() == ISD::FNEG)
4019 MHS = MHS.getOperand(0);
4020 else
4021 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4022
4023 if (RHS.getOpcode() != ISD::FNEG)
4024 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4025 else
4026 RHS = RHS.getOperand(0);
4027
4028 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4029 if (Res.getOpcode() != Opc)
4030 return SDValue(); // Op got folded away.
4031 if (!N0.hasOneUse())
4032 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4033 return Res;
4034 }
4035 case ISD::FMAXNUM:
4036 case ISD::FMINNUM:
4037 case ISD::FMAXNUM_IEEE:
4038 case ISD::FMINNUM_IEEE:
4039 case AMDGPUISD::FMAX_LEGACY:
4040 case AMDGPUISD::FMIN_LEGACY: {
4041 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4042 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4043 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4044 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4045
4046 SDValue LHS = N0.getOperand(0);
4047 SDValue RHS = N0.getOperand(1);
4048
4049 // 0 doesn't have a negated inline immediate.
4050 // TODO: This constant check should be generalized to other operations.
4051 if (isConstantCostlierToNegate(RHS))
4052 return SDValue();
4053
4054 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4055 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4056 unsigned Opposite = inverseMinMax(Opc);
4057
4058 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4059 if (Res.getOpcode() != Opposite)
4060 return SDValue(); // Op got folded away.
4061 if (!N0.hasOneUse())
4062 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4063 return Res;
4064 }
4065 case AMDGPUISD::FMED3: {
4066 SDValue Ops[3];
4067 for (unsigned I = 0; I < 3; ++I)
4068 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4069
4070 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4071 if (Res.getOpcode() != AMDGPUISD::FMED3)
4072 return SDValue(); // Op got folded away.
4073
4074 if (!N0.hasOneUse()) {
4075 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4076 DAG.ReplaceAllUsesWith(N0, Neg);
4077
4078 for (SDNode *U : Neg->uses())
4079 DCI.AddToWorklist(U);
4080 }
4081
4082 return Res;
4083 }
4084 case ISD::FP_EXTEND:
4085 case ISD::FTRUNC:
4086 case ISD::FRINT:
4087 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4088 case ISD::FSIN:
4089 case ISD::FCANONICALIZE:
4090 case AMDGPUISD::RCP:
4091 case AMDGPUISD::RCP_LEGACY:
4092 case AMDGPUISD::RCP_IFLAG:
4093 case AMDGPUISD::SIN_HW: {
4094 SDValue CvtSrc = N0.getOperand(0);
4095 if (CvtSrc.getOpcode() == ISD::FNEG) {
4096 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4097 // (fneg (rcp (fneg x))) -> (rcp x)
4098 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4099 }
4100
4101 if (!N0.hasOneUse())
4102 return SDValue();
4103
4104 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4105 // (fneg (rcp x)) -> (rcp (fneg x))
4106 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4107 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4108 }
4109 case ISD::FP_ROUND: {
4110 SDValue CvtSrc = N0.getOperand(0);
4111
4112 if (CvtSrc.getOpcode() == ISD::FNEG) {
4113 // (fneg (fp_round (fneg x))) -> (fp_round x)
4114 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4115 CvtSrc.getOperand(0), N0.getOperand(1));
4116 }
4117
4118 if (!N0.hasOneUse())
4119 return SDValue();
4120
4121 // (fneg (fp_round x)) -> (fp_round (fneg x))
4122 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4123 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4124 }
4125 case ISD::FP16_TO_FP: {
4126 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4127 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4128 // Put the fneg back as a legal source operation that can be matched later.
4129 SDLoc SL(N);
4130
4131 SDValue Src = N0.getOperand(0);
4132 EVT SrcVT = Src.getValueType();
4133
4134 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4135 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4136 DAG.getConstant(0x8000, SL, SrcVT));
4137 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4138 }
4139 case ISD::SELECT: {
4140 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4141 // TODO: Invert conditions of foldFreeOpFromSelect
4142 return SDValue();
4143 }
4144 default:
4145 return SDValue();
4146 }
4147}
4148
4149SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4150 DAGCombinerInfo &DCI) const {
4151 SelectionDAG &DAG = DCI.DAG;
4152 SDValue N0 = N->getOperand(0);
4153
4154 if (!N0.hasOneUse())
4155 return SDValue();
4156
4157 switch (N0.getOpcode()) {
4158 case ISD::FP16_TO_FP: {
4159 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")(static_cast <bool> (!Subtarget->has16BitInsts() &&
"should only see if f16 is illegal") ? void (0) : __assert_fail
("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4159, __extension__
__PRETTY_FUNCTION__))
;
4160 SDLoc SL(N);
4161 SDValue Src = N0.getOperand(0);
4162 EVT SrcVT = Src.getValueType();
4163
4164 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4165 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4166 DAG.getConstant(0x7fff, SL, SrcVT));
4167 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4168 }
4169 default:
4170 return SDValue();
4171 }
4172}
4173
4174SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4175 DAGCombinerInfo &DCI) const {
4176 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4177 if (!CFP)
4178 return SDValue();
4179
4180 // XXX - Should this flush denormals?
4181 const APFloat &Val = CFP->getValueAPF();
4182 APFloat One(Val.getSemantics(), "1.0");
4183 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4184}
4185
4186SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4187 DAGCombinerInfo &DCI) const {
4188 SelectionDAG &DAG = DCI.DAG;
4189 SDLoc DL(N);
4190
4191 switch(N->getOpcode()) {
4192 default:
4193 break;
4194 case ISD::BITCAST: {
4195 EVT DestVT = N->getValueType(0);
4196
4197 // Push casts through vector builds. This helps avoid emitting a large
4198 // number of copies when materializing floating point vector constants.
4199 //
4200 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4201 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4202 if (DestVT.isVector()) {
4203 SDValue Src = N->getOperand(0);
4204 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4205 EVT SrcVT = Src.getValueType();
4206 unsigned NElts = DestVT.getVectorNumElements();
4207
4208 if (SrcVT.getVectorNumElements() == NElts) {
4209 EVT DestEltVT = DestVT.getVectorElementType();
4210
4211 SmallVector<SDValue, 8> CastedElts;
4212 SDLoc SL(N);
4213 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4214 SDValue Elt = Src.getOperand(I);
4215 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4216 }
4217
4218 return DAG.getBuildVector(DestVT, SL, CastedElts);
4219 }
4220 }
4221 }
4222
4223 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4224 break;
4225
4226 // Fold bitcasts of constants.
4227 //
4228 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4229 // TODO: Generalize and move to DAGCombiner
4230 SDValue Src = N->getOperand(0);
4231 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4232 SDLoc SL(N);
4233 uint64_t CVal = C->getZExtValue();
4234 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4235 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4236 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4237 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4238 }
4239
4240 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4241 const APInt &Val = C->getValueAPF().bitcastToAPInt();
4242 SDLoc SL(N);
4243 uint64_t CVal = Val.getZExtValue();
4244 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4245 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4246 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4247
4248 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4249 }
4250
4251 break;
4252 }
4253 case ISD::SHL: {
4254 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4255 break;
4256
4257 return performShlCombine(N, DCI);
4258 }
4259 case ISD::SRL: {
4260 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4261 break;
4262
4263 return performSrlCombine(N, DCI);
4264 }
4265 case ISD::SRA: {
4266 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4267 break;
4268
4269 return performSraCombine(N, DCI);
4270 }
4271 case ISD::TRUNCATE:
4272 return performTruncateCombine(N, DCI);
4273 case ISD::MUL:
4274 return performMulCombine(N, DCI);
4275 case ISD::SMUL_LOHI:
4276 case ISD::UMUL_LOHI:
4277 return performMulLoHiCombine(N, DCI);
4278 case ISD::MULHS:
4279 return performMulhsCombine(N, DCI);
4280 case ISD::MULHU:
4281 return performMulhuCombine(N, DCI);
4282 case AMDGPUISD::MUL_I24:
4283 case AMDGPUISD::MUL_U24:
4284 case AMDGPUISD::MULHI_I24:
4285 case AMDGPUISD::MULHI_U24:
4286 return simplifyMul24(N, DCI);
4287 case ISD::SELECT:
4288 return performSelectCombine(N, DCI);
4289 case ISD::FNEG:
4290 return performFNegCombine(N, DCI);
4291 case ISD::FABS:
4292 return performFAbsCombine(N, DCI);
4293 case AMDGPUISD::BFE_I32:
4294 case AMDGPUISD::BFE_U32: {
4295 assert(!N->getValueType(0).isVector() &&(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4296, __extension__
__PRETTY_FUNCTION__))
4296 "Vector handling of BFE not implemented")(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4296, __extension__
__PRETTY_FUNCTION__))
;
4297 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4298 if (!Width)
4299 break;
4300
4301 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4302 if (WidthVal == 0)
4303 return DAG.getConstant(0, DL, MVT::i32);
4304
4305 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4306 if (!Offset)
4307 break;
4308
4309 SDValue BitsFrom = N->getOperand(0);
4310 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4311
4312 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4313
4314 if (OffsetVal == 0) {
4315 // This is already sign / zero extended, so try to fold away extra BFEs.
4316 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4317
4318 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4319 if (OpSignBits >= SignBits)
4320 return BitsFrom;
4321
4322 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4323 if (Signed) {
4324 // This is a sign_extend_inreg. Replace it to take advantage of existing
4325 // DAG Combines. If not eliminated, we will match back to BFE during
4326 // selection.
4327
4328 // TODO: The sext_inreg of extended types ends, although we can could
4329 // handle them in a single BFE.
4330 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4331 DAG.getValueType(SmallVT));
4332 }
4333
4334 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4335 }
4336
4337 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4338 if (Signed) {
4339 return constantFoldBFE<int32_t>(DAG,
4340 CVal->getSExtValue(),
4341 OffsetVal,
4342 WidthVal,
4343 DL);
4344 }
4345
4346 return constantFoldBFE<uint32_t>(DAG,
4347 CVal->getZExtValue(),
4348 OffsetVal,
4349 WidthVal,
4350 DL);
4351 }
4352
4353 if ((OffsetVal + WidthVal) >= 32 &&
4354 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4355 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4356 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4357 BitsFrom, ShiftVal);
4358 }
4359
4360 if (BitsFrom.hasOneUse()) {
4361 APInt Demanded = APInt::getBitsSet(32,
4362 OffsetVal,
4363 OffsetVal + WidthVal);
4364
4365 KnownBits Known;
4366 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4367 !DCI.isBeforeLegalizeOps());
4368 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4369 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4370 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4371 DCI.CommitTargetLoweringOpt(TLO);
4372 }
4373 }
4374
4375 break;
4376 }
4377 case ISD::LOAD:
4378 return performLoadCombine(N, DCI);
4379 case ISD::STORE:
4380 return performStoreCombine(N, DCI);
4381 case AMDGPUISD::RCP:
4382 case AMDGPUISD::RCP_IFLAG:
4383 return performRcpCombine(N, DCI);
4384 case ISD::AssertZext:
4385 case ISD::AssertSext:
4386 return performAssertSZExtCombine(N, DCI);
4387 case ISD::INTRINSIC_WO_CHAIN:
4388 return performIntrinsicWOChainCombine(N, DCI);
4389 }
4390 return SDValue();
4391}
4392
4393//===----------------------------------------------------------------------===//
4394// Helper functions
4395//===----------------------------------------------------------------------===//
4396
4397SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4398 const TargetRegisterClass *RC,
4399 Register Reg, EVT VT,
4400 const SDLoc &SL,
4401 bool RawReg) const {
4402 MachineFunction &MF = DAG.getMachineFunction();
4403 MachineRegisterInfo &MRI = MF.getRegInfo();
4404 Register VReg;
4405
4406 if (!MRI.isLiveIn(Reg)) {
4407 VReg = MRI.createVirtualRegister(RC);
4408 MRI.addLiveIn(Reg, VReg);
4409 } else {
4410 VReg = MRI.getLiveInVirtReg(Reg);
4411 }
4412
4413 if (RawReg)
4414 return DAG.getRegister(VReg, VT);
4415
4416 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4417}
4418
4419// This may be called multiple times, and nothing prevents creating multiple
4420// objects at the same offset. See if we already defined this object.
4421static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4422 int64_t Offset) {
4423 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4424 if (MFI.getObjectOffset(I) == Offset) {
4425 assert(MFI.getObjectSize(I) == Size)(static_cast <bool> (MFI.getObjectSize(I) == Size) ? void
(0) : __assert_fail ("MFI.getObjectSize(I) == Size", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4425, __extension__ __PRETTY_FUNCTION__))
;
4426 return I;
4427 }
4428 }
4429
4430 return MFI.CreateFixedObject(Size, Offset, true);
4431}
4432
4433SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4434 EVT VT,
4435 const SDLoc &SL,
4436 int64_t Offset) const {
4437 MachineFunction &MF = DAG.getMachineFunction();
4438 MachineFrameInfo &MFI = MF.getFrameInfo();
4439 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4440
4441 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4442 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4443
4444 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4445 MachineMemOperand::MODereferenceable |
4446 MachineMemOperand::MOInvariant);
4447}
4448
4449SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4450 const SDLoc &SL,
4451 SDValue Chain,
4452 SDValue ArgVal,
4453 int64_t Offset) const {
4454 MachineFunction &MF = DAG.getMachineFunction();
4455 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4456 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4457
4458 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4459 // Stores to the argument stack area are relative to the stack pointer.
4460 SDValue SP =
4461 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4462 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4463 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4464 MachineMemOperand::MODereferenceable);
4465 return Store;
4466}
4467
4468SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4469 const TargetRegisterClass *RC,
4470 EVT VT, const SDLoc &SL,
4471 const ArgDescriptor &Arg) const {
4472 assert(Arg && "Attempting to load missing argument")(static_cast <bool> (Arg && "Attempting to load missing argument"
) ? void (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4472, __extension__
__PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4473
4474 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4475 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4476 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4477
4478 if (!Arg.isMasked())
4
Taking false branch
4479 return V;
4480
4481 unsigned Mask = Arg.getMask();
4482 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5
Calling 'countr_zero<unsigned int>'
12
Returning from 'countr_zero<unsigned int>'
13
'Shift' initialized to 32
4483 V = DAG.getNode(ISD::SRL, SL, VT, V,
4484 DAG.getShiftAmountConstant(Shift, VT, SL));
4485 return DAG.getNode(ISD::AND, SL, VT, V,
4486 DAG.getConstant(Mask >> Shift, SL, VT));
14
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4487}
4488
4489uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4490 const MachineFunction &MF, const ImplicitParameter Param) const {
4491 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4492 const AMDGPUSubtarget &ST =
4493 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4494 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4495 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4496 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4497 ExplicitArgOffset;
4498 switch (Param) {
4499 case FIRST_IMPLICIT:
4500 return ArgOffset;
4501 case PRIVATE_BASE:
4502 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
4503 case SHARED_BASE:
4504 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
4505 case QUEUE_PTR:
4506 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
4507 }
4508 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4508)
;
4509}
4510
4511#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4512
4513const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4514 switch ((AMDGPUISD::NodeType)Opcode) {
4515 case AMDGPUISD::FIRST_NUMBER: break;
4516 // AMDIL DAG nodes
4517 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4518 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4519
4520 // AMDGPU DAG nodes
4521 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4522 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4523 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4524 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4525 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4526 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4527 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4528 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4529 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4530 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4531 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4532 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4533 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4534 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4535 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4536 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4537 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4538 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4539 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4540 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4541 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4542 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4543 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4544 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4545 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4546 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4547 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4548 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4549 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4550 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4551 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4552 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4553 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4554 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4555 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4556 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4557 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4558 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4559 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4560 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4561 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4562 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4563 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4564 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4565 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4566 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4567 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4568 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4569 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4570 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4571 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4572 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4573 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4574 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4575 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4576 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4577 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4578 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4579 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4580 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4581 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4582 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4583 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4584 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4585 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4586 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4587 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4588 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4589 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4590 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4591 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4592 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4593 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4594 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4595 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4596 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4597 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4598 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4599 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4600 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4601 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4602 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4603 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4604 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4605 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4606 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4607 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)case AMDGPUISD::FPTRUNC_ROUND_UPWARD: return "FPTRUNC_ROUND_UPWARD"
;
4608 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)case AMDGPUISD::FPTRUNC_ROUND_DOWNWARD: return "FPTRUNC_ROUND_DOWNWARD"
;
4609 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4610 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4611 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4612 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4613 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4614 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4615 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4616 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4617 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4618 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4619 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4620 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4621 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4622 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4623 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4624 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4625 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4626 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4627 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4628 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4629 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4630 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4631 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4632 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4633 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)case AMDGPUISD::BUFFER_LOAD_FORMAT_TFE: return "BUFFER_LOAD_FORMAT_TFE"
;
4634 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4635 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4636 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4637 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4638 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4639 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4640 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4641 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4642 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4643 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4644 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4645 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4646 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4647 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4648 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4649 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4650 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4651 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4652 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4653 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4654 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)case AMDGPUISD::BUFFER_ATOMIC_CSUB: return "BUFFER_ATOMIC_CSUB"
;
4655 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4656 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)case AMDGPUISD::BUFFER_ATOMIC_FMIN: return "BUFFER_ATOMIC_FMIN"
;
4657 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)case AMDGPUISD::BUFFER_ATOMIC_FMAX: return "BUFFER_ATOMIC_FMAX"
;
4658
4659 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4660 }
4661 return nullptr;
4662}
4663
4664SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4665 SelectionDAG &DAG, int Enabled,
4666 int &RefinementSteps,
4667 bool &UseOneConstNR,
4668 bool Reciprocal) const {
4669 EVT VT = Operand.getValueType();
4670
4671 if (VT == MVT::f32) {
4672 RefinementSteps = 0;
4673 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4674 }
4675
4676 // TODO: There is also f64 rsq instruction, but the documentation is less
4677 // clear on its precision.
4678
4679 return SDValue();
4680}
4681
4682SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4683 SelectionDAG &DAG, int Enabled,
4684 int &RefinementSteps) const {
4685 EVT VT = Operand.getValueType();
4686
4687 if (VT == MVT::f32) {
4688 // Reciprocal, < 1 ulp error.
4689 //
4690 // This reciprocal approximation converges to < 0.5 ulp error with one
4691 // newton rhapson performed with two fused multiple adds (FMAs).
4692
4693 RefinementSteps = 0;
4694 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4695 }
4696
4697 // TODO: There is also f64 rcp instruction, but the documentation is less
4698 // clear on its precision.
4699
4700 return SDValue();
4701}
4702
4703static unsigned workitemIntrinsicDim(unsigned ID) {
4704 switch (ID) {
4705 case Intrinsic::amdgcn_workitem_id_x:
4706 return 0;
4707 case Intrinsic::amdgcn_workitem_id_y:
4708 return 1;
4709 case Intrinsic::amdgcn_workitem_id_z:
4710 return 2;
4711 default:
4712 llvm_unreachable("not a workitem intrinsic")::llvm::llvm_unreachable_internal("not a workitem intrinsic",
"llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4712)
;
4713 }
4714}
4715
4716void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4717 const SDValue Op, KnownBits &Known,
4718 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4719
4720 Known.resetAll(); // Don't know anything.
4721
4722 unsigned Opc = Op.getOpcode();
4723
4724 switch (Opc) {
4725 default:
4726 break;
4727 case AMDGPUISD::CARRY:
4728 case AMDGPUISD::BORROW: {
4729 Known.Zero = APInt::getHighBitsSet(32, 31);
4730 break;
4731 }
4732
4733 case AMDGPUISD::BFE_I32:
4734 case AMDGPUISD::BFE_U32: {
4735 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4736 if (!CWidth)
4737 return;
4738
4739 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4740
4741 if (Opc == AMDGPUISD::BFE_U32)
4742 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4743
4744 break;
4745 }
4746 case AMDGPUISD::FP_TO_FP16: {
4747 unsigned BitWidth = Known.getBitWidth();
4748
4749 // High bits are zero.
4750 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4751 break;
4752 }
4753 case AMDGPUISD::MUL_U24:
4754 case AMDGPUISD::MUL_I24: {
4755 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4756 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4757 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4758 RHSKnown.countMinTrailingZeros();
4759 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4760 // Skip extra check if all bits are known zeros.
4761 if (TrailZ >= 32)
4762 break;
4763
4764 // Truncate to 24 bits.
4765 LHSKnown = LHSKnown.trunc(24);
4766 RHSKnown = RHSKnown.trunc(24);
4767
4768 if (Opc == AMDGPUISD::MUL_I24) {
4769 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4770 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4771 unsigned MaxValBits = LHSValBits + RHSValBits;
4772 if (MaxValBits > 32)
4773 break;
4774 unsigned SignBits = 32 - MaxValBits + 1;
4775 bool LHSNegative = LHSKnown.isNegative();
4776 bool LHSNonNegative = LHSKnown.isNonNegative();
4777 bool LHSPositive = LHSKnown.isStrictlyPositive();
4778 bool RHSNegative = RHSKnown.isNegative();
4779 bool RHSNonNegative = RHSKnown.isNonNegative();
4780 bool RHSPositive = RHSKnown.isStrictlyPositive();
4781
4782 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4783 Known.Zero.setHighBits(SignBits);
4784 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4785 Known.One.setHighBits(SignBits);
4786 } else {
4787 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4788 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4789 unsigned MaxValBits = LHSValBits + RHSValBits;
4790 if (MaxValBits >= 32)
4791 break;
4792 Known.Zero.setBitsFrom(MaxValBits);
4793 }
4794 break;
4795 }
4796 case AMDGPUISD::PERM: {
4797 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4798 if (!CMask)
4799 return;
4800
4801 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4802 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4803 unsigned Sel = CMask->getZExtValue();
4804
4805 for (unsigned I = 0; I < 32; I += 8) {
4806 unsigned SelBits = Sel & 0xff;
4807 if (SelBits < 4) {
4808 SelBits *= 8;
4809 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4810 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4811 } else if (SelBits < 7) {
4812 SelBits = (SelBits & 3) * 8;
4813 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4814 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4815 } else if (SelBits == 0x0c) {
4816 Known.Zero |= 0xFFull << I;
4817 } else if (SelBits > 0x0c) {
4818 Known.One |= 0xFFull << I;
4819 }
4820 Sel >>= 8;
4821 }
4822 break;
4823 }
4824 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4825 Known.Zero.setHighBits(24);
4826 break;
4827 }
4828 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4829 Known.Zero.setHighBits(16);
4830 break;
4831 }
4832 case AMDGPUISD::LDS: {
4833 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4834 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4835
4836 Known.Zero.setHighBits(16);
4837 Known.Zero.setLowBits(Log2(Alignment));
4838 break;
4839 }
4840 case ISD::INTRINSIC_WO_CHAIN: {
4841 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4842 switch (IID) {
4843 case Intrinsic::amdgcn_mbcnt_lo:
4844 case Intrinsic::amdgcn_mbcnt_hi: {
4845 const GCNSubtarget &ST =
4846 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4847 // These return at most the (wavefront size - 1) + src1
4848 // As long as src1 is an immediate we can calc known bits
4849 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
4850 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
4851 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
4852 // Cater for potential carry
4853 MaxActiveBits += Src1ValBits ? 1 : 0;
4854 unsigned Size = Op.getValueType().getSizeInBits();
4855 if (MaxActiveBits < Size)
4856 Known.Zero.setHighBits(Size - MaxActiveBits);
4857 break;
4858 }
4859 case Intrinsic::amdgcn_workitem_id_x:
4860 case Intrinsic::amdgcn_workitem_id_y:
4861 case Intrinsic::amdgcn_workitem_id_z: {
4862 unsigned MaxValue = Subtarget->getMaxWorkitemID(
4863 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
4864 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
4865 break;
4866 }
4867 default:
4868 break;
4869 }
4870 }
4871 }
4872}
4873
4874unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4875 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4876 unsigned Depth) const {
4877 switch (Op.getOpcode()) {
4878 case AMDGPUISD::BFE_I32: {
4879 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4880 if (!Width)
4881 return 1;
4882
4883 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4884 if (!isNullConstant(Op.getOperand(1)))
4885 return SignBits;
4886
4887 // TODO: Could probably figure something out with non-0 offsets.
4888 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4889 return std::max(SignBits, Op0SignBits);
4890 }
4891
4892 case AMDGPUISD::BFE_U32: {
4893 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4894 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4895 }
4896
4897 case AMDGPUISD::CARRY:
4898 case AMDGPUISD::BORROW:
4899 return 31;
4900 case AMDGPUISD::BUFFER_LOAD_BYTE:
4901 return 25;
4902 case AMDGPUISD::BUFFER_LOAD_SHORT:
4903 return 17;
4904 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4905 return 24;
4906 case AMDGPUISD::BUFFER_LOAD_USHORT:
4907 return 16;
4908 case AMDGPUISD::FP_TO_FP16:
4909 return 16;
4910 default:
4911 return 1;
4912 }
4913}
4914
4915unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4916 GISelKnownBits &Analysis, Register R,
4917 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4918 unsigned Depth) const {
4919 const MachineInstr *MI = MRI.getVRegDef(R);
4920 if (!MI)
4921 return 1;
4922
4923 // TODO: Check range metadata on MMO.
4924 switch (MI->getOpcode()) {
4925 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4926 return 25;
4927 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4928 return 17;
4929 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4930 return 24;
4931 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4932 return 16;
4933 default:
4934 return 1;
4935 }
4936}
4937
4938bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4939 const SelectionDAG &DAG,
4940 bool SNaN,
4941 unsigned Depth) const {
4942 unsigned Opcode = Op.getOpcode();
4943 switch (Opcode) {
4944 case AMDGPUISD::FMIN_LEGACY:
4945 case AMDGPUISD::FMAX_LEGACY: {
4946 if (SNaN)
4947 return true;
4948
4949 // TODO: Can check no nans on one of the operands for each one, but which
4950 // one?
4951 return false;
4952 }
4953 case AMDGPUISD::FMUL_LEGACY:
4954 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4955 if (SNaN)
4956 return true;
4957 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4958 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4959 }
4960 case AMDGPUISD::FMED3:
4961 case AMDGPUISD::FMIN3:
4962 case AMDGPUISD::FMAX3:
4963 case AMDGPUISD::FMAD_FTZ: {
4964 if (SNaN)
4965 return true;
4966 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4967 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4968 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4969 }
4970 case AMDGPUISD::CVT_F32_UBYTE0:
4971 case AMDGPUISD::CVT_F32_UBYTE1:
4972 case AMDGPUISD::CVT_F32_UBYTE2:
4973 case AMDGPUISD::CVT_F32_UBYTE3:
4974 return true;
4975
4976 case AMDGPUISD::RCP:
4977 case AMDGPUISD::RSQ:
4978 case AMDGPUISD::RCP_LEGACY:
4979 case AMDGPUISD::RSQ_CLAMP: {
4980 if (SNaN)
4981 return true;
4982
4983 // TODO: Need is known positive check.
4984 return false;
4985 }
4986 case AMDGPUISD::LDEXP:
4987 case AMDGPUISD::FRACT: {
4988 if (SNaN)
4989 return true;
4990 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4991 }
4992 case AMDGPUISD::DIV_SCALE:
4993 case AMDGPUISD::DIV_FMAS:
4994 case AMDGPUISD::DIV_FIXUP:
4995 // TODO: Refine on operands.
4996 return SNaN;
4997 case AMDGPUISD::SIN_HW:
4998 case AMDGPUISD::COS_HW: {
4999 // TODO: Need check for infinity
5000 return SNaN;
5001 }
5002 case ISD::INTRINSIC_WO_CHAIN: {
5003 unsigned IntrinsicID
5004 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5005 // TODO: Handle more intrinsics
5006 switch (IntrinsicID) {
5007 case Intrinsic::amdgcn_cubeid:
5008 return true;
5009
5010 case Intrinsic::amdgcn_frexp_mant: {
5011 if (SNaN)
5012 return true;
5013 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5014 }
5015 case Intrinsic::amdgcn_cvt_pkrtz: {
5016 if (SNaN)
5017 return true;
5018 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5019 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5020 }
5021 case Intrinsic::amdgcn_rcp:
5022 case Intrinsic::amdgcn_rsq:
5023 case Intrinsic::amdgcn_rcp_legacy:
5024 case Intrinsic::amdgcn_rsq_legacy:
5025 case Intrinsic::amdgcn_rsq_clamp: {
5026 if (SNaN)
5027 return true;
5028
5029 // TODO: Need is known positive check.
5030 return false;
5031 }
5032 case Intrinsic::amdgcn_trig_preop:
5033 case Intrinsic::amdgcn_fdot2:
5034 // TODO: Refine on operand
5035 return SNaN;
5036 case Intrinsic::amdgcn_fma_legacy:
5037 if (SNaN)
5038 return true;
5039 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5040 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5041 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5042 default:
5043 return false;
5044 }
5045 }
5046 default:
5047 return false;
5048 }
5049}
5050
5051TargetLowering::AtomicExpansionKind
5052AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
5053 switch (RMW->getOperation()) {
5054 case AtomicRMWInst::Nand:
5055 case AtomicRMWInst::FAdd:
5056 case AtomicRMWInst::FSub:
5057 case AtomicRMWInst::FMax:
5058 case AtomicRMWInst::FMin:
5059 return AtomicExpansionKind::CmpXChg;
5060 default: {
5061 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5062 unsigned Size = IntTy->getBitWidth();
5063 if (Size == 32 || Size == 64)
5064 return AtomicExpansionKind::None;
5065 }
5066
5067 return AtomicExpansionKind::CmpXChg;
5068 }
5069 }
5070}
5071
5072bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
5073 unsigned Opc, LLT Ty1, LLT Ty2) const {
5074 return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
5075 Ty2 == LLT::scalar(32);
5076}

/build/source/llvm/include/llvm/ADT/bit.h

1//===-- llvm/ADT/bit.h - C++20 <bit> ----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the C++20 <bit> header.
11///
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_ADT_BIT_H
15#define LLVM_ADT_BIT_H
16
17#include "llvm/Support/Compiler.h"
18#include <cstdint>
19#include <limits>
20#include <type_traits>
21
22#if !__has_builtin(__builtin_bit_cast)1
23#include <cstring>
24#endif
25
26#if defined(_MSC_VER) && !defined(_DEBUG1)
27#include <cstdlib> // for _byteswap_{ushort,ulong,uint64}
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and bit.h is popular via MathExtras.h.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44// This implementation of bit_cast is different from the C++20 one in two ways:
45// - It isn't constexpr because that requires compiler support.
46// - It requires trivially-constructible To, to avoid UB in the implementation.
47template <
48 typename To, typename From,
49 typename = std::enable_if_t<sizeof(To) == sizeof(From)>,
50 typename = std::enable_if_t<std::is_trivially_constructible<To>::value>,
51 typename = std::enable_if_t<std::is_trivially_copyable<To>::value>,
52 typename = std::enable_if_t<std::is_trivially_copyable<From>::value>>
53[[nodiscard]] inline To bit_cast(const From &from) noexcept {
54#if __has_builtin(__builtin_bit_cast)1
55 return __builtin_bit_cast(To, from);
56#else
57 To to;
58 std::memcpy(&to, &from, sizeof(To));
59 return to;
60#endif
61}
62
63/// Reverses the bytes in the given integer value V.
64template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
65[[nodiscard]] constexpr T byteswap(T V) noexcept {
66 if constexpr (sizeof(T) == 1) {
67 return V;
68 } else if constexpr (sizeof(T) == 2) {
69 uint16_t UV = V;
70#if defined(_MSC_VER) && !defined(_DEBUG1)
71 // The DLL version of the runtime lacks these functions (bug!?), but in a
72 // release build they're replaced with BSWAP instructions anyway.
73 return _byteswap_ushort(UV);
74#else
75 uint16_t Hi = UV << 8;
76 uint16_t Lo = UV >> 8;
77 return Hi | Lo;
78#endif
79 } else if constexpr (sizeof(T) == 4) {
80 uint32_t UV = V;
81#if __has_builtin(__builtin_bswap32)1
82 return __builtin_bswap32(UV);
83#elif defined(_MSC_VER) && !defined(_DEBUG1)
84 return _byteswap_ulong(UV);
85#else
86 uint32_t Byte0 = UV & 0x000000FF;
87 uint32_t Byte1 = UV & 0x0000FF00;
88 uint32_t Byte2 = UV & 0x00FF0000;
89 uint32_t Byte3 = UV & 0xFF000000;
90 return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
91#endif
92 } else if constexpr (sizeof(T) == 8) {
93 uint64_t UV = V;
94#if __has_builtin(__builtin_bswap64)1
95 return __builtin_bswap64(UV);
96#elif defined(_MSC_VER) && !defined(_DEBUG1)
97 return _byteswap_uint64(UV);
98#else
99 uint64_t Hi = llvm::byteswap<uint32_t>(UV);
100 uint32_t Lo = llvm::byteswap<uint32_t>(UV >> 32);
101 return (Hi << 32) | Lo;
102#endif
103 } else {
104 static_assert(!sizeof(T *), "Don't know how to handle the given type.");
105 return 0;
106 }
107}
108
109template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
110[[nodiscard]] constexpr inline bool has_single_bit(T Value) noexcept {
111 return (Value != 0) && ((Value & (Value - 1)) == 0);
112}
113
114namespace detail {
115template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
116 static unsigned count(T Val) {
117 if (!Val)
118 return std::numeric_limits<T>::digits;
119 if (Val & 0x1)
120 return 0;
121
122 // Bisection method.
123 unsigned ZeroBits = 0;
124 T Shift = std::numeric_limits<T>::digits >> 1;
125 T Mask = std::numeric_limits<T>::max() >> Shift;
126 while (Shift) {
127 if ((Val & Mask) == 0) {
128 Val >>= Shift;
129 ZeroBits |= Shift;
130 }
131 Shift >>= 1;
132 Mask >>= Shift;
133 }
134 return ZeroBits;
135 }
136};
137
138#if defined(__GNUC__4) || defined(_MSC_VER)
139template <typename T> struct TrailingZerosCounter<T, 4> {
140 static unsigned count(T Val) {
141 if (Val == 0)
7
Assuming 'Val' is equal to 0
8
Taking true branch
142 return 32;
9
Returning the value 32
143
144#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
145 return __builtin_ctz(Val);
146#elif defined(_MSC_VER)
147 unsigned long Index;
148 _BitScanForward(&Index, Val);
149 return Index;
150#endif
151 }
152};
153
154#if !defined(_MSC_VER) || defined(_M_X64)
155template <typename T> struct TrailingZerosCounter<T, 8> {
156 static unsigned count(T Val) {
157 if (Val == 0)
158 return 64;
159
160#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
161 return __builtin_ctzll(Val);
162#elif defined(_MSC_VER)
163 unsigned long Index;
164 _BitScanForward64(&Index, Val);
165 return Index;
166#endif
167 }
168};
169#endif
170#endif
171} // namespace detail
172
173/// Count number of 0's from the least significant bit to the most
174/// stopping at the first 1.
175///
176/// Only unsigned integral types are allowed.
177///
178/// Returns std::numeric_limits<T>::digits on an input of 0.
179template <typename T> [[nodiscard]] int countr_zero(T Val) {
180 static_assert(std::is_unsigned_v<T>,
181 "Only unsigned integral types are allowed.");
182 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val);
6
Calling 'TrailingZerosCounter::count'
10
Returning from 'TrailingZerosCounter::count'
11
Returning the value 32
183}
184
185namespace detail {
186template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
187 static unsigned count(T Val) {
188 if (!Val)
189 return std::numeric_limits<T>::digits;
190
191 // Bisection method.
192 unsigned ZeroBits = 0;
193 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
194 T Tmp = Val >> Shift;
195 if (Tmp)
196 Val = Tmp;
197 else
198 ZeroBits |= Shift;
199 }
200 return ZeroBits;
201 }
202};
203
204#if defined(__GNUC__4) || defined(_MSC_VER)
205template <typename T> struct LeadingZerosCounter<T, 4> {
206 static unsigned count(T Val) {
207 if (Val == 0)
208 return 32;
209
210#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
211 return __builtin_clz(Val);
212#elif defined(_MSC_VER)
213 unsigned long Index;
214 _BitScanReverse(&Index, Val);
215 return Index ^ 31;
216#endif
217 }
218};
219
220#if !defined(_MSC_VER) || defined(_M_X64)
221template <typename T> struct LeadingZerosCounter<T, 8> {
222 static unsigned count(T Val) {
223 if (Val == 0)
224 return 64;
225
226#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
227 return __builtin_clzll(Val);
228#elif defined(_MSC_VER)
229 unsigned long Index;
230 _BitScanReverse64(&Index, Val);
231 return Index ^ 63;
232#endif
233 }
234};
235#endif
236#endif
237} // namespace detail
238
239/// Count number of 0's from the most significant bit to the least
240/// stopping at the first 1.
241///
242/// Only unsigned integral types are allowed.
243///
244/// Returns std::numeric_limits<T>::digits on an input of 0.
245template <typename T> [[nodiscard]] int countl_zero(T Val) {
246 static_assert(std::is_unsigned_v<T>,
247 "Only unsigned integral types are allowed.");
248 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val);
249}
250
251/// Count the number of ones from the most significant bit to the first
252/// zero bit.
253///
254/// Ex. countl_one(0xFF0FFF00) == 8.
255/// Only unsigned integral types are allowed.
256///
257/// Returns std::numeric_limits<T>::digits on an input of all ones.
258template <typename T> [[nodiscard]] int countl_one(T Value) {
259 static_assert(std::is_unsigned_v<T>,
260 "Only unsigned integral types are allowed.");
261 return llvm::countl_zero<T>(~Value);
262}
263
264/// Count the number of ones from the least significant bit to the first
265/// zero bit.
266///
267/// Ex. countr_one(0x00FF00FF) == 8.
268/// Only unsigned integral types are allowed.
269///
270/// Returns std::numeric_limits<T>::digits on an input of all ones.
271template <typename T> [[nodiscard]] int countr_one(T Value) {
272 static_assert(std::is_unsigned_v<T>,
273 "Only unsigned integral types are allowed.");
274 return llvm::countr_zero<T>(~Value);
275}
276
277/// Returns the number of bits needed to represent Value if Value is nonzero.
278/// Returns 0 otherwise.
279///
280/// Ex. bit_width(5) == 3.
281template <typename T> [[nodiscard]] int bit_width(T Value) {
282 static_assert(std::is_unsigned_v<T>,
283 "Only unsigned integral types are allowed.");
284 return std::numeric_limits<T>::digits - llvm::countl_zero(Value);
285}
286
287/// Returns the largest integral power of two no greater than Value if Value is
288/// nonzero. Returns 0 otherwise.
289///
290/// Ex. bit_floor(5) == 4.
291template <typename T> [[nodiscard]] T bit_floor(T Value) {
292 static_assert(std::is_unsigned_v<T>,
293 "Only unsigned integral types are allowed.");
294 if (!Value)
295 return 0;
296 return T(1) << (llvm::bit_width(Value) - 1);
297}
298
299/// Returns the smallest integral power of two no smaller than Value if Value is
300/// nonzero. Returns 1 otherwise.
301///
302/// Ex. bit_ceil(5) == 8.
303///
304/// The return value is undefined if the input is larger than the largest power
305/// of two representable in T.
306template <typename T> [[nodiscard]] T bit_ceil(T Value) {
307 static_assert(std::is_unsigned_v<T>,
308 "Only unsigned integral types are allowed.");
309 if (Value < 2)
310 return 1;
311 return T(1) << llvm::bit_width<T>(Value - 1u);
312}
313
314namespace detail {
315template <typename T, std::size_t SizeOfT> struct PopulationCounter {
316 static int count(T Value) {
317 // Generic version, forward to 32 bits.
318 static_assert(SizeOfT <= 4, "Not implemented!");
319#if defined(__GNUC__4)
320 return (int)__builtin_popcount(Value);
321#else
322 uint32_t v = Value;
323 v = v - ((v >> 1) & 0x55555555);
324 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
325 return int(((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24);
326#endif
327 }
328};
329
330template <typename T> struct PopulationCounter<T, 8> {
331 static int count(T Value) {
332#if defined(__GNUC__4)
333 return (int)__builtin_popcountll(Value);
334#else
335 uint64_t v = Value;
336 v = v - ((v >> 1) & 0x5555555555555555ULL);
337 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
338 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
339 return int((uint64_t)(v * 0x0101010101010101ULL) >> 56);
340#endif
341 }
342};
343} // namespace detail
344
345/// Count the number of set bits in a value.
346/// Ex. popcount(0xF000F000) = 8
347/// Returns 0 if the word is zero.
348template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
349[[nodiscard]] inline int popcount(T Value) noexcept {
350 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
351}
352
353// Forward-declare rotr so that rotl can use it.
354template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
355[[nodiscard]] constexpr T rotr(T V, int R);
356
357template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
358[[nodiscard]] constexpr T rotl(T V, int R) {
359 unsigned N = std::numeric_limits<T>::digits;
360
361 R = R % N;
362 if (!R)
363 return V;
364
365 if (R < 0)
366 return llvm::rotr(V, -R);
367
368 return (V << R) | (V >> (N - R));
369}
370
371template <typename T, typename> [[nodiscard]] constexpr T rotr(T V, int R) {
372 unsigned N = std::numeric_limits<T>::digits;
373
374 R = R % N;
375 if (!R)
376 return V;
377
378 if (R < 0)
379 return llvm::rotl(V, -R);
380
381 return (V >> R) | (V << (N - R));
382}
383
384} // namespace llvm
385
386#endif