Bug Summary

File:build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4242, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-09-04-125545-48738-1 -x c++ /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/CodeGen/Analysis.h"
22#include "llvm/CodeGen/MachineFrameInfo.h"
23#include "llvm/IR/DiagnosticInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
25#include "llvm/Support/CommandLine.h"
26#include "llvm/Support/KnownBits.h"
27#include "llvm/Target/TargetMachine.h"
28
29using namespace llvm;
30
31#include "AMDGPUGenCallingConv.inc"
32
33static cl::opt<bool> AMDGPUBypassSlowDiv(
34 "amdgpu-bypass-slow-div",
35 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36 cl::init(true));
37
38// Find a larger type to do a load / store of a vector with.
39EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
40 unsigned StoreSize = VT.getStoreSizeInBits();
41 if (StoreSize <= 32)
42 return EVT::getIntegerVT(Ctx, StoreSize);
43
44 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")(static_cast <bool> (StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? void (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 44, __extension__
__PRETTY_FUNCTION__))
;
45 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46}
47
48unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
49 return DAG.computeKnownBits(Op).countMaxActiveBits();
50}
51
52unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
53 // In order for this to be a signed 24-bit value, bit 23, must
54 // be a sign bit.
55 return DAG.ComputeMaxSignificantBits(Op);
56}
57
58AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
59 const AMDGPUSubtarget &STI)
60 : TargetLowering(TM), Subtarget(&STI) {
61 // Lower floating point store/load to integer store/load to reduce the number
62 // of patterns in tablegen.
63 setOperationAction(ISD::LOAD, MVT::f32, Promote);
64 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
65
66 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
67 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
68
69 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
70 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
71
72 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
73 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
74
75 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
77
78 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
80
81 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
83
84 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
86
87 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
89
90 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
92
93 setOperationAction(ISD::LOAD, MVT::i64, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
95
96 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
98
99 setOperationAction(ISD::LOAD, MVT::f64, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
101
102 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
104
105 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
107
108 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
110
111 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
113
114 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
116
117 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
119
120 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
122
123 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
125
126 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
128
129 // There are no 64-bit extloads. These should be done as a 32-bit extload and
130 // an extension to 64-bit.
131 for (MVT VT : MVT::integer_valuetypes())
132 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
133 Expand);
134
135 for (MVT VT : MVT::integer_valuetypes()) {
136 if (VT == MVT::i64)
137 continue;
138
139 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
140 setLoadExtAction(Op, VT, MVT::i1, Promote);
141 setLoadExtAction(Op, VT, MVT::i8, Legal);
142 setLoadExtAction(Op, VT, MVT::i16, Legal);
143 setLoadExtAction(Op, VT, MVT::i32, Expand);
144 }
145 }
146
147 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
148 for (auto MemVT :
149 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
150 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
151 Expand);
152
153 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
154 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
155 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
156 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
157 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
158 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
159 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
160
161 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
162 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
163 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
164 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
165 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
166 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
167
168 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
169 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
172 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
173 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
174
175 setOperationAction(ISD::STORE, MVT::f32, Promote);
176 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
177
178 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
179 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
180
181 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
182 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
183
184 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
185 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
186
187 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
188 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
189
190 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
191 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
192
193 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
194 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
195
196 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
197 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
198
199 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
200 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
201
202 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
203 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
204
205 setOperationAction(ISD::STORE, MVT::i64, Promote);
206 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
207
208 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
209 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
210
211 setOperationAction(ISD::STORE, MVT::f64, Promote);
212 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
213
214 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
215 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
216
217 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
218 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
219
220 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
221 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
222
223 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
224 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
225
226 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
227 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
228
229 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
230 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
231
232 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
233 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
234
235 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
236 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
237
238 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
239 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
240
241 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
242 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
243 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
244 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
245
246 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
247 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
248 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
249 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
250
251 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
252 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
253 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
254 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
255 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
256 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
257 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
258
259 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
260 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
261
262 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
263 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
264
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
267 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
268 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
269
270 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
271 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
272 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
273 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
274
275 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
276 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
277
278 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
279 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
280 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
281 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
282 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
283 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
284 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
285
286 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
287 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
288
289 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
290
291 // This is totally unsupported, just custom lower to produce an error.
292 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
293
294 // Library functions. These default to Expand, but we have instructions
295 // for them.
296 setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
297 ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
298 ISD::FMAXNUM},
299 MVT::f32, Legal);
300
301 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
302
303 setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
304
305 setOperationAction(ISD::FNEARBYINT, {MVT::f32, MVT::f64}, Custom);
306
307 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
308
309 // Expand to fneg + fadd.
310 setOperationAction(ISD::FSUB, MVT::f64, Expand);
311
312 setOperationAction(ISD::CONCAT_VECTORS,
313 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
314 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
315 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
316 Custom);
317 setOperationAction(
318 ISD::EXTRACT_SUBVECTOR,
319 {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32,
320 MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32,
321 MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32,
322 MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16,
323 MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
324 MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64,
325 MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
326 Custom);
327
328 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
329 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
330
331 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
332 for (MVT VT : ScalarIntVTs) {
333 // These should use [SU]DIVREM, so set them to expand
334 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
335 Expand);
336
337 // GPU does not have divrem function for signed or unsigned.
338 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
339
340 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
341 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
342
343 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
344
345 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
346 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
347 }
348
349 // The hardware supports 32-bit FSHR, but not FSHL.
350 setOperationAction(ISD::FSHR, MVT::i32, Legal);
351
352 // The hardware supports 32-bit ROTR, but not ROTL.
353 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
354 setOperationAction(ISD::ROTR, MVT::i64, Expand);
355
356 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
357
358 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
359 setOperationAction(
360 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
361 MVT::i64, Custom);
362 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
363
364 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
365 Legal);
366
367 setOperationAction(
368 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
369 MVT::i64, Custom);
370
371 static const MVT::SimpleValueType VectorIntTypes[] = {
372 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
373
374 for (MVT VT : VectorIntTypes) {
375 // Expand the following operations for the current type by default.
376 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
377 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
378 ISD::MULHS, ISD::OR, ISD::SHL,
379 ISD::SRA, ISD::SRL, ISD::ROTL,
380 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
381 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
382 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
383 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
384 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
385 ISD::XOR, ISD::BSWAP, ISD::CTPOP,
386 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
387 ISD::SETCC},
388 VT, Expand);
389 }
390
391 static const MVT::SimpleValueType FloatVectorTypes[] = {
392 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
393
394 for (MVT VT : FloatVectorTypes) {
395 setOperationAction(
396 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD,
397 ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2,
398 ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG,
399 ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC,
400 ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
401 ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG,
402 ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
403 ISD::SETCC, ISD::FCANONICALIZE},
404 VT, Expand);
405 }
406
407 // This causes using an unrolled select operation rather than expansion with
408 // bit operations. This is in general better, but the alternative using BFI
409 // instructions may be better if the select sources are SGPRs.
410 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
411 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
412
413 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
414 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
415
416 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
417 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
418
419 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
420 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
421
422 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
423 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
424
425 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
426 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
427
428 // There are no libcalls of any kind.
429 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
430 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
431
432 setSchedulingPreference(Sched::RegPressure);
433 setJumpIsExpensive(true);
434
435 // FIXME: This is only partially true. If we have to do vector compares, any
436 // SGPR pair can be a condition register. If we have a uniform condition, we
437 // are better off doing SALU operations, where there is only one SCC. For now,
438 // we don't have a way of knowing during instruction selection if a condition
439 // will be uniform and we always use vector compares. Assume we are using
440 // vector compares until that is fixed.
441 setHasMultipleConditionRegisters(true);
442
443 setMinCmpXchgSizeInBits(32);
444 setSupportsUnalignedAtomics(false);
445
446 PredictableSelectIsExpensive = false;
447
448 // We want to find all load dependencies for long chains of stores to enable
449 // merging into very wide vectors. The problem is with vectors with > 4
450 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
451 // vectors are a legal type, even though we have to split the loads
452 // usually. When we can more precisely specify load legality per address
453 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
454 // smarter so that they can figure out what to do in 2 iterations without all
455 // N > 4 stores on the same chain.
456 GatherAllAliasesMaxDepth = 16;
457
458 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
459 // about these during lowering.
460 MaxStoresPerMemcpy = 0xffffffff;
461 MaxStoresPerMemmove = 0xffffffff;
462 MaxStoresPerMemset = 0xffffffff;
463
464 // The expansion for 64-bit division is enormous.
465 if (AMDGPUBypassSlowDiv)
466 addBypassSlowDiv(64, 32);
467
468 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
469 ISD::SRA, ISD::SRL,
470 ISD::TRUNCATE, ISD::MUL,
471 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
472 ISD::MULHU, ISD::MULHS,
473 ISD::SELECT, ISD::SELECT_CC,
474 ISD::STORE, ISD::FADD,
475 ISD::FSUB, ISD::FNEG,
476 ISD::FABS, ISD::AssertZext,
477 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
478}
479
480bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
481 if (getTargetMachine().Options.NoSignedZerosFPMath)
482 return true;
483
484 const auto Flags = Op.getNode()->getFlags();
485 if (Flags.hasNoSignedZeros())
486 return true;
487
488 return false;
489}
490
491//===----------------------------------------------------------------------===//
492// Target Information
493//===----------------------------------------------------------------------===//
494
495LLVM_READNONE__attribute__((__const__))
496static bool fnegFoldsIntoOp(unsigned Opc) {
497 switch (Opc) {
498 case ISD::FADD:
499 case ISD::FSUB:
500 case ISD::FMUL:
501 case ISD::FMA:
502 case ISD::FMAD:
503 case ISD::FMINNUM:
504 case ISD::FMAXNUM:
505 case ISD::FMINNUM_IEEE:
506 case ISD::FMAXNUM_IEEE:
507 case ISD::FSIN:
508 case ISD::FTRUNC:
509 case ISD::FRINT:
510 case ISD::FNEARBYINT:
511 case ISD::FCANONICALIZE:
512 case AMDGPUISD::RCP:
513 case AMDGPUISD::RCP_LEGACY:
514 case AMDGPUISD::RCP_IFLAG:
515 case AMDGPUISD::SIN_HW:
516 case AMDGPUISD::FMUL_LEGACY:
517 case AMDGPUISD::FMIN_LEGACY:
518 case AMDGPUISD::FMAX_LEGACY:
519 case AMDGPUISD::FMED3:
520 // TODO: handle llvm.amdgcn.fma.legacy
521 return true;
522 default:
523 return false;
524 }
525}
526
527/// \p returns true if the operation will definitely need to use a 64-bit
528/// encoding, and thus will use a VOP3 encoding regardless of the source
529/// modifiers.
530LLVM_READONLY__attribute__((__pure__))
531static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
532 return N->getNumOperands() > 2 || VT == MVT::f64;
533}
534
535// Most FP instructions support source modifiers, but this could be refined
536// slightly.
537LLVM_READONLY__attribute__((__pure__))
538static bool hasSourceMods(const SDNode *N) {
539 if (isa<MemSDNode>(N))
540 return false;
541
542 switch (N->getOpcode()) {
543 case ISD::CopyToReg:
544 case ISD::SELECT:
545 case ISD::FDIV:
546 case ISD::FREM:
547 case ISD::INLINEASM:
548 case ISD::INLINEASM_BR:
549 case AMDGPUISD::DIV_SCALE:
550 case ISD::INTRINSIC_W_CHAIN:
551
552 // TODO: Should really be looking at the users of the bitcast. These are
553 // problematic because bitcasts are used to legalize all stores to integer
554 // types.
555 case ISD::BITCAST:
556 return false;
557 case ISD::INTRINSIC_WO_CHAIN: {
558 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
559 case Intrinsic::amdgcn_interp_p1:
560 case Intrinsic::amdgcn_interp_p2:
561 case Intrinsic::amdgcn_interp_mov:
562 case Intrinsic::amdgcn_interp_p1_f16:
563 case Intrinsic::amdgcn_interp_p2_f16:
564 return false;
565 default:
566 return true;
567 }
568 }
569 default:
570 return true;
571 }
572}
573
574bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
575 unsigned CostThreshold) {
576 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
577 // it is truly free to use a source modifier in all cases. If there are
578 // multiple users but for each one will necessitate using VOP3, there will be
579 // a code size increase. Try to avoid increasing code size unless we know it
580 // will save on the instruction count.
581 unsigned NumMayIncreaseSize = 0;
582 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
583
584 // XXX - Should this limit number of uses to check?
585 for (const SDNode *U : N->uses()) {
586 if (!hasSourceMods(U))
587 return false;
588
589 if (!opMustUseVOP3Encoding(U, VT)) {
590 if (++NumMayIncreaseSize > CostThreshold)
591 return false;
592 }
593 }
594
595 return true;
596}
597
598EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
599 ISD::NodeType ExtendKind) const {
600 assert(!VT.isVector() && "only scalar expected")(static_cast <bool> (!VT.isVector() && "only scalar expected"
) ? void (0) : __assert_fail ("!VT.isVector() && \"only scalar expected\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 600, __extension__
__PRETTY_FUNCTION__))
;
601
602 // Round to the next multiple of 32-bits.
603 unsigned Size = VT.getSizeInBits();
604 if (Size <= 32)
605 return MVT::i32;
606 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
607}
608
609MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
610 return MVT::i32;
611}
612
613bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
614 return true;
615}
616
617// The backend supports 32 and 64 bit floating point immediates.
618// FIXME: Why are we reporting vectors of FP immediates as legal?
619bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
620 bool ForCodeSize) const {
621 EVT ScalarVT = VT.getScalarType();
622 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
623 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
624}
625
626// We don't want to shrink f64 / f32 constants.
627bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
628 EVT ScalarVT = VT.getScalarType();
629 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
630}
631
632bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
633 ISD::LoadExtType ExtTy,
634 EVT NewVT) const {
635 // TODO: This may be worth removing. Check regression tests for diffs.
636 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
637 return false;
638
639 unsigned NewSize = NewVT.getStoreSizeInBits();
640
641 // If we are reducing to a 32-bit load or a smaller multi-dword load,
642 // this is always better.
643 if (NewSize >= 32)
644 return true;
645
646 EVT OldVT = N->getValueType(0);
647 unsigned OldSize = OldVT.getStoreSizeInBits();
648
649 MemSDNode *MN = cast<MemSDNode>(N);
650 unsigned AS = MN->getAddressSpace();
651 // Do not shrink an aligned scalar load to sub-dword.
652 // Scalar engine cannot do sub-dword loads.
653 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
654 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
655 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
656 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
657 MN->isInvariant())) &&
658 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
659 return false;
660
661 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
662 // extloads, so doing one requires using a buffer_load. In cases where we
663 // still couldn't use a scalar load, using the wider load shouldn't really
664 // hurt anything.
665
666 // If the old size already had to be an extload, there's no harm in continuing
667 // to reduce the width.
668 return (OldSize < 32);
669}
670
671bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
672 const SelectionDAG &DAG,
673 const MachineMemOperand &MMO) const {
674
675 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())(static_cast <bool> (LoadTy.getSizeInBits() == CastTy.getSizeInBits
()) ? void (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 675, __extension__
__PRETTY_FUNCTION__))
;
676
677 if (LoadTy.getScalarType() == MVT::i32)
678 return false;
679
680 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
681 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
682
683 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
684 return false;
685
686 bool Fast = false;
687 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
688 CastTy, MMO, &Fast) &&
689 Fast;
690}
691
692// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
693// profitable with the expansion for 64-bit since it's generally good to
694// speculate things.
695bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
696 return true;
697}
698
699bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
700 return true;
701}
702
703bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
704 switch (N->getOpcode()) {
705 case ISD::EntryToken:
706 case ISD::TokenFactor:
707 return true;
708 case ISD::INTRINSIC_WO_CHAIN: {
709 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
710 switch (IntrID) {
711 case Intrinsic::amdgcn_readfirstlane:
712 case Intrinsic::amdgcn_readlane:
713 return true;
714 }
715 return false;
716 }
717 case ISD::LOAD:
718 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
719 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
720 return true;
721 return false;
722 case AMDGPUISD::SETCC: // ballot-style instruction
723 return true;
724 }
725 return false;
726}
727
728SDValue AMDGPUTargetLowering::getNegatedExpression(
729 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
730 NegatibleCost &Cost, unsigned Depth) const {
731
732 switch (Op.getOpcode()) {
733 case ISD::FMA:
734 case ISD::FMAD: {
735 // Negating a fma is not free if it has users without source mods.
736 if (!allUsesHaveSourceMods(Op.getNode()))
737 return SDValue();
738 break;
739 }
740 default:
741 break;
742 }
743
744 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
745 ForCodeSize, Cost, Depth);
746}
747
748//===---------------------------------------------------------------------===//
749// Target Properties
750//===---------------------------------------------------------------------===//
751
752bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
753 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 753, __extension__ __PRETTY_FUNCTION__))
;
754
755 // Packed operations do not have a fabs modifier.
756 return VT == MVT::f32 || VT == MVT::f64 ||
757 (Subtarget->has16BitInsts() && VT == MVT::f16);
758}
759
760bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
761 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 761, __extension__ __PRETTY_FUNCTION__))
;
762 // Report this based on the end legalized type.
763 VT = VT.getScalarType();
764 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
765}
766
767bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
768 unsigned NumElem,
769 unsigned AS) const {
770 return true;
771}
772
773bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
774 // There are few operations which truly have vector input operands. Any vector
775 // operation is going to involve operations on each component, and a
776 // build_vector will be a copy per element, so it always makes sense to use a
777 // build_vector input in place of the extracted element to avoid a copy into a
778 // super register.
779 //
780 // We should probably only do this if all users are extracts only, but this
781 // should be the common case.
782 return true;
783}
784
785bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
786 // Truncate is just accessing a subregister.
787
788 unsigned SrcSize = Source.getSizeInBits();
789 unsigned DestSize = Dest.getSizeInBits();
790
791 return DestSize < SrcSize && DestSize % 32 == 0 ;
792}
793
794bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
795 // Truncate is just accessing a subregister.
796
797 unsigned SrcSize = Source->getScalarSizeInBits();
798 unsigned DestSize = Dest->getScalarSizeInBits();
799
800 if (DestSize== 16 && Subtarget->has16BitInsts())
801 return SrcSize >= 32;
802
803 return DestSize < SrcSize && DestSize % 32 == 0;
804}
805
806bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
807 unsigned SrcSize = Src->getScalarSizeInBits();
808 unsigned DestSize = Dest->getScalarSizeInBits();
809
810 if (SrcSize == 16 && Subtarget->has16BitInsts())
811 return DestSize >= 32;
812
813 return SrcSize == 32 && DestSize == 64;
814}
815
816bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
817 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
818 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
819 // this will enable reducing 64-bit operations the 32-bit, which is always
820 // good.
821
822 if (Src == MVT::i16)
823 return Dest == MVT::i32 ||Dest == MVT::i64 ;
824
825 return Src == MVT::i32 && Dest == MVT::i64;
826}
827
828bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
829 return isZExtFree(Val.getValueType(), VT2);
830}
831
832bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
833 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
834 // limited number of native 64-bit operations. Shrinking an operation to fit
835 // in a single 32-bit register should always be helpful. As currently used,
836 // this is much less general than the name suggests, and is only used in
837 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
838 // not profitable, and may actually be harmful.
839 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
840}
841
842//===---------------------------------------------------------------------===//
843// TargetLowering Callbacks
844//===---------------------------------------------------------------------===//
845
846CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
847 bool IsVarArg) {
848 switch (CC) {
849 case CallingConv::AMDGPU_VS:
850 case CallingConv::AMDGPU_GS:
851 case CallingConv::AMDGPU_PS:
852 case CallingConv::AMDGPU_CS:
853 case CallingConv::AMDGPU_HS:
854 case CallingConv::AMDGPU_ES:
855 case CallingConv::AMDGPU_LS:
856 return CC_AMDGPU;
857 case CallingConv::C:
858 case CallingConv::Fast:
859 case CallingConv::Cold:
860 return CC_AMDGPU_Func;
861 case CallingConv::AMDGPU_Gfx:
862 return CC_SI_Gfx;
863 case CallingConv::AMDGPU_KERNEL:
864 case CallingConv::SPIR_KERNEL:
865 default:
866 report_fatal_error("Unsupported calling convention for call");
867 }
868}
869
870CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
871 bool IsVarArg) {
872 switch (CC) {
873 case CallingConv::AMDGPU_KERNEL:
874 case CallingConv::SPIR_KERNEL:
875 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 875)
;
876 case CallingConv::AMDGPU_VS:
877 case CallingConv::AMDGPU_GS:
878 case CallingConv::AMDGPU_PS:
879 case CallingConv::AMDGPU_CS:
880 case CallingConv::AMDGPU_HS:
881 case CallingConv::AMDGPU_ES:
882 case CallingConv::AMDGPU_LS:
883 return RetCC_SI_Shader;
884 case CallingConv::AMDGPU_Gfx:
885 return RetCC_SI_Gfx;
886 case CallingConv::C:
887 case CallingConv::Fast:
888 case CallingConv::Cold:
889 return RetCC_AMDGPU_Func;
890 default:
891 report_fatal_error("Unsupported calling convention.");
892 }
893}
894
895/// The SelectionDAGBuilder will automatically promote function arguments
896/// with illegal types. However, this does not work for the AMDGPU targets
897/// since the function arguments are stored in memory as these illegal types.
898/// In order to handle this properly we need to get the original types sizes
899/// from the LLVM IR Function and fixup the ISD:InputArg values before
900/// passing them to AnalyzeFormalArguments()
901
902/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
903/// input values across multiple registers. Each item in the Ins array
904/// represents a single value that will be stored in registers. Ins[x].VT is
905/// the value type of the value that will be stored in the register, so
906/// whatever SDNode we lower the argument to needs to be this type.
907///
908/// In order to correctly lower the arguments we need to know the size of each
909/// argument. Since Ins[x].VT gives us the size of the register that will
910/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
911/// for the original function argument so that we can deduce the correct memory
912/// type to use for Ins[x]. In most cases the correct memory type will be
913/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
914/// we have a kernel argument of type v8i8, this argument will be split into
915/// 8 parts and each part will be represented by its own item in the Ins array.
916/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
917/// the argument before it was split. From this, we deduce that the memory type
918/// for each individual part is i8. We pass the memory type as LocVT to the
919/// calling convention analysis function and the register type (Ins[x].VT) as
920/// the ValVT.
921void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
922 CCState &State,
923 const SmallVectorImpl<ISD::InputArg> &Ins) const {
924 const MachineFunction &MF = State.getMachineFunction();
925 const Function &Fn = MF.getFunction();
926 LLVMContext &Ctx = Fn.getParent()->getContext();
927 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
928 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
929 CallingConv::ID CC = Fn.getCallingConv();
930
931 Align MaxAlign = Align(1);
932 uint64_t ExplicitArgOffset = 0;
933 const DataLayout &DL = Fn.getParent()->getDataLayout();
934
935 unsigned InIndex = 0;
936
937 for (const Argument &Arg : Fn.args()) {
938 const bool IsByRef = Arg.hasByRefAttr();
939 Type *BaseArgTy = Arg.getType();
940 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
941 Align Alignment = DL.getValueOrABITypeAlignment(
942 IsByRef ? Arg.getParamAlign() : None, MemArgTy);
943 MaxAlign = std::max(Alignment, MaxAlign);
944 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
945
946 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
947 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
948
949 // We're basically throwing away everything passed into us and starting over
950 // to get accurate in-memory offsets. The "PartOffset" is completely useless
951 // to us as computed in Ins.
952 //
953 // We also need to figure out what type legalization is trying to do to get
954 // the correct memory offsets.
955
956 SmallVector<EVT, 16> ValueVTs;
957 SmallVector<uint64_t, 16> Offsets;
958 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
959
960 for (unsigned Value = 0, NumValues = ValueVTs.size();
961 Value != NumValues; ++Value) {
962 uint64_t BasePartOffset = Offsets[Value];
963
964 EVT ArgVT = ValueVTs[Value];
965 EVT MemVT = ArgVT;
966 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
967 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
968
969 if (NumRegs == 1) {
970 // This argument is not split, so the IR type is the memory type.
971 if (ArgVT.isExtended()) {
972 // We have an extended type, like i24, so we should just use the
973 // register type.
974 MemVT = RegisterVT;
975 } else {
976 MemVT = ArgVT;
977 }
978 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
979 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
980 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())(static_cast <bool> (ArgVT.getVectorNumElements() > RegisterVT
.getVectorNumElements()) ? void (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 980, __extension__
__PRETTY_FUNCTION__))
;
981 // We have a vector value which has been split into a vector with
982 // the same scalar type, but fewer elements. This should handle
983 // all the floating-point vector types.
984 MemVT = RegisterVT;
985 } else if (ArgVT.isVector() &&
986 ArgVT.getVectorNumElements() == NumRegs) {
987 // This arg has been split so that each element is stored in a separate
988 // register.
989 MemVT = ArgVT.getScalarType();
990 } else if (ArgVT.isExtended()) {
991 // We have an extended type, like i65.
992 MemVT = RegisterVT;
993 } else {
994 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
995 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)(static_cast <bool> (ArgVT.getStoreSizeInBits() % NumRegs
== 0) ? void (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 995, __extension__
__PRETTY_FUNCTION__))
;
996 if (RegisterVT.isInteger()) {
997 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
998 } else if (RegisterVT.isVector()) {
999 assert(!RegisterVT.getScalarType().isFloatingPoint())(static_cast <bool> (!RegisterVT.getScalarType().isFloatingPoint
()) ? void (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 999, __extension__
__PRETTY_FUNCTION__))
;
1000 unsigned NumElements = RegisterVT.getVectorNumElements();
1001 assert(MemoryBits % NumElements == 0)(static_cast <bool> (MemoryBits % NumElements == 0) ? void
(0) : __assert_fail ("MemoryBits % NumElements == 0", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1001, __extension__ __PRETTY_FUNCTION__))
;
1002 // This vector type has been split into another vector type with
1003 // a different elements size.
1004 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1005 MemoryBits / NumElements);
1006 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1007 } else {
1008 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1008)
;
1009 }
1010 }
1011
1012 // Convert one element vectors to scalar.
1013 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1014 MemVT = MemVT.getScalarType();
1015
1016 // Round up vec3/vec5 argument.
1017 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1018 assert(MemVT.getVectorNumElements() == 3 ||(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5) ? void (0) : __assert_fail
("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1019, __extension__
__PRETTY_FUNCTION__))
1019 MemVT.getVectorNumElements() == 5)(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5) ? void (0) : __assert_fail
("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1019, __extension__
__PRETTY_FUNCTION__))
;
1020 MemVT = MemVT.getPow2VectorType(State.getContext());
1021 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1022 MemVT = MemVT.getRoundIntegerType(State.getContext());
1023 }
1024
1025 unsigned PartOffset = 0;
1026 for (unsigned i = 0; i != NumRegs; ++i) {
1027 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1028 BasePartOffset + PartOffset,
1029 MemVT.getSimpleVT(),
1030 CCValAssign::Full));
1031 PartOffset += MemVT.getStoreSize();
1032 }
1033 }
1034 }
1035}
1036
1037SDValue AMDGPUTargetLowering::LowerReturn(
1038 SDValue Chain, CallingConv::ID CallConv,
1039 bool isVarArg,
1040 const SmallVectorImpl<ISD::OutputArg> &Outs,
1041 const SmallVectorImpl<SDValue> &OutVals,
1042 const SDLoc &DL, SelectionDAG &DAG) const {
1043 // FIXME: Fails for r600 tests
1044 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1045 // "wave terminate should not have return values");
1046 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1047}
1048
1049//===---------------------------------------------------------------------===//
1050// Target specific lowering
1051//===---------------------------------------------------------------------===//
1052
1053/// Selects the correct CCAssignFn for a given CallingConvention value.
1054CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1055 bool IsVarArg) {
1056 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1057}
1058
1059CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1060 bool IsVarArg) {
1061 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1062}
1063
1064SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1065 SelectionDAG &DAG,
1066 MachineFrameInfo &MFI,
1067 int ClobberedFI) const {
1068 SmallVector<SDValue, 8> ArgChains;
1069 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1070 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1071
1072 // Include the original chain at the beginning of the list. When this is
1073 // used by target LowerCall hooks, this helps legalize find the
1074 // CALLSEQ_BEGIN node.
1075 ArgChains.push_back(Chain);
1076
1077 // Add a chain value for each stack argument corresponding
1078 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1079 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1080 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1081 if (FI->getIndex() < 0) {
1082 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1083 int64_t InLastByte = InFirstByte;
1084 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1085
1086 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1087 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1088 ArgChains.push_back(SDValue(L, 1));
1089 }
1090 }
1091 }
1092 }
1093
1094 // Build a tokenfactor for all the chains.
1095 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1096}
1097
1098SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1099 SmallVectorImpl<SDValue> &InVals,
1100 StringRef Reason) const {
1101 SDValue Callee = CLI.Callee;
1102 SelectionDAG &DAG = CLI.DAG;
1103
1104 const Function &Fn = DAG.getMachineFunction().getFunction();
1105
1106 StringRef FuncName("<unknown>");
1107
1108 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1109 FuncName = G->getSymbol();
1110 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1111 FuncName = G->getGlobal()->getName();
1112
1113 DiagnosticInfoUnsupported NoCalls(
1114 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1115 DAG.getContext()->diagnose(NoCalls);
1116
1117 if (!CLI.IsTailCall) {
1118 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1119 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1120 }
1121
1122 return DAG.getEntryNode();
1123}
1124
1125SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1126 SmallVectorImpl<SDValue> &InVals) const {
1127 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1128}
1129
1130SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1131 SelectionDAG &DAG) const {
1132 const Function &Fn = DAG.getMachineFunction().getFunction();
1133
1134 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1135 SDLoc(Op).getDebugLoc());
1136 DAG.getContext()->diagnose(NoDynamicAlloca);
1137 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1138 return DAG.getMergeValues(Ops, SDLoc());
1139}
1140
1141SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1142 SelectionDAG &DAG) const {
1143 switch (Op.getOpcode()) {
1144 default:
1145 Op->print(errs(), &DAG);
1146 llvm_unreachable("Custom lowering code for this "::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1147)
1147 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1147)
;
1148 break;
1149 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1150 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1151 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1152 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1153 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1154 case ISD::FREM: return LowerFREM(Op, DAG);
1155 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1156 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1157 case ISD::FRINT: return LowerFRINT(Op, DAG);
1158 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1159 case ISD::FROUND: return LowerFROUND(Op, DAG);
1160 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1161 case ISD::FLOG:
1162 return LowerFLOG(Op, DAG, numbers::ln2f);
1163 case ISD::FLOG10:
1164 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1165 case ISD::FEXP:
1166 return lowerFEXP(Op, DAG);
1167 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1168 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1169 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1170 case ISD::FP_TO_SINT:
1171 case ISD::FP_TO_UINT:
1172 return LowerFP_TO_INT(Op, DAG);
1173 case ISD::CTTZ:
1174 case ISD::CTTZ_ZERO_UNDEF:
1175 case ISD::CTLZ:
1176 case ISD::CTLZ_ZERO_UNDEF:
1177 return LowerCTLZ_CTTZ(Op, DAG);
1178 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1179 }
1180 return Op;
1181}
1182
1183void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1184 SmallVectorImpl<SDValue> &Results,
1185 SelectionDAG &DAG) const {
1186 switch (N->getOpcode()) {
1187 case ISD::SIGN_EXTEND_INREG:
1188 // Different parts of legalization seem to interpret which type of
1189 // sign_extend_inreg is the one to check for custom lowering. The extended
1190 // from type is what really matters, but some places check for custom
1191 // lowering of the result type. This results in trying to use
1192 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1193 // nothing here and let the illegal result integer be handled normally.
1194 return;
1195 default:
1196 return;
1197 }
1198}
1199
1200SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1201 SDValue Op,
1202 SelectionDAG &DAG) const {
1203
1204 const DataLayout &DL = DAG.getDataLayout();
1205 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1206 const GlobalValue *GV = G->getGlobal();
1207
1208 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1209 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1210 if (!MFI->isModuleEntryFunction() &&
1211 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1212 SDLoc DL(Op);
1213 const Function &Fn = DAG.getMachineFunction().getFunction();
1214 DiagnosticInfoUnsupported BadLDSDecl(
1215 Fn, "local memory global used by non-kernel function",
1216 DL.getDebugLoc(), DS_Warning);
1217 DAG.getContext()->diagnose(BadLDSDecl);
1218
1219 // We currently don't have a way to correctly allocate LDS objects that
1220 // aren't directly associated with a kernel. We do force inlining of
1221 // functions that use local objects. However, if these dead functions are
1222 // not eliminated, we don't want a compile time error. Just emit a warning
1223 // and a trap, since there should be no callable path here.
1224 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1225 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1226 Trap, DAG.getRoot());
1227 DAG.setRoot(OutputChain);
1228 return DAG.getUNDEF(Op.getValueType());
1229 }
1230
1231 // XXX: What does the value of G->getOffset() mean?
1232 assert(G->getOffset() == 0 &&(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1233, __extension__
__PRETTY_FUNCTION__))
1233 "Do not know what to do with an non-zero offset")(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1233, __extension__
__PRETTY_FUNCTION__))
;
1234
1235 // TODO: We could emit code to handle the initialization somewhere.
1236 // We ignore the initializer for now and legalize it to allow selection.
1237 // The initializer will anyway get errored out during assembly emission.
1238 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1239 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1240 }
1241 return SDValue();
1242}
1243
1244SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1245 SelectionDAG &DAG) const {
1246 SmallVector<SDValue, 8> Args;
1247
1248 EVT VT = Op.getValueType();
1249 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1250 SDLoc SL(Op);
1251 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1252 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1253
1254 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1255 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1256 }
1257
1258 for (const SDUse &U : Op->ops())
1259 DAG.ExtractVectorElements(U.get(), Args);
1260
1261 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1262}
1263
1264SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1265 SelectionDAG &DAG) const {
1266
1267 SmallVector<SDValue, 8> Args;
1268 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1269 EVT VT = Op.getValueType();
1270 EVT SrcVT = Op.getOperand(0).getValueType();
1271
1272 // For these types, we have some TableGen patterns except if the index is 1
1273 if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1274 (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1275 Start != 1)
1276 return Op;
1277
1278 if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1279 (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1280 (Start == 0 || Start == 4))
1281 return Op;
1282
1283 if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
1284 (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
1285 (Start == 0 || Start == 8))
1286 return Op;
1287
1288 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1289 VT.getVectorNumElements());
1290
1291 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1292}
1293
1294/// Generate Min/Max node
1295SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1296 SDValue LHS, SDValue RHS,
1297 SDValue True, SDValue False,
1298 SDValue CC,
1299 DAGCombinerInfo &DCI) const {
1300 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1301 return SDValue();
1302
1303 SelectionDAG &DAG = DCI.DAG;
1304 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1305 switch (CCOpcode) {
1306 case ISD::SETOEQ:
1307 case ISD::SETONE:
1308 case ISD::SETUNE:
1309 case ISD::SETNE:
1310 case ISD::SETUEQ:
1311 case ISD::SETEQ:
1312 case ISD::SETFALSE:
1313 case ISD::SETFALSE2:
1314 case ISD::SETTRUE:
1315 case ISD::SETTRUE2:
1316 case ISD::SETUO:
1317 case ISD::SETO:
1318 break;
1319 case ISD::SETULE:
1320 case ISD::SETULT: {
1321 if (LHS == True)
1322 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1323 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1324 }
1325 case ISD::SETOLE:
1326 case ISD::SETOLT:
1327 case ISD::SETLE:
1328 case ISD::SETLT: {
1329 // Ordered. Assume ordered for undefined.
1330
1331 // Only do this after legalization to avoid interfering with other combines
1332 // which might occur.
1333 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1334 !DCI.isCalledByLegalizer())
1335 return SDValue();
1336
1337 // We need to permute the operands to get the correct NaN behavior. The
1338 // selected operand is the second one based on the failing compare with NaN,
1339 // so permute it based on the compare type the hardware uses.
1340 if (LHS == True)
1341 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1342 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1343 }
1344 case ISD::SETUGE:
1345 case ISD::SETUGT: {
1346 if (LHS == True)
1347 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1348 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1349 }
1350 case ISD::SETGT:
1351 case ISD::SETGE:
1352 case ISD::SETOGE:
1353 case ISD::SETOGT: {
1354 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1355 !DCI.isCalledByLegalizer())
1356 return SDValue();
1357
1358 if (LHS == True)
1359 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1360 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1361 }
1362 case ISD::SETCC_INVALID:
1363 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1363)
;
1364 }
1365 return SDValue();
1366}
1367
1368std::pair<SDValue, SDValue>
1369AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1370 SDLoc SL(Op);
1371
1372 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1373
1374 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1375 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1376
1377 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1378 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1379
1380 return std::make_pair(Lo, Hi);
1381}
1382
1383SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1384 SDLoc SL(Op);
1385
1386 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1387 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1388 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1389}
1390
1391SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1392 SDLoc SL(Op);
1393
1394 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1395 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1396 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1397}
1398
1399// Split a vector type into two parts. The first part is a power of two vector.
1400// The second part is whatever is left over, and is a scalar if it would
1401// otherwise be a 1-vector.
1402std::pair<EVT, EVT>
1403AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1404 EVT LoVT, HiVT;
1405 EVT EltVT = VT.getVectorElementType();
1406 unsigned NumElts = VT.getVectorNumElements();
1407 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1408 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1409 HiVT = NumElts - LoNumElts == 1
1410 ? EltVT
1411 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1412 return std::make_pair(LoVT, HiVT);
1413}
1414
1415// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1416// scalar.
1417std::pair<SDValue, SDValue>
1418AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1419 const EVT &LoVT, const EVT &HiVT,
1420 SelectionDAG &DAG) const {
1421 assert(LoVT.getVectorNumElements() +(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1424, __extension__
__PRETTY_FUNCTION__))
1422 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1424, __extension__
__PRETTY_FUNCTION__))
1423 N.getValueType().getVectorNumElements() &&(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1424, __extension__
__PRETTY_FUNCTION__))
1424 "More vector elements requested than available!")(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1424, __extension__
__PRETTY_FUNCTION__))
;
1425 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1426 DAG.getVectorIdxConstant(0, DL));
1427 SDValue Hi = DAG.getNode(
1428 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1429 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1430 return std::make_pair(Lo, Hi);
1431}
1432
1433SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1434 SelectionDAG &DAG) const {
1435 LoadSDNode *Load = cast<LoadSDNode>(Op);
1436 EVT VT = Op.getValueType();
1437 SDLoc SL(Op);
1438
1439
1440 // If this is a 2 element vector, we really want to scalarize and not create
1441 // weird 1 element vectors.
1442 if (VT.getVectorNumElements() == 2) {
1443 SDValue Ops[2];
1444 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1445 return DAG.getMergeValues(Ops, SL);
1446 }
1447
1448 SDValue BasePtr = Load->getBasePtr();
1449 EVT MemVT = Load->getMemoryVT();
1450
1451 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1452
1453 EVT LoVT, HiVT;
1454 EVT LoMemVT, HiMemVT;
1455 SDValue Lo, Hi;
1456
1457 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1458 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1459 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1460
1461 unsigned Size = LoMemVT.getStoreSize();
1462 Align BaseAlign = Load->getAlign();
1463 Align HiAlign = commonAlignment(BaseAlign, Size);
1464
1465 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1466 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1467 BaseAlign, Load->getMemOperand()->getFlags());
1468 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1469 SDValue HiLoad =
1470 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1471 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1472 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1473
1474 SDValue Join;
1475 if (LoVT == HiVT) {
1476 // This is the case that the vector is power of two so was evenly split.
1477 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1478 } else {
1479 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1480 DAG.getVectorIdxConstant(0, SL));
1481 Join = DAG.getNode(
1482 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1483 VT, Join, HiLoad,
1484 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1485 }
1486
1487 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1488 LoLoad.getValue(1), HiLoad.getValue(1))};
1489
1490 return DAG.getMergeValues(Ops, SL);
1491}
1492
1493SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1494 SelectionDAG &DAG) const {
1495 LoadSDNode *Load = cast<LoadSDNode>(Op);
1496 EVT VT = Op.getValueType();
1497 SDValue BasePtr = Load->getBasePtr();
1498 EVT MemVT = Load->getMemoryVT();
1499 SDLoc SL(Op);
1500 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1501 Align BaseAlign = Load->getAlign();
1502 unsigned NumElements = MemVT.getVectorNumElements();
1503
1504 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1505 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1506 if (NumElements != 3 ||
1507 (BaseAlign < Align(8) &&
1508 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1509 return SplitVectorLoad(Op, DAG);
1510
1511 assert(NumElements == 3)(static_cast <bool> (NumElements == 3) ? void (0) : __assert_fail
("NumElements == 3", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1511, __extension__ __PRETTY_FUNCTION__))
;
1512
1513 EVT WideVT =
1514 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1515 EVT WideMemVT =
1516 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1517 SDValue WideLoad = DAG.getExtLoad(
1518 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1519 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1520 return DAG.getMergeValues(
1521 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1522 DAG.getVectorIdxConstant(0, SL)),
1523 WideLoad.getValue(1)},
1524 SL);
1525}
1526
1527SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1528 SelectionDAG &DAG) const {
1529 StoreSDNode *Store = cast<StoreSDNode>(Op);
1530 SDValue Val = Store->getValue();
1531 EVT VT = Val.getValueType();
1532
1533 // If this is a 2 element vector, we really want to scalarize and not create
1534 // weird 1 element vectors.
1535 if (VT.getVectorNumElements() == 2)
1536 return scalarizeVectorStore(Store, DAG);
1537
1538 EVT MemVT = Store->getMemoryVT();
1539 SDValue Chain = Store->getChain();
1540 SDValue BasePtr = Store->getBasePtr();
1541 SDLoc SL(Op);
1542
1543 EVT LoVT, HiVT;
1544 EVT LoMemVT, HiMemVT;
1545 SDValue Lo, Hi;
1546
1547 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1548 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1549 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1550
1551 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1552
1553 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1554 Align BaseAlign = Store->getAlign();
1555 unsigned Size = LoMemVT.getStoreSize();
1556 Align HiAlign = commonAlignment(BaseAlign, Size);
1557
1558 SDValue LoStore =
1559 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1560 Store->getMemOperand()->getFlags());
1561 SDValue HiStore =
1562 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1563 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1564
1565 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1566}
1567
1568// This is a shortcut for integer division because we have fast i32<->f32
1569// conversions, and fast f32 reciprocal instructions. The fractional part of a
1570// float is enough to accurately represent up to a 24-bit signed integer.
1571SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1572 bool Sign) const {
1573 SDLoc DL(Op);
1574 EVT VT = Op.getValueType();
1575 SDValue LHS = Op.getOperand(0);
1576 SDValue RHS = Op.getOperand(1);
1577 MVT IntVT = MVT::i32;
1578 MVT FltVT = MVT::f32;
1579
1580 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1581 if (LHSSignBits < 9)
1582 return SDValue();
1583
1584 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1585 if (RHSSignBits < 9)
1586 return SDValue();
1587
1588 unsigned BitSize = VT.getSizeInBits();
1589 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1590 unsigned DivBits = BitSize - SignBits;
1591 if (Sign)
1592 ++DivBits;
1593
1594 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1595 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1596
1597 SDValue jq = DAG.getConstant(1, DL, IntVT);
1598
1599 if (Sign) {
1600 // char|short jq = ia ^ ib;
1601 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1602
1603 // jq = jq >> (bitsize - 2)
1604 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1605 DAG.getConstant(BitSize - 2, DL, VT));
1606
1607 // jq = jq | 0x1
1608 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1609 }
1610
1611 // int ia = (int)LHS;
1612 SDValue ia = LHS;
1613
1614 // int ib, (int)RHS;
1615 SDValue ib = RHS;
1616
1617 // float fa = (float)ia;
1618 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1619
1620 // float fb = (float)ib;
1621 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1622
1623 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1624 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1625
1626 // fq = trunc(fq);
1627 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1628
1629 // float fqneg = -fq;
1630 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1631
1632 MachineFunction &MF = DAG.getMachineFunction();
1633 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1634
1635 // float fr = mad(fqneg, fb, fa);
1636 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1637 (unsigned)ISD::FMA :
1638 !MFI->getMode().allFP32Denormals() ?
1639 (unsigned)ISD::FMAD :
1640 (unsigned)AMDGPUISD::FMAD_FTZ;
1641 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1642
1643 // int iq = (int)fq;
1644 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1645
1646 // fr = fabs(fr);
1647 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1648
1649 // fb = fabs(fb);
1650 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1651
1652 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1653
1654 // int cv = fr >= fb;
1655 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1656
1657 // jq = (cv ? jq : 0);
1658 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1659
1660 // dst = iq + jq;
1661 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1662
1663 // Rem needs compensation, it's easier to recompute it
1664 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1665 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1666
1667 // Truncate to number of bits this divide really is.
1668 if (Sign) {
1669 SDValue InRegSize
1670 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1671 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1672 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1673 } else {
1674 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1675 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1676 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1677 }
1678
1679 return DAG.getMergeValues({ Div, Rem }, DL);
1680}
1681
1682void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1683 SelectionDAG &DAG,
1684 SmallVectorImpl<SDValue> &Results) const {
1685 SDLoc DL(Op);
1686 EVT VT = Op.getValueType();
1687
1688 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")(static_cast <bool> (VT == MVT::i64 && "LowerUDIVREM64 expects an i64"
) ? void (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1688, __extension__
__PRETTY_FUNCTION__))
;
1689
1690 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1691
1692 SDValue One = DAG.getConstant(1, DL, HalfVT);
1693 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1694
1695 //HiLo split
1696 SDValue LHS = Op.getOperand(0);
1697 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1698 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1699
1700 SDValue RHS = Op.getOperand(1);
1701 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1702 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1703
1704 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1705 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1706
1707 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1708 LHS_Lo, RHS_Lo);
1709
1710 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1711 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1712
1713 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1714 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1715 return;
1716 }
1717
1718 if (isTypeLegal(MVT::i64)) {
1719 // The algorithm here is based on ideas from "Software Integer Division",
1720 // Tom Rodeheffer, August 2008.
1721
1722 MachineFunction &MF = DAG.getMachineFunction();
1723 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1724
1725 // Compute denominator reciprocal.
1726 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1727 (unsigned)ISD::FMA :
1728 !MFI->getMode().allFP32Denormals() ?
1729 (unsigned)ISD::FMAD :
1730 (unsigned)AMDGPUISD::FMAD_FTZ;
1731
1732 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1733 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1734 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1735 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1736 Cvt_Lo);
1737 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1738 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1739 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1740 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1741 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1742 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1743 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1744 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1745 Mul1);
1746 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1747 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1748 SDValue Rcp64 = DAG.getBitcast(VT,
1749 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1750
1751 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1752 SDValue One64 = DAG.getConstant(1, DL, VT);
1753 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1754 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1755
1756 // First round of UNR (Unsigned integer Newton-Raphson).
1757 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1758 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1759 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1760 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1761 Zero);
1762 SDValue Mulhi1_Hi =
1763 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1764 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1765 Mulhi1_Lo, Zero1);
1766 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1767 Mulhi1_Hi, Add1_Lo.getValue(1));
1768 SDValue Add1 = DAG.getBitcast(VT,
1769 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1770
1771 // Second round of UNR.
1772 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1773 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1774 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1775 Zero);
1776 SDValue Mulhi2_Hi =
1777 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1778 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1779 Mulhi2_Lo, Zero1);
1780 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1781 Mulhi2_Hi, Add2_Lo.getValue(1));
1782 SDValue Add2 = DAG.getBitcast(VT,
1783 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1784
1785 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1786
1787 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1788
1789 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1790 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1791 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1792 Mul3_Lo, Zero1);
1793 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1794 Mul3_Hi, Sub1_Lo.getValue(1));
1795 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1796 SDValue Sub1 = DAG.getBitcast(VT,
1797 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1798
1799 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1800 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1801 ISD::SETUGE);
1802 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1803 ISD::SETUGE);
1804 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1805
1806 // TODO: Here and below portions of the code can be enclosed into if/endif.
1807 // Currently control flow is unconditional and we have 4 selects after
1808 // potential endif to substitute PHIs.
1809
1810 // if C3 != 0 ...
1811 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1812 RHS_Lo, Zero1);
1813 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1814 RHS_Hi, Sub1_Lo.getValue(1));
1815 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1816 Zero, Sub2_Lo.getValue(1));
1817 SDValue Sub2 = DAG.getBitcast(VT,
1818 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1819
1820 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1821
1822 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1823 ISD::SETUGE);
1824 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1825 ISD::SETUGE);
1826 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1827
1828 // if (C6 != 0)
1829 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1830
1831 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1832 RHS_Lo, Zero1);
1833 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1834 RHS_Hi, Sub2_Lo.getValue(1));
1835 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1836 Zero, Sub3_Lo.getValue(1));
1837 SDValue Sub3 = DAG.getBitcast(VT,
1838 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1839
1840 // endif C6
1841 // endif C3
1842
1843 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1844 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1845
1846 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1847 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1848
1849 Results.push_back(Div);
1850 Results.push_back(Rem);
1851
1852 return;
1853 }
1854
1855 // r600 expandion.
1856 // Get Speculative values
1857 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1858 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1859
1860 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1861 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1862 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1863
1864 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1865 SDValue DIV_Lo = Zero;
1866
1867 const unsigned halfBitWidth = HalfVT.getSizeInBits();
1868
1869 for (unsigned i = 0; i < halfBitWidth; ++i) {
1870 const unsigned bitPos = halfBitWidth - i - 1;
1871 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1872 // Get value of high bit
1873 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1874 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1875 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1876
1877 // Shift
1878 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1879 // Add LHS high bit
1880 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1881
1882 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1883 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1884
1885 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1886
1887 // Update REM
1888 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1889 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1890 }
1891
1892 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1893 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1894 Results.push_back(DIV);
1895 Results.push_back(REM);
1896}
1897
1898SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1899 SelectionDAG &DAG) const {
1900 SDLoc DL(Op);
1901 EVT VT = Op.getValueType();
1902
1903 if (VT == MVT::i64) {
1904 SmallVector<SDValue, 2> Results;
1905 LowerUDIVREM64(Op, DAG, Results);
1906 return DAG.getMergeValues(Results, DL);
1907 }
1908
1909 if (VT == MVT::i32) {
1910 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1911 return Res;
1912 }
1913
1914 SDValue X = Op.getOperand(0);
1915 SDValue Y = Op.getOperand(1);
1916
1917 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
1918 // algorithm used here.
1919
1920 // Initial estimate of inv(y).
1921 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
1922
1923 // One round of UNR.
1924 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
1925 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
1926 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
1927 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
1928
1929 // Quotient/remainder estimate.
1930 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
1931 SDValue R =
1932 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
1933
1934 // First quotient/remainder refinement.
1935 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1936 SDValue One = DAG.getConstant(1, DL, VT);
1937 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
1938 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1939 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
1940 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1941 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
1942
1943 // Second quotient/remainder refinement.
1944 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
1945 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1946 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
1947 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1948 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
1949
1950 return DAG.getMergeValues({Q, R}, DL);
1951}
1952
1953SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1954 SelectionDAG &DAG) const {
1955 SDLoc DL(Op);
1956 EVT VT = Op.getValueType();
1957
1958 SDValue LHS = Op.getOperand(0);
1959 SDValue RHS = Op.getOperand(1);
1960
1961 SDValue Zero = DAG.getConstant(0, DL, VT);
1962 SDValue NegOne = DAG.getConstant(-1, DL, VT);
1963
1964 if (VT == MVT::i32) {
1965 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1966 return Res;
1967 }
1968
1969 if (VT == MVT::i64 &&
1970 DAG.ComputeNumSignBits(LHS) > 32 &&
1971 DAG.ComputeNumSignBits(RHS) > 32) {
1972 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1973
1974 //HiLo split
1975 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1976 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1977 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1978 LHS_Lo, RHS_Lo);
1979 SDValue Res[2] = {
1980 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1981 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1982 };
1983 return DAG.getMergeValues(Res, DL);
1984 }
1985
1986 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1987 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1988 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1989 SDValue RSign = LHSign; // Remainder sign is the same as LHS
1990
1991 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1992 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1993
1994 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1995 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1996
1997 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1998 SDValue Rem = Div.getValue(1);
1999
2000 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2001 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2002
2003 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2004 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2005
2006 SDValue Res[2] = {
2007 Div,
2008 Rem
2009 };
2010 return DAG.getMergeValues(Res, DL);
2011}
2012
2013// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2014SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2015 SDLoc SL(Op);
2016 EVT VT = Op.getValueType();
2017 auto Flags = Op->getFlags();
2018 SDValue X = Op.getOperand(0);
2019 SDValue Y = Op.getOperand(1);
2020
2021 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2022 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2023 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2024 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2025 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2026}
2027
2028SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2029 SDLoc SL(Op);
2030 SDValue Src = Op.getOperand(0);
2031
2032 // result = trunc(src)
2033 // if (src > 0.0 && src != result)
2034 // result += 1.0
2035
2036 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2037
2038 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2039 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2040
2041 EVT SetCCVT =
2042 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2043
2044 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2045 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2046 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2047
2048 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2049 // TODO: Should this propagate fast-math-flags?
2050 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2051}
2052
2053static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2054 SelectionDAG &DAG) {
2055 const unsigned FractBits = 52;
2056 const unsigned ExpBits = 11;
2057
2058 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2059 Hi,
2060 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2061 DAG.getConstant(ExpBits, SL, MVT::i32));
2062 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2063 DAG.getConstant(1023, SL, MVT::i32));
2064
2065 return Exp;
2066}
2067
2068SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2069 SDLoc SL(Op);
2070 SDValue Src = Op.getOperand(0);
2071
2072 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2072, __extension__ __PRETTY_FUNCTION__))
;
2073
2074 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2075
2076 // Extract the upper half, since this is where we will find the sign and
2077 // exponent.
2078 SDValue Hi = getHiHalf64(Src, DAG);
2079
2080 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2081
2082 const unsigned FractBits = 52;
2083
2084 // Extract the sign bit.
2085 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2086 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2087
2088 // Extend back to 64-bits.
2089 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2090 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2091
2092 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2093 const SDValue FractMask
2094 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2095
2096 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2097 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2098 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2099
2100 EVT SetCCVT =
2101 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2102
2103 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2104
2105 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2106 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2107
2108 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2109 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2110
2111 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2112}
2113
2114SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2115 SDLoc SL(Op);
2116 SDValue Src = Op.getOperand(0);
2117
2118 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2118, __extension__ __PRETTY_FUNCTION__))
;
2119
2120 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2121 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2122 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2123
2124 // TODO: Should this propagate fast-math-flags?
2125
2126 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2127 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2128
2129 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2130
2131 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2132 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2133
2134 EVT SetCCVT =
2135 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2136 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2137
2138 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2139}
2140
2141SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2142 // FNEARBYINT and FRINT are the same, except in their handling of FP
2143 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2144 // rint, so just treat them as equivalent.
2145 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2146}
2147
2148// XXX - May require not supporting f32 denormals?
2149
2150// Don't handle v2f16. The extra instructions to scalarize and repack around the
2151// compare and vselect end up producing worse code than scalarizing the whole
2152// operation.
2153SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2154 SDLoc SL(Op);
2155 SDValue X = Op.getOperand(0);
2156 EVT VT = Op.getValueType();
2157
2158 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2159
2160 // TODO: Should this propagate fast-math-flags?
2161
2162 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2163
2164 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2165
2166 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2167 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2168 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2169
2170 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2171
2172 EVT SetCCVT =
2173 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2174
2175 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2176
2177 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2178
2179 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2180}
2181
2182SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2183 SDLoc SL(Op);
2184 SDValue Src = Op.getOperand(0);
2185
2186 // result = trunc(src);
2187 // if (src < 0.0 && src != result)
2188 // result += -1.0.
2189
2190 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2191
2192 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2193 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2194
2195 EVT SetCCVT =
2196 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2197
2198 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2199 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2200 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2201
2202 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2203 // TODO: Should this propagate fast-math-flags?
2204 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2205}
2206
2207SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2208 double Log2BaseInverted) const {
2209 EVT VT = Op.getValueType();
2210
2211 SDLoc SL(Op);
2212 SDValue Operand = Op.getOperand(0);
2213 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2214 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2215
2216 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2217}
2218
2219// exp2(M_LOG2E_F * f);
2220SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2221 EVT VT = Op.getValueType();
2222 SDLoc SL(Op);
2223 SDValue Src = Op.getOperand(0);
2224
2225 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2226 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2227 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2228}
2229
2230static bool isCtlzOpc(unsigned Opc) {
2231 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2232}
2233
2234static bool isCttzOpc(unsigned Opc) {
2235 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2236}
2237
2238SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2239 SDLoc SL(Op);
2240 SDValue Src = Op.getOperand(0);
2241
2242 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()))(static_cast <bool> (isCtlzOpc(Op.getOpcode()) || isCttzOpc
(Op.getOpcode())) ? void (0) : __assert_fail ("isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2242, __extension__
__PRETTY_FUNCTION__))
;
2243 bool Ctlz = isCtlzOpc(Op.getOpcode());
2244 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2245
2246 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2247 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2248
2249 if (Src.getValueType() == MVT::i32) {
2250 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2251 // (cttz hi:lo) -> (umin (ffbl src), 32)
2252 // (ctlz_zero_undef src) -> (ffbh src)
2253 // (cttz_zero_undef src) -> (ffbl src)
2254 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2255 if (!ZeroUndef) {
2256 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2257 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2258 }
2259 return NewOpr;
2260 }
2261
2262 SDValue Lo, Hi;
2263 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2264
2265 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2266 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2267
2268 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2269 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2270 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2271 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2272
2273 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2274 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2275 if (Ctlz)
2276 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2277 else
2278 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2279
2280 SDValue NewOpr;
2281 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2282 if (!ZeroUndef) {
2283 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2284 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2285 }
2286
2287 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2288}
2289
2290SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2291 bool Signed) const {
2292 // The regular method converting a 64-bit integer to float roughly consists of
2293 // 2 steps: normalization and rounding. In fact, after normalization, the
2294 // conversion from a 64-bit integer to a float is essentially the same as the
2295 // one from a 32-bit integer. The only difference is that it has more
2296 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2297 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2298 // converted into the correct float number. The basic steps for the unsigned
2299 // conversion are illustrated in the following pseudo code:
2300 //
2301 // f32 uitofp(i64 u) {
2302 // i32 hi, lo = split(u);
2303 // // Only count the leading zeros in hi as we have native support of the
2304 // // conversion from i32 to f32. If hi is all 0s, the conversion is
2305 // // reduced to a 32-bit one automatically.
2306 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2307 // u <<= shamt;
2308 // hi, lo = split(u);
2309 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2310 // // convert it as a 32-bit integer and scale the result back.
2311 // return uitofp(hi) * 2^(32 - shamt);
2312 // }
2313 //
2314 // The signed one follows the same principle but uses 'ffbh_i32' to count its
2315 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2316 // converted instead followed by negation based its sign bit.
2317
2318 SDLoc SL(Op);
2319 SDValue Src = Op.getOperand(0);
2320
2321 SDValue Lo, Hi;
2322 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2323 SDValue Sign;
2324 SDValue ShAmt;
2325 if (Signed && Subtarget->isGCN()) {
2326 // We also need to consider the sign bit in Lo if Hi has just sign bits,
2327 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2328 // account. That is, the maximal shift is
2329 // - 32 if Lo and Hi have opposite signs;
2330 // - 33 if Lo and Hi have the same sign.
2331 //
2332 // Or, MaxShAmt = 33 + OppositeSign, where
2333 //
2334 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2335 // - -1 if Lo and Hi have opposite signs; and
2336 // - 0 otherwise.
2337 //
2338 // All in all, ShAmt is calculated as
2339 //
2340 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2341 //
2342 // or
2343 //
2344 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2345 //
2346 // to reduce the critical path.
2347 SDValue OppositeSign = DAG.getNode(
2348 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2349 DAG.getConstant(31, SL, MVT::i32));
2350 SDValue MaxShAmt =
2351 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2352 OppositeSign);
2353 // Count the leading sign bits.
2354 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2355 // Different from unsigned conversion, the shift should be one bit less to
2356 // preserve the sign bit.
2357 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2358 DAG.getConstant(1, SL, MVT::i32));
2359 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2360 } else {
2361 if (Signed) {
2362 // Without 'ffbh_i32', only leading zeros could be counted. Take the
2363 // absolute value first.
2364 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2365 DAG.getConstant(63, SL, MVT::i64));
2366 SDValue Abs =
2367 DAG.getNode(ISD::XOR, SL, MVT::i64,
2368 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2369 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2370 }
2371 // Count the leading zeros.
2372 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2373 // The shift amount for signed integers is [0, 32].
2374 }
2375 // Normalize the given 64-bit integer.
2376 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2377 // Split it again.
2378 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2379 // Calculate the adjust bit for rounding.
2380 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2381 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2382 DAG.getConstant(1, SL, MVT::i32), Lo);
2383 // Get the 32-bit normalized integer.
2384 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2385 // Convert the normalized 32-bit integer into f32.
2386 unsigned Opc =
2387 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2388 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2389
2390 // Finally, need to scale back the converted floating number as the original
2391 // 64-bit integer is converted as a 32-bit one.
2392 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2393 ShAmt);
2394 // On GCN, use LDEXP directly.
2395 if (Subtarget->isGCN())
2396 return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2397
2398 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2399 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2400 // exponent is enough to avoid overflowing into the sign bit.
2401 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2402 DAG.getConstant(23, SL, MVT::i32));
2403 SDValue IVal =
2404 DAG.getNode(ISD::ADD, SL, MVT::i32,
2405 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2406 if (Signed) {
2407 // Set the sign bit.
2408 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2409 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2410 DAG.getConstant(31, SL, MVT::i32));
2411 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2412 }
2413 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2414}
2415
2416SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2417 bool Signed) const {
2418 SDLoc SL(Op);
2419 SDValue Src = Op.getOperand(0);
2420
2421 SDValue Lo, Hi;
2422 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2423
2424 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2425 SL, MVT::f64, Hi);
2426
2427 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2428
2429 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2430 DAG.getConstant(32, SL, MVT::i32));
2431 // TODO: Should this propagate fast-math-flags?
2432 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2433}
2434
2435SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2436 SelectionDAG &DAG) const {
2437 // TODO: Factor out code common with LowerSINT_TO_FP.
2438 EVT DestVT = Op.getValueType();
2439 SDValue Src = Op.getOperand(0);
2440 EVT SrcVT = Src.getValueType();
2441
2442 if (SrcVT == MVT::i16) {
2443 if (DestVT == MVT::f16)
2444 return Op;
2445 SDLoc DL(Op);
2446
2447 // Promote src to i32
2448 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2449 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2450 }
2451
2452 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2452, __extension__
__PRETTY_FUNCTION__))
;
2453
2454 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2455 SDLoc DL(Op);
2456
2457 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2458 SDValue FPRoundFlag =
2459 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2460 SDValue FPRound =
2461 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2462
2463 return FPRound;
2464 }
2465
2466 if (DestVT == MVT::f32)
2467 return LowerINT_TO_FP32(Op, DAG, false);
2468
2469 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2469, __extension__ __PRETTY_FUNCTION__))
;
2470 return LowerINT_TO_FP64(Op, DAG, false);
2471}
2472
2473SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2474 SelectionDAG &DAG) const {
2475 EVT DestVT = Op.getValueType();
2476
2477 SDValue Src = Op.getOperand(0);
2478 EVT SrcVT = Src.getValueType();
2479
2480 if (SrcVT == MVT::i16) {
2481 if (DestVT == MVT::f16)
2482 return Op;
2483
2484 SDLoc DL(Op);
2485 // Promote src to i32
2486 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2487 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2488 }
2489
2490 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2490, __extension__
__PRETTY_FUNCTION__))
;
2491
2492 // TODO: Factor out code common with LowerUINT_TO_FP.
2493
2494 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2495 SDLoc DL(Op);
2496 SDValue Src = Op.getOperand(0);
2497
2498 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2499 SDValue FPRoundFlag =
2500 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2501 SDValue FPRound =
2502 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2503
2504 return FPRound;
2505 }
2506
2507 if (DestVT == MVT::f32)
2508 return LowerINT_TO_FP32(Op, DAG, true);
2509
2510 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2510, __extension__ __PRETTY_FUNCTION__))
;
2511 return LowerINT_TO_FP64(Op, DAG, true);
2512}
2513
2514SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2515 bool Signed) const {
2516 SDLoc SL(Op);
2517
2518 SDValue Src = Op.getOperand(0);
2519 EVT SrcVT = Src.getValueType();
2520
2521 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64)(static_cast <bool> (SrcVT == MVT::f32 || SrcVT == MVT::
f64) ? void (0) : __assert_fail ("SrcVT == MVT::f32 || SrcVT == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2521, __extension__
__PRETTY_FUNCTION__))
;
2522
2523 // The basic idea of converting a floating point number into a pair of 32-bit
2524 // integers is illustrated as follows:
2525 //
2526 // tf := trunc(val);
2527 // hif := floor(tf * 2^-32);
2528 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2529 // hi := fptoi(hif);
2530 // lo := fptoi(lof);
2531 //
2532 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2533 SDValue Sign;
2534 if (Signed && SrcVT == MVT::f32) {
2535 // However, a 32-bit floating point number has only 23 bits mantissa and
2536 // it's not enough to hold all the significant bits of `lof` if val is
2537 // negative. To avoid the loss of precision, We need to take the absolute
2538 // value after truncating and flip the result back based on the original
2539 // signedness.
2540 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2541 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2542 DAG.getConstant(31, SL, MVT::i32));
2543 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2544 }
2545
2546 SDValue K0, K1;
2547 if (SrcVT == MVT::f64) {
2548 K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL),
2549 SL, SrcVT);
2550 K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL),
2551 SL, SrcVT);
2552 } else {
2553 K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U), SL,
2554 SrcVT);
2555 K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U), SL,
2556 SrcVT);
2557 }
2558 // TODO: Should this propagate fast-math-flags?
2559 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2560
2561 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2562
2563 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2564
2565 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2566 : ISD::FP_TO_UINT,
2567 SL, MVT::i32, FloorMul);
2568 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2569
2570 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2571 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2572
2573 if (Signed && SrcVT == MVT::f32) {
2574 assert(Sign)(static_cast <bool> (Sign) ? void (0) : __assert_fail (
"Sign", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2574
, __extension__ __PRETTY_FUNCTION__))
;
2575 // Flip the result based on the signedness, which is either all 0s or 1s.
2576 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2577 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2578 // r := xor(r, sign) - sign;
2579 Result =
2580 DAG.getNode(ISD::SUB, SL, MVT::i64,
2581 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2582 }
2583
2584 return Result;
2585}
2586
2587SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2588 SDLoc DL(Op);
2589 SDValue N0 = Op.getOperand(0);
2590
2591 // Convert to target node to get known bits
2592 if (N0.getValueType() == MVT::f32)
2593 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2594
2595 if (getTargetMachine().Options.UnsafeFPMath) {
2596 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2597 return SDValue();
2598 }
2599
2600 assert(N0.getSimpleValueType() == MVT::f64)(static_cast <bool> (N0.getSimpleValueType() == MVT::f64
) ? void (0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2600, __extension__
__PRETTY_FUNCTION__))
;
2601
2602 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2603 const unsigned ExpMask = 0x7ff;
2604 const unsigned ExpBiasf64 = 1023;
2605 const unsigned ExpBiasf16 = 15;
2606 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2607 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2608 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2609 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2610 DAG.getConstant(32, DL, MVT::i64));
2611 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2612 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2613 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2614 DAG.getConstant(20, DL, MVT::i64));
2615 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2616 DAG.getConstant(ExpMask, DL, MVT::i32));
2617 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2618 // add the f16 bias (15) to get the biased exponent for the f16 format.
2619 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2620 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2621
2622 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2623 DAG.getConstant(8, DL, MVT::i32));
2624 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2625 DAG.getConstant(0xffe, DL, MVT::i32));
2626
2627 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2628 DAG.getConstant(0x1ff, DL, MVT::i32));
2629 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2630
2631 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2632 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2633
2634 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2635 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2636 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2637 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2638
2639 // N = M | (E << 12);
2640 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2641 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2642 DAG.getConstant(12, DL, MVT::i32)));
2643
2644 // B = clamp(1-E, 0, 13);
2645 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2646 One, E);
2647 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2648 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2649 DAG.getConstant(13, DL, MVT::i32));
2650
2651 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2652 DAG.getConstant(0x1000, DL, MVT::i32));
2653
2654 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2655 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2656 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2657 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2658
2659 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2660 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2661 DAG.getConstant(0x7, DL, MVT::i32));
2662 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2663 DAG.getConstant(2, DL, MVT::i32));
2664 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2665 One, Zero, ISD::SETEQ);
2666 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2667 One, Zero, ISD::SETGT);
2668 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2669 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2670
2671 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2672 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2673 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2674 I, V, ISD::SETEQ);
2675
2676 // Extract the sign bit.
2677 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2678 DAG.getConstant(16, DL, MVT::i32));
2679 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2680 DAG.getConstant(0x8000, DL, MVT::i32));
2681
2682 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2683 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2684}
2685
2686SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2687 SelectionDAG &DAG) const {
2688 SDValue Src = Op.getOperand(0);
2689 unsigned OpOpcode = Op.getOpcode();
2690 EVT SrcVT = Src.getValueType();
2691 EVT DestVT = Op.getValueType();
2692
2693 // Will be selected natively
2694 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2695 return Op;
2696
2697 // Promote i16 to i32
2698 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2699 SDLoc DL(Op);
2700
2701 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2702 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2703 }
2704
2705 if (SrcVT == MVT::f16 ||
2706 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2707 SDLoc DL(Op);
2708
2709 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2710 unsigned Ext =
2711 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2712 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2713 }
2714
2715 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2716 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2717
2718 return SDValue();
2719}
2720
2721SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2722 SelectionDAG &DAG) const {
2723 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2724 MVT VT = Op.getSimpleValueType();
2725 MVT ScalarVT = VT.getScalarType();
2726
2727 assert(VT.isVector())(static_cast <bool> (VT.isVector()) ? void (0) : __assert_fail
("VT.isVector()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2727, __extension__ __PRETTY_FUNCTION__))
;
2728
2729 SDValue Src = Op.getOperand(0);
2730 SDLoc DL(Op);
2731
2732 // TODO: Don't scalarize on Evergreen?
2733 unsigned NElts = VT.getVectorNumElements();
2734 SmallVector<SDValue, 8> Args;
2735 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2736
2737 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2738 for (unsigned I = 0; I < NElts; ++I)
2739 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2740
2741 return DAG.getBuildVector(VT, DL, Args);
2742}
2743
2744//===----------------------------------------------------------------------===//
2745// Custom DAG optimizations
2746//===----------------------------------------------------------------------===//
2747
2748static bool isU24(SDValue Op, SelectionDAG &DAG) {
2749 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2750}
2751
2752static bool isI24(SDValue Op, SelectionDAG &DAG) {
2753 EVT VT = Op.getValueType();
2754 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2755 // as unsigned 24-bit values.
2756 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2757}
2758
2759static SDValue simplifyMul24(SDNode *Node24,
2760 TargetLowering::DAGCombinerInfo &DCI) {
2761 SelectionDAG &DAG = DCI.DAG;
2762 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2763 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2764
2765 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2766 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2767 unsigned NewOpcode = Node24->getOpcode();
2768 if (IsIntrin) {
2769 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2770 switch (IID) {
2771 case Intrinsic::amdgcn_mul_i24:
2772 NewOpcode = AMDGPUISD::MUL_I24;
2773 break;
2774 case Intrinsic::amdgcn_mul_u24:
2775 NewOpcode = AMDGPUISD::MUL_U24;
2776 break;
2777 case Intrinsic::amdgcn_mulhi_i24:
2778 NewOpcode = AMDGPUISD::MULHI_I24;
2779 break;
2780 case Intrinsic::amdgcn_mulhi_u24:
2781 NewOpcode = AMDGPUISD::MULHI_U24;
2782 break;
2783 default:
2784 llvm_unreachable("Expected 24-bit mul intrinsic")::llvm::llvm_unreachable_internal("Expected 24-bit mul intrinsic"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2784)
;
2785 }
2786 }
2787
2788 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2789
2790 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2791 // the operands to have other uses, but will only perform simplifications that
2792 // involve bypassing some nodes for this user.
2793 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2794 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2795 if (DemandedLHS || DemandedRHS)
2796 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2797 DemandedLHS ? DemandedLHS : LHS,
2798 DemandedRHS ? DemandedRHS : RHS);
2799
2800 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2801 // operands if this node is the only user.
2802 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2803 return SDValue(Node24, 0);
2804 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2805 return SDValue(Node24, 0);
2806
2807 return SDValue();
2808}
2809
2810template <typename IntTy>
2811static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2812 uint32_t Width, const SDLoc &DL) {
2813 if (Width + Offset < 32) {
2814 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2815 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2816 return DAG.getConstant(Result, DL, MVT::i32);
2817 }
2818
2819 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2820}
2821
2822static bool hasVolatileUser(SDNode *Val) {
2823 for (SDNode *U : Val->uses()) {
2824 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2825 if (M->isVolatile())
2826 return true;
2827 }
2828 }
2829
2830 return false;
2831}
2832
2833bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2834 // i32 vectors are the canonical memory type.
2835 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2836 return false;
2837
2838 if (!VT.isByteSized())
2839 return false;
2840
2841 unsigned Size = VT.getStoreSize();
2842
2843 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2844 return false;
2845
2846 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2847 return false;
2848
2849 return true;
2850}
2851
2852// Replace load of an illegal type with a store of a bitcast to a friendlier
2853// type.
2854SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2855 DAGCombinerInfo &DCI) const {
2856 if (!DCI.isBeforeLegalize())
2857 return SDValue();
2858
2859 LoadSDNode *LN = cast<LoadSDNode>(N);
2860 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2861 return SDValue();
2862
2863 SDLoc SL(N);
2864 SelectionDAG &DAG = DCI.DAG;
2865 EVT VT = LN->getMemoryVT();
2866
2867 unsigned Size = VT.getStoreSize();
2868 Align Alignment = LN->getAlign();
2869 if (Alignment < Size && isTypeLegal(VT)) {
2870 bool IsFast;
2871 unsigned AS = LN->getAddressSpace();
2872
2873 // Expand unaligned loads earlier than legalization. Due to visitation order
2874 // problems during legalization, the emitted instructions to pack and unpack
2875 // the bytes again are not eliminated in the case of an unaligned copy.
2876 if (!allowsMisalignedMemoryAccesses(
2877 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2878 if (VT.isVector())
2879 return SplitVectorLoad(SDValue(LN, 0), DAG);
2880
2881 SDValue Ops[2];
2882 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2883
2884 return DAG.getMergeValues(Ops, SDLoc(N));
2885 }
2886
2887 if (!IsFast)
2888 return SDValue();
2889 }
2890
2891 if (!shouldCombineMemoryType(VT))
2892 return SDValue();
2893
2894 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2895
2896 SDValue NewLoad
2897 = DAG.getLoad(NewVT, SL, LN->getChain(),
2898 LN->getBasePtr(), LN->getMemOperand());
2899
2900 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2901 DCI.CombineTo(N, BC, NewLoad.getValue(1));
2902 return SDValue(N, 0);
2903}
2904
2905// Replace store of an illegal type with a store of a bitcast to a friendlier
2906// type.
2907SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2908 DAGCombinerInfo &DCI) const {
2909 if (!DCI.isBeforeLegalize())
2910 return SDValue();
2911
2912 StoreSDNode *SN = cast<StoreSDNode>(N);
2913 if (!SN->isSimple() || !ISD::isNormalStore(SN))
2914 return SDValue();
2915
2916 EVT VT = SN->getMemoryVT();
2917 unsigned Size = VT.getStoreSize();
2918
2919 SDLoc SL(N);
2920 SelectionDAG &DAG = DCI.DAG;
2921 Align Alignment = SN->getAlign();
2922 if (Alignment < Size && isTypeLegal(VT)) {
2923 bool IsFast;
2924 unsigned AS = SN->getAddressSpace();
2925
2926 // Expand unaligned stores earlier than legalization. Due to visitation
2927 // order problems during legalization, the emitted instructions to pack and
2928 // unpack the bytes again are not eliminated in the case of an unaligned
2929 // copy.
2930 if (!allowsMisalignedMemoryAccesses(
2931 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
2932 if (VT.isVector())
2933 return SplitVectorStore(SDValue(SN, 0), DAG);
2934
2935 return expandUnalignedStore(SN, DAG);
2936 }
2937
2938 if (!IsFast)
2939 return SDValue();
2940 }
2941
2942 if (!shouldCombineMemoryType(VT))
2943 return SDValue();
2944
2945 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2946 SDValue Val = SN->getValue();
2947
2948 //DCI.AddToWorklist(Val.getNode());
2949
2950 bool OtherUses = !Val.hasOneUse();
2951 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2952 if (OtherUses) {
2953 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2954 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2955 }
2956
2957 return DAG.getStore(SN->getChain(), SL, CastVal,
2958 SN->getBasePtr(), SN->getMemOperand());
2959}
2960
2961// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2962// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2963// issues.
2964SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2965 DAGCombinerInfo &DCI) const {
2966 SelectionDAG &DAG = DCI.DAG;
2967 SDValue N0 = N->getOperand(0);
2968
2969 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2970 // (vt2 (truncate (assertzext vt0:x, vt1)))
2971 if (N0.getOpcode() == ISD::TRUNCATE) {
2972 SDValue N1 = N->getOperand(1);
2973 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2974 SDLoc SL(N);
2975
2976 SDValue Src = N0.getOperand(0);
2977 EVT SrcVT = Src.getValueType();
2978 if (SrcVT.bitsGE(ExtVT)) {
2979 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2980 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2981 }
2982 }
2983
2984 return SDValue();
2985}
2986
2987SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
2988 SDNode *N, DAGCombinerInfo &DCI) const {
2989 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2990 switch (IID) {
2991 case Intrinsic::amdgcn_mul_i24:
2992 case Intrinsic::amdgcn_mul_u24:
2993 case Intrinsic::amdgcn_mulhi_i24:
2994 case Intrinsic::amdgcn_mulhi_u24:
2995 return simplifyMul24(N, DCI);
2996 case Intrinsic::amdgcn_fract:
2997 case Intrinsic::amdgcn_rsq:
2998 case Intrinsic::amdgcn_rcp_legacy:
2999 case Intrinsic::amdgcn_rsq_legacy:
3000 case Intrinsic::amdgcn_rsq_clamp:
3001 case Intrinsic::amdgcn_ldexp: {
3002 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3003 SDValue Src = N->getOperand(1);
3004 return Src.isUndef() ? Src : SDValue();
3005 }
3006 default:
3007 return SDValue();
3008 }
3009}
3010
3011/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3012/// binary operation \p Opc to it with the corresponding constant operands.
3013SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3014 DAGCombinerInfo &DCI, const SDLoc &SL,
3015 unsigned Opc, SDValue LHS,
3016 uint32_t ValLo, uint32_t ValHi) const {
3017 SelectionDAG &DAG = DCI.DAG;
3018 SDValue Lo, Hi;
3019 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3020
3021 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3022 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3023
3024 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3025 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3026
3027 // Re-visit the ands. It's possible we eliminated one of them and it could
3028 // simplify the vector.
3029 DCI.AddToWorklist(Lo.getNode());
3030 DCI.AddToWorklist(Hi.getNode());
3031
3032 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3033 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3034}
3035
3036SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3037 DAGCombinerInfo &DCI) const {
3038 EVT VT = N->getValueType(0);
3039
3040 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3041 if (!RHS)
3042 return SDValue();
3043
3044 SDValue LHS = N->getOperand(0);
3045 unsigned RHSVal = RHS->getZExtValue();
3046 if (!RHSVal)
3047 return LHS;
3048
3049 SDLoc SL(N);
3050 SelectionDAG &DAG = DCI.DAG;
3051
3052 switch (LHS->getOpcode()) {
3053 default:
3054 break;
3055 case ISD::ZERO_EXTEND:
3056 case ISD::SIGN_EXTEND:
3057 case ISD::ANY_EXTEND: {
3058 SDValue X = LHS->getOperand(0);
3059
3060 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3061 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3062 // Prefer build_vector as the canonical form if packed types are legal.
3063 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3064 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3065 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3066 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3067 }
3068
3069 // shl (ext x) => zext (shl x), if shift does not overflow int
3070 if (VT != MVT::i64)
3071 break;
3072 KnownBits Known = DAG.computeKnownBits(X);
3073 unsigned LZ = Known.countMinLeadingZeros();
3074 if (LZ < RHSVal)
3075 break;
3076 EVT XVT = X.getValueType();
3077 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3078 return DAG.getZExtOrTrunc(Shl, SL, VT);
3079 }
3080 }
3081
3082 if (VT != MVT::i64)
3083 return SDValue();
3084
3085 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3086
3087 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3088 // common case, splitting this into a move and a 32-bit shift is faster and
3089 // the same code size.
3090 if (RHSVal < 32)
3091 return SDValue();
3092
3093 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3094
3095 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3096 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3097
3098 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3099
3100 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3101 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3102}
3103
3104SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3105 DAGCombinerInfo &DCI) const {
3106 if (N->getValueType(0) != MVT::i64)
3107 return SDValue();
3108
3109 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3110 if (!RHS)
3111 return SDValue();
3112
3113 SelectionDAG &DAG = DCI.DAG;
3114 SDLoc SL(N);
3115 unsigned RHSVal = RHS->getZExtValue();
3116
3117 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3118 if (RHSVal == 32) {
3119 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3120 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3121 DAG.getConstant(31, SL, MVT::i32));
3122
3123 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3124 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3125 }
3126
3127 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3128 if (RHSVal == 63) {
3129 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3130 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3131 DAG.getConstant(31, SL, MVT::i32));
3132 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3133 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3134 }
3135
3136 return SDValue();
3137}
3138
3139SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3140 DAGCombinerInfo &DCI) const {
3141 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3142 if (!RHS)
3143 return SDValue();
3144
3145 EVT VT = N->getValueType(0);
3146 SDValue LHS = N->getOperand(0);
3147 unsigned ShiftAmt = RHS->getZExtValue();
3148 SelectionDAG &DAG = DCI.DAG;
3149 SDLoc SL(N);
3150
3151 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3152 // this improves the ability to match BFE patterns in isel.
3153 if (LHS.getOpcode() == ISD::AND) {
3154 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3155 unsigned MaskIdx, MaskLen;
3156 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3157 MaskIdx == ShiftAmt) {
3158 return DAG.getNode(
3159 ISD::AND, SL, VT,
3160 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3161 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3162 }
3163 }
3164 }
3165
3166 if (VT != MVT::i64)
3167 return SDValue();
3168
3169 if (ShiftAmt < 32)
3170 return SDValue();
3171
3172 // srl i64:x, C for C >= 32
3173 // =>
3174 // build_pair (srl hi_32(x), C - 32), 0
3175 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3176
3177 SDValue Hi = getHiHalf64(LHS, DAG);
3178
3179 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3180 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3181
3182 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3183
3184 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3185}
3186
3187SDValue AMDGPUTargetLowering::performTruncateCombine(
3188 SDNode *N, DAGCombinerInfo &DCI) const {
3189 SDLoc SL(N);
3190 SelectionDAG &DAG = DCI.DAG;
3191 EVT VT = N->getValueType(0);
3192 SDValue Src = N->getOperand(0);
3193
3194 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3195 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3196 SDValue Vec = Src.getOperand(0);
3197 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3198 SDValue Elt0 = Vec.getOperand(0);
3199 EVT EltVT = Elt0.getValueType();
3200 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3201 if (EltVT.isFloatingPoint()) {
3202 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3203 EltVT.changeTypeToInteger(), Elt0);
3204 }
3205
3206 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3207 }
3208 }
3209 }
3210
3211 // Equivalent of above for accessing the high element of a vector as an
3212 // integer operation.
3213 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3214 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3215 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3216 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3217 SDValue BV = stripBitcast(Src.getOperand(0));
3218 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3219 BV.getValueType().getVectorNumElements() == 2) {
3220 SDValue SrcElt = BV.getOperand(1);
3221 EVT SrcEltVT = SrcElt.getValueType();
3222 if (SrcEltVT.isFloatingPoint()) {
3223 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3224 SrcEltVT.changeTypeToInteger(), SrcElt);
3225 }
3226
3227 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3228 }
3229 }
3230 }
3231 }
3232
3233 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3234 //
3235 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3236 // i16 (trunc (srl (i32 (trunc x), K)))
3237 if (VT.getScalarSizeInBits() < 32) {
3238 EVT SrcVT = Src.getValueType();
3239 if (SrcVT.getScalarSizeInBits() > 32 &&
3240 (Src.getOpcode() == ISD::SRL ||
3241 Src.getOpcode() == ISD::SRA ||
3242 Src.getOpcode() == ISD::SHL)) {
3243 SDValue Amt = Src.getOperand(1);
3244 KnownBits Known = DAG.computeKnownBits(Amt);
3245 unsigned Size = VT.getScalarSizeInBits();
3246 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3247 (Known.countMaxActiveBits() <= Log2_32(Size))) {
3248 EVT MidVT = VT.isVector() ?
3249 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3250 VT.getVectorNumElements()) : MVT::i32;
3251
3252 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3253 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3254 Src.getOperand(0));
3255 DCI.AddToWorklist(Trunc.getNode());
3256
3257 if (Amt.getValueType() != NewShiftVT) {
3258 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3259 DCI.AddToWorklist(Amt.getNode());
3260 }
3261
3262 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3263 Trunc, Amt);
3264 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3265 }
3266 }
3267 }
3268
3269 return SDValue();
3270}
3271
3272// We need to specifically handle i64 mul here to avoid unnecessary conversion
3273// instructions. If we only match on the legalized i64 mul expansion,
3274// SimplifyDemandedBits will be unable to remove them because there will be
3275// multiple uses due to the separate mul + mulh[su].
3276static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3277 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3278 if (Size <= 32) {
3279 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3280 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3281 }
3282
3283 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3284 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3285
3286 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3287 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3288
3289 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3290}
3291
3292SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3293 DAGCombinerInfo &DCI) const {
3294 EVT VT = N->getValueType(0);
3295
3296 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3297 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3298 // unnecessarily). isDivergent() is used as an approximation of whether the
3299 // value is in an SGPR.
3300 if (!N->isDivergent())
3301 return SDValue();
3302
3303 unsigned Size = VT.getSizeInBits();
3304 if (VT.isVector() || Size > 64)
3305 return SDValue();
3306
3307 // There are i16 integer mul/mad.
3308 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3309 return SDValue();
3310
3311 SelectionDAG &DAG = DCI.DAG;
3312 SDLoc DL(N);
3313
3314 SDValue N0 = N->getOperand(0);
3315 SDValue N1 = N->getOperand(1);
3316
3317 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3318 // in the source into any_extends if the result of the mul is truncated. Since
3319 // we can assume the high bits are whatever we want, use the underlying value
3320 // to avoid the unknown high bits from interfering.
3321 if (N0.getOpcode() == ISD::ANY_EXTEND)
3322 N0 = N0.getOperand(0);
3323
3324 if (N1.getOpcode() == ISD::ANY_EXTEND)
3325 N1 = N1.getOperand(0);
3326
3327 SDValue Mul;
3328
3329 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3330 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3331 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3332 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3333 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3334 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3335 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3336 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3337 } else {
3338 return SDValue();
3339 }
3340
3341 // We need to use sext even for MUL_U24, because MUL_U24 is used
3342 // for signed multiply of 8 and 16-bit types.
3343 return DAG.getSExtOrTrunc(Mul, DL, VT);
3344}
3345
3346SDValue
3347AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3348 DAGCombinerInfo &DCI) const {
3349 if (N->getValueType(0) != MVT::i32)
3350 return SDValue();
3351
3352 SelectionDAG &DAG = DCI.DAG;
3353 SDLoc DL(N);
3354
3355 SDValue N0 = N->getOperand(0);
3356 SDValue N1 = N->getOperand(1);
3357
3358 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3359 // in the source into any_extends if the result of the mul is truncated. Since
3360 // we can assume the high bits are whatever we want, use the underlying value
3361 // to avoid the unknown high bits from interfering.
3362 if (N0.getOpcode() == ISD::ANY_EXTEND)
3363 N0 = N0.getOperand(0);
3364 if (N1.getOpcode() == ISD::ANY_EXTEND)
3365 N1 = N1.getOperand(0);
3366
3367 // Try to use two fast 24-bit multiplies (one for each half of the result)
3368 // instead of one slow extending multiply.
3369 unsigned LoOpcode, HiOpcode;
3370 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3371 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3372 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3373 LoOpcode = AMDGPUISD::MUL_U24;
3374 HiOpcode = AMDGPUISD::MULHI_U24;
3375 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3376 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3377 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3378 LoOpcode = AMDGPUISD::MUL_I24;
3379 HiOpcode = AMDGPUISD::MULHI_I24;
3380 } else {
3381 return SDValue();
3382 }
3383
3384 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3385 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3386 DCI.CombineTo(N, Lo, Hi);
3387 return SDValue(N, 0);
3388}
3389
3390SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3391 DAGCombinerInfo &DCI) const {
3392 EVT VT = N->getValueType(0);
3393
3394 if (!Subtarget->hasMulI24() || VT.isVector())
3395 return SDValue();
3396
3397 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3398 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3399 // unnecessarily). isDivergent() is used as an approximation of whether the
3400 // value is in an SGPR.
3401 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3402 // valu op anyway)
3403 if (Subtarget->hasSMulHi() && !N->isDivergent())
3404 return SDValue();
3405
3406 SelectionDAG &DAG = DCI.DAG;
3407 SDLoc DL(N);
3408
3409 SDValue N0 = N->getOperand(0);
3410 SDValue N1 = N->getOperand(1);
3411
3412 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3413 return SDValue();
3414
3415 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3416 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3417
3418 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3419 DCI.AddToWorklist(Mulhi.getNode());
3420 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3421}
3422
3423SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3424 DAGCombinerInfo &DCI) const {
3425 EVT VT = N->getValueType(0);
3426
3427 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3428 return SDValue();
3429
3430 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3431 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3432 // unnecessarily). isDivergent() is used as an approximation of whether the
3433 // value is in an SGPR.
3434 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3435 // valu op anyway)
3436 if (Subtarget->hasSMulHi() && !N->isDivergent())
3437 return SDValue();
3438
3439 SelectionDAG &DAG = DCI.DAG;
3440 SDLoc DL(N);
3441
3442 SDValue N0 = N->getOperand(0);
3443 SDValue N1 = N->getOperand(1);
3444
3445 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3446 return SDValue();
3447
3448 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3449 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3450
3451 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3452 DCI.AddToWorklist(Mulhi.getNode());
3453 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3454}
3455
3456static bool isNegativeOne(SDValue Val) {
3457 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3458 return C->isAllOnes();
3459 return false;
3460}
3461
3462SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3463 SDValue Op,
3464 const SDLoc &DL,
3465 unsigned Opc) const {
3466 EVT VT = Op.getValueType();
3467 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3468 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3469 LegalVT != MVT::i16))
3470 return SDValue();
3471
3472 if (VT != MVT::i32)
3473 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3474
3475 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3476 if (VT != MVT::i32)
3477 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3478
3479 return FFBX;
3480}
3481
3482// The native instructions return -1 on 0 input. Optimize out a select that
3483// produces -1 on 0.
3484//
3485// TODO: If zero is not undef, we could also do this if the output is compared
3486// against the bitwidth.
3487//
3488// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3489SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3490 SDValue LHS, SDValue RHS,
3491 DAGCombinerInfo &DCI) const {
3492 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3493 if (!CmpRhs || !CmpRhs->isZero())
3494 return SDValue();
3495
3496 SelectionDAG &DAG = DCI.DAG;
3497 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3498 SDValue CmpLHS = Cond.getOperand(0);
3499
3500 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3501 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3502 if (CCOpcode == ISD::SETEQ &&
3503 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3504 RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3505 unsigned Opc =
3506 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3507 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3508 }
3509
3510 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3511 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3512 if (CCOpcode == ISD::SETNE &&
3513 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3514 LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3515 unsigned Opc =
3516 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3517
3518 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3519 }
3520
3521 return SDValue();
3522}
3523
3524static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3525 unsigned Op,
3526 const SDLoc &SL,
3527 SDValue Cond,
3528 SDValue N1,
3529 SDValue N2) {
3530 SelectionDAG &DAG = DCI.DAG;
3531 EVT VT = N1.getValueType();
3532
3533 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3534 N1.getOperand(0), N2.getOperand(0));
3535 DCI.AddToWorklist(NewSelect.getNode());
3536 return DAG.getNode(Op, SL, VT, NewSelect);
3537}
3538
3539// Pull a free FP operation out of a select so it may fold into uses.
3540//
3541// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3542// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3543//
3544// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3545// select c, (fabs x), +k -> fabs (select c, x, k)
3546static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3547 SDValue N) {
3548 SelectionDAG &DAG = DCI.DAG;
3549 SDValue Cond = N.getOperand(0);
3550 SDValue LHS = N.getOperand(1);
3551 SDValue RHS = N.getOperand(2);
3552
3553 EVT VT = N.getValueType();
3554 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3555 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3556 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3557 SDLoc(N), Cond, LHS, RHS);
3558 }
3559
3560 bool Inv = false;
3561 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3562 std::swap(LHS, RHS);
3563 Inv = true;
3564 }
3565
3566 // TODO: Support vector constants.
3567 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3568 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3569 SDLoc SL(N);
3570 // If one side is an fneg/fabs and the other is a constant, we can push the
3571 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3572 SDValue NewLHS = LHS.getOperand(0);
3573 SDValue NewRHS = RHS;
3574
3575 // Careful: if the neg can be folded up, don't try to pull it back down.
3576 bool ShouldFoldNeg = true;
3577
3578 if (NewLHS.hasOneUse()) {
3579 unsigned Opc = NewLHS.getOpcode();
3580 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3581 ShouldFoldNeg = false;
3582 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3583 ShouldFoldNeg = false;
3584 }
3585
3586 if (ShouldFoldNeg) {
3587 if (LHS.getOpcode() == ISD::FNEG)
3588 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3589 else if (CRHS->isNegative())
3590 return SDValue();
3591
3592 if (Inv)
3593 std::swap(NewLHS, NewRHS);
3594
3595 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3596 Cond, NewLHS, NewRHS);
3597 DCI.AddToWorklist(NewSelect.getNode());
3598 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3599 }
3600 }
3601
3602 return SDValue();
3603}
3604
3605
3606SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3607 DAGCombinerInfo &DCI) const {
3608 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3609 return Folded;
3610
3611 SDValue Cond = N->getOperand(0);
3612 if (Cond.getOpcode() != ISD::SETCC)
3613 return SDValue();
3614
3615 EVT VT = N->getValueType(0);
3616 SDValue LHS = Cond.getOperand(0);
3617 SDValue RHS = Cond.getOperand(1);
3618 SDValue CC = Cond.getOperand(2);
3619
3620 SDValue True = N->getOperand(1);
3621 SDValue False = N->getOperand(2);
3622
3623 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3624 SelectionDAG &DAG = DCI.DAG;
3625 if (DAG.isConstantValueOfAnyType(True) &&
3626 !DAG.isConstantValueOfAnyType(False)) {
3627 // Swap cmp + select pair to move constant to false input.
3628 // This will allow using VOPC cndmasks more often.
3629 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3630
3631 SDLoc SL(N);
3632 ISD::CondCode NewCC =
3633 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3634
3635 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3636 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3637 }
3638
3639 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3640 SDValue MinMax
3641 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3642 // Revisit this node so we can catch min3/max3/med3 patterns.
3643 //DCI.AddToWorklist(MinMax.getNode());
3644 return MinMax;
3645 }
3646 }
3647
3648 // There's no reason to not do this if the condition has other uses.
3649 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3650}
3651
3652static bool isInv2Pi(const APFloat &APF) {
3653 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3654 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3655 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3656
3657 return APF.bitwiseIsEqual(KF16) ||
3658 APF.bitwiseIsEqual(KF32) ||
3659 APF.bitwiseIsEqual(KF64);
3660}
3661
3662// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3663// additional cost to negate them.
3664bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3665 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3666 if (C->isZero() && !C->isNegative())
3667 return true;
3668
3669 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3670 return true;
3671 }
3672
3673 return false;
3674}
3675
3676static unsigned inverseMinMax(unsigned Opc) {
3677 switch (Opc) {
3678 case ISD::FMAXNUM:
3679 return ISD::FMINNUM;
3680 case ISD::FMINNUM:
3681 return ISD::FMAXNUM;
3682 case ISD::FMAXNUM_IEEE:
3683 return ISD::FMINNUM_IEEE;
3684 case ISD::FMINNUM_IEEE:
3685 return ISD::FMAXNUM_IEEE;
3686 case AMDGPUISD::FMAX_LEGACY:
3687 return AMDGPUISD::FMIN_LEGACY;
3688 case AMDGPUISD::FMIN_LEGACY:
3689 return AMDGPUISD::FMAX_LEGACY;
3690 default:
3691 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3691)
;
3692 }
3693}
3694
3695SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3696 DAGCombinerInfo &DCI) const {
3697 SelectionDAG &DAG = DCI.DAG;
3698 SDValue N0 = N->getOperand(0);
3699 EVT VT = N->getValueType(0);
3700
3701 unsigned Opc = N0.getOpcode();
3702
3703 // If the input has multiple uses and we can either fold the negate down, or
3704 // the other uses cannot, give up. This both prevents unprofitable
3705 // transformations and infinite loops: we won't repeatedly try to fold around
3706 // a negate that has no 'good' form.
3707 if (N0.hasOneUse()) {
3708 // This may be able to fold into the source, but at a code size cost. Don't
3709 // fold if the fold into the user is free.
3710 if (allUsesHaveSourceMods(N, 0))
3711 return SDValue();
3712 } else {
3713 if (fnegFoldsIntoOp(Opc) &&
3714 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3715 return SDValue();
3716 }
3717
3718 SDLoc SL(N);
3719 switch (Opc) {
3720 case ISD::FADD: {
3721 if (!mayIgnoreSignedZero(N0))
3722 return SDValue();
3723
3724 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3725 SDValue LHS = N0.getOperand(0);
3726 SDValue RHS = N0.getOperand(1);
3727
3728 if (LHS.getOpcode() != ISD::FNEG)
3729 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3730 else
3731 LHS = LHS.getOperand(0);
3732
3733 if (RHS.getOpcode() != ISD::FNEG)
3734 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3735 else
3736 RHS = RHS.getOperand(0);
3737
3738 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3739 if (Res.getOpcode() != ISD::FADD)
3740 return SDValue(); // Op got folded away.
3741 if (!N0.hasOneUse())
3742 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3743 return Res;
3744 }
3745 case ISD::FMUL:
3746 case AMDGPUISD::FMUL_LEGACY: {
3747 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3748 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3749 SDValue LHS = N0.getOperand(0);
3750 SDValue RHS = N0.getOperand(1);
3751
3752 if (LHS.getOpcode() == ISD::FNEG)
3753 LHS = LHS.getOperand(0);
3754 else if (RHS.getOpcode() == ISD::FNEG)
3755 RHS = RHS.getOperand(0);
3756 else
3757 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3758
3759 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3760 if (Res.getOpcode() != Opc)
3761 return SDValue(); // Op got folded away.
3762 if (!N0.hasOneUse())
3763 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3764 return Res;
3765 }
3766 case ISD::FMA:
3767 case ISD::FMAD: {
3768 // TODO: handle llvm.amdgcn.fma.legacy
3769 if (!mayIgnoreSignedZero(N0))
3770 return SDValue();
3771
3772 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3773 SDValue LHS = N0.getOperand(0);
3774 SDValue MHS = N0.getOperand(1);
3775 SDValue RHS = N0.getOperand(2);
3776
3777 if (LHS.getOpcode() == ISD::FNEG)
3778 LHS = LHS.getOperand(0);
3779 else if (MHS.getOpcode() == ISD::FNEG)
3780 MHS = MHS.getOperand(0);
3781 else
3782 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3783
3784 if (RHS.getOpcode() != ISD::FNEG)
3785 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3786 else
3787 RHS = RHS.getOperand(0);
3788
3789 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3790 if (Res.getOpcode() != Opc)
3791 return SDValue(); // Op got folded away.
3792 if (!N0.hasOneUse())
3793 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3794 return Res;
3795 }
3796 case ISD::FMAXNUM:
3797 case ISD::FMINNUM:
3798 case ISD::FMAXNUM_IEEE:
3799 case ISD::FMINNUM_IEEE:
3800 case AMDGPUISD::FMAX_LEGACY:
3801 case AMDGPUISD::FMIN_LEGACY: {
3802 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3803 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3804 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3805 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3806
3807 SDValue LHS = N0.getOperand(0);
3808 SDValue RHS = N0.getOperand(1);
3809
3810 // 0 doesn't have a negated inline immediate.
3811 // TODO: This constant check should be generalized to other operations.
3812 if (isConstantCostlierToNegate(RHS))
3813 return SDValue();
3814
3815 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3816 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3817 unsigned Opposite = inverseMinMax(Opc);
3818
3819 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3820 if (Res.getOpcode() != Opposite)
3821 return SDValue(); // Op got folded away.
3822 if (!N0.hasOneUse())
3823 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3824 return Res;
3825 }
3826 case AMDGPUISD::FMED3: {
3827 SDValue Ops[3];
3828 for (unsigned I = 0; I < 3; ++I)
3829 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3830
3831 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3832 if (Res.getOpcode() != AMDGPUISD::FMED3)
3833 return SDValue(); // Op got folded away.
3834
3835 if (!N0.hasOneUse()) {
3836 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3837 DAG.ReplaceAllUsesWith(N0, Neg);
3838
3839 for (SDNode *U : Neg->uses())
3840 DCI.AddToWorklist(U);
3841 }
3842
3843 return Res;
3844 }
3845 case ISD::FP_EXTEND:
3846 case ISD::FTRUNC:
3847 case ISD::FRINT:
3848 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3849 case ISD::FSIN:
3850 case ISD::FCANONICALIZE:
3851 case AMDGPUISD::RCP:
3852 case AMDGPUISD::RCP_LEGACY:
3853 case AMDGPUISD::RCP_IFLAG:
3854 case AMDGPUISD::SIN_HW: {
3855 SDValue CvtSrc = N0.getOperand(0);
3856 if (CvtSrc.getOpcode() == ISD::FNEG) {
3857 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3858 // (fneg (rcp (fneg x))) -> (rcp x)
3859 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3860 }
3861
3862 if (!N0.hasOneUse())
3863 return SDValue();
3864
3865 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3866 // (fneg (rcp x)) -> (rcp (fneg x))
3867 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3868 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3869 }
3870 case ISD::FP_ROUND: {
3871 SDValue CvtSrc = N0.getOperand(0);
3872
3873 if (CvtSrc.getOpcode() == ISD::FNEG) {
3874 // (fneg (fp_round (fneg x))) -> (fp_round x)
3875 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3876 CvtSrc.getOperand(0), N0.getOperand(1));
3877 }
3878
3879 if (!N0.hasOneUse())
3880 return SDValue();
3881
3882 // (fneg (fp_round x)) -> (fp_round (fneg x))
3883 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3884 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3885 }
3886 case ISD::FP16_TO_FP: {
3887 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3888 // f16, but legalization of f16 fneg ends up pulling it out of the source.
3889 // Put the fneg back as a legal source operation that can be matched later.
3890 SDLoc SL(N);
3891
3892 SDValue Src = N0.getOperand(0);
3893 EVT SrcVT = Src.getValueType();
3894
3895 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3896 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3897 DAG.getConstant(0x8000, SL, SrcVT));
3898 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3899 }
3900 default:
3901 return SDValue();
3902 }
3903}
3904
3905SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3906 DAGCombinerInfo &DCI) const {
3907 SelectionDAG &DAG = DCI.DAG;
3908 SDValue N0 = N->getOperand(0);
3909
3910 if (!N0.hasOneUse())
3911 return SDValue();
3912
3913 switch (N0.getOpcode()) {
3914 case ISD::FP16_TO_FP: {
3915 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")(static_cast <bool> (!Subtarget->has16BitInsts() &&
"should only see if f16 is illegal") ? void (0) : __assert_fail
("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 3915, __extension__
__PRETTY_FUNCTION__))
;
3916 SDLoc SL(N);
3917 SDValue Src = N0.getOperand(0);
3918 EVT SrcVT = Src.getValueType();
3919
3920 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3921 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3922 DAG.getConstant(0x7fff, SL, SrcVT));
3923 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3924 }
3925 default:
3926 return SDValue();
3927 }
3928}
3929
3930SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3931 DAGCombinerInfo &DCI) const {
3932 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3933 if (!CFP)
3934 return SDValue();
3935
3936 // XXX - Should this flush denormals?
3937 const APFloat &Val = CFP->getValueAPF();
3938 APFloat One(Val.getSemantics(), "1.0");
3939 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3940}
3941
3942SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3943 DAGCombinerInfo &DCI) const {
3944 SelectionDAG &DAG = DCI.DAG;
3945 SDLoc DL(N);
3946
3947 switch(N->getOpcode()) {
3948 default:
3949 break;
3950 case ISD::BITCAST: {
3951 EVT DestVT = N->getValueType(0);
3952
3953 // Push casts through vector builds. This helps avoid emitting a large
3954 // number of copies when materializing floating point vector constants.
3955 //
3956 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3957 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3958 if (DestVT.isVector()) {
3959 SDValue Src = N->getOperand(0);
3960 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3961 EVT SrcVT = Src.getValueType();
3962 unsigned NElts = DestVT.getVectorNumElements();
3963
3964 if (SrcVT.getVectorNumElements() == NElts) {
3965 EVT DestEltVT = DestVT.getVectorElementType();
3966
3967 SmallVector<SDValue, 8> CastedElts;
3968 SDLoc SL(N);
3969 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3970 SDValue Elt = Src.getOperand(I);
3971 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3972 }
3973
3974 return DAG.getBuildVector(DestVT, SL, CastedElts);
3975 }
3976 }
3977 }
3978
3979 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
3980 break;
3981
3982 // Fold bitcasts of constants.
3983 //
3984 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3985 // TODO: Generalize and move to DAGCombiner
3986 SDValue Src = N->getOperand(0);
3987 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3988 SDLoc SL(N);
3989 uint64_t CVal = C->getZExtValue();
3990 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3991 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3992 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3993 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
3994 }
3995
3996 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3997 const APInt &Val = C->getValueAPF().bitcastToAPInt();
3998 SDLoc SL(N);
3999 uint64_t CVal = Val.getZExtValue();
4000 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4001 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4002 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4003
4004 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4005 }
4006
4007 break;
4008 }
4009 case ISD::SHL: {
4010 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4011 break;
4012
4013 return performShlCombine(N, DCI);
4014 }
4015 case ISD::SRL: {
4016 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4017 break;
4018
4019 return performSrlCombine(N, DCI);
4020 }
4021 case ISD::SRA: {
4022 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4023 break;
4024
4025 return performSraCombine(N, DCI);
4026 }
4027 case ISD::TRUNCATE:
4028 return performTruncateCombine(N, DCI);
4029 case ISD::MUL:
4030 return performMulCombine(N, DCI);
4031 case ISD::SMUL_LOHI:
4032 case ISD::UMUL_LOHI:
4033 return performMulLoHiCombine(N, DCI);
4034 case ISD::MULHS:
4035 return performMulhsCombine(N, DCI);
4036 case ISD::MULHU:
4037 return performMulhuCombine(N, DCI);
4038 case AMDGPUISD::MUL_I24:
4039 case AMDGPUISD::MUL_U24:
4040 case AMDGPUISD::MULHI_I24:
4041 case AMDGPUISD::MULHI_U24:
4042 return simplifyMul24(N, DCI);
4043 case ISD::SELECT:
4044 return performSelectCombine(N, DCI);
4045 case ISD::FNEG:
4046 return performFNegCombine(N, DCI);
4047 case ISD::FABS:
4048 return performFAbsCombine(N, DCI);
4049 case AMDGPUISD::BFE_I32:
4050 case AMDGPUISD::BFE_U32: {
4051 assert(!N->getValueType(0).isVector() &&(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4052, __extension__
__PRETTY_FUNCTION__))
4052 "Vector handling of BFE not implemented")(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4052, __extension__
__PRETTY_FUNCTION__))
;
4053 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4054 if (!Width)
4055 break;
4056
4057 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4058 if (WidthVal == 0)
4059 return DAG.getConstant(0, DL, MVT::i32);
4060
4061 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4062 if (!Offset)
4063 break;
4064
4065 SDValue BitsFrom = N->getOperand(0);
4066 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4067
4068 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4069
4070 if (OffsetVal == 0) {
4071 // This is already sign / zero extended, so try to fold away extra BFEs.
4072 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4073
4074 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4075 if (OpSignBits >= SignBits)
4076 return BitsFrom;
4077
4078 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4079 if (Signed) {
4080 // This is a sign_extend_inreg. Replace it to take advantage of existing
4081 // DAG Combines. If not eliminated, we will match back to BFE during
4082 // selection.
4083
4084 // TODO: The sext_inreg of extended types ends, although we can could
4085 // handle them in a single BFE.
4086 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4087 DAG.getValueType(SmallVT));
4088 }
4089
4090 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4091 }
4092
4093 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4094 if (Signed) {
4095 return constantFoldBFE<int32_t>(DAG,
4096 CVal->getSExtValue(),
4097 OffsetVal,
4098 WidthVal,
4099 DL);
4100 }
4101
4102 return constantFoldBFE<uint32_t>(DAG,
4103 CVal->getZExtValue(),
4104 OffsetVal,
4105 WidthVal,
4106 DL);
4107 }
4108
4109 if ((OffsetVal + WidthVal) >= 32 &&
4110 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4111 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4112 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4113 BitsFrom, ShiftVal);
4114 }
4115
4116 if (BitsFrom.hasOneUse()) {
4117 APInt Demanded = APInt::getBitsSet(32,
4118 OffsetVal,
4119 OffsetVal + WidthVal);
4120
4121 KnownBits Known;
4122 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4123 !DCI.isBeforeLegalizeOps());
4124 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4125 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4126 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4127 DCI.CommitTargetLoweringOpt(TLO);
4128 }
4129 }
4130
4131 break;
4132 }
4133 case ISD::LOAD:
4134 return performLoadCombine(N, DCI);
4135 case ISD::STORE:
4136 return performStoreCombine(N, DCI);
4137 case AMDGPUISD::RCP:
4138 case AMDGPUISD::RCP_IFLAG:
4139 return performRcpCombine(N, DCI);
4140 case ISD::AssertZext:
4141 case ISD::AssertSext:
4142 return performAssertSZExtCombine(N, DCI);
4143 case ISD::INTRINSIC_WO_CHAIN:
4144 return performIntrinsicWOChainCombine(N, DCI);
4145 }
4146 return SDValue();
4147}
4148
4149//===----------------------------------------------------------------------===//
4150// Helper functions
4151//===----------------------------------------------------------------------===//
4152
4153SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4154 const TargetRegisterClass *RC,
4155 Register Reg, EVT VT,
4156 const SDLoc &SL,
4157 bool RawReg) const {
4158 MachineFunction &MF = DAG.getMachineFunction();
4159 MachineRegisterInfo &MRI = MF.getRegInfo();
4160 Register VReg;
4161
4162 if (!MRI.isLiveIn(Reg)) {
4163 VReg = MRI.createVirtualRegister(RC);
4164 MRI.addLiveIn(Reg, VReg);
4165 } else {
4166 VReg = MRI.getLiveInVirtReg(Reg);
4167 }
4168
4169 if (RawReg)
4170 return DAG.getRegister(VReg, VT);
4171
4172 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4173}
4174
4175// This may be called multiple times, and nothing prevents creating multiple
4176// objects at the same offset. See if we already defined this object.
4177static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4178 int64_t Offset) {
4179 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4180 if (MFI.getObjectOffset(I) == Offset) {
4181 assert(MFI.getObjectSize(I) == Size)(static_cast <bool> (MFI.getObjectSize(I) == Size) ? void
(0) : __assert_fail ("MFI.getObjectSize(I) == Size", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4181, __extension__ __PRETTY_FUNCTION__))
;
4182 return I;
4183 }
4184 }
4185
4186 return MFI.CreateFixedObject(Size, Offset, true);
4187}
4188
4189SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4190 EVT VT,
4191 const SDLoc &SL,
4192 int64_t Offset) const {
4193 MachineFunction &MF = DAG.getMachineFunction();
4194 MachineFrameInfo &MFI = MF.getFrameInfo();
4195 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4196
4197 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4198 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4199
4200 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4201 MachineMemOperand::MODereferenceable |
4202 MachineMemOperand::MOInvariant);
4203}
4204
4205SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4206 const SDLoc &SL,
4207 SDValue Chain,
4208 SDValue ArgVal,
4209 int64_t Offset) const {
4210 MachineFunction &MF = DAG.getMachineFunction();
4211 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4212 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4213
4214 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4215 // Stores to the argument stack area are relative to the stack pointer.
4216 SDValue SP =
4217 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4218 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4219 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4220 MachineMemOperand::MODereferenceable);
4221 return Store;
4222}
4223
4224SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4225 const TargetRegisterClass *RC,
4226 EVT VT, const SDLoc &SL,
4227 const ArgDescriptor &Arg) const {
4228 assert(Arg && "Attempting to load missing argument")(static_cast <bool> (Arg && "Attempting to load missing argument"
) ? void (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4228, __extension__
__PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4229
4230 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4231 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4232 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4233
4234 if (!Arg.isMasked())
4
Taking false branch
4235 return V;
4236
4237 unsigned Mask = Arg.getMask();
4238 unsigned Shift = countTrailingZeros<unsigned>(Mask);
5
Calling 'countTrailingZeros<unsigned int>'
12
Returning from 'countTrailingZeros<unsigned int>'
13
'Shift' initialized to 32
4239 V = DAG.getNode(ISD::SRL, SL, VT, V,
4240 DAG.getShiftAmountConstant(Shift, VT, SL));
4241 return DAG.getNode(ISD::AND, SL, VT, V,
4242 DAG.getConstant(Mask >> Shift, SL, VT));
14
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4243}
4244
4245uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4246 const MachineFunction &MF, const ImplicitParameter Param) const {
4247 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4248 const AMDGPUSubtarget &ST =
4249 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4250 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4251 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4252 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4253 ExplicitArgOffset;
4254 switch (Param) {
4255 case FIRST_IMPLICIT:
4256 return ArgOffset;
4257 case PRIVATE_BASE:
4258 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
4259 case SHARED_BASE:
4260 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
4261 case QUEUE_PTR:
4262 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
4263 }
4264 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4264)
;
4265}
4266
4267#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4268
4269const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4270 switch ((AMDGPUISD::NodeType)Opcode) {
4271 case AMDGPUISD::FIRST_NUMBER: break;
4272 // AMDIL DAG nodes
4273 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4274 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4275
4276 // AMDGPU DAG nodes
4277 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4278 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4279 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4280 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4281 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4282 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4283 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4284 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4285 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4286 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4287 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4288 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4289 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4290 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4291 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4292 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4293 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4294 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4295 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4296 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4297 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4298 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4299 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4300 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4301 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4302 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4303 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4304 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4305 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4306 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4307 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4308 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4309 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4310 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4311 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4312 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4313 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4314 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4315 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4316 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4317 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4318 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4319 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4320 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4321 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4322 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4323 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4324 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4325 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4326 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4327 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4328 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4329 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4330 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4331 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4332 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4333 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4334 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4335 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4336 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4337 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4338 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4339 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4340 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4341 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4342 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4343 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4344 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4345 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4346 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4347 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4348 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4349 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4350 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4351 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4352 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4353 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4354 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4355 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4356 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4357 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4358 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4359 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4360 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4361 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4362 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4363 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)case AMDGPUISD::FPTRUNC_ROUND_UPWARD: return "FPTRUNC_ROUND_UPWARD"
;
4364 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)case AMDGPUISD::FPTRUNC_ROUND_DOWNWARD: return "FPTRUNC_ROUND_DOWNWARD"
;
4365 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4366 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4367 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4368 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4369 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4370 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4371 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4372 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4373 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4374 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4375 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4376 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4377 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4378 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4379 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4380 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4381 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4382 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4383 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4384 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4385 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4386 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4387 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4388 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4389 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4390 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4391 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4392 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4393 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4394 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4395 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4396 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4397 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4398 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4399 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4400 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4401 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4402 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4403 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4404 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4405 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4406 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4407 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4408 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4409 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4410 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4411 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)case AMDGPUISD::BUFFER_ATOMIC_CSUB: return "BUFFER_ATOMIC_CSUB"
;
4412 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4413 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)case AMDGPUISD::BUFFER_ATOMIC_FMIN: return "BUFFER_ATOMIC_FMIN"
;
4414 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)case AMDGPUISD::BUFFER_ATOMIC_FMAX: return "BUFFER_ATOMIC_FMAX"
;
4415
4416 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4417 }
4418 return nullptr;
4419}
4420
4421SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4422 SelectionDAG &DAG, int Enabled,
4423 int &RefinementSteps,
4424 bool &UseOneConstNR,
4425 bool Reciprocal) const {
4426 EVT VT = Operand.getValueType();
4427
4428 if (VT == MVT::f32) {
4429 RefinementSteps = 0;
4430 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4431 }
4432
4433 // TODO: There is also f64 rsq instruction, but the documentation is less
4434 // clear on its precision.
4435
4436 return SDValue();
4437}
4438
4439SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4440 SelectionDAG &DAG, int Enabled,
4441 int &RefinementSteps) const {
4442 EVT VT = Operand.getValueType();
4443
4444 if (VT == MVT::f32) {
4445 // Reciprocal, < 1 ulp error.
4446 //
4447 // This reciprocal approximation converges to < 0.5 ulp error with one
4448 // newton rhapson performed with two fused multiple adds (FMAs).
4449
4450 RefinementSteps = 0;
4451 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4452 }
4453
4454 // TODO: There is also f64 rcp instruction, but the documentation is less
4455 // clear on its precision.
4456
4457 return SDValue();
4458}
4459
4460static unsigned workitemIntrinsicDim(unsigned ID) {
4461 switch (ID) {
4462 case Intrinsic::amdgcn_workitem_id_x:
4463 return 0;
4464 case Intrinsic::amdgcn_workitem_id_y:
4465 return 1;
4466 case Intrinsic::amdgcn_workitem_id_z:
4467 return 2;
4468 default:
4469 llvm_unreachable("not a workitem intrinsic")::llvm::llvm_unreachable_internal("not a workitem intrinsic",
"llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4469)
;
4470 }
4471}
4472
4473void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4474 const SDValue Op, KnownBits &Known,
4475 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4476
4477 Known.resetAll(); // Don't know anything.
4478
4479 unsigned Opc = Op.getOpcode();
4480
4481 switch (Opc) {
4482 default:
4483 break;
4484 case AMDGPUISD::CARRY:
4485 case AMDGPUISD::BORROW: {
4486 Known.Zero = APInt::getHighBitsSet(32, 31);
4487 break;
4488 }
4489
4490 case AMDGPUISD::BFE_I32:
4491 case AMDGPUISD::BFE_U32: {
4492 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4493 if (!CWidth)
4494 return;
4495
4496 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4497
4498 if (Opc == AMDGPUISD::BFE_U32)
4499 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4500
4501 break;
4502 }
4503 case AMDGPUISD::FP_TO_FP16: {
4504 unsigned BitWidth = Known.getBitWidth();
4505
4506 // High bits are zero.
4507 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4508 break;
4509 }
4510 case AMDGPUISD::MUL_U24:
4511 case AMDGPUISD::MUL_I24: {
4512 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4513 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4514 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4515 RHSKnown.countMinTrailingZeros();
4516 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4517 // Skip extra check if all bits are known zeros.
4518 if (TrailZ >= 32)
4519 break;
4520
4521 // Truncate to 24 bits.
4522 LHSKnown = LHSKnown.trunc(24);
4523 RHSKnown = RHSKnown.trunc(24);
4524
4525 if (Opc == AMDGPUISD::MUL_I24) {
4526 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4527 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4528 unsigned MaxValBits = LHSValBits + RHSValBits;
4529 if (MaxValBits > 32)
4530 break;
4531 unsigned SignBits = 32 - MaxValBits + 1;
4532 bool LHSNegative = LHSKnown.isNegative();
4533 bool LHSNonNegative = LHSKnown.isNonNegative();
4534 bool LHSPositive = LHSKnown.isStrictlyPositive();
4535 bool RHSNegative = RHSKnown.isNegative();
4536 bool RHSNonNegative = RHSKnown.isNonNegative();
4537 bool RHSPositive = RHSKnown.isStrictlyPositive();
4538
4539 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4540 Known.Zero.setHighBits(SignBits);
4541 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4542 Known.One.setHighBits(SignBits);
4543 } else {
4544 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4545 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4546 unsigned MaxValBits = LHSValBits + RHSValBits;
4547 if (MaxValBits >= 32)
4548 break;
4549 Known.Zero.setBitsFrom(MaxValBits);
4550 }
4551 break;
4552 }
4553 case AMDGPUISD::PERM: {
4554 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4555 if (!CMask)
4556 return;
4557
4558 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4559 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4560 unsigned Sel = CMask->getZExtValue();
4561
4562 for (unsigned I = 0; I < 32; I += 8) {
4563 unsigned SelBits = Sel & 0xff;
4564 if (SelBits < 4) {
4565 SelBits *= 8;
4566 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4567 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4568 } else if (SelBits < 7) {
4569 SelBits = (SelBits & 3) * 8;
4570 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4571 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4572 } else if (SelBits == 0x0c) {
4573 Known.Zero |= 0xFFull << I;
4574 } else if (SelBits > 0x0c) {
4575 Known.One |= 0xFFull << I;
4576 }
4577 Sel >>= 8;
4578 }
4579 break;
4580 }
4581 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4582 Known.Zero.setHighBits(24);
4583 break;
4584 }
4585 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4586 Known.Zero.setHighBits(16);
4587 break;
4588 }
4589 case AMDGPUISD::LDS: {
4590 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4591 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4592
4593 Known.Zero.setHighBits(16);
4594 Known.Zero.setLowBits(Log2(Alignment));
4595 break;
4596 }
4597 case ISD::INTRINSIC_WO_CHAIN: {
4598 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4599 switch (IID) {
4600 case Intrinsic::amdgcn_mbcnt_lo:
4601 case Intrinsic::amdgcn_mbcnt_hi: {
4602 const GCNSubtarget &ST =
4603 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4604 // These return at most the (wavefront size - 1) + src1
4605 // As long as src1 is an immediate we can calc known bits
4606 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
4607 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
4608 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
4609 // Cater for potential carry
4610 MaxActiveBits += Src1ValBits ? 1 : 0;
4611 unsigned Size = Op.getValueType().getSizeInBits();
4612 if (MaxActiveBits < Size)
4613 Known.Zero.setHighBits(Size - MaxActiveBits);
4614 break;
4615 }
4616 case Intrinsic::amdgcn_workitem_id_x:
4617 case Intrinsic::amdgcn_workitem_id_y:
4618 case Intrinsic::amdgcn_workitem_id_z: {
4619 unsigned MaxValue = Subtarget->getMaxWorkitemID(
4620 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
4621 Known.Zero.setHighBits(countLeadingZeros(MaxValue));
4622 break;
4623 }
4624 default:
4625 break;
4626 }
4627 }
4628 }
4629}
4630
4631unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4632 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4633 unsigned Depth) const {
4634 switch (Op.getOpcode()) {
4635 case AMDGPUISD::BFE_I32: {
4636 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4637 if (!Width)
4638 return 1;
4639
4640 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4641 if (!isNullConstant(Op.getOperand(1)))
4642 return SignBits;
4643
4644 // TODO: Could probably figure something out with non-0 offsets.
4645 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4646 return std::max(SignBits, Op0SignBits);
4647 }
4648
4649 case AMDGPUISD::BFE_U32: {
4650 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4651 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4652 }
4653
4654 case AMDGPUISD::CARRY:
4655 case AMDGPUISD::BORROW:
4656 return 31;
4657 case AMDGPUISD::BUFFER_LOAD_BYTE:
4658 return 25;
4659 case AMDGPUISD::BUFFER_LOAD_SHORT:
4660 return 17;
4661 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4662 return 24;
4663 case AMDGPUISD::BUFFER_LOAD_USHORT:
4664 return 16;
4665 case AMDGPUISD::FP_TO_FP16:
4666 return 16;
4667 default:
4668 return 1;
4669 }
4670}
4671
4672unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4673 GISelKnownBits &Analysis, Register R,
4674 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4675 unsigned Depth) const {
4676 const MachineInstr *MI = MRI.getVRegDef(R);
4677 if (!MI)
4678 return 1;
4679
4680 // TODO: Check range metadata on MMO.
4681 switch (MI->getOpcode()) {
4682 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4683 return 25;
4684 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4685 return 17;
4686 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4687 return 24;
4688 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4689 return 16;
4690 default:
4691 return 1;
4692 }
4693}
4694
4695bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4696 const SelectionDAG &DAG,
4697 bool SNaN,
4698 unsigned Depth) const {
4699 unsigned Opcode = Op.getOpcode();
4700 switch (Opcode) {
4701 case AMDGPUISD::FMIN_LEGACY:
4702 case AMDGPUISD::FMAX_LEGACY: {
4703 if (SNaN)
4704 return true;
4705
4706 // TODO: Can check no nans on one of the operands for each one, but which
4707 // one?
4708 return false;
4709 }
4710 case AMDGPUISD::FMUL_LEGACY:
4711 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4712 if (SNaN)
4713 return true;
4714 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4715 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4716 }
4717 case AMDGPUISD::FMED3:
4718 case AMDGPUISD::FMIN3:
4719 case AMDGPUISD::FMAX3:
4720 case AMDGPUISD::FMAD_FTZ: {
4721 if (SNaN)
4722 return true;
4723 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4724 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4725 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4726 }
4727 case AMDGPUISD::CVT_F32_UBYTE0:
4728 case AMDGPUISD::CVT_F32_UBYTE1:
4729 case AMDGPUISD::CVT_F32_UBYTE2:
4730 case AMDGPUISD::CVT_F32_UBYTE3:
4731 return true;
4732
4733 case AMDGPUISD::RCP:
4734 case AMDGPUISD::RSQ:
4735 case AMDGPUISD::RCP_LEGACY:
4736 case AMDGPUISD::RSQ_CLAMP: {
4737 if (SNaN)
4738 return true;
4739
4740 // TODO: Need is known positive check.
4741 return false;
4742 }
4743 case AMDGPUISD::LDEXP:
4744 case AMDGPUISD::FRACT: {
4745 if (SNaN)
4746 return true;
4747 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4748 }
4749 case AMDGPUISD::DIV_SCALE:
4750 case AMDGPUISD::DIV_FMAS:
4751 case AMDGPUISD::DIV_FIXUP:
4752 // TODO: Refine on operands.
4753 return SNaN;
4754 case AMDGPUISD::SIN_HW:
4755 case AMDGPUISD::COS_HW: {
4756 // TODO: Need check for infinity
4757 return SNaN;
4758 }
4759 case ISD::INTRINSIC_WO_CHAIN: {
4760 unsigned IntrinsicID
4761 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4762 // TODO: Handle more intrinsics
4763 switch (IntrinsicID) {
4764 case Intrinsic::amdgcn_cubeid:
4765 return true;
4766
4767 case Intrinsic::amdgcn_frexp_mant: {
4768 if (SNaN)
4769 return true;
4770 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4771 }
4772 case Intrinsic::amdgcn_cvt_pkrtz: {
4773 if (SNaN)
4774 return true;
4775 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4776 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4777 }
4778 case Intrinsic::amdgcn_rcp:
4779 case Intrinsic::amdgcn_rsq:
4780 case Intrinsic::amdgcn_rcp_legacy:
4781 case Intrinsic::amdgcn_rsq_legacy:
4782 case Intrinsic::amdgcn_rsq_clamp: {
4783 if (SNaN)
4784 return true;
4785
4786 // TODO: Need is known positive check.
4787 return false;
4788 }
4789 case Intrinsic::amdgcn_trig_preop:
4790 case Intrinsic::amdgcn_fdot2:
4791 // TODO: Refine on operand
4792 return SNaN;
4793 case Intrinsic::amdgcn_fma_legacy:
4794 if (SNaN)
4795 return true;
4796 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4797 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4798 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4799 default:
4800 return false;
4801 }
4802 }
4803 default:
4804 return false;
4805 }
4806}
4807
4808TargetLowering::AtomicExpansionKind
4809AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4810 switch (RMW->getOperation()) {
4811 case AtomicRMWInst::Nand:
4812 case AtomicRMWInst::FAdd:
4813 case AtomicRMWInst::FSub:
4814 case AtomicRMWInst::FMax:
4815 case AtomicRMWInst::FMin:
4816 return AtomicExpansionKind::CmpXChg;
4817 default:
4818 return AtomicExpansionKind::None;
4819 }
4820}
4821
4822bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
4823 unsigned Opc, LLT Ty1, LLT Ty2) const {
4824 return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
4825 Ty2 == LLT::scalar(32);
4826}

/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/ADT/bit.h"
17#include "llvm/Support/Compiler.h"
18#include <cassert>
19#include <climits>
20#include <cmath>
21#include <cstdint>
22#include <cstring>
23#include <limits>
24#include <type_traits>
25
26#ifdef _MSC_VER
27// Declare these intrinsics manually rather including intrin.h. It's very
28// expensive, and MathExtras.h is popular.
29// #include <intrin.h>
30extern "C" {
31unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
32unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
33unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
34unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
35}
36#endif
37
38namespace llvm {
39
40/// The behavior an operation has on an input of 0.
41enum ZeroBehavior {
42 /// The returned value is undefined.
43 ZB_Undefined,
44 /// The returned value is numeric_limits<T>::max()
45 ZB_Max,
46 /// The returned value is numeric_limits<T>::digits
47 ZB_Width
48};
49
50/// Mathematical constants.
51namespace numbers {
52// TODO: Track C++20 std::numbers.
53// TODO: Favor using the hexadecimal FP constants (requires C++17).
54constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
55 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
56 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
57 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
58 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
59 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
60 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
61 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
62 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
63 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
64 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
65 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
66 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
67 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
68 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
69constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
70 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
71 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
72 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
73 log2ef = 1.44269504F, // (0x1.715476P+0)
74 log10ef = .434294482F, // (0x1.bcb7b2P-2)
75 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
76 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
77 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
78 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
79 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
80 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
81 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
82 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
83 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
84} // namespace numbers
85
86namespace detail {
87template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
88 static unsigned count(T Val, ZeroBehavior) {
89 if (!Val)
90 return std::numeric_limits<T>::digits;
91 if (Val & 0x1)
92 return 0;
93
94 // Bisection method.
95 unsigned ZeroBits = 0;
96 T Shift = std::numeric_limits<T>::digits >> 1;
97 T Mask = std::numeric_limits<T>::max() >> Shift;
98 while (Shift) {
99 if ((Val & Mask) == 0) {
100 Val >>= Shift;
101 ZeroBits |= Shift;
102 }
103 Shift >>= 1;
104 Mask >>= Shift;
105 }
106 return ZeroBits;
107 }
108};
109
110#if defined(__GNUC__4) || defined(_MSC_VER)
111template <typename T> struct TrailingZerosCounter<T, 4> {
112 static unsigned count(T Val, ZeroBehavior ZB) {
113 if (ZB
6.1
'ZB' is not equal to ZB_Undefined
6.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
7
Assuming 'Val' is equal to 0
8
Taking true branch
114 return 32;
9
Returning the value 32
115
116#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
117 return __builtin_ctz(Val);
118#elif defined(_MSC_VER)
119 unsigned long Index;
120 _BitScanForward(&Index, Val);
121 return Index;
122#endif
123 }
124};
125
126#if !defined(_MSC_VER) || defined(_M_X64)
127template <typename T> struct TrailingZerosCounter<T, 8> {
128 static unsigned count(T Val, ZeroBehavior ZB) {
129 if (ZB != ZB_Undefined && Val == 0)
130 return 64;
131
132#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
133 return __builtin_ctzll(Val);
134#elif defined(_MSC_VER)
135 unsigned long Index;
136 _BitScanForward64(&Index, Val);
137 return Index;
138#endif
139 }
140};
141#endif
142#endif
143} // namespace detail
144
145/// Count number of 0's from the least significant bit to the most
146/// stopping at the first 1.
147///
148/// Only unsigned integral types are allowed.
149///
150/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
151/// valid arguments.
152template <typename T>
153unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
154 static_assert(std::is_unsigned_v<T>,
155 "Only unsigned integral types are allowed.");
156 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
6
Calling 'TrailingZerosCounter::count'
10
Returning from 'TrailingZerosCounter::count'
11
Returning the value 32
157}
158
159namespace detail {
160template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
161 static unsigned count(T Val, ZeroBehavior) {
162 if (!Val)
163 return std::numeric_limits<T>::digits;
164
165 // Bisection method.
166 unsigned ZeroBits = 0;
167 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
168 T Tmp = Val >> Shift;
169 if (Tmp)
170 Val = Tmp;
171 else
172 ZeroBits |= Shift;
173 }
174 return ZeroBits;
175 }
176};
177
178#if defined(__GNUC__4) || defined(_MSC_VER)
179template <typename T> struct LeadingZerosCounter<T, 4> {
180 static unsigned count(T Val, ZeroBehavior ZB) {
181 if (ZB != ZB_Undefined && Val == 0)
182 return 32;
183
184#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
185 return __builtin_clz(Val);
186#elif defined(_MSC_VER)
187 unsigned long Index;
188 _BitScanReverse(&Index, Val);
189 return Index ^ 31;
190#endif
191 }
192};
193
194#if !defined(_MSC_VER) || defined(_M_X64)
195template <typename T> struct LeadingZerosCounter<T, 8> {
196 static unsigned count(T Val, ZeroBehavior ZB) {
197 if (ZB != ZB_Undefined && Val == 0)
198 return 64;
199
200#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
201 return __builtin_clzll(Val);
202#elif defined(_MSC_VER)
203 unsigned long Index;
204 _BitScanReverse64(&Index, Val);
205 return Index ^ 63;
206#endif
207 }
208};
209#endif
210#endif
211} // namespace detail
212
213/// Count number of 0's from the most significant bit to the least
214/// stopping at the first 1.
215///
216/// Only unsigned integral types are allowed.
217///
218/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
219/// valid arguments.
220template <typename T>
221unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
222 static_assert(std::is_unsigned_v<T>,
223 "Only unsigned integral types are allowed.");
224 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
225}
226
227/// Get the index of the first set bit starting from the least
228/// significant bit.
229///
230/// Only unsigned integral types are allowed.
231///
232/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
233/// valid arguments.
234template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
235 if (ZB == ZB_Max && Val == 0)
236 return std::numeric_limits<T>::max();
237
238 return countTrailingZeros(Val, ZB_Undefined);
239}
240
241/// Create a bitmask with the N right-most bits set to 1, and all other
242/// bits set to 0. Only unsigned types are allowed.
243template <typename T> T maskTrailingOnes(unsigned N) {
244 static_assert(std::is_unsigned<T>::value, "Invalid type!");
245 const unsigned Bits = CHAR_BIT8 * sizeof(T);
246 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 246, __extension__
__PRETTY_FUNCTION__))
;
247 return N == 0 ? 0 : (T(-1) >> (Bits - N));
248}
249
250/// Create a bitmask with the N left-most bits set to 1, and all other
251/// bits set to 0. Only unsigned types are allowed.
252template <typename T> T maskLeadingOnes(unsigned N) {
253 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
254}
255
256/// Create a bitmask with the N right-most bits set to 0, and all other
257/// bits set to 1. Only unsigned types are allowed.
258template <typename T> T maskTrailingZeros(unsigned N) {
259 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
260}
261
262/// Create a bitmask with the N left-most bits set to 0, and all other
263/// bits set to 1. Only unsigned types are allowed.
264template <typename T> T maskLeadingZeros(unsigned N) {
265 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
266}
267
268/// Get the index of the last set bit starting from the least
269/// significant bit.
270///
271/// Only unsigned integral types are allowed.
272///
273/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
274/// valid arguments.
275template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
276 if (ZB == ZB_Max && Val == 0)
277 return std::numeric_limits<T>::max();
278
279 // Use ^ instead of - because both gcc and llvm can remove the associated ^
280 // in the __builtin_clz intrinsic on x86.
281 return countLeadingZeros(Val, ZB_Undefined) ^
282 (std::numeric_limits<T>::digits - 1);
283}
284
285/// Macro compressed bit reversal table for 256 bits.
286///
287/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
288static const unsigned char BitReverseTable256[256] = {
289#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
290#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
291#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
292 R6(0), R6(2), R6(1), R6(3)
293#undef R2
294#undef R4
295#undef R6
296};
297
298/// Reverse the bits in \p Val.
299template <typename T> T reverseBits(T Val) {
300#if __has_builtin(__builtin_bitreverse8)1
301 if constexpr (std::is_same_v<T, uint8_t>)
302 return __builtin_bitreverse8(Val);
303#endif
304#if __has_builtin(__builtin_bitreverse16)1
305 if constexpr (std::is_same_v<T, uint16_t>)
306 return __builtin_bitreverse16(Val);
307#endif
308#if __has_builtin(__builtin_bitreverse32)1
309 if constexpr (std::is_same_v<T, uint32_t>)
310 return __builtin_bitreverse32(Val);
311#endif
312#if __has_builtin(__builtin_bitreverse64)1
313 if constexpr (std::is_same_v<T, uint64_t>)
314 return __builtin_bitreverse64(Val);
315#endif
316
317 unsigned char in[sizeof(Val)];
318 unsigned char out[sizeof(Val)];
319 std::memcpy(in, &Val, sizeof(Val));
320 for (unsigned i = 0; i < sizeof(Val); ++i)
321 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
322 std::memcpy(&Val, out, sizeof(Val));
323 return Val;
324}
325
326// NOTE: The following support functions use the _32/_64 extensions instead of
327// type overloading so that signed and unsigned integers can be used without
328// ambiguity.
329
330/// Return the high 32 bits of a 64 bit value.
331constexpr inline uint32_t Hi_32(uint64_t Value) {
332 return static_cast<uint32_t>(Value >> 32);
333}
334
335/// Return the low 32 bits of a 64 bit value.
336constexpr inline uint32_t Lo_32(uint64_t Value) {
337 return static_cast<uint32_t>(Value);
338}
339
340/// Make a 64-bit integer from a high / low pair of 32-bit integers.
341constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
342 return ((uint64_t)High << 32) | (uint64_t)Low;
343}
344
345/// Checks if an integer fits into the given bit width.
346template <unsigned N> constexpr inline bool isInt(int64_t x) {
347 if constexpr (N == 8)
348 return static_cast<int8_t>(x) == x;
349 if constexpr (N == 16)
350 return static_cast<int16_t>(x) == x;
351 if constexpr (N == 32)
352 return static_cast<int32_t>(x) == x;
353 if constexpr (N < 64)
354 return -(INT64_C(1)1L << (N - 1)) <= x && x < (INT64_C(1)1L << (N - 1));
355 (void)x; // MSVC v19.25 warns that x is unused.
356 return true;
357}
358
359/// Checks if a signed integer is an N bit number shifted left by S.
360template <unsigned N, unsigned S>
361constexpr inline bool isShiftedInt(int64_t x) {
362 static_assert(
363 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
364 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
365 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
366}
367
368/// Checks if an unsigned integer fits into the given bit width.
369template <unsigned N> constexpr inline bool isUInt(uint64_t x) {
370 static_assert(N > 0, "isUInt<0> doesn't make sense");
371 if constexpr (N == 8)
372 return static_cast<uint8_t>(x) == x;
373 if constexpr (N == 16)
374 return static_cast<uint16_t>(x) == x;
375 if constexpr (N == 32)
376 return static_cast<uint32_t>(x) == x;
377 if constexpr (N < 64)
378 return x < (UINT64_C(1)1UL << (N));
379 (void)x; // MSVC v19.25 warns that x is unused.
380 return true;
381}
382
383/// Checks if a unsigned integer is an N bit number shifted left by S.
384template <unsigned N, unsigned S>
385constexpr inline bool isShiftedUInt(uint64_t x) {
386 static_assert(
387 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
388 static_assert(N + S <= 64,
389 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
390 // Per the two static_asserts above, S must be strictly less than 64. So
391 // 1 << S is not undefined behavior.
392 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
393}
394
395/// Gets the maximum value for a N-bit unsigned integer.
396inline uint64_t maxUIntN(uint64_t N) {
397 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 397, __extension__
__PRETTY_FUNCTION__))
;
398
399 // uint64_t(1) << 64 is undefined behavior, so we can't do
400 // (uint64_t(1) << N) - 1
401 // without checking first that N != 64. But this works and doesn't have a
402 // branch.
403 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
404}
405
406/// Gets the minimum value for a N-bit signed integer.
407inline int64_t minIntN(int64_t N) {
408 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 408, __extension__
__PRETTY_FUNCTION__))
;
409
410 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
411}
412
413/// Gets the maximum value for a N-bit signed integer.
414inline int64_t maxIntN(int64_t N) {
415 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 415, __extension__
__PRETTY_FUNCTION__))
;
416
417 // This relies on two's complement wraparound when N == 64, so we convert to
418 // int64_t only at the very end to avoid UB.
419 return (UINT64_C(1)1UL << (N - 1)) - 1;
420}
421
422/// Checks if an unsigned integer fits into the given (dynamic) bit width.
423inline bool isUIntN(unsigned N, uint64_t x) {
424 return N >= 64 || x <= maxUIntN(N);
425}
426
427/// Checks if an signed integer fits into the given (dynamic) bit width.
428inline bool isIntN(unsigned N, int64_t x) {
429 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
430}
431
432/// Return true if the argument is a non-empty sequence of ones starting at the
433/// least significant bit with the remainder zero (32 bit version).
434/// Ex. isMask_32(0x0000FFFFU) == true.
435constexpr inline bool isMask_32(uint32_t Value) {
436 return Value && ((Value + 1) & Value) == 0;
437}
438
439/// Return true if the argument is a non-empty sequence of ones starting at the
440/// least significant bit with the remainder zero (64 bit version).
441constexpr inline bool isMask_64(uint64_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument contains a non-empty sequence of ones with the
446/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
447constexpr inline bool isShiftedMask_32(uint32_t Value) {
448 return Value && isMask_32((Value - 1) | Value);
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (64 bit version.)
453constexpr inline bool isShiftedMask_64(uint64_t Value) {
454 return Value && isMask_64((Value - 1) | Value);
455}
456
457/// Return true if the argument is a power of two > 0.
458/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
459constexpr inline bool isPowerOf2_32(uint32_t Value) {
460 return llvm::has_single_bit(Value);
461}
462
463/// Return true if the argument is a power of two > 0 (64 bit edition.)
464constexpr inline bool isPowerOf2_64(uint64_t Value) {
465 return llvm::has_single_bit(Value);
466}
467
468/// Count the number of ones from the most significant bit to the first
469/// zero bit.
470///
471/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
472/// Only unsigned integral types are allowed.
473///
474/// \param ZB the behavior on an input of all ones. Only ZB_Width and
475/// ZB_Undefined are valid arguments.
476template <typename T>
477unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
478 static_assert(std::is_unsigned_v<T>,
479 "Only unsigned integral types are allowed.");
480 return countLeadingZeros<T>(~Value, ZB);
481}
482
483/// Count the number of ones from the least significant bit to the first
484/// zero bit.
485///
486/// Ex. countTrailingOnes(0x00FF00FF) == 8.
487/// Only unsigned integral types are allowed.
488///
489/// \param ZB the behavior on an input of all ones. Only ZB_Width and
490/// ZB_Undefined are valid arguments.
491template <typename T>
492unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
493 static_assert(std::is_unsigned_v<T>,
494 "Only unsigned integral types are allowed.");
495 return countTrailingZeros<T>(~Value, ZB);
496}
497
498/// Count the number of set bits in a value.
499/// Ex. countPopulation(0xF000F000) = 8
500/// Returns 0 if the word is zero.
501template <typename T>
502inline unsigned countPopulation(T Value) {
503 static_assert(std::is_unsigned_v<T>,
504 "Only unsigned integral types are allowed.");
505 return (unsigned)llvm::popcount(Value);
506}
507
508/// Return true if the argument contains a non-empty sequence of ones with the
509/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
510/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
511/// MaskLen is updated to specify the length of the mask, else neither are
512/// updated.
513inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
514 unsigned &MaskLen) {
515 if (!isShiftedMask_32(Value))
516 return false;
517 MaskIdx = countTrailingZeros(Value);
518 MaskLen = countPopulation(Value);
519 return true;
520}
521
522/// Return true if the argument contains a non-empty sequence of ones with the
523/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
524/// of the lowest set bit and \p MaskLen is updated to specify the length of the
525/// mask, else neither are updated.
526inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
527 unsigned &MaskLen) {
528 if (!isShiftedMask_64(Value))
529 return false;
530 MaskIdx = countTrailingZeros(Value);
531 MaskLen = countPopulation(Value);
532 return true;
533}
534
535/// Compile time Log2.
536/// Valid only for positive powers of two.
537template <size_t kValue> constexpr inline size_t CTLog2() {
538 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
539 "Value is not a valid power of 2");
540 return 1 + CTLog2<kValue / 2>();
541}
542
543template <> constexpr inline size_t CTLog2<1>() { return 0; }
544
545/// Return the floor log base 2 of the specified value, -1 if the value is zero.
546/// (32 bit edition.)
547/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
548inline unsigned Log2_32(uint32_t Value) {
549 return 31 - countLeadingZeros(Value);
550}
551
552/// Return the floor log base 2 of the specified value, -1 if the value is zero.
553/// (64 bit edition.)
554inline unsigned Log2_64(uint64_t Value) {
555 return 63 - countLeadingZeros(Value);
556}
557
558/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
559/// (32 bit edition).
560/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
561inline unsigned Log2_32_Ceil(uint32_t Value) {
562 return 32 - countLeadingZeros(Value - 1);
563}
564
565/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
566/// (64 bit edition.)
567inline unsigned Log2_64_Ceil(uint64_t Value) {
568 return 64 - countLeadingZeros(Value - 1);
569}
570
571/// This function takes a 64-bit integer and returns the bit equivalent double.
572inline double BitsToDouble(uint64_t Bits) {
573 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
574 return llvm::bit_cast<double>(Bits);
575}
576
577/// This function takes a 32-bit integer and returns the bit equivalent float.
578inline float BitsToFloat(uint32_t Bits) {
579 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
580 return llvm::bit_cast<float>(Bits);
581}
582
583/// This function takes a double and returns the bit equivalent 64-bit integer.
584/// Note that copying doubles around changes the bits of NaNs on some hosts,
585/// notably x86, so this routine cannot be used if these bits are needed.
586inline uint64_t DoubleToBits(double Double) {
587 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
588 return llvm::bit_cast<uint64_t>(Double);
589}
590
591/// This function takes a float and returns the bit equivalent 32-bit integer.
592/// Note that copying floats around changes the bits of NaNs on some hosts,
593/// notably x86, so this routine cannot be used if these bits are needed.
594inline uint32_t FloatToBits(float Float) {
595 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
596 return llvm::bit_cast<uint32_t>(Float);
597}
598
599/// A and B are either alignments or offsets. Return the minimum alignment that
600/// may be assumed after adding the two together.
601constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
602 // The largest power of 2 that divides both A and B.
603 //
604 // Replace "-Value" by "1+~Value" in the following commented code to avoid
605 // MSVC warning C4146
606 // return (A | B) & -(A | B);
607 return (A | B) & (1 + ~(A | B));
608}
609
610/// Returns the next power of two (in 64-bits) that is strictly greater than A.
611/// Returns zero on overflow.
612constexpr inline uint64_t NextPowerOf2(uint64_t A) {
613 A |= (A >> 1);
614 A |= (A >> 2);
615 A |= (A >> 4);
616 A |= (A >> 8);
617 A |= (A >> 16);
618 A |= (A >> 32);
619 return A + 1;
620}
621
622/// Returns the power of two which is less than or equal to the given value.
623/// Essentially, it is a floor operation across the domain of powers of two.
624inline uint64_t PowerOf2Floor(uint64_t A) {
625 if (!A) return 0;
626 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
627}
628
629/// Returns the power of two which is greater than or equal to the given value.
630/// Essentially, it is a ceil operation across the domain of powers of two.
631inline uint64_t PowerOf2Ceil(uint64_t A) {
632 if (!A)
633 return 0;
634 return NextPowerOf2(A - 1);
635}
636
637/// Returns the next integer (mod 2**64) that is greater than or equal to
638/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
639///
640/// Examples:
641/// \code
642/// alignTo(5, 8) = 8
643/// alignTo(17, 8) = 24
644/// alignTo(~0LL, 8) = 0
645/// alignTo(321, 255) = 510
646/// \endcode
647inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
648 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 648, __extension__
__PRETTY_FUNCTION__))
;
649 return (Value + Align - 1) / Align * Align;
650}
651
652inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) {
653 assert(Align != 0 && (Align & (Align - 1)) == 0 &&(static_cast <bool> (Align != 0 && (Align &
(Align - 1)) == 0 && "Align must be a power of 2") ?
void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 654, __extension__
__PRETTY_FUNCTION__))
654 "Align must be a power of 2")(static_cast <bool> (Align != 0 && (Align &
(Align - 1)) == 0 && "Align must be a power of 2") ?
void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 654, __extension__
__PRETTY_FUNCTION__))
;
655 return (Value + Align - 1) & -Align;
656}
657
658/// If non-zero \p Skew is specified, the return value will be a minimal integer
659/// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for
660/// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p
661/// Skew mod \p A'. \p Align must be non-zero.
662///
663/// Examples:
664/// \code
665/// alignTo(5, 8, 7) = 7
666/// alignTo(17, 8, 1) = 17
667/// alignTo(~0LL, 8, 3) = 3
668/// alignTo(321, 255, 42) = 552
669/// \endcode
670inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) {
671 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 671, __extension__
__PRETTY_FUNCTION__))
;
672 Skew %= Align;
673 return alignTo(Value - Skew, Align) + Skew;
674}
675
676/// Returns the next integer (mod 2**64) that is greater than or equal to
677/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
678template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
679 static_assert(Align != 0u, "Align must be non-zero");
680 return (Value + Align - 1) / Align * Align;
681}
682
683/// Returns the integer ceil(Numerator / Denominator).
684inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
685 return alignTo(Numerator, Denominator) / Denominator;
686}
687
688/// Returns the integer nearest(Numerator / Denominator).
689inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
690 return (Numerator + (Denominator / 2)) / Denominator;
691}
692
693/// Returns the largest uint64_t less than or equal to \p Value and is
694/// \p Skew mod \p Align. \p Align must be non-zero
695inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
696 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 696, __extension__
__PRETTY_FUNCTION__))
;
697 Skew %= Align;
698 return (Value - Skew) / Align * Align + Skew;
699}
700
701/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
702/// Requires 0 < B <= 32.
703template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
704 static_assert(B > 0, "Bit width can't be 0.");
705 static_assert(B <= 32, "Bit width out of range.");
706 return int32_t(X << (32 - B)) >> (32 - B);
707}
708
709/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
710/// Requires 0 < B <= 32.
711inline int32_t SignExtend32(uint32_t X, unsigned B) {
712 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 712, __extension__
__PRETTY_FUNCTION__))
;
713 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 713, __extension__
__PRETTY_FUNCTION__))
;
714 return int32_t(X << (32 - B)) >> (32 - B);
715}
716
717/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
718/// Requires 0 < B <= 64.
719template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
720 static_assert(B > 0, "Bit width can't be 0.");
721 static_assert(B <= 64, "Bit width out of range.");
722 return int64_t(x << (64 - B)) >> (64 - B);
723}
724
725/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
726/// Requires 0 < B <= 64.
727inline int64_t SignExtend64(uint64_t X, unsigned B) {
728 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 728, __extension__
__PRETTY_FUNCTION__))
;
729 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 729, __extension__
__PRETTY_FUNCTION__))
;
730 return int64_t(X << (64 - B)) >> (64 - B);
731}
732
733/// Subtract two unsigned integers, X and Y, of type T and return the absolute
734/// value of the result.
735template <typename T>
736std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
737 return X > Y ? (X - Y) : (Y - X);
738}
739
740/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
741/// maximum representable value of T on overflow. ResultOverflowed indicates if
742/// the result is larger than the maximum representable value of type T.
743template <typename T>
744std::enable_if_t<std::is_unsigned<T>::value, T>
745SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
746 bool Dummy;
747 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
748 // Hacker's Delight, p. 29
749 T Z = X + Y;
750 Overflowed = (Z < X || Z < Y);
751 if (Overflowed)
752 return std::numeric_limits<T>::max();
753 else
754 return Z;
755}
756
757/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
758/// maximum representable value of T on overflow. ResultOverflowed indicates if
759/// the result is larger than the maximum representable value of type T.
760template <typename T>
761std::enable_if_t<std::is_unsigned<T>::value, T>
762SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
763 bool Dummy;
764 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
765
766 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
767 // because it fails for uint16_t (where multiplication can have undefined
768 // behavior due to promotion to int), and requires a division in addition
769 // to the multiplication.
770
771 Overflowed = false;
772
773 // Log2(Z) would be either Log2Z or Log2Z + 1.
774 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
775 // will necessarily be less than Log2Max as desired.
776 int Log2Z = Log2_64(X) + Log2_64(Y);
777 const T Max = std::numeric_limits<T>::max();
778 int Log2Max = Log2_64(Max);
779 if (Log2Z < Log2Max) {
780 return X * Y;
781 }
782 if (Log2Z > Log2Max) {
783 Overflowed = true;
784 return Max;
785 }
786
787 // We're going to use the top bit, and maybe overflow one
788 // bit past it. Multiply all but the bottom bit then add
789 // that on at the end.
790 T Z = (X >> 1) * Y;
791 if (Z & ~(Max >> 1)) {
792 Overflowed = true;
793 return Max;
794 }
795 Z <<= 1;
796 if (X & 1)
797 return SaturatingAdd(Z, Y, ResultOverflowed);
798
799 return Z;
800}
801
802/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
803/// the product. Clamp the result to the maximum representable value of T on
804/// overflow. ResultOverflowed indicates if the result is larger than the
805/// maximum representable value of type T.
806template <typename T>
807std::enable_if_t<std::is_unsigned<T>::value, T>
808SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
809 bool Dummy;
810 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
811
812 T Product = SaturatingMultiply(X, Y, &Overflowed);
813 if (Overflowed)
814 return Product;
815
816 return SaturatingAdd(A, Product, &Overflowed);
817}
818
819/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
820extern const float huge_valf;
821
822
823/// Add two signed integers, computing the two's complement truncated result,
824/// returning true if overflow occurred.
825template <typename T>
826std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
827#if __has_builtin(__builtin_add_overflow)1
828 return __builtin_add_overflow(X, Y, &Result);
829#else
830 // Perform the unsigned addition.
831 using U = std::make_unsigned_t<T>;
832 const U UX = static_cast<U>(X);
833 const U UY = static_cast<U>(Y);
834 const U UResult = UX + UY;
835
836 // Convert to signed.
837 Result = static_cast<T>(UResult);
838
839 // Adding two positive numbers should result in a positive number.
840 if (X > 0 && Y > 0)
841 return Result <= 0;
842 // Adding two negatives should result in a negative number.
843 if (X < 0 && Y < 0)
844 return Result >= 0;
845 return false;
846#endif
847}
848
849/// Subtract two signed integers, computing the two's complement truncated
850/// result, returning true if an overflow ocurred.
851template <typename T>
852std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
853#if __has_builtin(__builtin_sub_overflow)1
854 return __builtin_sub_overflow(X, Y, &Result);
855#else
856 // Perform the unsigned addition.
857 using U = std::make_unsigned_t<T>;
858 const U UX = static_cast<U>(X);
859 const U UY = static_cast<U>(Y);
860 const U UResult = UX - UY;
861
862 // Convert to signed.
863 Result = static_cast<T>(UResult);
864
865 // Subtracting a positive number from a negative results in a negative number.
866 if (X <= 0 && Y > 0)
867 return Result >= 0;
868 // Subtracting a negative number from a positive results in a positive number.
869 if (X >= 0 && Y < 0)
870 return Result <= 0;
871 return false;
872#endif
873}
874
875/// Multiply two signed integers, computing the two's complement truncated
876/// result, returning true if an overflow ocurred.
877template <typename T>
878std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
879 // Perform the unsigned multiplication on absolute values.
880 using U = std::make_unsigned_t<T>;
881 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
882 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
883 const U UResult = UX * UY;
884
885 // Convert to signed.
886 const bool IsNegative = (X < 0) ^ (Y < 0);
887 Result = IsNegative ? (0 - UResult) : UResult;
888
889 // If any of the args was 0, result is 0 and no overflow occurs.
890 if (UX == 0 || UY == 0)
891 return false;
892
893 // UX and UY are in [1, 2^n], where n is the number of digits.
894 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
895 // positive) divided by an argument compares to the other.
896 if (IsNegative)
897 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
898 else
899 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
900}
901
902} // End llvm namespace
903
904#endif