Bug Summary

File:build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4363, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-04-20-140412-16051-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/CodeGen/Analysis.h"
22#include "llvm/CodeGen/MachineFrameInfo.h"
23#include "llvm/IR/DiagnosticInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
25#include "llvm/Support/CommandLine.h"
26#include "llvm/Support/KnownBits.h"
27#include "llvm/Target/TargetMachine.h"
28
29using namespace llvm;
30
31#include "AMDGPUGenCallingConv.inc"
32
33static cl::opt<bool> AMDGPUBypassSlowDiv(
34 "amdgpu-bypass-slow-div",
35 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36 cl::init(true));
37
38// Find a larger type to do a load / store of a vector with.
39EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
40 unsigned StoreSize = VT.getStoreSizeInBits();
41 if (StoreSize <= 32)
42 return EVT::getIntegerVT(Ctx, StoreSize);
43
44 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")(static_cast <bool> (StoreSize % 32 == 0 && "Store size not a multiple of 32"
) ? void (0) : __assert_fail ("StoreSize % 32 == 0 && \"Store size not a multiple of 32\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 44, __extension__
__PRETTY_FUNCTION__))
;
45 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46}
47
48unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
49 return DAG.computeKnownBits(Op).countMaxActiveBits();
50}
51
52unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
53 // In order for this to be a signed 24-bit value, bit 23, must
54 // be a sign bit.
55 return DAG.ComputeMaxSignificantBits(Op);
56}
57
58AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
59 const AMDGPUSubtarget &STI)
60 : TargetLowering(TM), Subtarget(&STI) {
61 // Lower floating point store/load to integer store/load to reduce the number
62 // of patterns in tablegen.
63 setOperationAction(ISD::LOAD, MVT::f32, Promote);
64 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
65
66 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
67 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
68
69 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
70 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
71
72 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
73 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
74
75 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
77
78 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
80
81 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
83
84 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
86
87 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
89
90 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
92
93 setOperationAction(ISD::LOAD, MVT::i64, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
95
96 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
98
99 setOperationAction(ISD::LOAD, MVT::f64, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
101
102 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
104
105 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
107
108 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
110
111 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
113
114 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
116
117 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
119
120 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
122
123 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
125
126 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
128
129 // There are no 64-bit extloads. These should be done as a 32-bit extload and
130 // an extension to 64-bit.
131 for (MVT VT : MVT::integer_valuetypes()) {
132 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
133 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
134 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
135 }
136
137 for (MVT VT : MVT::integer_valuetypes()) {
138 if (VT == MVT::i64)
139 continue;
140
141 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
143 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
145
146 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
147 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
148 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
149 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
150
151 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
152 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
153 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
154 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
155 }
156
157 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
158 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
159 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
160 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
161 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
162 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
163 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
164 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
165 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
166 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
167 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
168 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
169 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
172 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
173 }
174
175 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
182
183 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
189
190 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
196
197 setOperationAction(ISD::STORE, MVT::f32, Promote);
198 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
199
200 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
201 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
202
203 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
204 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
205
206 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
207 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
208
209 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
210 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
211
212 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
213 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
214
215 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
216 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
217
218 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
219 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
220
221 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
222 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
223
224 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
225 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
226
227 setOperationAction(ISD::STORE, MVT::i64, Promote);
228 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
229
230 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
231 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
232
233 setOperationAction(ISD::STORE, MVT::f64, Promote);
234 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
235
236 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
237 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
238
239 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
240 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
241
242 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
243 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
244
245 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
246 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
247
248 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
249 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
250
251 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
252 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
253
254 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
255 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
256
257 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
258 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
259
260 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
261 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
262
263 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
264 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
265 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
266 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
267
268 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
269 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
270 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
271 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
272
273 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
274 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
275 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
276 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
277 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
278 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
279 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
280
281 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
282 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
283
284 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
285 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
286
287 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
288 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
289 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
290 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
291
292 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
293 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
294 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
295 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
296
297 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
298 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
299
300 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
301 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
302 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
303 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
304 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
305 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
306 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
307
308 setOperationAction(ISD::Constant, MVT::i32, Legal);
309 setOperationAction(ISD::Constant, MVT::i64, Legal);
310 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
311 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
312
313 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
314 setOperationAction(ISD::BRIND, MVT::Other, Expand);
315
316 // This is totally unsupported, just custom lower to produce an error.
317 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
318
319 // Library functions. These default to Expand, but we have instructions
320 // for them.
321 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
322 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
323 setOperationAction(ISD::FPOW, MVT::f32, Legal);
324 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
325 setOperationAction(ISD::FABS, MVT::f32, Legal);
326 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
327 setOperationAction(ISD::FRINT, MVT::f32, Legal);
328 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
329 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
330 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
331
332 setOperationAction(ISD::FROUND, MVT::f32, Custom);
333 setOperationAction(ISD::FROUND, MVT::f64, Custom);
334
335 setOperationAction(ISD::FLOG, MVT::f32, Custom);
336 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
337 setOperationAction(ISD::FEXP, MVT::f32, Custom);
338
339
340 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
341 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
342
343 setOperationAction(ISD::FREM, MVT::f16, Custom);
344 setOperationAction(ISD::FREM, MVT::f32, Custom);
345 setOperationAction(ISD::FREM, MVT::f64, Custom);
346
347 // Expand to fneg + fadd.
348 setOperationAction(ISD::FSUB, MVT::f64, Expand);
349
350 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
351 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
352 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
353 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
354 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
355 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
356 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
357 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
358 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
359 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
360 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
361 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
362 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
363 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
364 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
365 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
366 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
367 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
368 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
369 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
370 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
371 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
372 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
373 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
374 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
375 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
376 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
377 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
378 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
379 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
380 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
381 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
382 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
383 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
384 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
385 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
386 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
387 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
388 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
389 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
390 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
391 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
392 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
393 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
394
395 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
396 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
397 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
398
399 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
400 for (MVT VT : ScalarIntVTs) {
401 // These should use [SU]DIVREM, so set them to expand
402 setOperationAction(ISD::SDIV, VT, Expand);
403 setOperationAction(ISD::UDIV, VT, Expand);
404 setOperationAction(ISD::SREM, VT, Expand);
405 setOperationAction(ISD::UREM, VT, Expand);
406
407 // GPU does not have divrem function for signed or unsigned.
408 setOperationAction(ISD::SDIVREM, VT, Custom);
409 setOperationAction(ISD::UDIVREM, VT, Custom);
410
411 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
412 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
413 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
414
415 setOperationAction(ISD::BSWAP, VT, Expand);
416 setOperationAction(ISD::CTTZ, VT, Expand);
417 setOperationAction(ISD::CTLZ, VT, Expand);
418
419 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
420 setOperationAction(ISD::ADDC, VT, Legal);
421 setOperationAction(ISD::SUBC, VT, Legal);
422 setOperationAction(ISD::ADDE, VT, Legal);
423 setOperationAction(ISD::SUBE, VT, Legal);
424 }
425
426 // The hardware supports 32-bit FSHR, but not FSHL.
427 setOperationAction(ISD::FSHR, MVT::i32, Legal);
428
429 // The hardware supports 32-bit ROTR, but not ROTL.
430 setOperationAction(ISD::ROTL, MVT::i32, Expand);
431 setOperationAction(ISD::ROTL, MVT::i64, Expand);
432 setOperationAction(ISD::ROTR, MVT::i64, Expand);
433
434 setOperationAction(ISD::MULHU, MVT::i16, Expand);
435 setOperationAction(ISD::MULHS, MVT::i16, Expand);
436
437 setOperationAction(ISD::MUL, MVT::i64, Expand);
438 setOperationAction(ISD::MULHU, MVT::i64, Expand);
439 setOperationAction(ISD::MULHS, MVT::i64, Expand);
440 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
441 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
442 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
443 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
444 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
445
446 setOperationAction(ISD::SMIN, MVT::i32, Legal);
447 setOperationAction(ISD::UMIN, MVT::i32, Legal);
448 setOperationAction(ISD::SMAX, MVT::i32, Legal);
449 setOperationAction(ISD::UMAX, MVT::i32, Legal);
450
451 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
452 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
453 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
454 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
455
456 static const MVT::SimpleValueType VectorIntTypes[] = {
457 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
458
459 for (MVT VT : VectorIntTypes) {
460 // Expand the following operations for the current type by default.
461 setOperationAction(ISD::ADD, VT, Expand);
462 setOperationAction(ISD::AND, VT, Expand);
463 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
464 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
465 setOperationAction(ISD::MUL, VT, Expand);
466 setOperationAction(ISD::MULHU, VT, Expand);
467 setOperationAction(ISD::MULHS, VT, Expand);
468 setOperationAction(ISD::OR, VT, Expand);
469 setOperationAction(ISD::SHL, VT, Expand);
470 setOperationAction(ISD::SRA, VT, Expand);
471 setOperationAction(ISD::SRL, VT, Expand);
472 setOperationAction(ISD::ROTL, VT, Expand);
473 setOperationAction(ISD::ROTR, VT, Expand);
474 setOperationAction(ISD::SUB, VT, Expand);
475 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
476 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
477 setOperationAction(ISD::SDIV, VT, Expand);
478 setOperationAction(ISD::UDIV, VT, Expand);
479 setOperationAction(ISD::SREM, VT, Expand);
480 setOperationAction(ISD::UREM, VT, Expand);
481 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
482 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
483 setOperationAction(ISD::SDIVREM, VT, Expand);
484 setOperationAction(ISD::UDIVREM, VT, Expand);
485 setOperationAction(ISD::SELECT, VT, Expand);
486 setOperationAction(ISD::VSELECT, VT, Expand);
487 setOperationAction(ISD::SELECT_CC, VT, Expand);
488 setOperationAction(ISD::XOR, VT, Expand);
489 setOperationAction(ISD::BSWAP, VT, Expand);
490 setOperationAction(ISD::CTPOP, VT, Expand);
491 setOperationAction(ISD::CTTZ, VT, Expand);
492 setOperationAction(ISD::CTLZ, VT, Expand);
493 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
494 setOperationAction(ISD::SETCC, VT, Expand);
495 }
496
497 static const MVT::SimpleValueType FloatVectorTypes[] = {
498 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
499
500 for (MVT VT : FloatVectorTypes) {
501 setOperationAction(ISD::FABS, VT, Expand);
502 setOperationAction(ISD::FMINNUM, VT, Expand);
503 setOperationAction(ISD::FMAXNUM, VT, Expand);
504 setOperationAction(ISD::FADD, VT, Expand);
505 setOperationAction(ISD::FCEIL, VT, Expand);
506 setOperationAction(ISD::FCOS, VT, Expand);
507 setOperationAction(ISD::FDIV, VT, Expand);
508 setOperationAction(ISD::FEXP2, VT, Expand);
509 setOperationAction(ISD::FEXP, VT, Expand);
510 setOperationAction(ISD::FLOG2, VT, Expand);
511 setOperationAction(ISD::FREM, VT, Expand);
512 setOperationAction(ISD::FLOG, VT, Expand);
513 setOperationAction(ISD::FLOG10, VT, Expand);
514 setOperationAction(ISD::FPOW, VT, Expand);
515 setOperationAction(ISD::FFLOOR, VT, Expand);
516 setOperationAction(ISD::FTRUNC, VT, Expand);
517 setOperationAction(ISD::FMUL, VT, Expand);
518 setOperationAction(ISD::FMA, VT, Expand);
519 setOperationAction(ISD::FRINT, VT, Expand);
520 setOperationAction(ISD::FNEARBYINT, VT, Expand);
521 setOperationAction(ISD::FSQRT, VT, Expand);
522 setOperationAction(ISD::FSIN, VT, Expand);
523 setOperationAction(ISD::FSUB, VT, Expand);
524 setOperationAction(ISD::FNEG, VT, Expand);
525 setOperationAction(ISD::VSELECT, VT, Expand);
526 setOperationAction(ISD::SELECT_CC, VT, Expand);
527 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
528 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
529 setOperationAction(ISD::SETCC, VT, Expand);
530 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
531 }
532
533 // This causes using an unrolled select operation rather than expansion with
534 // bit operations. This is in general better, but the alternative using BFI
535 // instructions may be better if the select sources are SGPRs.
536 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
537 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
538
539 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
540 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
541
542 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
543 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
544
545 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
546 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
547
548 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
549 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
550
551 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
552 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
553
554 // There are no libcalls of any kind.
555 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
556 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
557
558 setSchedulingPreference(Sched::RegPressure);
559 setJumpIsExpensive(true);
560
561 // FIXME: This is only partially true. If we have to do vector compares, any
562 // SGPR pair can be a condition register. If we have a uniform condition, we
563 // are better off doing SALU operations, where there is only one SCC. For now,
564 // we don't have a way of knowing during instruction selection if a condition
565 // will be uniform and we always use vector compares. Assume we are using
566 // vector compares until that is fixed.
567 setHasMultipleConditionRegisters(true);
568
569 setMinCmpXchgSizeInBits(32);
570 setSupportsUnalignedAtomics(false);
571
572 PredictableSelectIsExpensive = false;
573
574 // We want to find all load dependencies for long chains of stores to enable
575 // merging into very wide vectors. The problem is with vectors with > 4
576 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
577 // vectors are a legal type, even though we have to split the loads
578 // usually. When we can more precisely specify load legality per address
579 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
580 // smarter so that they can figure out what to do in 2 iterations without all
581 // N > 4 stores on the same chain.
582 GatherAllAliasesMaxDepth = 16;
583
584 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
585 // about these during lowering.
586 MaxStoresPerMemcpy = 0xffffffff;
587 MaxStoresPerMemmove = 0xffffffff;
588 MaxStoresPerMemset = 0xffffffff;
589
590 // The expansion for 64-bit division is enormous.
591 if (AMDGPUBypassSlowDiv)
592 addBypassSlowDiv(64, 32);
593
594 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
595 ISD::SRA, ISD::SRL,
596 ISD::TRUNCATE, ISD::MUL,
597 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
598 ISD::MULHU, ISD::MULHS,
599 ISD::SELECT, ISD::SELECT_CC,
600 ISD::STORE, ISD::FADD,
601 ISD::FSUB, ISD::FNEG,
602 ISD::FABS, ISD::AssertZext,
603 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
604}
605
606bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
607 if (getTargetMachine().Options.NoSignedZerosFPMath)
608 return true;
609
610 const auto Flags = Op.getNode()->getFlags();
611 if (Flags.hasNoSignedZeros())
612 return true;
613
614 return false;
615}
616
617//===----------------------------------------------------------------------===//
618// Target Information
619//===----------------------------------------------------------------------===//
620
621LLVM_READNONE__attribute__((__const__))
622static bool fnegFoldsIntoOp(unsigned Opc) {
623 switch (Opc) {
624 case ISD::FADD:
625 case ISD::FSUB:
626 case ISD::FMUL:
627 case ISD::FMA:
628 case ISD::FMAD:
629 case ISD::FMINNUM:
630 case ISD::FMAXNUM:
631 case ISD::FMINNUM_IEEE:
632 case ISD::FMAXNUM_IEEE:
633 case ISD::FSIN:
634 case ISD::FTRUNC:
635 case ISD::FRINT:
636 case ISD::FNEARBYINT:
637 case ISD::FCANONICALIZE:
638 case AMDGPUISD::RCP:
639 case AMDGPUISD::RCP_LEGACY:
640 case AMDGPUISD::RCP_IFLAG:
641 case AMDGPUISD::SIN_HW:
642 case AMDGPUISD::FMUL_LEGACY:
643 case AMDGPUISD::FMIN_LEGACY:
644 case AMDGPUISD::FMAX_LEGACY:
645 case AMDGPUISD::FMED3:
646 // TODO: handle llvm.amdgcn.fma.legacy
647 return true;
648 default:
649 return false;
650 }
651}
652
653/// \p returns true if the operation will definitely need to use a 64-bit
654/// encoding, and thus will use a VOP3 encoding regardless of the source
655/// modifiers.
656LLVM_READONLY__attribute__((__pure__))
657static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
658 return N->getNumOperands() > 2 || VT == MVT::f64;
659}
660
661// Most FP instructions support source modifiers, but this could be refined
662// slightly.
663LLVM_READONLY__attribute__((__pure__))
664static bool hasSourceMods(const SDNode *N) {
665 if (isa<MemSDNode>(N))
666 return false;
667
668 switch (N->getOpcode()) {
669 case ISD::CopyToReg:
670 case ISD::SELECT:
671 case ISD::FDIV:
672 case ISD::FREM:
673 case ISD::INLINEASM:
674 case ISD::INLINEASM_BR:
675 case AMDGPUISD::DIV_SCALE:
676 case ISD::INTRINSIC_W_CHAIN:
677
678 // TODO: Should really be looking at the users of the bitcast. These are
679 // problematic because bitcasts are used to legalize all stores to integer
680 // types.
681 case ISD::BITCAST:
682 return false;
683 case ISD::INTRINSIC_WO_CHAIN: {
684 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
685 case Intrinsic::amdgcn_interp_p1:
686 case Intrinsic::amdgcn_interp_p2:
687 case Intrinsic::amdgcn_interp_mov:
688 case Intrinsic::amdgcn_interp_p1_f16:
689 case Intrinsic::amdgcn_interp_p2_f16:
690 return false;
691 default:
692 return true;
693 }
694 }
695 default:
696 return true;
697 }
698}
699
700bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
701 unsigned CostThreshold) {
702 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
703 // it is truly free to use a source modifier in all cases. If there are
704 // multiple users but for each one will necessitate using VOP3, there will be
705 // a code size increase. Try to avoid increasing code size unless we know it
706 // will save on the instruction count.
707 unsigned NumMayIncreaseSize = 0;
708 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
709
710 // XXX - Should this limit number of uses to check?
711 for (const SDNode *U : N->uses()) {
712 if (!hasSourceMods(U))
713 return false;
714
715 if (!opMustUseVOP3Encoding(U, VT)) {
716 if (++NumMayIncreaseSize > CostThreshold)
717 return false;
718 }
719 }
720
721 return true;
722}
723
724EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
725 ISD::NodeType ExtendKind) const {
726 assert(!VT.isVector() && "only scalar expected")(static_cast <bool> (!VT.isVector() && "only scalar expected"
) ? void (0) : __assert_fail ("!VT.isVector() && \"only scalar expected\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 726, __extension__
__PRETTY_FUNCTION__))
;
727
728 // Round to the next multiple of 32-bits.
729 unsigned Size = VT.getSizeInBits();
730 if (Size <= 32)
731 return MVT::i32;
732 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
733}
734
735MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
736 return MVT::i32;
737}
738
739bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
740 return true;
741}
742
743// The backend supports 32 and 64 bit floating point immediates.
744// FIXME: Why are we reporting vectors of FP immediates as legal?
745bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
746 bool ForCodeSize) const {
747 EVT ScalarVT = VT.getScalarType();
748 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
749 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
750}
751
752// We don't want to shrink f64 / f32 constants.
753bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
754 EVT ScalarVT = VT.getScalarType();
755 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
756}
757
758bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
759 ISD::LoadExtType ExtTy,
760 EVT NewVT) const {
761 // TODO: This may be worth removing. Check regression tests for diffs.
762 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
763 return false;
764
765 unsigned NewSize = NewVT.getStoreSizeInBits();
766
767 // If we are reducing to a 32-bit load or a smaller multi-dword load,
768 // this is always better.
769 if (NewSize >= 32)
770 return true;
771
772 EVT OldVT = N->getValueType(0);
773 unsigned OldSize = OldVT.getStoreSizeInBits();
774
775 MemSDNode *MN = cast<MemSDNode>(N);
776 unsigned AS = MN->getAddressSpace();
777 // Do not shrink an aligned scalar load to sub-dword.
778 // Scalar engine cannot do sub-dword loads.
779 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
780 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
781 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
782 (isa<LoadSDNode>(N) &&
783 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
784 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
785 return false;
786
787 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
788 // extloads, so doing one requires using a buffer_load. In cases where we
789 // still couldn't use a scalar load, using the wider load shouldn't really
790 // hurt anything.
791
792 // If the old size already had to be an extload, there's no harm in continuing
793 // to reduce the width.
794 return (OldSize < 32);
795}
796
797bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
798 const SelectionDAG &DAG,
799 const MachineMemOperand &MMO) const {
800
801 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())(static_cast <bool> (LoadTy.getSizeInBits() == CastTy.getSizeInBits
()) ? void (0) : __assert_fail ("LoadTy.getSizeInBits() == CastTy.getSizeInBits()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 801, __extension__
__PRETTY_FUNCTION__))
;
802
803 if (LoadTy.getScalarType() == MVT::i32)
804 return false;
805
806 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
807 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
808
809 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
810 return false;
811
812 bool Fast = false;
813 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
814 CastTy, MMO, &Fast) &&
815 Fast;
816}
817
818// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
819// profitable with the expansion for 64-bit since it's generally good to
820// speculate things.
821// FIXME: These should really have the size as a parameter.
822bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
823 return true;
824}
825
826bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
827 return true;
828}
829
830bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
831 switch (N->getOpcode()) {
832 case ISD::EntryToken:
833 case ISD::TokenFactor:
834 return true;
835 case ISD::INTRINSIC_WO_CHAIN: {
836 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
837 switch (IntrID) {
838 case Intrinsic::amdgcn_readfirstlane:
839 case Intrinsic::amdgcn_readlane:
840 return true;
841 }
842 return false;
843 }
844 case ISD::LOAD:
845 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
846 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
847 return true;
848 return false;
849 case AMDGPUISD::SETCC: // ballot-style instruction
850 return true;
851 }
852 return false;
853}
854
855SDValue AMDGPUTargetLowering::getNegatedExpression(
856 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
857 NegatibleCost &Cost, unsigned Depth) const {
858
859 switch (Op.getOpcode()) {
860 case ISD::FMA:
861 case ISD::FMAD: {
862 // Negating a fma is not free if it has users without source mods.
863 if (!allUsesHaveSourceMods(Op.getNode()))
864 return SDValue();
865 break;
866 }
867 default:
868 break;
869 }
870
871 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
872 ForCodeSize, Cost, Depth);
873}
874
875//===---------------------------------------------------------------------===//
876// Target Properties
877//===---------------------------------------------------------------------===//
878
879bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
880 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 880, __extension__ __PRETTY_FUNCTION__))
;
881
882 // Packed operations do not have a fabs modifier.
883 return VT == MVT::f32 || VT == MVT::f64 ||
884 (Subtarget->has16BitInsts() && VT == MVT::f16);
885}
886
887bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
888 assert(VT.isFloatingPoint())(static_cast <bool> (VT.isFloatingPoint()) ? void (0) :
__assert_fail ("VT.isFloatingPoint()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 888, __extension__ __PRETTY_FUNCTION__))
;
889 // Report this based on the end legalized type.
890 VT = VT.getScalarType();
891 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
892}
893
894bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
895 unsigned NumElem,
896 unsigned AS) const {
897 return true;
898}
899
900bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
901 // There are few operations which truly have vector input operands. Any vector
902 // operation is going to involve operations on each component, and a
903 // build_vector will be a copy per element, so it always makes sense to use a
904 // build_vector input in place of the extracted element to avoid a copy into a
905 // super register.
906 //
907 // We should probably only do this if all users are extracts only, but this
908 // should be the common case.
909 return true;
910}
911
912bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
913 // Truncate is just accessing a subregister.
914
915 unsigned SrcSize = Source.getSizeInBits();
916 unsigned DestSize = Dest.getSizeInBits();
917
918 return DestSize < SrcSize && DestSize % 32 == 0 ;
919}
920
921bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
922 // Truncate is just accessing a subregister.
923
924 unsigned SrcSize = Source->getScalarSizeInBits();
925 unsigned DestSize = Dest->getScalarSizeInBits();
926
927 if (DestSize== 16 && Subtarget->has16BitInsts())
928 return SrcSize >= 32;
929
930 return DestSize < SrcSize && DestSize % 32 == 0;
931}
932
933bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
934 unsigned SrcSize = Src->getScalarSizeInBits();
935 unsigned DestSize = Dest->getScalarSizeInBits();
936
937 if (SrcSize == 16 && Subtarget->has16BitInsts())
938 return DestSize >= 32;
939
940 return SrcSize == 32 && DestSize == 64;
941}
942
943bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
944 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
945 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
946 // this will enable reducing 64-bit operations the 32-bit, which is always
947 // good.
948
949 if (Src == MVT::i16)
950 return Dest == MVT::i32 ||Dest == MVT::i64 ;
951
952 return Src == MVT::i32 && Dest == MVT::i64;
953}
954
955bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
956 return isZExtFree(Val.getValueType(), VT2);
957}
958
959bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
960 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
961 // limited number of native 64-bit operations. Shrinking an operation to fit
962 // in a single 32-bit register should always be helpful. As currently used,
963 // this is much less general than the name suggests, and is only used in
964 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
965 // not profitable, and may actually be harmful.
966 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
967}
968
969//===---------------------------------------------------------------------===//
970// TargetLowering Callbacks
971//===---------------------------------------------------------------------===//
972
973CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
974 bool IsVarArg) {
975 switch (CC) {
976 case CallingConv::AMDGPU_VS:
977 case CallingConv::AMDGPU_GS:
978 case CallingConv::AMDGPU_PS:
979 case CallingConv::AMDGPU_CS:
980 case CallingConv::AMDGPU_HS:
981 case CallingConv::AMDGPU_ES:
982 case CallingConv::AMDGPU_LS:
983 return CC_AMDGPU;
984 case CallingConv::C:
985 case CallingConv::Fast:
986 case CallingConv::Cold:
987 return CC_AMDGPU_Func;
988 case CallingConv::AMDGPU_Gfx:
989 return CC_SI_Gfx;
990 case CallingConv::AMDGPU_KERNEL:
991 case CallingConv::SPIR_KERNEL:
992 default:
993 report_fatal_error("Unsupported calling convention for call");
994 }
995}
996
997CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
998 bool IsVarArg) {
999 switch (CC) {
1000 case CallingConv::AMDGPU_KERNEL:
1001 case CallingConv::SPIR_KERNEL:
1002 llvm_unreachable("kernels should not be handled here")::llvm::llvm_unreachable_internal("kernels should not be handled here"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1002)
;
1003 case CallingConv::AMDGPU_VS:
1004 case CallingConv::AMDGPU_GS:
1005 case CallingConv::AMDGPU_PS:
1006 case CallingConv::AMDGPU_CS:
1007 case CallingConv::AMDGPU_HS:
1008 case CallingConv::AMDGPU_ES:
1009 case CallingConv::AMDGPU_LS:
1010 return RetCC_SI_Shader;
1011 case CallingConv::AMDGPU_Gfx:
1012 return RetCC_SI_Gfx;
1013 case CallingConv::C:
1014 case CallingConv::Fast:
1015 case CallingConv::Cold:
1016 return RetCC_AMDGPU_Func;
1017 default:
1018 report_fatal_error("Unsupported calling convention.");
1019 }
1020}
1021
1022/// The SelectionDAGBuilder will automatically promote function arguments
1023/// with illegal types. However, this does not work for the AMDGPU targets
1024/// since the function arguments are stored in memory as these illegal types.
1025/// In order to handle this properly we need to get the original types sizes
1026/// from the LLVM IR Function and fixup the ISD:InputArg values before
1027/// passing them to AnalyzeFormalArguments()
1028
1029/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1030/// input values across multiple registers. Each item in the Ins array
1031/// represents a single value that will be stored in registers. Ins[x].VT is
1032/// the value type of the value that will be stored in the register, so
1033/// whatever SDNode we lower the argument to needs to be this type.
1034///
1035/// In order to correctly lower the arguments we need to know the size of each
1036/// argument. Since Ins[x].VT gives us the size of the register that will
1037/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1038/// for the original function argument so that we can deduce the correct memory
1039/// type to use for Ins[x]. In most cases the correct memory type will be
1040/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1041/// we have a kernel argument of type v8i8, this argument will be split into
1042/// 8 parts and each part will be represented by its own item in the Ins array.
1043/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1044/// the argument before it was split. From this, we deduce that the memory type
1045/// for each individual part is i8. We pass the memory type as LocVT to the
1046/// calling convention analysis function and the register type (Ins[x].VT) as
1047/// the ValVT.
1048void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1049 CCState &State,
1050 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1051 const MachineFunction &MF = State.getMachineFunction();
1052 const Function &Fn = MF.getFunction();
1053 LLVMContext &Ctx = Fn.getParent()->getContext();
1054 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1055 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1056 CallingConv::ID CC = Fn.getCallingConv();
1057
1058 Align MaxAlign = Align(1);
1059 uint64_t ExplicitArgOffset = 0;
1060 const DataLayout &DL = Fn.getParent()->getDataLayout();
1061
1062 unsigned InIndex = 0;
1063
1064 for (const Argument &Arg : Fn.args()) {
1065 const bool IsByRef = Arg.hasByRefAttr();
1066 Type *BaseArgTy = Arg.getType();
1067 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1068 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1069 if (!Alignment)
1070 Alignment = DL.getABITypeAlign(MemArgTy);
1071 MaxAlign = max(Alignment, MaxAlign);
1072 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1073
1074 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1075 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1076
1077 // We're basically throwing away everything passed into us and starting over
1078 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1079 // to us as computed in Ins.
1080 //
1081 // We also need to figure out what type legalization is trying to do to get
1082 // the correct memory offsets.
1083
1084 SmallVector<EVT, 16> ValueVTs;
1085 SmallVector<uint64_t, 16> Offsets;
1086 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1087
1088 for (unsigned Value = 0, NumValues = ValueVTs.size();
1089 Value != NumValues; ++Value) {
1090 uint64_t BasePartOffset = Offsets[Value];
1091
1092 EVT ArgVT = ValueVTs[Value];
1093 EVT MemVT = ArgVT;
1094 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1095 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1096
1097 if (NumRegs == 1) {
1098 // This argument is not split, so the IR type is the memory type.
1099 if (ArgVT.isExtended()) {
1100 // We have an extended type, like i24, so we should just use the
1101 // register type.
1102 MemVT = RegisterVT;
1103 } else {
1104 MemVT = ArgVT;
1105 }
1106 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1107 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1108 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())(static_cast <bool> (ArgVT.getVectorNumElements() > RegisterVT
.getVectorNumElements()) ? void (0) : __assert_fail ("ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1108, __extension__
__PRETTY_FUNCTION__))
;
1109 // We have a vector value which has been split into a vector with
1110 // the same scalar type, but fewer elements. This should handle
1111 // all the floating-point vector types.
1112 MemVT = RegisterVT;
1113 } else if (ArgVT.isVector() &&
1114 ArgVT.getVectorNumElements() == NumRegs) {
1115 // This arg has been split so that each element is stored in a separate
1116 // register.
1117 MemVT = ArgVT.getScalarType();
1118 } else if (ArgVT.isExtended()) {
1119 // We have an extended type, like i65.
1120 MemVT = RegisterVT;
1121 } else {
1122 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1123 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)(static_cast <bool> (ArgVT.getStoreSizeInBits() % NumRegs
== 0) ? void (0) : __assert_fail ("ArgVT.getStoreSizeInBits() % NumRegs == 0"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1123, __extension__
__PRETTY_FUNCTION__))
;
1124 if (RegisterVT.isInteger()) {
1125 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1126 } else if (RegisterVT.isVector()) {
1127 assert(!RegisterVT.getScalarType().isFloatingPoint())(static_cast <bool> (!RegisterVT.getScalarType().isFloatingPoint
()) ? void (0) : __assert_fail ("!RegisterVT.getScalarType().isFloatingPoint()"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1127, __extension__
__PRETTY_FUNCTION__))
;
1128 unsigned NumElements = RegisterVT.getVectorNumElements();
1129 assert(MemoryBits % NumElements == 0)(static_cast <bool> (MemoryBits % NumElements == 0) ? void
(0) : __assert_fail ("MemoryBits % NumElements == 0", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1129, __extension__ __PRETTY_FUNCTION__))
;
1130 // This vector type has been split into another vector type with
1131 // a different elements size.
1132 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1133 MemoryBits / NumElements);
1134 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1135 } else {
1136 llvm_unreachable("cannot deduce memory type.")::llvm::llvm_unreachable_internal("cannot deduce memory type."
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1136)
;
1137 }
1138 }
1139
1140 // Convert one element vectors to scalar.
1141 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1142 MemVT = MemVT.getScalarType();
1143
1144 // Round up vec3/vec5 argument.
1145 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1146 assert(MemVT.getVectorNumElements() == 3 ||(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5) ? void (0) : __assert_fail
("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1147, __extension__
__PRETTY_FUNCTION__))
1147 MemVT.getVectorNumElements() == 5)(static_cast <bool> (MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5) ? void (0) : __assert_fail
("MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1147, __extension__
__PRETTY_FUNCTION__))
;
1148 MemVT = MemVT.getPow2VectorType(State.getContext());
1149 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1150 MemVT = MemVT.getRoundIntegerType(State.getContext());
1151 }
1152
1153 unsigned PartOffset = 0;
1154 for (unsigned i = 0; i != NumRegs; ++i) {
1155 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1156 BasePartOffset + PartOffset,
1157 MemVT.getSimpleVT(),
1158 CCValAssign::Full));
1159 PartOffset += MemVT.getStoreSize();
1160 }
1161 }
1162 }
1163}
1164
1165SDValue AMDGPUTargetLowering::LowerReturn(
1166 SDValue Chain, CallingConv::ID CallConv,
1167 bool isVarArg,
1168 const SmallVectorImpl<ISD::OutputArg> &Outs,
1169 const SmallVectorImpl<SDValue> &OutVals,
1170 const SDLoc &DL, SelectionDAG &DAG) const {
1171 // FIXME: Fails for r600 tests
1172 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1173 // "wave terminate should not have return values");
1174 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1175}
1176
1177//===---------------------------------------------------------------------===//
1178// Target specific lowering
1179//===---------------------------------------------------------------------===//
1180
1181/// Selects the correct CCAssignFn for a given CallingConvention value.
1182CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1183 bool IsVarArg) {
1184 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1185}
1186
1187CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1188 bool IsVarArg) {
1189 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1190}
1191
1192SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1193 SelectionDAG &DAG,
1194 MachineFrameInfo &MFI,
1195 int ClobberedFI) const {
1196 SmallVector<SDValue, 8> ArgChains;
1197 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1198 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1199
1200 // Include the original chain at the beginning of the list. When this is
1201 // used by target LowerCall hooks, this helps legalize find the
1202 // CALLSEQ_BEGIN node.
1203 ArgChains.push_back(Chain);
1204
1205 // Add a chain value for each stack argument corresponding
1206 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1207 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1208 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1209 if (FI->getIndex() < 0) {
1210 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1211 int64_t InLastByte = InFirstByte;
1212 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1213
1214 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1215 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1216 ArgChains.push_back(SDValue(L, 1));
1217 }
1218 }
1219 }
1220 }
1221
1222 // Build a tokenfactor for all the chains.
1223 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1224}
1225
1226SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1227 SmallVectorImpl<SDValue> &InVals,
1228 StringRef Reason) const {
1229 SDValue Callee = CLI.Callee;
1230 SelectionDAG &DAG = CLI.DAG;
1231
1232 const Function &Fn = DAG.getMachineFunction().getFunction();
1233
1234 StringRef FuncName("<unknown>");
1235
1236 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1237 FuncName = G->getSymbol();
1238 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1239 FuncName = G->getGlobal()->getName();
1240
1241 DiagnosticInfoUnsupported NoCalls(
1242 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1243 DAG.getContext()->diagnose(NoCalls);
1244
1245 if (!CLI.IsTailCall) {
1246 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1247 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1248 }
1249
1250 return DAG.getEntryNode();
1251}
1252
1253SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1254 SmallVectorImpl<SDValue> &InVals) const {
1255 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1256}
1257
1258SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1259 SelectionDAG &DAG) const {
1260 const Function &Fn = DAG.getMachineFunction().getFunction();
1261
1262 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1263 SDLoc(Op).getDebugLoc());
1264 DAG.getContext()->diagnose(NoDynamicAlloca);
1265 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1266 return DAG.getMergeValues(Ops, SDLoc());
1267}
1268
1269SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1270 SelectionDAG &DAG) const {
1271 switch (Op.getOpcode()) {
1272 default:
1273 Op->print(errs(), &DAG);
1274 llvm_unreachable("Custom lowering code for this "::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1275)
1275 "instruction is not implemented yet!")::llvm::llvm_unreachable_internal("Custom lowering code for this "
"instruction is not implemented yet!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1275)
;
1276 break;
1277 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1278 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1279 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1280 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1281 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1282 case ISD::FREM: return LowerFREM(Op, DAG);
1283 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1284 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1285 case ISD::FRINT: return LowerFRINT(Op, DAG);
1286 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1287 case ISD::FROUND: return LowerFROUND(Op, DAG);
1288 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1289 case ISD::FLOG:
1290 return LowerFLOG(Op, DAG, numbers::ln2f);
1291 case ISD::FLOG10:
1292 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1293 case ISD::FEXP:
1294 return lowerFEXP(Op, DAG);
1295 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1296 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1297 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1298 case ISD::FP_TO_SINT:
1299 case ISD::FP_TO_UINT:
1300 return LowerFP_TO_INT(Op, DAG);
1301 case ISD::CTTZ:
1302 case ISD::CTTZ_ZERO_UNDEF:
1303 case ISD::CTLZ:
1304 case ISD::CTLZ_ZERO_UNDEF:
1305 return LowerCTLZ_CTTZ(Op, DAG);
1306 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1307 }
1308 return Op;
1309}
1310
1311void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1312 SmallVectorImpl<SDValue> &Results,
1313 SelectionDAG &DAG) const {
1314 switch (N->getOpcode()) {
1315 case ISD::SIGN_EXTEND_INREG:
1316 // Different parts of legalization seem to interpret which type of
1317 // sign_extend_inreg is the one to check for custom lowering. The extended
1318 // from type is what really matters, but some places check for custom
1319 // lowering of the result type. This results in trying to use
1320 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1321 // nothing here and let the illegal result integer be handled normally.
1322 return;
1323 default:
1324 return;
1325 }
1326}
1327
1328SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1329 SDValue Op,
1330 SelectionDAG &DAG) const {
1331
1332 const DataLayout &DL = DAG.getDataLayout();
1333 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1334 const GlobalValue *GV = G->getGlobal();
1335
1336 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1337 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1338 if (!MFI->isModuleEntryFunction() &&
1339 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1340 SDLoc DL(Op);
1341 const Function &Fn = DAG.getMachineFunction().getFunction();
1342 DiagnosticInfoUnsupported BadLDSDecl(
1343 Fn, "local memory global used by non-kernel function",
1344 DL.getDebugLoc(), DS_Warning);
1345 DAG.getContext()->diagnose(BadLDSDecl);
1346
1347 // We currently don't have a way to correctly allocate LDS objects that
1348 // aren't directly associated with a kernel. We do force inlining of
1349 // functions that use local objects. However, if these dead functions are
1350 // not eliminated, we don't want a compile time error. Just emit a warning
1351 // and a trap, since there should be no callable path here.
1352 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1353 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1354 Trap, DAG.getRoot());
1355 DAG.setRoot(OutputChain);
1356 return DAG.getUNDEF(Op.getValueType());
1357 }
1358
1359 // XXX: What does the value of G->getOffset() mean?
1360 assert(G->getOffset() == 0 &&(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1361, __extension__
__PRETTY_FUNCTION__))
1361 "Do not know what to do with an non-zero offset")(static_cast <bool> (G->getOffset() == 0 && "Do not know what to do with an non-zero offset"
) ? void (0) : __assert_fail ("G->getOffset() == 0 && \"Do not know what to do with an non-zero offset\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1361, __extension__
__PRETTY_FUNCTION__))
;
1362
1363 // TODO: We could emit code to handle the initialization somewhere.
1364 // We ignore the initializer for now and legalize it to allow selection.
1365 // The initializer will anyway get errored out during assembly emission.
1366 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1367 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1368 }
1369 return SDValue();
1370}
1371
1372SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1373 SelectionDAG &DAG) const {
1374 SmallVector<SDValue, 8> Args;
1375
1376 EVT VT = Op.getValueType();
1377 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1378 SDLoc SL(Op);
1379 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1380 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1381
1382 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1383 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1384 }
1385
1386 for (const SDUse &U : Op->ops())
1387 DAG.ExtractVectorElements(U.get(), Args);
1388
1389 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1390}
1391
1392SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1393 SelectionDAG &DAG) const {
1394
1395 SmallVector<SDValue, 8> Args;
1396 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1397 EVT VT = Op.getValueType();
1398 EVT SrcVT = Op.getOperand(0).getValueType();
1399
1400 // For these types, we have some TableGen patterns except if the index is 1
1401 if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1402 (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1403 Start != 1)
1404 return Op;
1405
1406 if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1407 (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1408 (Start == 0 || Start == 4))
1409 return Op;
1410
1411 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1412 VT.getVectorNumElements());
1413
1414 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1415}
1416
1417/// Generate Min/Max node
1418SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1419 SDValue LHS, SDValue RHS,
1420 SDValue True, SDValue False,
1421 SDValue CC,
1422 DAGCombinerInfo &DCI) const {
1423 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1424 return SDValue();
1425
1426 SelectionDAG &DAG = DCI.DAG;
1427 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1428 switch (CCOpcode) {
1429 case ISD::SETOEQ:
1430 case ISD::SETONE:
1431 case ISD::SETUNE:
1432 case ISD::SETNE:
1433 case ISD::SETUEQ:
1434 case ISD::SETEQ:
1435 case ISD::SETFALSE:
1436 case ISD::SETFALSE2:
1437 case ISD::SETTRUE:
1438 case ISD::SETTRUE2:
1439 case ISD::SETUO:
1440 case ISD::SETO:
1441 break;
1442 case ISD::SETULE:
1443 case ISD::SETULT: {
1444 if (LHS == True)
1445 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1446 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1447 }
1448 case ISD::SETOLE:
1449 case ISD::SETOLT:
1450 case ISD::SETLE:
1451 case ISD::SETLT: {
1452 // Ordered. Assume ordered for undefined.
1453
1454 // Only do this after legalization to avoid interfering with other combines
1455 // which might occur.
1456 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1457 !DCI.isCalledByLegalizer())
1458 return SDValue();
1459
1460 // We need to permute the operands to get the correct NaN behavior. The
1461 // selected operand is the second one based on the failing compare with NaN,
1462 // so permute it based on the compare type the hardware uses.
1463 if (LHS == True)
1464 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1465 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1466 }
1467 case ISD::SETUGE:
1468 case ISD::SETUGT: {
1469 if (LHS == True)
1470 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1471 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1472 }
1473 case ISD::SETGT:
1474 case ISD::SETGE:
1475 case ISD::SETOGE:
1476 case ISD::SETOGT: {
1477 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1478 !DCI.isCalledByLegalizer())
1479 return SDValue();
1480
1481 if (LHS == True)
1482 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1483 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1484 }
1485 case ISD::SETCC_INVALID:
1486 llvm_unreachable("Invalid setcc condcode!")::llvm::llvm_unreachable_internal("Invalid setcc condcode!", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1486)
;
1487 }
1488 return SDValue();
1489}
1490
1491std::pair<SDValue, SDValue>
1492AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1493 SDLoc SL(Op);
1494
1495 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1496
1497 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1498 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1499
1500 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1501 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1502
1503 return std::make_pair(Lo, Hi);
1504}
1505
1506SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1507 SDLoc SL(Op);
1508
1509 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1510 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1511 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1512}
1513
1514SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1515 SDLoc SL(Op);
1516
1517 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1518 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1519 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1520}
1521
1522// Split a vector type into two parts. The first part is a power of two vector.
1523// The second part is whatever is left over, and is a scalar if it would
1524// otherwise be a 1-vector.
1525std::pair<EVT, EVT>
1526AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1527 EVT LoVT, HiVT;
1528 EVT EltVT = VT.getVectorElementType();
1529 unsigned NumElts = VT.getVectorNumElements();
1530 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1531 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1532 HiVT = NumElts - LoNumElts == 1
1533 ? EltVT
1534 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1535 return std::make_pair(LoVT, HiVT);
1536}
1537
1538// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1539// scalar.
1540std::pair<SDValue, SDValue>
1541AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1542 const EVT &LoVT, const EVT &HiVT,
1543 SelectionDAG &DAG) const {
1544 assert(LoVT.getVectorNumElements() +(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1547, __extension__
__PRETTY_FUNCTION__))
1545 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1547, __extension__
__PRETTY_FUNCTION__))
1546 N.getValueType().getVectorNumElements() &&(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1547, __extension__
__PRETTY_FUNCTION__))
1547 "More vector elements requested than available!")(static_cast <bool> (LoVT.getVectorNumElements() + (HiVT
.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType
().getVectorNumElements() && "More vector elements requested than available!"
) ? void (0) : __assert_fail ("LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && \"More vector elements requested than available!\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1547, __extension__
__PRETTY_FUNCTION__))
;
1548 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1549 DAG.getVectorIdxConstant(0, DL));
1550 SDValue Hi = DAG.getNode(
1551 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1552 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1553 return std::make_pair(Lo, Hi);
1554}
1555
1556SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1557 SelectionDAG &DAG) const {
1558 LoadSDNode *Load = cast<LoadSDNode>(Op);
1559 EVT VT = Op.getValueType();
1560 SDLoc SL(Op);
1561
1562
1563 // If this is a 2 element vector, we really want to scalarize and not create
1564 // weird 1 element vectors.
1565 if (VT.getVectorNumElements() == 2) {
1566 SDValue Ops[2];
1567 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1568 return DAG.getMergeValues(Ops, SL);
1569 }
1570
1571 SDValue BasePtr = Load->getBasePtr();
1572 EVT MemVT = Load->getMemoryVT();
1573
1574 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1575
1576 EVT LoVT, HiVT;
1577 EVT LoMemVT, HiMemVT;
1578 SDValue Lo, Hi;
1579
1580 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1581 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1582 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1583
1584 unsigned Size = LoMemVT.getStoreSize();
1585 unsigned BaseAlign = Load->getAlignment();
1586 unsigned HiAlign = MinAlign(BaseAlign, Size);
1587
1588 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1589 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1590 BaseAlign, Load->getMemOperand()->getFlags());
1591 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1592 SDValue HiLoad =
1593 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1594 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1595 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1596
1597 SDValue Join;
1598 if (LoVT == HiVT) {
1599 // This is the case that the vector is power of two so was evenly split.
1600 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1601 } else {
1602 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1603 DAG.getVectorIdxConstant(0, SL));
1604 Join = DAG.getNode(
1605 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1606 VT, Join, HiLoad,
1607 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1608 }
1609
1610 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1611 LoLoad.getValue(1), HiLoad.getValue(1))};
1612
1613 return DAG.getMergeValues(Ops, SL);
1614}
1615
1616SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1617 SelectionDAG &DAG) const {
1618 LoadSDNode *Load = cast<LoadSDNode>(Op);
1619 EVT VT = Op.getValueType();
1620 SDValue BasePtr = Load->getBasePtr();
1621 EVT MemVT = Load->getMemoryVT();
1622 SDLoc SL(Op);
1623 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1624 unsigned BaseAlign = Load->getAlignment();
1625 unsigned NumElements = MemVT.getVectorNumElements();
1626
1627 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1628 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1629 if (NumElements != 3 ||
1630 (BaseAlign < 8 &&
1631 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1632 return SplitVectorLoad(Op, DAG);
1633
1634 assert(NumElements == 3)(static_cast <bool> (NumElements == 3) ? void (0) : __assert_fail
("NumElements == 3", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 1634, __extension__ __PRETTY_FUNCTION__))
;
1635
1636 EVT WideVT =
1637 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1638 EVT WideMemVT =
1639 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1640 SDValue WideLoad = DAG.getExtLoad(
1641 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1642 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1643 return DAG.getMergeValues(
1644 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1645 DAG.getVectorIdxConstant(0, SL)),
1646 WideLoad.getValue(1)},
1647 SL);
1648}
1649
1650SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1651 SelectionDAG &DAG) const {
1652 StoreSDNode *Store = cast<StoreSDNode>(Op);
1653 SDValue Val = Store->getValue();
1654 EVT VT = Val.getValueType();
1655
1656 // If this is a 2 element vector, we really want to scalarize and not create
1657 // weird 1 element vectors.
1658 if (VT.getVectorNumElements() == 2)
1659 return scalarizeVectorStore(Store, DAG);
1660
1661 EVT MemVT = Store->getMemoryVT();
1662 SDValue Chain = Store->getChain();
1663 SDValue BasePtr = Store->getBasePtr();
1664 SDLoc SL(Op);
1665
1666 EVT LoVT, HiVT;
1667 EVT LoMemVT, HiMemVT;
1668 SDValue Lo, Hi;
1669
1670 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1671 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1672 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1673
1674 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1675
1676 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1677 unsigned BaseAlign = Store->getAlignment();
1678 unsigned Size = LoMemVT.getStoreSize();
1679 unsigned HiAlign = MinAlign(BaseAlign, Size);
1680
1681 SDValue LoStore =
1682 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1683 Store->getMemOperand()->getFlags());
1684 SDValue HiStore =
1685 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1686 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1687
1688 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1689}
1690
1691// This is a shortcut for integer division because we have fast i32<->f32
1692// conversions, and fast f32 reciprocal instructions. The fractional part of a
1693// float is enough to accurately represent up to a 24-bit signed integer.
1694SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1695 bool Sign) const {
1696 SDLoc DL(Op);
1697 EVT VT = Op.getValueType();
1698 SDValue LHS = Op.getOperand(0);
1699 SDValue RHS = Op.getOperand(1);
1700 MVT IntVT = MVT::i32;
1701 MVT FltVT = MVT::f32;
1702
1703 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1704 if (LHSSignBits < 9)
1705 return SDValue();
1706
1707 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1708 if (RHSSignBits < 9)
1709 return SDValue();
1710
1711 unsigned BitSize = VT.getSizeInBits();
1712 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1713 unsigned DivBits = BitSize - SignBits;
1714 if (Sign)
1715 ++DivBits;
1716
1717 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1718 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1719
1720 SDValue jq = DAG.getConstant(1, DL, IntVT);
1721
1722 if (Sign) {
1723 // char|short jq = ia ^ ib;
1724 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1725
1726 // jq = jq >> (bitsize - 2)
1727 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1728 DAG.getConstant(BitSize - 2, DL, VT));
1729
1730 // jq = jq | 0x1
1731 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1732 }
1733
1734 // int ia = (int)LHS;
1735 SDValue ia = LHS;
1736
1737 // int ib, (int)RHS;
1738 SDValue ib = RHS;
1739
1740 // float fa = (float)ia;
1741 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1742
1743 // float fb = (float)ib;
1744 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1745
1746 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1747 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1748
1749 // fq = trunc(fq);
1750 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1751
1752 // float fqneg = -fq;
1753 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1754
1755 MachineFunction &MF = DAG.getMachineFunction();
1756 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1757
1758 // float fr = mad(fqneg, fb, fa);
1759 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1760 (unsigned)ISD::FMA :
1761 !MFI->getMode().allFP32Denormals() ?
1762 (unsigned)ISD::FMAD :
1763 (unsigned)AMDGPUISD::FMAD_FTZ;
1764 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1765
1766 // int iq = (int)fq;
1767 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1768
1769 // fr = fabs(fr);
1770 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1771
1772 // fb = fabs(fb);
1773 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1774
1775 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1776
1777 // int cv = fr >= fb;
1778 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1779
1780 // jq = (cv ? jq : 0);
1781 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1782
1783 // dst = iq + jq;
1784 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1785
1786 // Rem needs compensation, it's easier to recompute it
1787 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1788 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1789
1790 // Truncate to number of bits this divide really is.
1791 if (Sign) {
1792 SDValue InRegSize
1793 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1794 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1795 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1796 } else {
1797 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1UL << DivBits) - 1, DL, VT);
1798 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1799 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1800 }
1801
1802 return DAG.getMergeValues({ Div, Rem }, DL);
1803}
1804
1805void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1806 SelectionDAG &DAG,
1807 SmallVectorImpl<SDValue> &Results) const {
1808 SDLoc DL(Op);
1809 EVT VT = Op.getValueType();
1810
1811 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")(static_cast <bool> (VT == MVT::i64 && "LowerUDIVREM64 expects an i64"
) ? void (0) : __assert_fail ("VT == MVT::i64 && \"LowerUDIVREM64 expects an i64\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))
;
1812
1813 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1814
1815 SDValue One = DAG.getConstant(1, DL, HalfVT);
1816 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1817
1818 //HiLo split
1819 SDValue LHS = Op.getOperand(0);
1820 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1821 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1822
1823 SDValue RHS = Op.getOperand(1);
1824 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1825 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1826
1827 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1828 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1829
1830 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1831 LHS_Lo, RHS_Lo);
1832
1833 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1834 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1835
1836 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1837 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1838 return;
1839 }
1840
1841 if (isTypeLegal(MVT::i64)) {
1842 // The algorithm here is based on ideas from "Software Integer Division",
1843 // Tom Rodeheffer, August 2008.
1844
1845 MachineFunction &MF = DAG.getMachineFunction();
1846 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1847
1848 // Compute denominator reciprocal.
1849 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1850 (unsigned)ISD::FMA :
1851 !MFI->getMode().allFP32Denormals() ?
1852 (unsigned)ISD::FMAD :
1853 (unsigned)AMDGPUISD::FMAD_FTZ;
1854
1855 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1856 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1857 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1858 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1859 Cvt_Lo);
1860 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1861 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1862 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1863 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1864 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1865 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1866 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1867 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1868 Mul1);
1869 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1870 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1871 SDValue Rcp64 = DAG.getBitcast(VT,
1872 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1873
1874 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1875 SDValue One64 = DAG.getConstant(1, DL, VT);
1876 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1877 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1878
1879 // First round of UNR (Unsigned integer Newton-Raphson).
1880 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1881 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1882 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1883 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1884 Zero);
1885 SDValue Mulhi1_Hi =
1886 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1887 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1888 Mulhi1_Lo, Zero1);
1889 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1890 Mulhi1_Hi, Add1_Lo.getValue(1));
1891 SDValue Add1 = DAG.getBitcast(VT,
1892 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1893
1894 // Second round of UNR.
1895 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1896 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1897 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1898 Zero);
1899 SDValue Mulhi2_Hi =
1900 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1901 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1902 Mulhi2_Lo, Zero1);
1903 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1904 Mulhi2_Hi, Add2_Lo.getValue(1));
1905 SDValue Add2 = DAG.getBitcast(VT,
1906 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1907
1908 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1909
1910 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1911
1912 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1913 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1914 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1915 Mul3_Lo, Zero1);
1916 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1917 Mul3_Hi, Sub1_Lo.getValue(1));
1918 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1919 SDValue Sub1 = DAG.getBitcast(VT,
1920 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1921
1922 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1923 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1924 ISD::SETUGE);
1925 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1926 ISD::SETUGE);
1927 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1928
1929 // TODO: Here and below portions of the code can be enclosed into if/endif.
1930 // Currently control flow is unconditional and we have 4 selects after
1931 // potential endif to substitute PHIs.
1932
1933 // if C3 != 0 ...
1934 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1935 RHS_Lo, Zero1);
1936 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1937 RHS_Hi, Sub1_Lo.getValue(1));
1938 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1939 Zero, Sub2_Lo.getValue(1));
1940 SDValue Sub2 = DAG.getBitcast(VT,
1941 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1942
1943 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1944
1945 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1946 ISD::SETUGE);
1947 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1948 ISD::SETUGE);
1949 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1950
1951 // if (C6 != 0)
1952 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1953
1954 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1955 RHS_Lo, Zero1);
1956 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1957 RHS_Hi, Sub2_Lo.getValue(1));
1958 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1959 Zero, Sub3_Lo.getValue(1));
1960 SDValue Sub3 = DAG.getBitcast(VT,
1961 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1962
1963 // endif C6
1964 // endif C3
1965
1966 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1967 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1968
1969 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1970 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1971
1972 Results.push_back(Div);
1973 Results.push_back(Rem);
1974
1975 return;
1976 }
1977
1978 // r600 expandion.
1979 // Get Speculative values
1980 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1981 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1982
1983 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1984 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1985 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1986
1987 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1988 SDValue DIV_Lo = Zero;
1989
1990 const unsigned halfBitWidth = HalfVT.getSizeInBits();
1991
1992 for (unsigned i = 0; i < halfBitWidth; ++i) {
1993 const unsigned bitPos = halfBitWidth - i - 1;
1994 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1995 // Get value of high bit
1996 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1997 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1998 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1999
2000 // Shift
2001 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2002 // Add LHS high bit
2003 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2004
2005 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2006 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2007
2008 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2009
2010 // Update REM
2011 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2012 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2013 }
2014
2015 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2016 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2017 Results.push_back(DIV);
2018 Results.push_back(REM);
2019}
2020
2021SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2022 SelectionDAG &DAG) const {
2023 SDLoc DL(Op);
2024 EVT VT = Op.getValueType();
2025
2026 if (VT == MVT::i64) {
2027 SmallVector<SDValue, 2> Results;
2028 LowerUDIVREM64(Op, DAG, Results);
2029 return DAG.getMergeValues(Results, DL);
2030 }
2031
2032 if (VT == MVT::i32) {
2033 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2034 return Res;
2035 }
2036
2037 SDValue X = Op.getOperand(0);
2038 SDValue Y = Op.getOperand(1);
2039
2040 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2041 // algorithm used here.
2042
2043 // Initial estimate of inv(y).
2044 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2045
2046 // One round of UNR.
2047 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2048 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2049 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2050 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2051
2052 // Quotient/remainder estimate.
2053 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2054 SDValue R =
2055 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2056
2057 // First quotient/remainder refinement.
2058 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2059 SDValue One = DAG.getConstant(1, DL, VT);
2060 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2061 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2062 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2063 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2064 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2065
2066 // Second quotient/remainder refinement.
2067 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2068 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2069 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2070 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2071 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2072
2073 return DAG.getMergeValues({Q, R}, DL);
2074}
2075
2076SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2077 SelectionDAG &DAG) const {
2078 SDLoc DL(Op);
2079 EVT VT = Op.getValueType();
2080
2081 SDValue LHS = Op.getOperand(0);
2082 SDValue RHS = Op.getOperand(1);
2083
2084 SDValue Zero = DAG.getConstant(0, DL, VT);
2085 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2086
2087 if (VT == MVT::i32) {
2088 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2089 return Res;
2090 }
2091
2092 if (VT == MVT::i64 &&
2093 DAG.ComputeNumSignBits(LHS) > 32 &&
2094 DAG.ComputeNumSignBits(RHS) > 32) {
2095 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2096
2097 //HiLo split
2098 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2099 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2100 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2101 LHS_Lo, RHS_Lo);
2102 SDValue Res[2] = {
2103 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2104 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2105 };
2106 return DAG.getMergeValues(Res, DL);
2107 }
2108
2109 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2110 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2111 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2112 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2113
2114 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2115 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2116
2117 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2118 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2119
2120 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2121 SDValue Rem = Div.getValue(1);
2122
2123 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2124 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2125
2126 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2127 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2128
2129 SDValue Res[2] = {
2130 Div,
2131 Rem
2132 };
2133 return DAG.getMergeValues(Res, DL);
2134}
2135
2136// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2137SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2138 SDLoc SL(Op);
2139 EVT VT = Op.getValueType();
2140 auto Flags = Op->getFlags();
2141 SDValue X = Op.getOperand(0);
2142 SDValue Y = Op.getOperand(1);
2143
2144 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2145 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2146 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2147 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2148 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2149}
2150
2151SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2152 SDLoc SL(Op);
2153 SDValue Src = Op.getOperand(0);
2154
2155 // result = trunc(src)
2156 // if (src > 0.0 && src != result)
2157 // result += 1.0
2158
2159 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2160
2161 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2162 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2163
2164 EVT SetCCVT =
2165 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2166
2167 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2168 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2169 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2170
2171 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2172 // TODO: Should this propagate fast-math-flags?
2173 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2174}
2175
2176static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2177 SelectionDAG &DAG) {
2178 const unsigned FractBits = 52;
2179 const unsigned ExpBits = 11;
2180
2181 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2182 Hi,
2183 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2184 DAG.getConstant(ExpBits, SL, MVT::i32));
2185 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2186 DAG.getConstant(1023, SL, MVT::i32));
2187
2188 return Exp;
2189}
2190
2191SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2192 SDLoc SL(Op);
2193 SDValue Src = Op.getOperand(0);
2194
2195 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2195, __extension__ __PRETTY_FUNCTION__))
;
2196
2197 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2198
2199 // Extract the upper half, since this is where we will find the sign and
2200 // exponent.
2201 SDValue Hi = getHiHalf64(Src, DAG);
2202
2203 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2204
2205 const unsigned FractBits = 52;
2206
2207 // Extract the sign bit.
2208 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2209 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2210
2211 // Extend back to 64-bits.
2212 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2213 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2214
2215 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2216 const SDValue FractMask
2217 = DAG.getConstant((UINT64_C(1)1UL << FractBits) - 1, SL, MVT::i64);
2218
2219 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2220 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2221 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2222
2223 EVT SetCCVT =
2224 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2225
2226 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2227
2228 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2229 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2230
2231 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2232 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2233
2234 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2235}
2236
2237SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2238 SDLoc SL(Op);
2239 SDValue Src = Op.getOperand(0);
2240
2241 assert(Op.getValueType() == MVT::f64)(static_cast <bool> (Op.getValueType() == MVT::f64) ? void
(0) : __assert_fail ("Op.getValueType() == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2241, __extension__ __PRETTY_FUNCTION__))
;
2242
2243 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2244 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2245 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2246
2247 // TODO: Should this propagate fast-math-flags?
2248
2249 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2250 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2251
2252 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2253
2254 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2255 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2256
2257 EVT SetCCVT =
2258 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2259 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2260
2261 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2262}
2263
2264SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2265 // FNEARBYINT and FRINT are the same, except in their handling of FP
2266 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2267 // rint, so just treat them as equivalent.
2268 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2269}
2270
2271// XXX - May require not supporting f32 denormals?
2272
2273// Don't handle v2f16. The extra instructions to scalarize and repack around the
2274// compare and vselect end up producing worse code than scalarizing the whole
2275// operation.
2276SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2277 SDLoc SL(Op);
2278 SDValue X = Op.getOperand(0);
2279 EVT VT = Op.getValueType();
2280
2281 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2282
2283 // TODO: Should this propagate fast-math-flags?
2284
2285 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2286
2287 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2288
2289 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2290 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2291 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2292
2293 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2294
2295 EVT SetCCVT =
2296 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2297
2298 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2299
2300 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2301
2302 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2303}
2304
2305SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2306 SDLoc SL(Op);
2307 SDValue Src = Op.getOperand(0);
2308
2309 // result = trunc(src);
2310 // if (src < 0.0 && src != result)
2311 // result += -1.0.
2312
2313 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2314
2315 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2316 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2317
2318 EVT SetCCVT =
2319 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2320
2321 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2322 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2323 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2324
2325 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2326 // TODO: Should this propagate fast-math-flags?
2327 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2328}
2329
2330SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2331 double Log2BaseInverted) const {
2332 EVT VT = Op.getValueType();
2333
2334 SDLoc SL(Op);
2335 SDValue Operand = Op.getOperand(0);
2336 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2337 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2338
2339 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2340}
2341
2342// exp2(M_LOG2E_F * f);
2343SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2344 EVT VT = Op.getValueType();
2345 SDLoc SL(Op);
2346 SDValue Src = Op.getOperand(0);
2347
2348 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2349 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2350 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2351}
2352
2353static bool isCtlzOpc(unsigned Opc) {
2354 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2355}
2356
2357static bool isCttzOpc(unsigned Opc) {
2358 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2359}
2360
2361SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2362 SDLoc SL(Op);
2363 SDValue Src = Op.getOperand(0);
2364
2365 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()))(static_cast <bool> (isCtlzOpc(Op.getOpcode()) || isCttzOpc
(Op.getOpcode())) ? void (0) : __assert_fail ("isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2365, __extension__
__PRETTY_FUNCTION__))
;
2366 bool Ctlz = isCtlzOpc(Op.getOpcode());
2367 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2368
2369 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2370 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2371
2372 if (Src.getValueType() == MVT::i32) {
2373 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2374 // (cttz hi:lo) -> (umin (ffbl src), 32)
2375 // (ctlz_zero_undef src) -> (ffbh src)
2376 // (cttz_zero_undef src) -> (ffbl src)
2377 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2378 if (!ZeroUndef) {
2379 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2380 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2381 }
2382 return NewOpr;
2383 }
2384
2385 SDValue Lo, Hi;
2386 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2387
2388 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2389 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2390
2391 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2392 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2393 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2394 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2395
2396 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2397 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2398 if (Ctlz)
2399 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2400 else
2401 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2402
2403 SDValue NewOpr;
2404 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2405 if (!ZeroUndef) {
2406 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2407 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2408 }
2409
2410 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2411}
2412
2413SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2414 bool Signed) const {
2415 // The regular method converting a 64-bit integer to float roughly consists of
2416 // 2 steps: normalization and rounding. In fact, after normalization, the
2417 // conversion from a 64-bit integer to a float is essentially the same as the
2418 // one from a 32-bit integer. The only difference is that it has more
2419 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2420 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2421 // converted into the correct float number. The basic steps for the unsigned
2422 // conversion are illustrated in the following pseudo code:
2423 //
2424 // f32 uitofp(i64 u) {
2425 // i32 hi, lo = split(u);
2426 // // Only count the leading zeros in hi as we have native support of the
2427 // // conversion from i32 to f32. If hi is all 0s, the conversion is
2428 // // reduced to a 32-bit one automatically.
2429 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2430 // u <<= shamt;
2431 // hi, lo = split(u);
2432 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2433 // // convert it as a 32-bit integer and scale the result back.
2434 // return uitofp(hi) * 2^(32 - shamt);
2435 // }
2436 //
2437 // The signed one follows the same principle but uses 'ffbh_i32' to count its
2438 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2439 // converted instead followed by negation based its sign bit.
2440
2441 SDLoc SL(Op);
2442 SDValue Src = Op.getOperand(0);
2443
2444 SDValue Lo, Hi;
2445 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2446 SDValue Sign;
2447 SDValue ShAmt;
2448 if (Signed && Subtarget->isGCN()) {
2449 // We also need to consider the sign bit in Lo if Hi has just sign bits,
2450 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2451 // account. That is, the maximal shift is
2452 // - 32 if Lo and Hi have opposite signs;
2453 // - 33 if Lo and Hi have the same sign.
2454 //
2455 // Or, MaxShAmt = 33 + OppositeSign, where
2456 //
2457 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2458 // - -1 if Lo and Hi have opposite signs; and
2459 // - 0 otherwise.
2460 //
2461 // All in all, ShAmt is calculated as
2462 //
2463 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2464 //
2465 // or
2466 //
2467 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2468 //
2469 // to reduce the critical path.
2470 SDValue OppositeSign = DAG.getNode(
2471 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2472 DAG.getConstant(31, SL, MVT::i32));
2473 SDValue MaxShAmt =
2474 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2475 OppositeSign);
2476 // Count the leading sign bits.
2477 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2478 // Different from unsigned conversion, the shift should be one bit less to
2479 // preserve the sign bit.
2480 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2481 DAG.getConstant(1, SL, MVT::i32));
2482 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2483 } else {
2484 if (Signed) {
2485 // Without 'ffbh_i32', only leading zeros could be counted. Take the
2486 // absolute value first.
2487 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2488 DAG.getConstant(63, SL, MVT::i64));
2489 SDValue Abs =
2490 DAG.getNode(ISD::XOR, SL, MVT::i64,
2491 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2492 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2493 }
2494 // Count the leading zeros.
2495 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2496 // The shift amount for signed integers is [0, 32].
2497 }
2498 // Normalize the given 64-bit integer.
2499 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2500 // Split it again.
2501 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2502 // Calculate the adjust bit for rounding.
2503 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2504 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2505 DAG.getConstant(1, SL, MVT::i32), Lo);
2506 // Get the 32-bit normalized integer.
2507 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2508 // Convert the normalized 32-bit integer into f32.
2509 unsigned Opc =
2510 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2511 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2512
2513 // Finally, need to scale back the converted floating number as the original
2514 // 64-bit integer is converted as a 32-bit one.
2515 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2516 ShAmt);
2517 // On GCN, use LDEXP directly.
2518 if (Subtarget->isGCN())
2519 return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2520
2521 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2522 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2523 // exponent is enough to avoid overflowing into the sign bit.
2524 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2525 DAG.getConstant(23, SL, MVT::i32));
2526 SDValue IVal =
2527 DAG.getNode(ISD::ADD, SL, MVT::i32,
2528 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2529 if (Signed) {
2530 // Set the sign bit.
2531 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2532 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2533 DAG.getConstant(31, SL, MVT::i32));
2534 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2535 }
2536 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2537}
2538
2539SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2540 bool Signed) const {
2541 SDLoc SL(Op);
2542 SDValue Src = Op.getOperand(0);
2543
2544 SDValue Lo, Hi;
2545 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2546
2547 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2548 SL, MVT::f64, Hi);
2549
2550 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2551
2552 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2553 DAG.getConstant(32, SL, MVT::i32));
2554 // TODO: Should this propagate fast-math-flags?
2555 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2556}
2557
2558SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2559 SelectionDAG &DAG) const {
2560 // TODO: Factor out code common with LowerSINT_TO_FP.
2561 EVT DestVT = Op.getValueType();
2562 SDValue Src = Op.getOperand(0);
2563 EVT SrcVT = Src.getValueType();
2564
2565 if (SrcVT == MVT::i16) {
2566 if (DestVT == MVT::f16)
2567 return Op;
2568 SDLoc DL(Op);
2569
2570 // Promote src to i32
2571 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2572 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2573 }
2574
2575 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2575, __extension__
__PRETTY_FUNCTION__))
;
2576
2577 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2578 SDLoc DL(Op);
2579
2580 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2581 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2582 SDValue FPRound =
2583 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2584
2585 return FPRound;
2586 }
2587
2588 if (DestVT == MVT::f32)
2589 return LowerINT_TO_FP32(Op, DAG, false);
2590
2591 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2591, __extension__ __PRETTY_FUNCTION__))
;
2592 return LowerINT_TO_FP64(Op, DAG, false);
2593}
2594
2595SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2596 SelectionDAG &DAG) const {
2597 EVT DestVT = Op.getValueType();
2598
2599 SDValue Src = Op.getOperand(0);
2600 EVT SrcVT = Src.getValueType();
2601
2602 if (SrcVT == MVT::i16) {
2603 if (DestVT == MVT::f16)
2604 return Op;
2605
2606 SDLoc DL(Op);
2607 // Promote src to i32
2608 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2609 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2610 }
2611
2612 assert(SrcVT == MVT::i64 && "operation should be legal")(static_cast <bool> (SrcVT == MVT::i64 && "operation should be legal"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"operation should be legal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2612, __extension__
__PRETTY_FUNCTION__))
;
2613
2614 // TODO: Factor out code common with LowerUINT_TO_FP.
2615
2616 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2617 SDLoc DL(Op);
2618 SDValue Src = Op.getOperand(0);
2619
2620 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2621 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2622 SDValue FPRound =
2623 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2624
2625 return FPRound;
2626 }
2627
2628 if (DestVT == MVT::f32)
2629 return LowerINT_TO_FP32(Op, DAG, true);
2630
2631 assert(DestVT == MVT::f64)(static_cast <bool> (DestVT == MVT::f64) ? void (0) : __assert_fail
("DestVT == MVT::f64", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2631, __extension__ __PRETTY_FUNCTION__))
;
2632 return LowerINT_TO_FP64(Op, DAG, true);
2633}
2634
2635SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2636 bool Signed) const {
2637 SDLoc SL(Op);
2638
2639 SDValue Src = Op.getOperand(0);
2640 EVT SrcVT = Src.getValueType();
2641
2642 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64)(static_cast <bool> (SrcVT == MVT::f32 || SrcVT == MVT::
f64) ? void (0) : __assert_fail ("SrcVT == MVT::f32 || SrcVT == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2642, __extension__
__PRETTY_FUNCTION__))
;
2643
2644 // The basic idea of converting a floating point number into a pair of 32-bit
2645 // integers is illustrated as follows:
2646 //
2647 // tf := trunc(val);
2648 // hif := floor(tf * 2^-32);
2649 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2650 // hi := fptoi(hif);
2651 // lo := fptoi(lof);
2652 //
2653 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2654 SDValue Sign;
2655 if (Signed && SrcVT == MVT::f32) {
2656 // However, a 32-bit floating point number has only 23 bits mantissa and
2657 // it's not enough to hold all the significant bits of `lof` if val is
2658 // negative. To avoid the loss of precision, We need to take the absolute
2659 // value after truncating and flip the result back based on the original
2660 // signedness.
2661 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2662 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2663 DAG.getConstant(31, SL, MVT::i32));
2664 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2665 }
2666
2667 SDValue K0, K1;
2668 if (SrcVT == MVT::f64) {
2669 K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL),
2670 SL, SrcVT);
2671 K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL),
2672 SL, SrcVT);
2673 } else {
2674 K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U), SL,
2675 SrcVT);
2676 K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U), SL,
2677 SrcVT);
2678 }
2679 // TODO: Should this propagate fast-math-flags?
2680 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2681
2682 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2683
2684 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2685
2686 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2687 : ISD::FP_TO_UINT,
2688 SL, MVT::i32, FloorMul);
2689 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2690
2691 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2692 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2693
2694 if (Signed && SrcVT == MVT::f32) {
2695 assert(Sign)(static_cast <bool> (Sign) ? void (0) : __assert_fail (
"Sign", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2695
, __extension__ __PRETTY_FUNCTION__))
;
2696 // Flip the result based on the signedness, which is either all 0s or 1s.
2697 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2698 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2699 // r := xor(r, sign) - sign;
2700 Result =
2701 DAG.getNode(ISD::SUB, SL, MVT::i64,
2702 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2703 }
2704
2705 return Result;
2706}
2707
2708SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2709 SDLoc DL(Op);
2710 SDValue N0 = Op.getOperand(0);
2711
2712 // Convert to target node to get known bits
2713 if (N0.getValueType() == MVT::f32)
2714 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2715
2716 if (getTargetMachine().Options.UnsafeFPMath) {
2717 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2718 return SDValue();
2719 }
2720
2721 assert(N0.getSimpleValueType() == MVT::f64)(static_cast <bool> (N0.getSimpleValueType() == MVT::f64
) ? void (0) : __assert_fail ("N0.getSimpleValueType() == MVT::f64"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2721, __extension__
__PRETTY_FUNCTION__))
;
2722
2723 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2724 const unsigned ExpMask = 0x7ff;
2725 const unsigned ExpBiasf64 = 1023;
2726 const unsigned ExpBiasf16 = 15;
2727 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2728 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2729 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2730 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2731 DAG.getConstant(32, DL, MVT::i64));
2732 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2733 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2734 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2735 DAG.getConstant(20, DL, MVT::i64));
2736 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2737 DAG.getConstant(ExpMask, DL, MVT::i32));
2738 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2739 // add the f16 bias (15) to get the biased exponent for the f16 format.
2740 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2741 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2742
2743 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2744 DAG.getConstant(8, DL, MVT::i32));
2745 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2746 DAG.getConstant(0xffe, DL, MVT::i32));
2747
2748 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2749 DAG.getConstant(0x1ff, DL, MVT::i32));
2750 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2751
2752 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2753 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2754
2755 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2756 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2757 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2758 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2759
2760 // N = M | (E << 12);
2761 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2762 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2763 DAG.getConstant(12, DL, MVT::i32)));
2764
2765 // B = clamp(1-E, 0, 13);
2766 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2767 One, E);
2768 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2769 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2770 DAG.getConstant(13, DL, MVT::i32));
2771
2772 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2773 DAG.getConstant(0x1000, DL, MVT::i32));
2774
2775 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2776 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2777 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2778 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2779
2780 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2781 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2782 DAG.getConstant(0x7, DL, MVT::i32));
2783 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2784 DAG.getConstant(2, DL, MVT::i32));
2785 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2786 One, Zero, ISD::SETEQ);
2787 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2788 One, Zero, ISD::SETGT);
2789 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2790 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2791
2792 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2793 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2794 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2795 I, V, ISD::SETEQ);
2796
2797 // Extract the sign bit.
2798 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2799 DAG.getConstant(16, DL, MVT::i32));
2800 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2801 DAG.getConstant(0x8000, DL, MVT::i32));
2802
2803 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2804 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2805}
2806
2807SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2808 SelectionDAG &DAG) const {
2809 SDValue Src = Op.getOperand(0);
2810 unsigned OpOpcode = Op.getOpcode();
2811 EVT SrcVT = Src.getValueType();
2812 EVT DestVT = Op.getValueType();
2813
2814 // Will be selected natively
2815 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2816 return Op;
2817
2818 // Promote i16 to i32
2819 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2820 SDLoc DL(Op);
2821
2822 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2823 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2824 }
2825
2826 if (SrcVT == MVT::f16 ||
2827 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2828 SDLoc DL(Op);
2829
2830 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2831 unsigned Ext =
2832 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2833 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2834 }
2835
2836 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2837 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2838
2839 return SDValue();
2840}
2841
2842SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2843 SelectionDAG &DAG) const {
2844 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2845 MVT VT = Op.getSimpleValueType();
2846 MVT ScalarVT = VT.getScalarType();
2847
2848 assert(VT.isVector())(static_cast <bool> (VT.isVector()) ? void (0) : __assert_fail
("VT.isVector()", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 2848, __extension__ __PRETTY_FUNCTION__))
;
2849
2850 SDValue Src = Op.getOperand(0);
2851 SDLoc DL(Op);
2852
2853 // TODO: Don't scalarize on Evergreen?
2854 unsigned NElts = VT.getVectorNumElements();
2855 SmallVector<SDValue, 8> Args;
2856 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2857
2858 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2859 for (unsigned I = 0; I < NElts; ++I)
2860 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2861
2862 return DAG.getBuildVector(VT, DL, Args);
2863}
2864
2865//===----------------------------------------------------------------------===//
2866// Custom DAG optimizations
2867//===----------------------------------------------------------------------===//
2868
2869static bool isU24(SDValue Op, SelectionDAG &DAG) {
2870 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2871}
2872
2873static bool isI24(SDValue Op, SelectionDAG &DAG) {
2874 EVT VT = Op.getValueType();
2875 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2876 // as unsigned 24-bit values.
2877 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2878}
2879
2880static SDValue simplifyMul24(SDNode *Node24,
2881 TargetLowering::DAGCombinerInfo &DCI) {
2882 SelectionDAG &DAG = DCI.DAG;
2883 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2884 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2885
2886 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2887 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2888 unsigned NewOpcode = Node24->getOpcode();
2889 if (IsIntrin) {
2890 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2891 switch (IID) {
2892 case Intrinsic::amdgcn_mul_i24:
2893 NewOpcode = AMDGPUISD::MUL_I24;
2894 break;
2895 case Intrinsic::amdgcn_mul_u24:
2896 NewOpcode = AMDGPUISD::MUL_U24;
2897 break;
2898 case Intrinsic::amdgcn_mulhi_i24:
2899 NewOpcode = AMDGPUISD::MULHI_I24;
2900 break;
2901 case Intrinsic::amdgcn_mulhi_u24:
2902 NewOpcode = AMDGPUISD::MULHI_U24;
2903 break;
2904 default:
2905 llvm_unreachable("Expected 24-bit mul intrinsic")::llvm::llvm_unreachable_internal("Expected 24-bit mul intrinsic"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 2905)
;
2906 }
2907 }
2908
2909 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2910
2911 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2912 // the operands to have other uses, but will only perform simplifications that
2913 // involve bypassing some nodes for this user.
2914 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2915 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2916 if (DemandedLHS || DemandedRHS)
2917 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2918 DemandedLHS ? DemandedLHS : LHS,
2919 DemandedRHS ? DemandedRHS : RHS);
2920
2921 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2922 // operands if this node is the only user.
2923 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2924 return SDValue(Node24, 0);
2925 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2926 return SDValue(Node24, 0);
2927
2928 return SDValue();
2929}
2930
2931template <typename IntTy>
2932static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2933 uint32_t Width, const SDLoc &DL) {
2934 if (Width + Offset < 32) {
2935 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2936 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2937 return DAG.getConstant(Result, DL, MVT::i32);
2938 }
2939
2940 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2941}
2942
2943static bool hasVolatileUser(SDNode *Val) {
2944 for (SDNode *U : Val->uses()) {
2945 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2946 if (M->isVolatile())
2947 return true;
2948 }
2949 }
2950
2951 return false;
2952}
2953
2954bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2955 // i32 vectors are the canonical memory type.
2956 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2957 return false;
2958
2959 if (!VT.isByteSized())
2960 return false;
2961
2962 unsigned Size = VT.getStoreSize();
2963
2964 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2965 return false;
2966
2967 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2968 return false;
2969
2970 return true;
2971}
2972
2973// Replace load of an illegal type with a store of a bitcast to a friendlier
2974// type.
2975SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2976 DAGCombinerInfo &DCI) const {
2977 if (!DCI.isBeforeLegalize())
2978 return SDValue();
2979
2980 LoadSDNode *LN = cast<LoadSDNode>(N);
2981 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2982 return SDValue();
2983
2984 SDLoc SL(N);
2985 SelectionDAG &DAG = DCI.DAG;
2986 EVT VT = LN->getMemoryVT();
2987
2988 unsigned Size = VT.getStoreSize();
2989 Align Alignment = LN->getAlign();
2990 if (Alignment < Size && isTypeLegal(VT)) {
2991 bool IsFast;
2992 unsigned AS = LN->getAddressSpace();
2993
2994 // Expand unaligned loads earlier than legalization. Due to visitation order
2995 // problems during legalization, the emitted instructions to pack and unpack
2996 // the bytes again are not eliminated in the case of an unaligned copy.
2997 if (!allowsMisalignedMemoryAccesses(
2998 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2999 if (VT.isVector())
3000 return SplitVectorLoad(SDValue(LN, 0), DAG);
3001
3002 SDValue Ops[2];
3003 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3004
3005 return DAG.getMergeValues(Ops, SDLoc(N));
3006 }
3007
3008 if (!IsFast)
3009 return SDValue();
3010 }
3011
3012 if (!shouldCombineMemoryType(VT))
3013 return SDValue();
3014
3015 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3016
3017 SDValue NewLoad
3018 = DAG.getLoad(NewVT, SL, LN->getChain(),
3019 LN->getBasePtr(), LN->getMemOperand());
3020
3021 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3022 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3023 return SDValue(N, 0);
3024}
3025
3026// Replace store of an illegal type with a store of a bitcast to a friendlier
3027// type.
3028SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3029 DAGCombinerInfo &DCI) const {
3030 if (!DCI.isBeforeLegalize())
3031 return SDValue();
3032
3033 StoreSDNode *SN = cast<StoreSDNode>(N);
3034 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3035 return SDValue();
3036
3037 EVT VT = SN->getMemoryVT();
3038 unsigned Size = VT.getStoreSize();
3039
3040 SDLoc SL(N);
3041 SelectionDAG &DAG = DCI.DAG;
3042 Align Alignment = SN->getAlign();
3043 if (Alignment < Size && isTypeLegal(VT)) {
3044 bool IsFast;
3045 unsigned AS = SN->getAddressSpace();
3046
3047 // Expand unaligned stores earlier than legalization. Due to visitation
3048 // order problems during legalization, the emitted instructions to pack and
3049 // unpack the bytes again are not eliminated in the case of an unaligned
3050 // copy.
3051 if (!allowsMisalignedMemoryAccesses(
3052 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3053 if (VT.isVector())
3054 return SplitVectorStore(SDValue(SN, 0), DAG);
3055
3056 return expandUnalignedStore(SN, DAG);
3057 }
3058
3059 if (!IsFast)
3060 return SDValue();
3061 }
3062
3063 if (!shouldCombineMemoryType(VT))
3064 return SDValue();
3065
3066 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3067 SDValue Val = SN->getValue();
3068
3069 //DCI.AddToWorklist(Val.getNode());
3070
3071 bool OtherUses = !Val.hasOneUse();
3072 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3073 if (OtherUses) {
3074 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3075 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3076 }
3077
3078 return DAG.getStore(SN->getChain(), SL, CastVal,
3079 SN->getBasePtr(), SN->getMemOperand());
3080}
3081
3082// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3083// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3084// issues.
3085SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3086 DAGCombinerInfo &DCI) const {
3087 SelectionDAG &DAG = DCI.DAG;
3088 SDValue N0 = N->getOperand(0);
3089
3090 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3091 // (vt2 (truncate (assertzext vt0:x, vt1)))
3092 if (N0.getOpcode() == ISD::TRUNCATE) {
3093 SDValue N1 = N->getOperand(1);
3094 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3095 SDLoc SL(N);
3096
3097 SDValue Src = N0.getOperand(0);
3098 EVT SrcVT = Src.getValueType();
3099 if (SrcVT.bitsGE(ExtVT)) {
3100 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3101 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3102 }
3103 }
3104
3105 return SDValue();
3106}
3107
3108SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3109 SDNode *N, DAGCombinerInfo &DCI) const {
3110 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3111 switch (IID) {
3112 case Intrinsic::amdgcn_mul_i24:
3113 case Intrinsic::amdgcn_mul_u24:
3114 case Intrinsic::amdgcn_mulhi_i24:
3115 case Intrinsic::amdgcn_mulhi_u24:
3116 return simplifyMul24(N, DCI);
3117 case Intrinsic::amdgcn_fract:
3118 case Intrinsic::amdgcn_rsq:
3119 case Intrinsic::amdgcn_rcp_legacy:
3120 case Intrinsic::amdgcn_rsq_legacy:
3121 case Intrinsic::amdgcn_rsq_clamp:
3122 case Intrinsic::amdgcn_ldexp: {
3123 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3124 SDValue Src = N->getOperand(1);
3125 return Src.isUndef() ? Src : SDValue();
3126 }
3127 default:
3128 return SDValue();
3129 }
3130}
3131
3132/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3133/// binary operation \p Opc to it with the corresponding constant operands.
3134SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3135 DAGCombinerInfo &DCI, const SDLoc &SL,
3136 unsigned Opc, SDValue LHS,
3137 uint32_t ValLo, uint32_t ValHi) const {
3138 SelectionDAG &DAG = DCI.DAG;
3139 SDValue Lo, Hi;
3140 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3141
3142 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3143 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3144
3145 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3146 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3147
3148 // Re-visit the ands. It's possible we eliminated one of them and it could
3149 // simplify the vector.
3150 DCI.AddToWorklist(Lo.getNode());
3151 DCI.AddToWorklist(Hi.getNode());
3152
3153 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3154 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3155}
3156
3157SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3158 DAGCombinerInfo &DCI) const {
3159 EVT VT = N->getValueType(0);
3160
3161 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3162 if (!RHS)
3163 return SDValue();
3164
3165 SDValue LHS = N->getOperand(0);
3166 unsigned RHSVal = RHS->getZExtValue();
3167 if (!RHSVal)
3168 return LHS;
3169
3170 SDLoc SL(N);
3171 SelectionDAG &DAG = DCI.DAG;
3172
3173 switch (LHS->getOpcode()) {
3174 default:
3175 break;
3176 case ISD::ZERO_EXTEND:
3177 case ISD::SIGN_EXTEND:
3178 case ISD::ANY_EXTEND: {
3179 SDValue X = LHS->getOperand(0);
3180
3181 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3182 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3183 // Prefer build_vector as the canonical form if packed types are legal.
3184 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3185 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3186 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3187 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3188 }
3189
3190 // shl (ext x) => zext (shl x), if shift does not overflow int
3191 if (VT != MVT::i64)
3192 break;
3193 KnownBits Known = DAG.computeKnownBits(X);
3194 unsigned LZ = Known.countMinLeadingZeros();
3195 if (LZ < RHSVal)
3196 break;
3197 EVT XVT = X.getValueType();
3198 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3199 return DAG.getZExtOrTrunc(Shl, SL, VT);
3200 }
3201 }
3202
3203 if (VT != MVT::i64)
3204 return SDValue();
3205
3206 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3207
3208 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3209 // common case, splitting this into a move and a 32-bit shift is faster and
3210 // the same code size.
3211 if (RHSVal < 32)
3212 return SDValue();
3213
3214 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3215
3216 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3217 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3218
3219 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3220
3221 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3222 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3223}
3224
3225SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3226 DAGCombinerInfo &DCI) const {
3227 if (N->getValueType(0) != MVT::i64)
3228 return SDValue();
3229
3230 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3231 if (!RHS)
3232 return SDValue();
3233
3234 SelectionDAG &DAG = DCI.DAG;
3235 SDLoc SL(N);
3236 unsigned RHSVal = RHS->getZExtValue();
3237
3238 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3239 if (RHSVal == 32) {
3240 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3241 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3242 DAG.getConstant(31, SL, MVT::i32));
3243
3244 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3245 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3246 }
3247
3248 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3249 if (RHSVal == 63) {
3250 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3251 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3252 DAG.getConstant(31, SL, MVT::i32));
3253 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3254 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3255 }
3256
3257 return SDValue();
3258}
3259
3260SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3261 DAGCombinerInfo &DCI) const {
3262 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3263 if (!RHS)
3264 return SDValue();
3265
3266 EVT VT = N->getValueType(0);
3267 SDValue LHS = N->getOperand(0);
3268 unsigned ShiftAmt = RHS->getZExtValue();
3269 SelectionDAG &DAG = DCI.DAG;
3270 SDLoc SL(N);
3271
3272 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3273 // this improves the ability to match BFE patterns in isel.
3274 if (LHS.getOpcode() == ISD::AND) {
3275 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3276 unsigned MaskIdx, MaskLen;
3277 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3278 MaskIdx == ShiftAmt) {
3279 return DAG.getNode(
3280 ISD::AND, SL, VT,
3281 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3282 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3283 }
3284 }
3285 }
3286
3287 if (VT != MVT::i64)
3288 return SDValue();
3289
3290 if (ShiftAmt < 32)
3291 return SDValue();
3292
3293 // srl i64:x, C for C >= 32
3294 // =>
3295 // build_pair (srl hi_32(x), C - 32), 0
3296 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3297
3298 SDValue Hi = getHiHalf64(LHS, DAG);
3299
3300 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3301 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3302
3303 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3304
3305 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3306}
3307
3308SDValue AMDGPUTargetLowering::performTruncateCombine(
3309 SDNode *N, DAGCombinerInfo &DCI) const {
3310 SDLoc SL(N);
3311 SelectionDAG &DAG = DCI.DAG;
3312 EVT VT = N->getValueType(0);
3313 SDValue Src = N->getOperand(0);
3314
3315 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3316 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3317 SDValue Vec = Src.getOperand(0);
3318 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3319 SDValue Elt0 = Vec.getOperand(0);
3320 EVT EltVT = Elt0.getValueType();
3321 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3322 if (EltVT.isFloatingPoint()) {
3323 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3324 EltVT.changeTypeToInteger(), Elt0);
3325 }
3326
3327 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3328 }
3329 }
3330 }
3331
3332 // Equivalent of above for accessing the high element of a vector as an
3333 // integer operation.
3334 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3335 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3336 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3337 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3338 SDValue BV = stripBitcast(Src.getOperand(0));
3339 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3340 BV.getValueType().getVectorNumElements() == 2) {
3341 SDValue SrcElt = BV.getOperand(1);
3342 EVT SrcEltVT = SrcElt.getValueType();
3343 if (SrcEltVT.isFloatingPoint()) {
3344 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3345 SrcEltVT.changeTypeToInteger(), SrcElt);
3346 }
3347
3348 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3349 }
3350 }
3351 }
3352 }
3353
3354 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3355 //
3356 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3357 // i16 (trunc (srl (i32 (trunc x), K)))
3358 if (VT.getScalarSizeInBits() < 32) {
3359 EVT SrcVT = Src.getValueType();
3360 if (SrcVT.getScalarSizeInBits() > 32 &&
3361 (Src.getOpcode() == ISD::SRL ||
3362 Src.getOpcode() == ISD::SRA ||
3363 Src.getOpcode() == ISD::SHL)) {
3364 SDValue Amt = Src.getOperand(1);
3365 KnownBits Known = DAG.computeKnownBits(Amt);
3366 unsigned Size = VT.getScalarSizeInBits();
3367 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3368 (Known.countMaxActiveBits() <= Log2_32(Size))) {
3369 EVT MidVT = VT.isVector() ?
3370 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3371 VT.getVectorNumElements()) : MVT::i32;
3372
3373 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3374 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3375 Src.getOperand(0));
3376 DCI.AddToWorklist(Trunc.getNode());
3377
3378 if (Amt.getValueType() != NewShiftVT) {
3379 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3380 DCI.AddToWorklist(Amt.getNode());
3381 }
3382
3383 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3384 Trunc, Amt);
3385 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3386 }
3387 }
3388 }
3389
3390 return SDValue();
3391}
3392
3393// We need to specifically handle i64 mul here to avoid unnecessary conversion
3394// instructions. If we only match on the legalized i64 mul expansion,
3395// SimplifyDemandedBits will be unable to remove them because there will be
3396// multiple uses due to the separate mul + mulh[su].
3397static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3398 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3399 if (Size <= 32) {
3400 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3401 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3402 }
3403
3404 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3405 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3406
3407 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3408 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3409
3410 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3411}
3412
3413SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3414 DAGCombinerInfo &DCI) const {
3415 EVT VT = N->getValueType(0);
3416
3417 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3418 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3419 // unnecessarily). isDivergent() is used as an approximation of whether the
3420 // value is in an SGPR.
3421 if (!N->isDivergent())
3422 return SDValue();
3423
3424 unsigned Size = VT.getSizeInBits();
3425 if (VT.isVector() || Size > 64)
3426 return SDValue();
3427
3428 // There are i16 integer mul/mad.
3429 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3430 return SDValue();
3431
3432 SelectionDAG &DAG = DCI.DAG;
3433 SDLoc DL(N);
3434
3435 SDValue N0 = N->getOperand(0);
3436 SDValue N1 = N->getOperand(1);
3437
3438 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3439 // in the source into any_extends if the result of the mul is truncated. Since
3440 // we can assume the high bits are whatever we want, use the underlying value
3441 // to avoid the unknown high bits from interfering.
3442 if (N0.getOpcode() == ISD::ANY_EXTEND)
3443 N0 = N0.getOperand(0);
3444
3445 if (N1.getOpcode() == ISD::ANY_EXTEND)
3446 N1 = N1.getOperand(0);
3447
3448 SDValue Mul;
3449
3450 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3451 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3452 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3453 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3454 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3455 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3456 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3457 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3458 } else {
3459 return SDValue();
3460 }
3461
3462 // We need to use sext even for MUL_U24, because MUL_U24 is used
3463 // for signed multiply of 8 and 16-bit types.
3464 return DAG.getSExtOrTrunc(Mul, DL, VT);
3465}
3466
3467SDValue
3468AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3469 DAGCombinerInfo &DCI) const {
3470 if (N->getValueType(0) != MVT::i32)
3471 return SDValue();
3472
3473 SelectionDAG &DAG = DCI.DAG;
3474 SDLoc DL(N);
3475
3476 SDValue N0 = N->getOperand(0);
3477 SDValue N1 = N->getOperand(1);
3478
3479 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3480 // in the source into any_extends if the result of the mul is truncated. Since
3481 // we can assume the high bits are whatever we want, use the underlying value
3482 // to avoid the unknown high bits from interfering.
3483 if (N0.getOpcode() == ISD::ANY_EXTEND)
3484 N0 = N0.getOperand(0);
3485 if (N1.getOpcode() == ISD::ANY_EXTEND)
3486 N1 = N1.getOperand(0);
3487
3488 // Try to use two fast 24-bit multiplies (one for each half of the result)
3489 // instead of one slow extending multiply.
3490 unsigned LoOpcode, HiOpcode;
3491 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3492 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3493 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3494 LoOpcode = AMDGPUISD::MUL_U24;
3495 HiOpcode = AMDGPUISD::MULHI_U24;
3496 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3497 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3498 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3499 LoOpcode = AMDGPUISD::MUL_I24;
3500 HiOpcode = AMDGPUISD::MULHI_I24;
3501 } else {
3502 return SDValue();
3503 }
3504
3505 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3506 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3507 DCI.CombineTo(N, Lo, Hi);
3508 return SDValue(N, 0);
3509}
3510
3511SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3512 DAGCombinerInfo &DCI) const {
3513 EVT VT = N->getValueType(0);
3514
3515 if (!Subtarget->hasMulI24() || VT.isVector())
3516 return SDValue();
3517
3518 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3519 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3520 // unnecessarily). isDivergent() is used as an approximation of whether the
3521 // value is in an SGPR.
3522 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3523 // valu op anyway)
3524 if (Subtarget->hasSMulHi() && !N->isDivergent())
3525 return SDValue();
3526
3527 SelectionDAG &DAG = DCI.DAG;
3528 SDLoc DL(N);
3529
3530 SDValue N0 = N->getOperand(0);
3531 SDValue N1 = N->getOperand(1);
3532
3533 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3534 return SDValue();
3535
3536 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3537 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3538
3539 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3540 DCI.AddToWorklist(Mulhi.getNode());
3541 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3542}
3543
3544SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3545 DAGCombinerInfo &DCI) const {
3546 EVT VT = N->getValueType(0);
3547
3548 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3549 return SDValue();
3550
3551 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3552 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3553 // unnecessarily). isDivergent() is used as an approximation of whether the
3554 // value is in an SGPR.
3555 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3556 // valu op anyway)
3557 if (Subtarget->hasSMulHi() && !N->isDivergent())
3558 return SDValue();
3559
3560 SelectionDAG &DAG = DCI.DAG;
3561 SDLoc DL(N);
3562
3563 SDValue N0 = N->getOperand(0);
3564 SDValue N1 = N->getOperand(1);
3565
3566 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3567 return SDValue();
3568
3569 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3570 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3571
3572 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3573 DCI.AddToWorklist(Mulhi.getNode());
3574 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3575}
3576
3577static bool isNegativeOne(SDValue Val) {
3578 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3579 return C->isAllOnes();
3580 return false;
3581}
3582
3583SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3584 SDValue Op,
3585 const SDLoc &DL,
3586 unsigned Opc) const {
3587 EVT VT = Op.getValueType();
3588 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3589 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3590 LegalVT != MVT::i16))
3591 return SDValue();
3592
3593 if (VT != MVT::i32)
3594 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3595
3596 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3597 if (VT != MVT::i32)
3598 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3599
3600 return FFBX;
3601}
3602
3603// The native instructions return -1 on 0 input. Optimize out a select that
3604// produces -1 on 0.
3605//
3606// TODO: If zero is not undef, we could also do this if the output is compared
3607// against the bitwidth.
3608//
3609// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3610SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3611 SDValue LHS, SDValue RHS,
3612 DAGCombinerInfo &DCI) const {
3613 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3614 if (!CmpRhs || !CmpRhs->isZero())
3615 return SDValue();
3616
3617 SelectionDAG &DAG = DCI.DAG;
3618 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3619 SDValue CmpLHS = Cond.getOperand(0);
3620
3621 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3622 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3623 if (CCOpcode == ISD::SETEQ &&
3624 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3625 RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3626 unsigned Opc =
3627 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3628 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3629 }
3630
3631 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3632 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3633 if (CCOpcode == ISD::SETNE &&
3634 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3635 LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3636 unsigned Opc =
3637 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3638
3639 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3640 }
3641
3642 return SDValue();
3643}
3644
3645static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3646 unsigned Op,
3647 const SDLoc &SL,
3648 SDValue Cond,
3649 SDValue N1,
3650 SDValue N2) {
3651 SelectionDAG &DAG = DCI.DAG;
3652 EVT VT = N1.getValueType();
3653
3654 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3655 N1.getOperand(0), N2.getOperand(0));
3656 DCI.AddToWorklist(NewSelect.getNode());
3657 return DAG.getNode(Op, SL, VT, NewSelect);
3658}
3659
3660// Pull a free FP operation out of a select so it may fold into uses.
3661//
3662// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3663// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3664//
3665// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3666// select c, (fabs x), +k -> fabs (select c, x, k)
3667static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3668 SDValue N) {
3669 SelectionDAG &DAG = DCI.DAG;
3670 SDValue Cond = N.getOperand(0);
3671 SDValue LHS = N.getOperand(1);
3672 SDValue RHS = N.getOperand(2);
3673
3674 EVT VT = N.getValueType();
3675 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3676 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3677 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3678 SDLoc(N), Cond, LHS, RHS);
3679 }
3680
3681 bool Inv = false;
3682 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3683 std::swap(LHS, RHS);
3684 Inv = true;
3685 }
3686
3687 // TODO: Support vector constants.
3688 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3689 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3690 SDLoc SL(N);
3691 // If one side is an fneg/fabs and the other is a constant, we can push the
3692 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3693 SDValue NewLHS = LHS.getOperand(0);
3694 SDValue NewRHS = RHS;
3695
3696 // Careful: if the neg can be folded up, don't try to pull it back down.
3697 bool ShouldFoldNeg = true;
3698
3699 if (NewLHS.hasOneUse()) {
3700 unsigned Opc = NewLHS.getOpcode();
3701 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3702 ShouldFoldNeg = false;
3703 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3704 ShouldFoldNeg = false;
3705 }
3706
3707 if (ShouldFoldNeg) {
3708 if (LHS.getOpcode() == ISD::FNEG)
3709 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3710 else if (CRHS->isNegative())
3711 return SDValue();
3712
3713 if (Inv)
3714 std::swap(NewLHS, NewRHS);
3715
3716 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3717 Cond, NewLHS, NewRHS);
3718 DCI.AddToWorklist(NewSelect.getNode());
3719 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3720 }
3721 }
3722
3723 return SDValue();
3724}
3725
3726
3727SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3728 DAGCombinerInfo &DCI) const {
3729 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3730 return Folded;
3731
3732 SDValue Cond = N->getOperand(0);
3733 if (Cond.getOpcode() != ISD::SETCC)
3734 return SDValue();
3735
3736 EVT VT = N->getValueType(0);
3737 SDValue LHS = Cond.getOperand(0);
3738 SDValue RHS = Cond.getOperand(1);
3739 SDValue CC = Cond.getOperand(2);
3740
3741 SDValue True = N->getOperand(1);
3742 SDValue False = N->getOperand(2);
3743
3744 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3745 SelectionDAG &DAG = DCI.DAG;
3746 if (DAG.isConstantValueOfAnyType(True) &&
3747 !DAG.isConstantValueOfAnyType(False)) {
3748 // Swap cmp + select pair to move constant to false input.
3749 // This will allow using VOPC cndmasks more often.
3750 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3751
3752 SDLoc SL(N);
3753 ISD::CondCode NewCC =
3754 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3755
3756 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3757 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3758 }
3759
3760 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3761 SDValue MinMax
3762 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3763 // Revisit this node so we can catch min3/max3/med3 patterns.
3764 //DCI.AddToWorklist(MinMax.getNode());
3765 return MinMax;
3766 }
3767 }
3768
3769 // There's no reason to not do this if the condition has other uses.
3770 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3771}
3772
3773static bool isInv2Pi(const APFloat &APF) {
3774 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3775 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3776 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3777
3778 return APF.bitwiseIsEqual(KF16) ||
3779 APF.bitwiseIsEqual(KF32) ||
3780 APF.bitwiseIsEqual(KF64);
3781}
3782
3783// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3784// additional cost to negate them.
3785bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3786 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3787 if (C->isZero() && !C->isNegative())
3788 return true;
3789
3790 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3791 return true;
3792 }
3793
3794 return false;
3795}
3796
3797static unsigned inverseMinMax(unsigned Opc) {
3798 switch (Opc) {
3799 case ISD::FMAXNUM:
3800 return ISD::FMINNUM;
3801 case ISD::FMINNUM:
3802 return ISD::FMAXNUM;
3803 case ISD::FMAXNUM_IEEE:
3804 return ISD::FMINNUM_IEEE;
3805 case ISD::FMINNUM_IEEE:
3806 return ISD::FMAXNUM_IEEE;
3807 case AMDGPUISD::FMAX_LEGACY:
3808 return AMDGPUISD::FMIN_LEGACY;
3809 case AMDGPUISD::FMIN_LEGACY:
3810 return AMDGPUISD::FMAX_LEGACY;
3811 default:
3812 llvm_unreachable("invalid min/max opcode")::llvm::llvm_unreachable_internal("invalid min/max opcode", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 3812)
;
3813 }
3814}
3815
3816SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3817 DAGCombinerInfo &DCI) const {
3818 SelectionDAG &DAG = DCI.DAG;
3819 SDValue N0 = N->getOperand(0);
3820 EVT VT = N->getValueType(0);
3821
3822 unsigned Opc = N0.getOpcode();
3823
3824 // If the input has multiple uses and we can either fold the negate down, or
3825 // the other uses cannot, give up. This both prevents unprofitable
3826 // transformations and infinite loops: we won't repeatedly try to fold around
3827 // a negate that has no 'good' form.
3828 if (N0.hasOneUse()) {
3829 // This may be able to fold into the source, but at a code size cost. Don't
3830 // fold if the fold into the user is free.
3831 if (allUsesHaveSourceMods(N, 0))
3832 return SDValue();
3833 } else {
3834 if (fnegFoldsIntoOp(Opc) &&
3835 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3836 return SDValue();
3837 }
3838
3839 SDLoc SL(N);
3840 switch (Opc) {
3841 case ISD::FADD: {
3842 if (!mayIgnoreSignedZero(N0))
3843 return SDValue();
3844
3845 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3846 SDValue LHS = N0.getOperand(0);
3847 SDValue RHS = N0.getOperand(1);
3848
3849 if (LHS.getOpcode() != ISD::FNEG)
3850 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3851 else
3852 LHS = LHS.getOperand(0);
3853
3854 if (RHS.getOpcode() != ISD::FNEG)
3855 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3856 else
3857 RHS = RHS.getOperand(0);
3858
3859 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3860 if (Res.getOpcode() != ISD::FADD)
3861 return SDValue(); // Op got folded away.
3862 if (!N0.hasOneUse())
3863 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3864 return Res;
3865 }
3866 case ISD::FMUL:
3867 case AMDGPUISD::FMUL_LEGACY: {
3868 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3869 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3870 SDValue LHS = N0.getOperand(0);
3871 SDValue RHS = N0.getOperand(1);
3872
3873 if (LHS.getOpcode() == ISD::FNEG)
3874 LHS = LHS.getOperand(0);
3875 else if (RHS.getOpcode() == ISD::FNEG)
3876 RHS = RHS.getOperand(0);
3877 else
3878 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3879
3880 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3881 if (Res.getOpcode() != Opc)
3882 return SDValue(); // Op got folded away.
3883 if (!N0.hasOneUse())
3884 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3885 return Res;
3886 }
3887 case ISD::FMA:
3888 case ISD::FMAD: {
3889 // TODO: handle llvm.amdgcn.fma.legacy
3890 if (!mayIgnoreSignedZero(N0))
3891 return SDValue();
3892
3893 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3894 SDValue LHS = N0.getOperand(0);
3895 SDValue MHS = N0.getOperand(1);
3896 SDValue RHS = N0.getOperand(2);
3897
3898 if (LHS.getOpcode() == ISD::FNEG)
3899 LHS = LHS.getOperand(0);
3900 else if (MHS.getOpcode() == ISD::FNEG)
3901 MHS = MHS.getOperand(0);
3902 else
3903 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3904
3905 if (RHS.getOpcode() != ISD::FNEG)
3906 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3907 else
3908 RHS = RHS.getOperand(0);
3909
3910 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3911 if (Res.getOpcode() != Opc)
3912 return SDValue(); // Op got folded away.
3913 if (!N0.hasOneUse())
3914 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3915 return Res;
3916 }
3917 case ISD::FMAXNUM:
3918 case ISD::FMINNUM:
3919 case ISD::FMAXNUM_IEEE:
3920 case ISD::FMINNUM_IEEE:
3921 case AMDGPUISD::FMAX_LEGACY:
3922 case AMDGPUISD::FMIN_LEGACY: {
3923 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3924 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3925 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3926 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3927
3928 SDValue LHS = N0.getOperand(0);
3929 SDValue RHS = N0.getOperand(1);
3930
3931 // 0 doesn't have a negated inline immediate.
3932 // TODO: This constant check should be generalized to other operations.
3933 if (isConstantCostlierToNegate(RHS))
3934 return SDValue();
3935
3936 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3937 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3938 unsigned Opposite = inverseMinMax(Opc);
3939
3940 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3941 if (Res.getOpcode() != Opposite)
3942 return SDValue(); // Op got folded away.
3943 if (!N0.hasOneUse())
3944 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3945 return Res;
3946 }
3947 case AMDGPUISD::FMED3: {
3948 SDValue Ops[3];
3949 for (unsigned I = 0; I < 3; ++I)
3950 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3951
3952 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3953 if (Res.getOpcode() != AMDGPUISD::FMED3)
3954 return SDValue(); // Op got folded away.
3955
3956 if (!N0.hasOneUse()) {
3957 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3958 DAG.ReplaceAllUsesWith(N0, Neg);
3959
3960 for (SDNode *U : Neg->uses())
3961 DCI.AddToWorklist(U);
3962 }
3963
3964 return Res;
3965 }
3966 case ISD::FP_EXTEND:
3967 case ISD::FTRUNC:
3968 case ISD::FRINT:
3969 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3970 case ISD::FSIN:
3971 case ISD::FCANONICALIZE:
3972 case AMDGPUISD::RCP:
3973 case AMDGPUISD::RCP_LEGACY:
3974 case AMDGPUISD::RCP_IFLAG:
3975 case AMDGPUISD::SIN_HW: {
3976 SDValue CvtSrc = N0.getOperand(0);
3977 if (CvtSrc.getOpcode() == ISD::FNEG) {
3978 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3979 // (fneg (rcp (fneg x))) -> (rcp x)
3980 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3981 }
3982
3983 if (!N0.hasOneUse())
3984 return SDValue();
3985
3986 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3987 // (fneg (rcp x)) -> (rcp (fneg x))
3988 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3989 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3990 }
3991 case ISD::FP_ROUND: {
3992 SDValue CvtSrc = N0.getOperand(0);
3993
3994 if (CvtSrc.getOpcode() == ISD::FNEG) {
3995 // (fneg (fp_round (fneg x))) -> (fp_round x)
3996 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3997 CvtSrc.getOperand(0), N0.getOperand(1));
3998 }
3999
4000 if (!N0.hasOneUse())
4001 return SDValue();
4002
4003 // (fneg (fp_round x)) -> (fp_round (fneg x))
4004 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4005 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4006 }
4007 case ISD::FP16_TO_FP: {
4008 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4009 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4010 // Put the fneg back as a legal source operation that can be matched later.
4011 SDLoc SL(N);
4012
4013 SDValue Src = N0.getOperand(0);
4014 EVT SrcVT = Src.getValueType();
4015
4016 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4017 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4018 DAG.getConstant(0x8000, SL, SrcVT));
4019 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4020 }
4021 default:
4022 return SDValue();
4023 }
4024}
4025
4026SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4027 DAGCombinerInfo &DCI) const {
4028 SelectionDAG &DAG = DCI.DAG;
4029 SDValue N0 = N->getOperand(0);
4030
4031 if (!N0.hasOneUse())
4032 return SDValue();
4033
4034 switch (N0.getOpcode()) {
4035 case ISD::FP16_TO_FP: {
4036 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")(static_cast <bool> (!Subtarget->has16BitInsts() &&
"should only see if f16 is illegal") ? void (0) : __assert_fail
("!Subtarget->has16BitInsts() && \"should only see if f16 is illegal\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4036, __extension__
__PRETTY_FUNCTION__))
;
4037 SDLoc SL(N);
4038 SDValue Src = N0.getOperand(0);
4039 EVT SrcVT = Src.getValueType();
4040
4041 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4042 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4043 DAG.getConstant(0x7fff, SL, SrcVT));
4044 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4045 }
4046 default:
4047 return SDValue();
4048 }
4049}
4050
4051SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4052 DAGCombinerInfo &DCI) const {
4053 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4054 if (!CFP)
4055 return SDValue();
4056
4057 // XXX - Should this flush denormals?
4058 const APFloat &Val = CFP->getValueAPF();
4059 APFloat One(Val.getSemantics(), "1.0");
4060 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4061}
4062
4063SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4064 DAGCombinerInfo &DCI) const {
4065 SelectionDAG &DAG = DCI.DAG;
4066 SDLoc DL(N);
4067
4068 switch(N->getOpcode()) {
4069 default:
4070 break;
4071 case ISD::BITCAST: {
4072 EVT DestVT = N->getValueType(0);
4073
4074 // Push casts through vector builds. This helps avoid emitting a large
4075 // number of copies when materializing floating point vector constants.
4076 //
4077 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4078 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4079 if (DestVT.isVector()) {
4080 SDValue Src = N->getOperand(0);
4081 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4082 EVT SrcVT = Src.getValueType();
4083 unsigned NElts = DestVT.getVectorNumElements();
4084
4085 if (SrcVT.getVectorNumElements() == NElts) {
4086 EVT DestEltVT = DestVT.getVectorElementType();
4087
4088 SmallVector<SDValue, 8> CastedElts;
4089 SDLoc SL(N);
4090 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4091 SDValue Elt = Src.getOperand(I);
4092 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4093 }
4094
4095 return DAG.getBuildVector(DestVT, SL, CastedElts);
4096 }
4097 }
4098 }
4099
4100 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4101 break;
4102
4103 // Fold bitcasts of constants.
4104 //
4105 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4106 // TODO: Generalize and move to DAGCombiner
4107 SDValue Src = N->getOperand(0);
4108 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4109 SDLoc SL(N);
4110 uint64_t CVal = C->getZExtValue();
4111 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4112 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4113 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4114 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4115 }
4116
4117 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4118 const APInt &Val = C->getValueAPF().bitcastToAPInt();
4119 SDLoc SL(N);
4120 uint64_t CVal = Val.getZExtValue();
4121 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4122 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4123 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4124
4125 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4126 }
4127
4128 break;
4129 }
4130 case ISD::SHL: {
4131 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4132 break;
4133
4134 return performShlCombine(N, DCI);
4135 }
4136 case ISD::SRL: {
4137 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4138 break;
4139
4140 return performSrlCombine(N, DCI);
4141 }
4142 case ISD::SRA: {
4143 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4144 break;
4145
4146 return performSraCombine(N, DCI);
4147 }
4148 case ISD::TRUNCATE:
4149 return performTruncateCombine(N, DCI);
4150 case ISD::MUL:
4151 return performMulCombine(N, DCI);
4152 case ISD::SMUL_LOHI:
4153 case ISD::UMUL_LOHI:
4154 return performMulLoHiCombine(N, DCI);
4155 case ISD::MULHS:
4156 return performMulhsCombine(N, DCI);
4157 case ISD::MULHU:
4158 return performMulhuCombine(N, DCI);
4159 case AMDGPUISD::MUL_I24:
4160 case AMDGPUISD::MUL_U24:
4161 case AMDGPUISD::MULHI_I24:
4162 case AMDGPUISD::MULHI_U24:
4163 return simplifyMul24(N, DCI);
4164 case ISD::SELECT:
4165 return performSelectCombine(N, DCI);
4166 case ISD::FNEG:
4167 return performFNegCombine(N, DCI);
4168 case ISD::FABS:
4169 return performFAbsCombine(N, DCI);
4170 case AMDGPUISD::BFE_I32:
4171 case AMDGPUISD::BFE_U32: {
4172 assert(!N->getValueType(0).isVector() &&(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4173, __extension__
__PRETTY_FUNCTION__))
4173 "Vector handling of BFE not implemented")(static_cast <bool> (!N->getValueType(0).isVector() &&
"Vector handling of BFE not implemented") ? void (0) : __assert_fail
("!N->getValueType(0).isVector() && \"Vector handling of BFE not implemented\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4173, __extension__
__PRETTY_FUNCTION__))
;
4174 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4175 if (!Width)
4176 break;
4177
4178 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4179 if (WidthVal == 0)
4180 return DAG.getConstant(0, DL, MVT::i32);
4181
4182 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4183 if (!Offset)
4184 break;
4185
4186 SDValue BitsFrom = N->getOperand(0);
4187 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4188
4189 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4190
4191 if (OffsetVal == 0) {
4192 // This is already sign / zero extended, so try to fold away extra BFEs.
4193 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4194
4195 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4196 if (OpSignBits >= SignBits)
4197 return BitsFrom;
4198
4199 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4200 if (Signed) {
4201 // This is a sign_extend_inreg. Replace it to take advantage of existing
4202 // DAG Combines. If not eliminated, we will match back to BFE during
4203 // selection.
4204
4205 // TODO: The sext_inreg of extended types ends, although we can could
4206 // handle them in a single BFE.
4207 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4208 DAG.getValueType(SmallVT));
4209 }
4210
4211 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4212 }
4213
4214 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4215 if (Signed) {
4216 return constantFoldBFE<int32_t>(DAG,
4217 CVal->getSExtValue(),
4218 OffsetVal,
4219 WidthVal,
4220 DL);
4221 }
4222
4223 return constantFoldBFE<uint32_t>(DAG,
4224 CVal->getZExtValue(),
4225 OffsetVal,
4226 WidthVal,
4227 DL);
4228 }
4229
4230 if ((OffsetVal + WidthVal) >= 32 &&
4231 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4232 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4233 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4234 BitsFrom, ShiftVal);
4235 }
4236
4237 if (BitsFrom.hasOneUse()) {
4238 APInt Demanded = APInt::getBitsSet(32,
4239 OffsetVal,
4240 OffsetVal + WidthVal);
4241
4242 KnownBits Known;
4243 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4244 !DCI.isBeforeLegalizeOps());
4245 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4246 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4247 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4248 DCI.CommitTargetLoweringOpt(TLO);
4249 }
4250 }
4251
4252 break;
4253 }
4254 case ISD::LOAD:
4255 return performLoadCombine(N, DCI);
4256 case ISD::STORE:
4257 return performStoreCombine(N, DCI);
4258 case AMDGPUISD::RCP:
4259 case AMDGPUISD::RCP_IFLAG:
4260 return performRcpCombine(N, DCI);
4261 case ISD::AssertZext:
4262 case ISD::AssertSext:
4263 return performAssertSZExtCombine(N, DCI);
4264 case ISD::INTRINSIC_WO_CHAIN:
4265 return performIntrinsicWOChainCombine(N, DCI);
4266 }
4267 return SDValue();
4268}
4269
4270//===----------------------------------------------------------------------===//
4271// Helper functions
4272//===----------------------------------------------------------------------===//
4273
4274SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4275 const TargetRegisterClass *RC,
4276 Register Reg, EVT VT,
4277 const SDLoc &SL,
4278 bool RawReg) const {
4279 MachineFunction &MF = DAG.getMachineFunction();
4280 MachineRegisterInfo &MRI = MF.getRegInfo();
4281 Register VReg;
4282
4283 if (!MRI.isLiveIn(Reg)) {
4284 VReg = MRI.createVirtualRegister(RC);
4285 MRI.addLiveIn(Reg, VReg);
4286 } else {
4287 VReg = MRI.getLiveInVirtReg(Reg);
4288 }
4289
4290 if (RawReg)
4291 return DAG.getRegister(VReg, VT);
4292
4293 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4294}
4295
4296// This may be called multiple times, and nothing prevents creating multiple
4297// objects at the same offset. See if we already defined this object.
4298static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4299 int64_t Offset) {
4300 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4301 if (MFI.getObjectOffset(I) == Offset) {
4302 assert(MFI.getObjectSize(I) == Size)(static_cast <bool> (MFI.getObjectSize(I) == Size) ? void
(0) : __assert_fail ("MFI.getObjectSize(I) == Size", "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp"
, 4302, __extension__ __PRETTY_FUNCTION__))
;
4303 return I;
4304 }
4305 }
4306
4307 return MFI.CreateFixedObject(Size, Offset, true);
4308}
4309
4310SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4311 EVT VT,
4312 const SDLoc &SL,
4313 int64_t Offset) const {
4314 MachineFunction &MF = DAG.getMachineFunction();
4315 MachineFrameInfo &MFI = MF.getFrameInfo();
4316 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4317
4318 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4319 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4320
4321 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4322 MachineMemOperand::MODereferenceable |
4323 MachineMemOperand::MOInvariant);
4324}
4325
4326SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4327 const SDLoc &SL,
4328 SDValue Chain,
4329 SDValue ArgVal,
4330 int64_t Offset) const {
4331 MachineFunction &MF = DAG.getMachineFunction();
4332 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4333 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4334
4335 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4336 // Stores to the argument stack area are relative to the stack pointer.
4337 SDValue SP =
4338 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4339 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4340 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4341 MachineMemOperand::MODereferenceable);
4342 return Store;
4343}
4344
4345SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4346 const TargetRegisterClass *RC,
4347 EVT VT, const SDLoc &SL,
4348 const ArgDescriptor &Arg) const {
4349 assert(Arg && "Attempting to load missing argument")(static_cast <bool> (Arg && "Attempting to load missing argument"
) ? void (0) : __assert_fail ("Arg && \"Attempting to load missing argument\""
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4349, __extension__
__PRETTY_FUNCTION__))
;
1
Assuming the condition is true
2
'?' condition is true
4350
4351 SDValue V = Arg.isRegister() ?
3
'?' condition is true
4352 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4353 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4354
4355 if (!Arg.isMasked())
4
Taking false branch
4356 return V;
4357
4358 unsigned Mask = Arg.getMask();
4359 unsigned Shift = countTrailingZeros<unsigned>(Mask);
5
Calling 'countTrailingZeros<unsigned int>'
12
Returning from 'countTrailingZeros<unsigned int>'
13
'Shift' initialized to 32
4360 V = DAG.getNode(ISD::SRL, SL, VT, V,
4361 DAG.getShiftAmountConstant(Shift, VT, SL));
4362 return DAG.getNode(ISD::AND, SL, VT, V,
4363 DAG.getConstant(Mask >> Shift, SL, VT));
14
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4364}
4365
4366uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4367 const MachineFunction &MF, const ImplicitParameter Param) const {
4368 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4369 const AMDGPUSubtarget &ST =
4370 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4371 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4372 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4373 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4374 ExplicitArgOffset;
4375 switch (Param) {
4376 case FIRST_IMPLICIT:
4377 return ArgOffset;
4378 case PRIVATE_BASE:
4379 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
4380 case SHARED_BASE:
4381 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
4382 case QUEUE_PTR:
4383 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
4384 }
4385 llvm_unreachable("unexpected implicit parameter type")::llvm::llvm_unreachable_internal("unexpected implicit parameter type"
, "llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp", 4385)
;
4386}
4387
4388#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4389
4390const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4391 switch ((AMDGPUISD::NodeType)Opcode) {
4392 case AMDGPUISD::FIRST_NUMBER: break;
4393 // AMDIL DAG nodes
4394 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4395 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4396
4397 // AMDGPU DAG nodes
4398 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4399 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4400 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4401 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4402 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4403 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4404 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4405 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4406 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4407 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4408 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4409 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4410 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4411 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4412 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4413 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4414 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4415 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4416 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4417 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4418 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4419 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4420 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4421 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4422 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4423 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4424 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4425 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4426 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4427 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4428 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4429 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4430 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4431 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4432 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4433 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4434 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4435 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4436 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4437 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4438 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4439 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4440 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4441 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4442 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4443 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4444 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4445 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4446 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4447 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4448 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4449 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4450 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4451 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4452 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4453 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4454 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4455 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4456 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4457 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4458 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4459 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4460 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4461 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4462 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4463 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4464 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4465 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4466 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4467 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4468 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4469 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4470 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4471 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4472 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4473 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4474 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4475 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4476 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4477 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4478 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4479 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4480 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4481 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4482 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4483 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4484 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)case AMDGPUISD::FPTRUNC_ROUND_UPWARD: return "FPTRUNC_ROUND_UPWARD"
;
4485 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)case AMDGPUISD::FPTRUNC_ROUND_DOWNWARD: return "FPTRUNC_ROUND_DOWNWARD"
;
4486 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4487 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4488 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4489 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4490 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4491 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4492 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4493 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4494 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4495 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4496 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4497 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4498 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4499 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4500 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4501 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4502 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4503 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4504 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4505 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4506 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4507 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4508 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4509 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4510 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4511 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4512 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4513 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4514 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4515 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4516 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4517 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4518 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4519 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4520 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4521 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4522 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4523 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4524 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4525 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4526 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4527 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4528 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4529 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4530 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4531 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4532 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)case AMDGPUISD::BUFFER_ATOMIC_CSUB: return "BUFFER_ATOMIC_CSUB"
;
4533 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4534 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)case AMDGPUISD::BUFFER_ATOMIC_FMIN: return "BUFFER_ATOMIC_FMIN"
;
4535 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)case AMDGPUISD::BUFFER_ATOMIC_FMAX: return "BUFFER_ATOMIC_FMAX"
;
4536
4537 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4538 }
4539 return nullptr;
4540}
4541
4542SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4543 SelectionDAG &DAG, int Enabled,
4544 int &RefinementSteps,
4545 bool &UseOneConstNR,
4546 bool Reciprocal) const {
4547 EVT VT = Operand.getValueType();
4548
4549 if (VT == MVT::f32) {
4550 RefinementSteps = 0;
4551 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4552 }
4553
4554 // TODO: There is also f64 rsq instruction, but the documentation is less
4555 // clear on its precision.
4556
4557 return SDValue();
4558}
4559
4560SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4561 SelectionDAG &DAG, int Enabled,
4562 int &RefinementSteps) const {
4563 EVT VT = Operand.getValueType();
4564
4565 if (VT == MVT::f32) {
4566 // Reciprocal, < 1 ulp error.
4567 //
4568 // This reciprocal approximation converges to < 0.5 ulp error with one
4569 // newton rhapson performed with two fused multiple adds (FMAs).
4570
4571 RefinementSteps = 0;
4572 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4573 }
4574
4575 // TODO: There is also f64 rcp instruction, but the documentation is less
4576 // clear on its precision.
4577
4578 return SDValue();
4579}
4580
4581void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4582 const SDValue Op, KnownBits &Known,
4583 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4584
4585 Known.resetAll(); // Don't know anything.
4586
4587 unsigned Opc = Op.getOpcode();
4588
4589 switch (Opc) {
4590 default:
4591 break;
4592 case AMDGPUISD::CARRY:
4593 case AMDGPUISD::BORROW: {
4594 Known.Zero = APInt::getHighBitsSet(32, 31);
4595 break;
4596 }
4597
4598 case AMDGPUISD::BFE_I32:
4599 case AMDGPUISD::BFE_U32: {
4600 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4601 if (!CWidth)
4602 return;
4603
4604 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4605
4606 if (Opc == AMDGPUISD::BFE_U32)
4607 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4608
4609 break;
4610 }
4611 case AMDGPUISD::FP_TO_FP16: {
4612 unsigned BitWidth = Known.getBitWidth();
4613
4614 // High bits are zero.
4615 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4616 break;
4617 }
4618 case AMDGPUISD::MUL_U24:
4619 case AMDGPUISD::MUL_I24: {
4620 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4621 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4622 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4623 RHSKnown.countMinTrailingZeros();
4624 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4625 // Skip extra check if all bits are known zeros.
4626 if (TrailZ >= 32)
4627 break;
4628
4629 // Truncate to 24 bits.
4630 LHSKnown = LHSKnown.trunc(24);
4631 RHSKnown = RHSKnown.trunc(24);
4632
4633 if (Opc == AMDGPUISD::MUL_I24) {
4634 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4635 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4636 unsigned MaxValBits = LHSValBits + RHSValBits;
4637 if (MaxValBits > 32)
4638 break;
4639 unsigned SignBits = 32 - MaxValBits + 1;
4640 bool LHSNegative = LHSKnown.isNegative();
4641 bool LHSNonNegative = LHSKnown.isNonNegative();
4642 bool LHSPositive = LHSKnown.isStrictlyPositive();
4643 bool RHSNegative = RHSKnown.isNegative();
4644 bool RHSNonNegative = RHSKnown.isNonNegative();
4645 bool RHSPositive = RHSKnown.isStrictlyPositive();
4646
4647 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4648 Known.Zero.setHighBits(SignBits);
4649 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4650 Known.One.setHighBits(SignBits);
4651 } else {
4652 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4653 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4654 unsigned MaxValBits = LHSValBits + RHSValBits;
4655 if (MaxValBits >= 32)
4656 break;
4657 Known.Zero.setBitsFrom(MaxValBits);
4658 }
4659 break;
4660 }
4661 case AMDGPUISD::PERM: {
4662 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4663 if (!CMask)
4664 return;
4665
4666 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4667 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4668 unsigned Sel = CMask->getZExtValue();
4669
4670 for (unsigned I = 0; I < 32; I += 8) {
4671 unsigned SelBits = Sel & 0xff;
4672 if (SelBits < 4) {
4673 SelBits *= 8;
4674 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4675 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4676 } else if (SelBits < 7) {
4677 SelBits = (SelBits & 3) * 8;
4678 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4679 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4680 } else if (SelBits == 0x0c) {
4681 Known.Zero |= 0xFFull << I;
4682 } else if (SelBits > 0x0c) {
4683 Known.One |= 0xFFull << I;
4684 }
4685 Sel >>= 8;
4686 }
4687 break;
4688 }
4689 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4690 Known.Zero.setHighBits(24);
4691 break;
4692 }
4693 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4694 Known.Zero.setHighBits(16);
4695 break;
4696 }
4697 case AMDGPUISD::LDS: {
4698 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4699 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4700
4701 Known.Zero.setHighBits(16);
4702 Known.Zero.setLowBits(Log2(Alignment));
4703 break;
4704 }
4705 case ISD::INTRINSIC_WO_CHAIN: {
4706 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4707 switch (IID) {
4708 case Intrinsic::amdgcn_mbcnt_lo:
4709 case Intrinsic::amdgcn_mbcnt_hi: {
4710 const GCNSubtarget &ST =
4711 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4712 // These return at most the wavefront size - 1.
4713 unsigned Size = Op.getValueType().getSizeInBits();
4714 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4715 break;
4716 }
4717 default:
4718 break;
4719 }
4720 }
4721 }
4722}
4723
4724unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4725 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4726 unsigned Depth) const {
4727 switch (Op.getOpcode()) {
4728 case AMDGPUISD::BFE_I32: {
4729 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4730 if (!Width)
4731 return 1;
4732
4733 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4734 if (!isNullConstant(Op.getOperand(1)))
4735 return SignBits;
4736
4737 // TODO: Could probably figure something out with non-0 offsets.
4738 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4739 return std::max(SignBits, Op0SignBits);
4740 }
4741
4742 case AMDGPUISD::BFE_U32: {
4743 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4744 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4745 }
4746
4747 case AMDGPUISD::CARRY:
4748 case AMDGPUISD::BORROW:
4749 return 31;
4750 case AMDGPUISD::BUFFER_LOAD_BYTE:
4751 return 25;
4752 case AMDGPUISD::BUFFER_LOAD_SHORT:
4753 return 17;
4754 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4755 return 24;
4756 case AMDGPUISD::BUFFER_LOAD_USHORT:
4757 return 16;
4758 case AMDGPUISD::FP_TO_FP16:
4759 return 16;
4760 default:
4761 return 1;
4762 }
4763}
4764
4765unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4766 GISelKnownBits &Analysis, Register R,
4767 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4768 unsigned Depth) const {
4769 const MachineInstr *MI = MRI.getVRegDef(R);
4770 if (!MI)
4771 return 1;
4772
4773 // TODO: Check range metadata on MMO.
4774 switch (MI->getOpcode()) {
4775 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4776 return 25;
4777 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4778 return 17;
4779 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4780 return 24;
4781 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4782 return 16;
4783 default:
4784 return 1;
4785 }
4786}
4787
4788bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4789 const SelectionDAG &DAG,
4790 bool SNaN,
4791 unsigned Depth) const {
4792 unsigned Opcode = Op.getOpcode();
4793 switch (Opcode) {
4794 case AMDGPUISD::FMIN_LEGACY:
4795 case AMDGPUISD::FMAX_LEGACY: {
4796 if (SNaN)
4797 return true;
4798
4799 // TODO: Can check no nans on one of the operands for each one, but which
4800 // one?
4801 return false;
4802 }
4803 case AMDGPUISD::FMUL_LEGACY:
4804 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4805 if (SNaN)
4806 return true;
4807 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4808 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4809 }
4810 case AMDGPUISD::FMED3:
4811 case AMDGPUISD::FMIN3:
4812 case AMDGPUISD::FMAX3:
4813 case AMDGPUISD::FMAD_FTZ: {
4814 if (SNaN)
4815 return true;
4816 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4817 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4818 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4819 }
4820 case AMDGPUISD::CVT_F32_UBYTE0:
4821 case AMDGPUISD::CVT_F32_UBYTE1:
4822 case AMDGPUISD::CVT_F32_UBYTE2:
4823 case AMDGPUISD::CVT_F32_UBYTE3:
4824 return true;
4825
4826 case AMDGPUISD::RCP:
4827 case AMDGPUISD::RSQ:
4828 case AMDGPUISD::RCP_LEGACY:
4829 case AMDGPUISD::RSQ_CLAMP: {
4830 if (SNaN)
4831 return true;
4832
4833 // TODO: Need is known positive check.
4834 return false;
4835 }
4836 case AMDGPUISD::LDEXP:
4837 case AMDGPUISD::FRACT: {
4838 if (SNaN)
4839 return true;
4840 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4841 }
4842 case AMDGPUISD::DIV_SCALE:
4843 case AMDGPUISD::DIV_FMAS:
4844 case AMDGPUISD::DIV_FIXUP:
4845 // TODO: Refine on operands.
4846 return SNaN;
4847 case AMDGPUISD::SIN_HW:
4848 case AMDGPUISD::COS_HW: {
4849 // TODO: Need check for infinity
4850 return SNaN;
4851 }
4852 case ISD::INTRINSIC_WO_CHAIN: {
4853 unsigned IntrinsicID
4854 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4855 // TODO: Handle more intrinsics
4856 switch (IntrinsicID) {
4857 case Intrinsic::amdgcn_cubeid:
4858 return true;
4859
4860 case Intrinsic::amdgcn_frexp_mant: {
4861 if (SNaN)
4862 return true;
4863 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4864 }
4865 case Intrinsic::amdgcn_cvt_pkrtz: {
4866 if (SNaN)
4867 return true;
4868 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4869 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4870 }
4871 case Intrinsic::amdgcn_rcp:
4872 case Intrinsic::amdgcn_rsq:
4873 case Intrinsic::amdgcn_rcp_legacy:
4874 case Intrinsic::amdgcn_rsq_legacy:
4875 case Intrinsic::amdgcn_rsq_clamp: {
4876 if (SNaN)
4877 return true;
4878
4879 // TODO: Need is known positive check.
4880 return false;
4881 }
4882 case Intrinsic::amdgcn_trig_preop:
4883 case Intrinsic::amdgcn_fdot2:
4884 // TODO: Refine on operand
4885 return SNaN;
4886 case Intrinsic::amdgcn_fma_legacy:
4887 if (SNaN)
4888 return true;
4889 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4890 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4891 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4892 default:
4893 return false;
4894 }
4895 }
4896 default:
4897 return false;
4898 }
4899}
4900
4901TargetLowering::AtomicExpansionKind
4902AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4903 switch (RMW->getOperation()) {
4904 case AtomicRMWInst::Nand:
4905 case AtomicRMWInst::FAdd:
4906 case AtomicRMWInst::FSub:
4907 return AtomicExpansionKind::CmpXChg;
4908 default:
4909 return AtomicExpansionKind::None;
4910 }
4911}
4912
4913bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
4914 unsigned Opc, LLT Ty1, LLT Ty2) const {
4915 return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
4916 Ty2 == LLT::scalar(32);
4917}

/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
6.1
'ZB' is not equal to ZB_Undefined
6.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
7
Assuming 'Val' is equal to 0
8
Taking true branch
117 return 32;
9
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
6
Calling 'TrailingZerosCounter::count'
10
Returning from 'TrailingZerosCounter::count'
11
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 251, __extension__
__PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 429, __extension__
__PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 440, __extension__
__PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 447, __extension__
__PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Return true if the argument contains a non-empty sequence of ones with the
575/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
576/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
577/// MaskLen is updated to specify the length of the mask, else neither are
578/// updated.
579inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
580 unsigned &MaskLen) {
581 if (!isShiftedMask_32(Value))
582 return false;
583 MaskIdx = countTrailingZeros(Value);
584 MaskLen = countPopulation(Value);
585 return true;
586}
587
588/// Return true if the argument contains a non-empty sequence of ones with the
589/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
590/// of the lowest set bit and \p MaskLen is updated to specify the length of the
591/// mask, else neither are updated.
592inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
593 unsigned &MaskLen) {
594 if (!isShiftedMask_64(Value))
595 return false;
596 MaskIdx = countTrailingZeros(Value);
597 MaskLen = countPopulation(Value);
598 return true;
599}
600
601/// Compile time Log2.
602/// Valid only for positive powers of two.
603template <size_t kValue> constexpr inline size_t CTLog2() {
604 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
605 "Value is not a valid power of 2");
606 return 1 + CTLog2<kValue / 2>();
607}
608
609template <> constexpr inline size_t CTLog2<1>() { return 0; }
610
611/// Return the log base 2 of the specified value.
612inline double Log2(double Value) {
613#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
614 return __builtin_log(Value) / __builtin_log(2.0);
615#else
616 return log2(Value);
617#endif
618}
619
620/// Return the floor log base 2 of the specified value, -1 if the value is zero.
621/// (32 bit edition.)
622/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
623inline unsigned Log2_32(uint32_t Value) {
624 return 31 - countLeadingZeros(Value);
625}
626
627/// Return the floor log base 2 of the specified value, -1 if the value is zero.
628/// (64 bit edition.)
629inline unsigned Log2_64(uint64_t Value) {
630 return 63 - countLeadingZeros(Value);
631}
632
633/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
634/// (32 bit edition).
635/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
636inline unsigned Log2_32_Ceil(uint32_t Value) {
637 return 32 - countLeadingZeros(Value - 1);
638}
639
640/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
641/// (64 bit edition.)
642inline unsigned Log2_64_Ceil(uint64_t Value) {
643 return 64 - countLeadingZeros(Value - 1);
644}
645
646/// Return the greatest common divisor of the values using Euclid's algorithm.
647template <typename T>
648inline T greatestCommonDivisor(T A, T B) {
649 while (B) {
650 T Tmp = B;
651 B = A % B;
652 A = Tmp;
653 }
654 return A;
655}
656
657inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
658 return greatestCommonDivisor<uint64_t>(A, B);
659}
660
661/// This function takes a 64-bit integer and returns the bit equivalent double.
662inline double BitsToDouble(uint64_t Bits) {
663 double D;
664 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
665 memcpy(&D, &Bits, sizeof(Bits));
666 return D;
667}
668
669/// This function takes a 32-bit integer and returns the bit equivalent float.
670inline float BitsToFloat(uint32_t Bits) {
671 float F;
672 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
673 memcpy(&F, &Bits, sizeof(Bits));
674 return F;
675}
676
677/// This function takes a double and returns the bit equivalent 64-bit integer.
678/// Note that copying doubles around changes the bits of NaNs on some hosts,
679/// notably x86, so this routine cannot be used if these bits are needed.
680inline uint64_t DoubleToBits(double Double) {
681 uint64_t Bits;
682 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
683 memcpy(&Bits, &Double, sizeof(Double));
684 return Bits;
685}
686
687/// This function takes a float and returns the bit equivalent 32-bit integer.
688/// Note that copying floats around changes the bits of NaNs on some hosts,
689/// notably x86, so this routine cannot be used if these bits are needed.
690inline uint32_t FloatToBits(float Float) {
691 uint32_t Bits;
692 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
693 memcpy(&Bits, &Float, sizeof(Float));
694 return Bits;
695}
696
697/// A and B are either alignments or offsets. Return the minimum alignment that
698/// may be assumed after adding the two together.
699constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
700 // The largest power of 2 that divides both A and B.
701 //
702 // Replace "-Value" by "1+~Value" in the following commented code to avoid
703 // MSVC warning C4146
704 // return (A | B) & -(A | B);
705 return (A | B) & (1 + ~(A | B));
706}
707
708/// Returns the next power of two (in 64-bits) that is strictly greater than A.
709/// Returns zero on overflow.
710constexpr inline uint64_t NextPowerOf2(uint64_t A) {
711 A |= (A >> 1);
712 A |= (A >> 2);
713 A |= (A >> 4);
714 A |= (A >> 8);
715 A |= (A >> 16);
716 A |= (A >> 32);
717 return A + 1;
718}
719
720/// Returns the power of two which is less than or equal to the given value.
721/// Essentially, it is a floor operation across the domain of powers of two.
722inline uint64_t PowerOf2Floor(uint64_t A) {
723 if (!A) return 0;
724 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
725}
726
727/// Returns the power of two which is greater than or equal to the given value.
728/// Essentially, it is a ceil operation across the domain of powers of two.
729inline uint64_t PowerOf2Ceil(uint64_t A) {
730 if (!A)
731 return 0;
732 return NextPowerOf2(A - 1);
733}
734
735/// Returns the next integer (mod 2**64) that is greater than or equal to
736/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
737///
738/// If non-zero \p Skew is specified, the return value will be a minimal
739/// integer that is greater than or equal to \p Value and equal to
740/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
741/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
742///
743/// Examples:
744/// \code
745/// alignTo(5, 8) = 8
746/// alignTo(17, 8) = 24
747/// alignTo(~0LL, 8) = 0
748/// alignTo(321, 255) = 510
749///
750/// alignTo(5, 8, 7) = 7
751/// alignTo(17, 8, 1) = 17
752/// alignTo(~0LL, 8, 3) = 3
753/// alignTo(321, 255, 42) = 552
754/// \endcode
755inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
756 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 756, __extension__
__PRETTY_FUNCTION__))
;
757 Skew %= Align;
758 return (Value + Align - 1 - Skew) / Align * Align + Skew;
759}
760
761/// Returns the next integer (mod 2**64) that is greater than or equal to
762/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
763template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
764 static_assert(Align != 0u, "Align must be non-zero");
765 return (Value + Align - 1) / Align * Align;
766}
767
768/// Returns the integer ceil(Numerator / Denominator).
769inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
770 return alignTo(Numerator, Denominator) / Denominator;
771}
772
773/// Returns the integer nearest(Numerator / Denominator).
774inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
775 return (Numerator + (Denominator / 2)) / Denominator;
776}
777
778/// Returns the largest uint64_t less than or equal to \p Value and is
779/// \p Skew mod \p Align. \p Align must be non-zero
780inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
781 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 781, __extension__
__PRETTY_FUNCTION__))
;
782 Skew %= Align;
783 return (Value - Skew) / Align * Align + Skew;
784}
785
786/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
787/// Requires 0 < B <= 32.
788template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
789 static_assert(B > 0, "Bit width can't be 0.");
790 static_assert(B <= 32, "Bit width out of range.");
791 return int32_t(X << (32 - B)) >> (32 - B);
792}
793
794/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
795/// Requires 0 < B <= 32.
796inline int32_t SignExtend32(uint32_t X, unsigned B) {
797 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 797, __extension__
__PRETTY_FUNCTION__))
;
798 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 798, __extension__
__PRETTY_FUNCTION__))
;
799 return int32_t(X << (32 - B)) >> (32 - B);
800}
801
802/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
803/// Requires 0 < B <= 64.
804template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
805 static_assert(B > 0, "Bit width can't be 0.");
806 static_assert(B <= 64, "Bit width out of range.");
807 return int64_t(x << (64 - B)) >> (64 - B);
808}
809
810/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
811/// Requires 0 < B <= 64.
812inline int64_t SignExtend64(uint64_t X, unsigned B) {
813 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 813, __extension__
__PRETTY_FUNCTION__))
;
814 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 814, __extension__
__PRETTY_FUNCTION__))
;
815 return int64_t(X << (64 - B)) >> (64 - B);
816}
817
818/// Subtract two unsigned integers, X and Y, of type T and return the absolute
819/// value of the result.
820template <typename T>
821std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
822 return X > Y ? (X - Y) : (Y - X);
823}
824
825/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
826/// maximum representable value of T on overflow. ResultOverflowed indicates if
827/// the result is larger than the maximum representable value of type T.
828template <typename T>
829std::enable_if_t<std::is_unsigned<T>::value, T>
830SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
831 bool Dummy;
832 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
833 // Hacker's Delight, p. 29
834 T Z = X + Y;
835 Overflowed = (Z < X || Z < Y);
836 if (Overflowed)
837 return std::numeric_limits<T>::max();
838 else
839 return Z;
840}
841
842/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
843/// maximum representable value of T on overflow. ResultOverflowed indicates if
844/// the result is larger than the maximum representable value of type T.
845template <typename T>
846std::enable_if_t<std::is_unsigned<T>::value, T>
847SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
848 bool Dummy;
849 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
850
851 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
852 // because it fails for uint16_t (where multiplication can have undefined
853 // behavior due to promotion to int), and requires a division in addition
854 // to the multiplication.
855
856 Overflowed = false;
857
858 // Log2(Z) would be either Log2Z or Log2Z + 1.
859 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
860 // will necessarily be less than Log2Max as desired.
861 int Log2Z = Log2_64(X) + Log2_64(Y);
862 const T Max = std::numeric_limits<T>::max();
863 int Log2Max = Log2_64(Max);
864 if (Log2Z < Log2Max) {
865 return X * Y;
866 }
867 if (Log2Z > Log2Max) {
868 Overflowed = true;
869 return Max;
870 }
871
872 // We're going to use the top bit, and maybe overflow one
873 // bit past it. Multiply all but the bottom bit then add
874 // that on at the end.
875 T Z = (X >> 1) * Y;
876 if (Z & ~(Max >> 1)) {
877 Overflowed = true;
878 return Max;
879 }
880 Z <<= 1;
881 if (X & 1)
882 return SaturatingAdd(Z, Y, ResultOverflowed);
883
884 return Z;
885}
886
887/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
888/// the product. Clamp the result to the maximum representable value of T on
889/// overflow. ResultOverflowed indicates if the result is larger than the
890/// maximum representable value of type T.
891template <typename T>
892std::enable_if_t<std::is_unsigned<T>::value, T>
893SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
894 bool Dummy;
895 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
896
897 T Product = SaturatingMultiply(X, Y, &Overflowed);
898 if (Overflowed)
899 return Product;
900
901 return SaturatingAdd(A, Product, &Overflowed);
902}
903
904/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
905extern const float huge_valf;
906
907
908/// Add two signed integers, computing the two's complement truncated result,
909/// returning true if overflow occurred.
910template <typename T>
911std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
912#if __has_builtin(__builtin_add_overflow)1
913 return __builtin_add_overflow(X, Y, &Result);
914#else
915 // Perform the unsigned addition.
916 using U = std::make_unsigned_t<T>;
917 const U UX = static_cast<U>(X);
918 const U UY = static_cast<U>(Y);
919 const U UResult = UX + UY;
920
921 // Convert to signed.
922 Result = static_cast<T>(UResult);
923
924 // Adding two positive numbers should result in a positive number.
925 if (X > 0 && Y > 0)
926 return Result <= 0;
927 // Adding two negatives should result in a negative number.
928 if (X < 0 && Y < 0)
929 return Result >= 0;
930 return false;
931#endif
932}
933
934/// Subtract two signed integers, computing the two's complement truncated
935/// result, returning true if an overflow ocurred.
936template <typename T>
937std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
938#if __has_builtin(__builtin_sub_overflow)1
939 return __builtin_sub_overflow(X, Y, &Result);
940#else
941 // Perform the unsigned addition.
942 using U = std::make_unsigned_t<T>;
943 const U UX = static_cast<U>(X);
944 const U UY = static_cast<U>(Y);
945 const U UResult = UX - UY;
946
947 // Convert to signed.
948 Result = static_cast<T>(UResult);
949
950 // Subtracting a positive number from a negative results in a negative number.
951 if (X <= 0 && Y > 0)
952 return Result >= 0;
953 // Subtracting a negative number from a positive results in a positive number.
954 if (X >= 0 && Y < 0)
955 return Result <= 0;
956 return false;
957#endif
958}
959
960/// Multiply two signed integers, computing the two's complement truncated
961/// result, returning true if an overflow ocurred.
962template <typename T>
963std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
964 // Perform the unsigned multiplication on absolute values.
965 using U = std::make_unsigned_t<T>;
966 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
967 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
968 const U UResult = UX * UY;
969
970 // Convert to signed.
971 const bool IsNegative = (X < 0) ^ (Y < 0);
972 Result = IsNegative ? (0 - UResult) : UResult;
973
974 // If any of the args was 0, result is 0 and no overflow occurs.
975 if (UX == 0 || UY == 0)
976 return false;
977
978 // UX and UY are in [1, 2^n], where n is the number of digits.
979 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
980 // positive) divided by an argument compares to the other.
981 if (IsNegative)
982 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
983 else
984 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
985}
986
987} // End llvm namespace
988
989#endif