LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
75 setOperationAction(ISD::LOAD, MVT::f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
78 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
81 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
84 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
87 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
90 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
93 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
96 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
99 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
117 setOperationAction(ISD::LOAD, MVT::i64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
120 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
123 setOperationAction(ISD::LOAD, MVT::f64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
126 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
129 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
132 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
135 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
138 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
141 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
144 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
153 setOperationAction(ISD::LOAD, MVT::i128, Promote);
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
157 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
160 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
163 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
166 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
169 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
172 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
175 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
178 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
240 setOperationAction(ISD::STORE, MVT::f32, Promote);
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
243 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
246 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
249 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
252 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
255 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
258 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
261 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
264 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
267 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
270 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
273 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
276 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
279 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
282 setOperationAction(ISD::STORE, MVT::i64, Promote);
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
285 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
288 setOperationAction(ISD::STORE, MVT::f64, Promote);
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
291 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
294 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
297 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
300 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
303 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
306 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
309 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
312 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
315 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
318 setOperationAction(ISD::STORE, MVT::i128, Promote);
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
371 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
372 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
373
374 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
375 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
376 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
377
378 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
379 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
380 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
381
382 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
383 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
384 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
385
386 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
387 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
388 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
389 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
390 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
391 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
393
394 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
395 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
396
397 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
398
399 // For R600, this is totally unsupported, just custom lower to produce an
400 // error.
401 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
402
403 // Library functions. These default to Expand, but we have instructions
404 // for them.
405 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
406 ISD::FROUNDEVEN, ISD::FTRUNC},
407 {MVT::f16, MVT::f32}, Legal);
408 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
409
410 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
411 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
412 setOperationAction({ISD::LROUND, ISD::LLROUND},
413 {MVT::f16, MVT::f32, MVT::f64}, Expand);
414
416 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
417 Custom);
418
419 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
420
421 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
422
423 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
424 Expand);
425
426 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
427
428 if (Subtarget->has16BitInsts()) {
429 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
430 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
431 } else {
432 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
433 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
434 }
435
436 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
437 Custom);
438
439 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
440 if (Subtarget->has16BitInsts()) {
442 }
443
444 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
445 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
446 // default unless marked custom/legal.
448 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
449 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
450 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
451 MVT::v16f64},
452 Custom);
453
454 if (isTypeLegal(MVT::f16))
456 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
457 Custom);
458
459 // Expand to fneg + fadd.
461
463 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
464 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
465 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
466 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
467 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
468 Custom);
469
472 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
473 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
474 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
475 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
476 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
477 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
478 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
479 Custom);
480
481 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
482 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
483
484 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
485 for (MVT VT : ScalarIntVTs) {
486 // These should use [SU]DIVREM, so set them to expand
488 Expand);
489
490 // GPU does not have divrem function for signed or unsigned.
492
493 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
495
497
498 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
500 }
501
502 // The hardware supports 32-bit FSHR, but not FSHL.
504
505 // The hardware supports 32-bit ROTR, but not ROTL.
506 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
508
510
514 MVT::i64, Custom);
516
518 Legal);
519
522 MVT::i64, Custom);
523
524 for (auto VT : {MVT::i8, MVT::i16})
526
527 static const MVT::SimpleValueType VectorIntTypes[] = {
528 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
529 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
530
531 for (MVT VT : VectorIntTypes) {
532 // Expand the following operations for the current type by default.
544 ISD::SETCC, ISD::ADDRSPACECAST},
545 VT, Expand);
546 }
547
548 static const MVT::SimpleValueType FloatVectorTypes[] = {
549 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
550 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
551
552 for (MVT VT : FloatVectorTypes) {
554 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
555 ISD::FADD, ISD::FCEIL, ISD::FCOS,
556 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
557 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
558 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
559 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
560 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
561 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
562 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
564 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
565 VT, Expand);
566 }
567
568 // This causes using an unrolled select operation rather than expansion with
569 // bit operations. This is in general better, but the alternative using BFI
570 // instructions may be better if the select sources are SGPRs.
572 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
582
584 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
585
587 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
588
590 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
591
593 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
594
596 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
597
599 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
600
602 setJumpIsExpensive(true);
603
606
608
609 // We want to find all load dependencies for long chains of stores to enable
610 // merging into very wide vectors. The problem is with vectors with > 4
611 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
612 // vectors are a legal type, even though we have to split the loads
613 // usually. When we can more precisely specify load legality per address
614 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
615 // smarter so that they can figure out what to do in 2 iterations without all
616 // N > 4 stores on the same chain.
618
619 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
620 // about these during lowering.
621 MaxStoresPerMemcpy = 0xffffffff;
622 MaxStoresPerMemmove = 0xffffffff;
623 MaxStoresPerMemset = 0xffffffff;
624
625 // The expansion for 64-bit division is enormous.
627 addBypassSlowDiv(64, 32);
628
629 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
635 ISD::STORE, ISD::FADD,
636 ISD::FSUB, ISD::FNEG,
637 ISD::FABS, ISD::AssertZext,
639
643}
644
646 if (getTargetMachine().Options.NoSignedZerosFPMath)
647 return true;
648
649 const auto Flags = Op.getNode()->getFlags();
650 if (Flags.hasNoSignedZeros())
651 return true;
652
653 return false;
654}
655
656//===----------------------------------------------------------------------===//
657// Target Information
658//===----------------------------------------------------------------------===//
659
661static bool fnegFoldsIntoOpcode(unsigned Opc) {
662 switch (Opc) {
663 case ISD::FADD:
664 case ISD::FSUB:
665 case ISD::FMUL:
666 case ISD::FMA:
667 case ISD::FMAD:
668 case ISD::FMINNUM:
669 case ISD::FMAXNUM:
670 case ISD::FMINNUM_IEEE:
671 case ISD::FMAXNUM_IEEE:
672 case ISD::FMINIMUM:
673 case ISD::FMAXIMUM:
674 case ISD::FMINIMUMNUM:
675 case ISD::FMAXIMUMNUM:
676 case ISD::SELECT:
677 case ISD::FSIN:
678 case ISD::FTRUNC:
679 case ISD::FRINT:
680 case ISD::FNEARBYINT:
681 case ISD::FROUNDEVEN:
683 case AMDGPUISD::RCP:
690 case AMDGPUISD::FMED3:
691 // TODO: handle llvm.amdgcn.fma.legacy
692 return true;
693 case ISD::BITCAST:
694 llvm_unreachable("bitcast is special cased");
695 default:
696 return false;
697 }
698}
699
700static bool fnegFoldsIntoOp(const SDNode *N) {
701 unsigned Opc = N->getOpcode();
702 if (Opc == ISD::BITCAST) {
703 // TODO: Is there a benefit to checking the conditions performFNegCombine
704 // does? We don't for the other cases.
705 SDValue BCSrc = N->getOperand(0);
706 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
707 return BCSrc.getNumOperands() == 2 &&
708 BCSrc.getOperand(1).getValueSizeInBits() == 32;
709 }
710
711 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
712 }
713
714 return fnegFoldsIntoOpcode(Opc);
715}
716
717/// \p returns true if the operation will definitely need to use a 64-bit
718/// encoding, and thus will use a VOP3 encoding regardless of the source
719/// modifiers.
721static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
722 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
723 VT == MVT::f64;
724}
725
726/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
727/// type for ISD::SELECT.
729static bool selectSupportsSourceMods(const SDNode *N) {
730 // TODO: Only applies if select will be vector
731 return N->getValueType(0) == MVT::f32;
732}
733
734// Most FP instructions support source modifiers, but this could be refined
735// slightly.
737static bool hasSourceMods(const SDNode *N) {
738 if (isa<MemSDNode>(N))
739 return false;
740
741 switch (N->getOpcode()) {
742 case ISD::CopyToReg:
743 case ISD::FDIV:
744 case ISD::FREM:
745 case ISD::INLINEASM:
746 case ISD::INLINEASM_BR:
749
750 // TODO: Should really be looking at the users of the bitcast. These are
751 // problematic because bitcasts are used to legalize all stores to integer
752 // types.
753 case ISD::BITCAST:
754 return false;
756 switch (N->getConstantOperandVal(0)) {
757 case Intrinsic::amdgcn_interp_p1:
758 case Intrinsic::amdgcn_interp_p2:
759 case Intrinsic::amdgcn_interp_mov:
760 case Intrinsic::amdgcn_interp_p1_f16:
761 case Intrinsic::amdgcn_interp_p2_f16:
762 return false;
763 default:
764 return true;
765 }
766 }
767 case ISD::SELECT:
769 default:
770 return true;
771 }
772}
773
775 unsigned CostThreshold) {
776 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
777 // it is truly free to use a source modifier in all cases. If there are
778 // multiple users but for each one will necessitate using VOP3, there will be
779 // a code size increase. Try to avoid increasing code size unless we know it
780 // will save on the instruction count.
781 unsigned NumMayIncreaseSize = 0;
782 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
783
784 assert(!N->use_empty());
785
786 // XXX - Should this limit number of uses to check?
787 for (const SDNode *U : N->users()) {
788 if (!hasSourceMods(U))
789 return false;
790
791 if (!opMustUseVOP3Encoding(U, VT)) {
792 if (++NumMayIncreaseSize > CostThreshold)
793 return false;
794 }
795 }
796
797 return true;
798}
799
801 ISD::NodeType ExtendKind) const {
802 assert(!VT.isVector() && "only scalar expected");
803
804 // Round to the next multiple of 32-bits.
805 unsigned Size = VT.getSizeInBits();
806 if (Size <= 32)
807 return MVT::i32;
808 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
809}
810
812 return 32;
813}
814
816 return true;
817}
818
819// The backend supports 32 and 64 bit floating point immediates.
820// FIXME: Why are we reporting vectors of FP immediates as legal?
822 bool ForCodeSize) const {
823 EVT ScalarVT = VT.getScalarType();
824 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
825 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 return VT == MVT::f32 || VT == MVT::f64 ||
970 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ADD:
1045 case ISD::SUB:
1046 case ISD::SHL:
1047 case ISD::SRL:
1048 case ISD::SRA:
1049 case ISD::AND:
1050 case ISD::OR:
1051 case ISD::XOR:
1052 case ISD::MUL:
1053 case ISD::SETCC:
1054 case ISD::SELECT:
1055 case ISD::SMIN:
1056 case ISD::SMAX:
1057 case ISD::UMIN:
1058 case ISD::UMAX:
1059 if (Subtarget->has16BitInsts() &&
1060 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1061 // Don't narrow back down to i16 if promoted to i32 already.
1062 if (!N->isDivergent() && DestVT.isInteger() &&
1063 DestVT.getScalarSizeInBits() > 1 &&
1064 DestVT.getScalarSizeInBits() <= 16 &&
1065 SrcVT.getScalarSizeInBits() > 16) {
1066 return false;
1067 }
1068 }
1069 return true;
1070 default:
1071 break;
1072 }
1073
1074 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1075 // limited number of native 64-bit operations. Shrinking an operation to fit
1076 // in a single 32-bit register should always be helpful. As currently used,
1077 // this is much less general than the name suggests, and is only used in
1078 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1079 // not profitable, and may actually be harmful.
1080 if (isa<LoadSDNode>(N))
1081 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1082
1083 return true;
1084}
1085
1087 const SDNode* N, CombineLevel Level) const {
1088 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1089 N->getOpcode() == ISD::SRL) &&
1090 "Expected shift op");
1091
1092 SDValue ShiftLHS = N->getOperand(0);
1093 if (!ShiftLHS->hasOneUse())
1094 return false;
1095
1096 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1097 !ShiftLHS.getOperand(0)->hasOneUse())
1098 return false;
1099
1100 // Always commute pre-type legalization and right shifts.
1101 // We're looking for shl(or(x,y),z) patterns.
1103 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1104 return true;
1105
1106 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1107 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1108 (N->user_begin()->getOpcode() == ISD::SRA ||
1109 N->user_begin()->getOpcode() == ISD::SRL))
1110 return false;
1111
1112 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1113 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1114 if (LHS.getOpcode() != ISD::SHL)
1115 return false;
1116 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1117 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1118 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1119 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1120 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1121 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1122 };
1123 SDValue LHS = N->getOperand(0).getOperand(0);
1124 SDValue RHS = N->getOperand(0).getOperand(1);
1125 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1126}
1127
1128//===---------------------------------------------------------------------===//
1129// TargetLowering Callbacks
1130//===---------------------------------------------------------------------===//
1131
1133 bool IsVarArg) {
1134 switch (CC) {
1142 return CC_AMDGPU;
1145 return CC_AMDGPU_CS_CHAIN;
1146 case CallingConv::C:
1147 case CallingConv::Fast:
1148 case CallingConv::Cold:
1149 return CC_AMDGPU_Func;
1152 return CC_SI_Gfx;
1155 default:
1156 reportFatalUsageError("unsupported calling convention for call");
1157 }
1158}
1159
1161 bool IsVarArg) {
1162 switch (CC) {
1165 llvm_unreachable("kernels should not be handled here");
1175 return RetCC_SI_Shader;
1178 return RetCC_SI_Gfx;
1179 case CallingConv::C:
1180 case CallingConv::Fast:
1181 case CallingConv::Cold:
1182 return RetCC_AMDGPU_Func;
1183 default:
1184 reportFatalUsageError("unsupported calling convention");
1185 }
1186}
1187
1188/// The SelectionDAGBuilder will automatically promote function arguments
1189/// with illegal types. However, this does not work for the AMDGPU targets
1190/// since the function arguments are stored in memory as these illegal types.
1191/// In order to handle this properly we need to get the original types sizes
1192/// from the LLVM IR Function and fixup the ISD:InputArg values before
1193/// passing them to AnalyzeFormalArguments()
1194
1195/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1196/// input values across multiple registers. Each item in the Ins array
1197/// represents a single value that will be stored in registers. Ins[x].VT is
1198/// the value type of the value that will be stored in the register, so
1199/// whatever SDNode we lower the argument to needs to be this type.
1200///
1201/// In order to correctly lower the arguments we need to know the size of each
1202/// argument. Since Ins[x].VT gives us the size of the register that will
1203/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1204/// for the original function argument so that we can deduce the correct memory
1205/// type to use for Ins[x]. In most cases the correct memory type will be
1206/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1207/// we have a kernel argument of type v8i8, this argument will be split into
1208/// 8 parts and each part will be represented by its own item in the Ins array.
1209/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1210/// the argument before it was split. From this, we deduce that the memory type
1211/// for each individual part is i8. We pass the memory type as LocVT to the
1212/// calling convention analysis function and the register type (Ins[x].VT) as
1213/// the ValVT.
1215 CCState &State,
1216 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1217 const MachineFunction &MF = State.getMachineFunction();
1218 const Function &Fn = MF.getFunction();
1219 LLVMContext &Ctx = Fn.getParent()->getContext();
1220 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1221 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1223
1224 Align MaxAlign = Align(1);
1225 uint64_t ExplicitArgOffset = 0;
1226 const DataLayout &DL = Fn.getDataLayout();
1227
1228 unsigned InIndex = 0;
1229
1230 for (const Argument &Arg : Fn.args()) {
1231 const bool IsByRef = Arg.hasByRefAttr();
1232 Type *BaseArgTy = Arg.getType();
1233 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1234 Align Alignment = DL.getValueOrABITypeAlignment(
1235 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1236 MaxAlign = std::max(Alignment, MaxAlign);
1237 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1238
1239 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1240 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1241
1242 // We're basically throwing away everything passed into us and starting over
1243 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1244 // to us as computed in Ins.
1245 //
1246 // We also need to figure out what type legalization is trying to do to get
1247 // the correct memory offsets.
1248
1249 SmallVector<EVT, 16> ValueVTs;
1251 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1252
1253 for (unsigned Value = 0, NumValues = ValueVTs.size();
1254 Value != NumValues; ++Value) {
1255 uint64_t BasePartOffset = Offsets[Value];
1256
1257 EVT ArgVT = ValueVTs[Value];
1258 EVT MemVT = ArgVT;
1259 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1260 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1261
1262 if (NumRegs == 1) {
1263 // This argument is not split, so the IR type is the memory type.
1264 if (ArgVT.isExtended()) {
1265 // We have an extended type, like i24, so we should just use the
1266 // register type.
1267 MemVT = RegisterVT;
1268 } else {
1269 MemVT = ArgVT;
1270 }
1271 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1272 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1273 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1274 // We have a vector value which has been split into a vector with
1275 // the same scalar type, but fewer elements. This should handle
1276 // all the floating-point vector types.
1277 MemVT = RegisterVT;
1278 } else if (ArgVT.isVector() &&
1279 ArgVT.getVectorNumElements() == NumRegs) {
1280 // This arg has been split so that each element is stored in a separate
1281 // register.
1282 MemVT = ArgVT.getScalarType();
1283 } else if (ArgVT.isExtended()) {
1284 // We have an extended type, like i65.
1285 MemVT = RegisterVT;
1286 } else {
1287 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1288 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1289 if (RegisterVT.isInteger()) {
1290 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1291 } else if (RegisterVT.isVector()) {
1292 assert(!RegisterVT.getScalarType().isFloatingPoint());
1293 unsigned NumElements = RegisterVT.getVectorNumElements();
1294 assert(MemoryBits % NumElements == 0);
1295 // This vector type has been split into another vector type with
1296 // a different elements size.
1297 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1298 MemoryBits / NumElements);
1299 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1300 } else {
1301 llvm_unreachable("cannot deduce memory type.");
1302 }
1303 }
1304
1305 // Convert one element vectors to scalar.
1306 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1307 MemVT = MemVT.getScalarType();
1308
1309 // Round up vec3/vec5 argument.
1310 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1311 MemVT = MemVT.getPow2VectorType(State.getContext());
1312 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1313 MemVT = MemVT.getRoundIntegerType(State.getContext());
1314 }
1315
1316 unsigned PartOffset = 0;
1317 for (unsigned i = 0; i != NumRegs; ++i) {
1318 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1319 BasePartOffset + PartOffset,
1320 MemVT.getSimpleVT(),
1322 PartOffset += MemVT.getStoreSize();
1323 }
1324 }
1325 }
1326}
1327
1329 SDValue Chain, CallingConv::ID CallConv,
1330 bool isVarArg,
1332 const SmallVectorImpl<SDValue> &OutVals,
1333 const SDLoc &DL, SelectionDAG &DAG) const {
1334 // FIXME: Fails for r600 tests
1335 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1336 // "wave terminate should not have return values");
1337 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1338}
1339
1340//===---------------------------------------------------------------------===//
1341// Target specific lowering
1342//===---------------------------------------------------------------------===//
1343
1344/// Selects the correct CCAssignFn for a given CallingConvention value.
1349
1354
1356 SelectionDAG &DAG,
1357 MachineFrameInfo &MFI,
1358 int ClobberedFI) const {
1359 SmallVector<SDValue, 8> ArgChains;
1360 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1361 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1362
1363 // Include the original chain at the beginning of the list. When this is
1364 // used by target LowerCall hooks, this helps legalize find the
1365 // CALLSEQ_BEGIN node.
1366 ArgChains.push_back(Chain);
1367
1368 // Add a chain value for each stack argument corresponding
1369 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1370 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1371 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1372 if (FI->getIndex() < 0) {
1373 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1374 int64_t InLastByte = InFirstByte;
1375 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1376
1377 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1378 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1379 ArgChains.push_back(SDValue(L, 1));
1380 }
1381 }
1382 }
1383 }
1384
1385 // Build a tokenfactor for all the chains.
1386 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1387}
1388
1391 StringRef Reason) const {
1392 SDValue Callee = CLI.Callee;
1393 SelectionDAG &DAG = CLI.DAG;
1394
1395 const Function &Fn = DAG.getMachineFunction().getFunction();
1396
1397 StringRef FuncName("<unknown>");
1398
1400 FuncName = G->getSymbol();
1401 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1402 FuncName = G->getGlobal()->getName();
1403
1404 DAG.getContext()->diagnose(
1405 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1406
1407 if (!CLI.IsTailCall) {
1408 for (ISD::InputArg &Arg : CLI.Ins)
1409 InVals.push_back(DAG.getPOISON(Arg.VT));
1410 }
1411
1412 return DAG.getEntryNode();
1413}
1414
1416 SmallVectorImpl<SDValue> &InVals) const {
1417 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1418}
1419
1421 SelectionDAG &DAG) const {
1422 const Function &Fn = DAG.getMachineFunction().getFunction();
1423
1425 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1426 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1427 return DAG.getMergeValues(Ops, SDLoc());
1428}
1429
1431 SelectionDAG &DAG) const {
1432 switch (Op.getOpcode()) {
1433 default:
1434 Op->print(errs(), &DAG);
1435 llvm_unreachable("Custom lowering code for this "
1436 "instruction is not implemented yet!");
1437 break;
1439 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1441 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1442 case ISD::SDIVREM:
1443 return LowerSDIVREM(Op, DAG);
1444 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1445 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1446 case ISD::FRINT: return LowerFRINT(Op, DAG);
1447 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1448 case ISD::FROUNDEVEN:
1449 return LowerFROUNDEVEN(Op, DAG);
1450 case ISD::FROUND: return LowerFROUND(Op, DAG);
1451 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1452 case ISD::FLOG2:
1453 return LowerFLOG2(Op, DAG);
1454 case ISD::FLOG:
1455 case ISD::FLOG10:
1456 return LowerFLOGCommon(Op, DAG);
1457 case ISD::FEXP:
1458 case ISD::FEXP10:
1459 return lowerFEXP(Op, DAG);
1460 case ISD::FEXP2:
1461 return lowerFEXP2(Op, DAG);
1462 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1463 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1464 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1465 case ISD::FP_TO_SINT:
1466 case ISD::FP_TO_UINT:
1467 return LowerFP_TO_INT(Op, DAG);
1468 case ISD::CTTZ:
1470 case ISD::CTLZ:
1472 return LowerCTLZ_CTTZ(Op, DAG);
1473 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1474 }
1475 return Op;
1476}
1477
1480 SelectionDAG &DAG) const {
1481 switch (N->getOpcode()) {
1483 // Different parts of legalization seem to interpret which type of
1484 // sign_extend_inreg is the one to check for custom lowering. The extended
1485 // from type is what really matters, but some places check for custom
1486 // lowering of the result type. This results in trying to use
1487 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1488 // nothing here and let the illegal result integer be handled normally.
1489 return;
1490 case ISD::FLOG2:
1491 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1492 Results.push_back(Lowered);
1493 return;
1494 case ISD::FLOG:
1495 case ISD::FLOG10:
1496 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1497 Results.push_back(Lowered);
1498 return;
1499 case ISD::FEXP2:
1500 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1501 Results.push_back(Lowered);
1502 return;
1503 case ISD::FEXP:
1504 case ISD::FEXP10:
1505 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1506 Results.push_back(Lowered);
1507 return;
1508 case ISD::CTLZ:
1510 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1511 Results.push_back(Lowered);
1512 return;
1513 default:
1514 return;
1515 }
1516}
1517
1519 SDValue Op,
1520 SelectionDAG &DAG) const {
1521
1522 const DataLayout &DL = DAG.getDataLayout();
1524 const GlobalValue *GV = G->getGlobal();
1525
1526 if (!MFI->isModuleEntryFunction()) {
1527 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1528 if (std::optional<uint32_t> Address =
1530 if (IsNamedBarrier) {
1531 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1532 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1533 }
1534 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1535 } else if (IsNamedBarrier) {
1536 llvm_unreachable("named barrier should have an assigned address");
1537 }
1538 }
1539
1540 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1541 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1542 if (!MFI->isModuleEntryFunction() &&
1543 GV->getName() != "llvm.amdgcn.module.lds" &&
1545 SDLoc DL(Op);
1546 const Function &Fn = DAG.getMachineFunction().getFunction();
1548 Fn, "local memory global used by non-kernel function",
1549 DL.getDebugLoc(), DS_Warning));
1550
1551 // We currently don't have a way to correctly allocate LDS objects that
1552 // aren't directly associated with a kernel. We do force inlining of
1553 // functions that use local objects. However, if these dead functions are
1554 // not eliminated, we don't want a compile time error. Just emit a warning
1555 // and a trap, since there should be no callable path here.
1556 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1557 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1558 Trap, DAG.getRoot());
1559 DAG.setRoot(OutputChain);
1560 return DAG.getPOISON(Op.getValueType());
1561 }
1562
1563 // XXX: What does the value of G->getOffset() mean?
1564 assert(G->getOffset() == 0 &&
1565 "Do not know what to do with an non-zero offset");
1566
1567 // TODO: We could emit code to handle the initialization somewhere.
1568 // We ignore the initializer for now and legalize it to allow selection.
1569 // The initializer will anyway get errored out during assembly emission.
1570 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1571 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1572 }
1573 return SDValue();
1574}
1575
1577 SelectionDAG &DAG) const {
1579 SDLoc SL(Op);
1580
1581 EVT VT = Op.getValueType();
1582 if (VT.getVectorElementType().getSizeInBits() < 32) {
1583 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1584 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1585 unsigned NewNumElt = OpBitSize / 32;
1586 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1588 MVT::i32, NewNumElt);
1589 for (const SDUse &U : Op->ops()) {
1590 SDValue In = U.get();
1591 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1592 if (NewNumElt > 1)
1593 DAG.ExtractVectorElements(NewIn, Args);
1594 else
1595 Args.push_back(NewIn);
1596 }
1597
1598 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1599 NewNumElt * Op.getNumOperands());
1600 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1601 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1602 }
1603 }
1604
1605 for (const SDUse &U : Op->ops())
1606 DAG.ExtractVectorElements(U.get(), Args);
1607
1608 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1609}
1610
1612 SelectionDAG &DAG) const {
1613 SDLoc SL(Op);
1615 unsigned Start = Op.getConstantOperandVal(1);
1616 EVT VT = Op.getValueType();
1617 EVT SrcVT = Op.getOperand(0).getValueType();
1618
1619 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1620 unsigned NumElt = VT.getVectorNumElements();
1621 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1622 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1623
1624 // Extract 32-bit registers at a time.
1625 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1626 EVT NewVT = NumElt == 2
1627 ? MVT::i32
1628 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1629 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1630
1631 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1632 if (NumElt == 2)
1633 Tmp = Args[0];
1634 else
1635 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1636
1637 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1638 }
1639
1640 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1642
1643 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1644}
1645
1646// TODO: Handle fabs too
1648 if (Val.getOpcode() == ISD::FNEG)
1649 return Val.getOperand(0);
1650
1651 return Val;
1652}
1653
1655 if (Val.getOpcode() == ISD::FNEG)
1656 Val = Val.getOperand(0);
1657 if (Val.getOpcode() == ISD::FABS)
1658 Val = Val.getOperand(0);
1659 if (Val.getOpcode() == ISD::FCOPYSIGN)
1660 Val = Val.getOperand(0);
1661 return Val;
1662}
1663
1665 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1666 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1667 SelectionDAG &DAG = DCI.DAG;
1668 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1669 switch (CCOpcode) {
1670 case ISD::SETOEQ:
1671 case ISD::SETONE:
1672 case ISD::SETUNE:
1673 case ISD::SETNE:
1674 case ISD::SETUEQ:
1675 case ISD::SETEQ:
1676 case ISD::SETFALSE:
1677 case ISD::SETFALSE2:
1678 case ISD::SETTRUE:
1679 case ISD::SETTRUE2:
1680 case ISD::SETUO:
1681 case ISD::SETO:
1682 break;
1683 case ISD::SETULE:
1684 case ISD::SETULT: {
1685 if (LHS == True)
1686 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1687 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1688 }
1689 case ISD::SETOLE:
1690 case ISD::SETOLT:
1691 case ISD::SETLE:
1692 case ISD::SETLT: {
1693 // Ordered. Assume ordered for undefined.
1694
1695 // Only do this after legalization to avoid interfering with other combines
1696 // which might occur.
1698 !DCI.isCalledByLegalizer())
1699 return SDValue();
1700
1701 // We need to permute the operands to get the correct NaN behavior. The
1702 // selected operand is the second one based on the failing compare with NaN,
1703 // so permute it based on the compare type the hardware uses.
1704 if (LHS == True)
1705 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1706 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1707 }
1708 case ISD::SETUGE:
1709 case ISD::SETUGT: {
1710 if (LHS == True)
1711 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1712 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1713 }
1714 case ISD::SETGT:
1715 case ISD::SETGE:
1716 case ISD::SETOGE:
1717 case ISD::SETOGT: {
1719 !DCI.isCalledByLegalizer())
1720 return SDValue();
1721
1722 if (LHS == True)
1723 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1724 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1725 }
1726 case ISD::SETCC_INVALID:
1727 llvm_unreachable("Invalid setcc condcode!");
1728 }
1729 return SDValue();
1730}
1731
1732/// Generate Min/Max node
1734 SDValue LHS, SDValue RHS,
1735 SDValue True, SDValue False,
1736 SDValue CC,
1737 DAGCombinerInfo &DCI) const {
1738 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1739 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1740
1741 SelectionDAG &DAG = DCI.DAG;
1742
1743 // If we can't directly match this, try to see if we can fold an fneg to
1744 // match.
1745
1748 SDValue NegTrue = peekFNeg(True);
1749
1750 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1751 // fmin/fmax.
1752 //
1753 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1754 // -> fneg (fmin_legacy lhs, K)
1755 //
1756 // TODO: Use getNegatedExpression
1757 if (LHS == NegTrue && CFalse && CRHS) {
1758 APFloat NegRHS = neg(CRHS->getValueAPF());
1759 if (NegRHS == CFalse->getValueAPF()) {
1760 SDValue Combined =
1761 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1762 if (Combined)
1763 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1764 return SDValue();
1765 }
1766 }
1767
1768 return SDValue();
1769}
1770
1771std::pair<SDValue, SDValue>
1773 SDLoc SL(Op);
1774
1775 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1776
1777 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1778 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1779
1780 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1781 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1782
1783 return std::pair(Lo, Hi);
1784}
1785
1787 SDLoc SL(Op);
1788
1789 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1790 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1791 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1792}
1793
1795 SDLoc SL(Op);
1796
1797 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1798 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1799 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1800}
1801
1802// Split a vector type into two parts. The first part is a power of two vector.
1803// The second part is whatever is left over, and is a scalar if it would
1804// otherwise be a 1-vector.
1805std::pair<EVT, EVT>
1807 EVT LoVT, HiVT;
1808 EVT EltVT = VT.getVectorElementType();
1809 unsigned NumElts = VT.getVectorNumElements();
1810 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1811 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1812 HiVT = NumElts - LoNumElts == 1
1813 ? EltVT
1814 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1815 return std::pair(LoVT, HiVT);
1816}
1817
1818// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1819// scalar.
1820std::pair<SDValue, SDValue>
1822 const EVT &LoVT, const EVT &HiVT,
1823 SelectionDAG &DAG) const {
1824 EVT VT = N.getValueType();
1826 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1827 VT.getVectorNumElements() &&
1828 "More vector elements requested than available!");
1830 DAG.getVectorIdxConstant(0, DL));
1831
1832 unsigned LoNumElts = LoVT.getVectorNumElements();
1833
1834 if (HiVT.isVector()) {
1835 unsigned HiNumElts = HiVT.getVectorNumElements();
1836 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1837 // Avoid creating an extract_subvector with an index that isn't a multiple
1838 // of the result type.
1840 DAG.getConstant(LoNumElts, DL, MVT::i32));
1841 return {Lo, Hi};
1842 }
1843
1845 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1846 /*Count=*/HiNumElts);
1847 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1848 return {Lo, Hi};
1849 }
1850
1852 DAG.getVectorIdxConstant(LoNumElts, DL));
1853 return {Lo, Hi};
1854}
1855
1857 SelectionDAG &DAG) const {
1859 EVT VT = Op.getValueType();
1860 SDLoc SL(Op);
1861
1862
1863 // If this is a 2 element vector, we really want to scalarize and not create
1864 // weird 1 element vectors.
1865 if (VT.getVectorNumElements() == 2) {
1866 SDValue Ops[2];
1867 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1868 return DAG.getMergeValues(Ops, SL);
1869 }
1870
1871 SDValue BasePtr = Load->getBasePtr();
1872 EVT MemVT = Load->getMemoryVT();
1873
1874 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1875
1876 EVT LoVT, HiVT;
1877 EVT LoMemVT, HiMemVT;
1878 SDValue Lo, Hi;
1879
1880 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1881 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1882 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1883
1884 unsigned Size = LoMemVT.getStoreSize();
1885 Align BaseAlign = Load->getAlign();
1886 Align HiAlign = commonAlignment(BaseAlign, Size);
1887
1888 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1889 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1890 BaseAlign, Load->getMemOperand()->getFlags());
1891 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1892 SDValue HiLoad =
1893 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1894 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1895 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1896
1897 SDValue Join;
1898 if (LoVT == HiVT) {
1899 // This is the case that the vector is power of two so was evenly split.
1900 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1901 } else {
1902 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1903 DAG.getVectorIdxConstant(0, SL));
1904 Join = DAG.getNode(
1906 VT, Join, HiLoad,
1908 }
1909
1910 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1911 LoLoad.getValue(1), HiLoad.getValue(1))};
1912
1913 return DAG.getMergeValues(Ops, SL);
1914}
1915
1917 SelectionDAG &DAG) const {
1919 EVT VT = Op.getValueType();
1920 SDValue BasePtr = Load->getBasePtr();
1921 EVT MemVT = Load->getMemoryVT();
1922 SDLoc SL(Op);
1923 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1924 Align BaseAlign = Load->getAlign();
1925 unsigned NumElements = MemVT.getVectorNumElements();
1926
1927 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1928 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1929 if (NumElements != 3 ||
1930 (BaseAlign < Align(8) &&
1931 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1932 return SplitVectorLoad(Op, DAG);
1933
1934 assert(NumElements == 3);
1935
1936 EVT WideVT =
1938 EVT WideMemVT =
1940 SDValue WideLoad = DAG.getExtLoad(
1941 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1942 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1943 return DAG.getMergeValues(
1944 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1945 DAG.getVectorIdxConstant(0, SL)),
1946 WideLoad.getValue(1)},
1947 SL);
1948}
1949
1951 SelectionDAG &DAG) const {
1953 SDValue Val = Store->getValue();
1954 EVT VT = Val.getValueType();
1955
1956 // If this is a 2 element vector, we really want to scalarize and not create
1957 // weird 1 element vectors.
1958 if (VT.getVectorNumElements() == 2)
1959 return scalarizeVectorStore(Store, DAG);
1960
1961 EVT MemVT = Store->getMemoryVT();
1962 SDValue Chain = Store->getChain();
1963 SDValue BasePtr = Store->getBasePtr();
1964 SDLoc SL(Op);
1965
1966 EVT LoVT, HiVT;
1967 EVT LoMemVT, HiMemVT;
1968 SDValue Lo, Hi;
1969
1970 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1971 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1972 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1973
1974 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1975
1976 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1977 Align BaseAlign = Store->getAlign();
1978 unsigned Size = LoMemVT.getStoreSize();
1979 Align HiAlign = commonAlignment(BaseAlign, Size);
1980
1981 SDValue LoStore =
1982 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1983 Store->getMemOperand()->getFlags());
1984 SDValue HiStore =
1985 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1986 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1987
1988 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1989}
1990
1991// This is a shortcut for integer division because we have fast i32<->f32
1992// conversions, and fast f32 reciprocal instructions. The fractional part of a
1993// float is enough to accurately represent up to a 24-bit signed integer.
1995 bool Sign) const {
1996 SDLoc DL(Op);
1997 EVT VT = Op.getValueType();
1998 SDValue LHS = Op.getOperand(0);
1999 SDValue RHS = Op.getOperand(1);
2000 MVT IntVT = MVT::i32;
2001 MVT FltVT = MVT::f32;
2002
2003 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2004 if (LHSSignBits < 9)
2005 return SDValue();
2006
2007 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2008 if (RHSSignBits < 9)
2009 return SDValue();
2010
2011 unsigned BitSize = VT.getSizeInBits();
2012 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2013 unsigned DivBits = BitSize - SignBits;
2014 if (Sign)
2015 ++DivBits;
2016
2019
2020 SDValue jq = DAG.getConstant(1, DL, IntVT);
2021
2022 if (Sign) {
2023 // char|short jq = ia ^ ib;
2024 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2025
2026 // jq = jq >> (bitsize - 2)
2027 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2028 DAG.getConstant(BitSize - 2, DL, VT));
2029
2030 // jq = jq | 0x1
2031 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2032 }
2033
2034 // int ia = (int)LHS;
2035 SDValue ia = LHS;
2036
2037 // int ib, (int)RHS;
2038 SDValue ib = RHS;
2039
2040 // float fa = (float)ia;
2041 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2042
2043 // float fb = (float)ib;
2044 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2045
2046 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2047 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2048
2049 // fq = trunc(fq);
2050 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2051
2052 // float fqneg = -fq;
2053 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2054
2056
2057 bool UseFmadFtz = false;
2058 if (Subtarget->isGCN()) {
2060 UseFmadFtz =
2062 }
2063
2064 // float fr = mad(fqneg, fb, fa);
2065 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2066 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2068 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2069
2070 // int iq = (int)fq;
2071 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2072
2073 // fr = fabs(fr);
2074 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2075
2076 // fb = fabs(fb);
2077 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2078
2079 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2080
2081 // int cv = fr >= fb;
2082 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2083
2084 // jq = (cv ? jq : 0);
2085 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2086
2087 // dst = iq + jq;
2088 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2089
2090 // Rem needs compensation, it's easier to recompute it
2091 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2092 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2093
2094 // Truncate to number of bits this divide really is.
2095 if (Sign) {
2096 SDValue InRegSize
2097 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2098 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2099 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2100 } else {
2101 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2102 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2103 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2104 }
2105
2106 return DAG.getMergeValues({ Div, Rem }, DL);
2107}
2108
2110 SelectionDAG &DAG,
2112 SDLoc DL(Op);
2113 EVT VT = Op.getValueType();
2114
2115 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2116
2117 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2118
2119 SDValue One = DAG.getConstant(1, DL, HalfVT);
2120 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2121
2122 //HiLo split
2123 SDValue LHS_Lo, LHS_Hi;
2124 SDValue LHS = Op.getOperand(0);
2125 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2126
2127 SDValue RHS_Lo, RHS_Hi;
2128 SDValue RHS = Op.getOperand(1);
2129 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2130
2131 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2132 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2133
2134 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2135 LHS_Lo, RHS_Lo);
2136
2137 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2138 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2139
2140 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2141 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2142 return;
2143 }
2144
2145 if (isTypeLegal(MVT::i64)) {
2146 // The algorithm here is based on ideas from "Software Integer Division",
2147 // Tom Rodeheffer, August 2008.
2148
2151
2152 // Compute denominator reciprocal.
2153 unsigned FMAD =
2154 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2158
2159 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2160 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2161 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2162 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2163 Cvt_Lo);
2164 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2165 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2166 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2167 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2168 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2169 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2170 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2171 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2172 Mul1);
2173 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2174 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2175 SDValue Rcp64 = DAG.getBitcast(VT,
2176 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2177
2178 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2179 SDValue One64 = DAG.getConstant(1, DL, VT);
2180 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2181 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2182
2183 // First round of UNR (Unsigned integer Newton-Raphson).
2184 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2185 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2186 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2187 SDValue Mulhi1_Lo, Mulhi1_Hi;
2188 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2189 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2190 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2191 Mulhi1_Lo, Zero1);
2192 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2193 Mulhi1_Hi, Add1_Lo.getValue(1));
2194 SDValue Add1 = DAG.getBitcast(VT,
2195 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2196
2197 // Second round of UNR.
2198 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2199 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2200 SDValue Mulhi2_Lo, Mulhi2_Hi;
2201 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2202 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2203 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2204 Mulhi2_Lo, Zero1);
2205 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2206 Mulhi2_Hi, Add2_Lo.getValue(1));
2207 SDValue Add2 = DAG.getBitcast(VT,
2208 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2209
2210 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2211
2212 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2213
2214 SDValue Mul3_Lo, Mul3_Hi;
2215 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2216 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2217 Mul3_Lo, Zero1);
2218 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2219 Mul3_Hi, Sub1_Lo.getValue(1));
2220 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2221 SDValue Sub1 = DAG.getBitcast(VT,
2222 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2223
2224 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2225 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2226 ISD::SETUGE);
2227 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2228 ISD::SETUGE);
2229 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2230
2231 // TODO: Here and below portions of the code can be enclosed into if/endif.
2232 // Currently control flow is unconditional and we have 4 selects after
2233 // potential endif to substitute PHIs.
2234
2235 // if C3 != 0 ...
2236 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2237 RHS_Lo, Zero1);
2238 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2239 RHS_Hi, Sub1_Lo.getValue(1));
2240 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2241 Zero, Sub2_Lo.getValue(1));
2242 SDValue Sub2 = DAG.getBitcast(VT,
2243 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2244
2245 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2246
2247 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2248 ISD::SETUGE);
2249 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2250 ISD::SETUGE);
2251 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2252
2253 // if (C6 != 0)
2254 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2255
2256 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2257 RHS_Lo, Zero1);
2258 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2259 RHS_Hi, Sub2_Lo.getValue(1));
2260 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2261 Zero, Sub3_Lo.getValue(1));
2262 SDValue Sub3 = DAG.getBitcast(VT,
2263 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2264
2265 // endif C6
2266 // endif C3
2267
2268 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2269 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2270
2271 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2272 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2273
2274 Results.push_back(Div);
2275 Results.push_back(Rem);
2276
2277 return;
2278 }
2279
2280 // r600 expandion.
2281 // Get Speculative values
2282 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2283 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2284
2285 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2286 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2287 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2288
2289 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2290 SDValue DIV_Lo = Zero;
2291
2292 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2293
2294 for (unsigned i = 0; i < halfBitWidth; ++i) {
2295 const unsigned bitPos = halfBitWidth - i - 1;
2296 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2297 // Get value of high bit
2298 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2299 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2300 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2301
2302 // Shift
2303 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2304 // Add LHS high bit
2305 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2306
2307 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2308 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2309
2310 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2311
2312 // Update REM
2313 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2314 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2315 }
2316
2317 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2318 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2319 Results.push_back(DIV);
2320 Results.push_back(REM);
2321}
2322
2324 SelectionDAG &DAG) const {
2325 SDLoc DL(Op);
2326 EVT VT = Op.getValueType();
2327
2328 if (VT == MVT::i64) {
2330 LowerUDIVREM64(Op, DAG, Results);
2331 return DAG.getMergeValues(Results, DL);
2332 }
2333
2334 if (VT == MVT::i32) {
2335 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2336 return Res;
2337 }
2338
2339 SDValue X = Op.getOperand(0);
2340 SDValue Y = Op.getOperand(1);
2341
2342 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2343 // algorithm used here.
2344
2345 // Initial estimate of inv(y).
2346 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2347
2348 // One round of UNR.
2349 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2350 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2351 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2352 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2353
2354 // Quotient/remainder estimate.
2355 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2356 SDValue R =
2357 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2358
2359 // First quotient/remainder refinement.
2360 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2361 SDValue One = DAG.getConstant(1, DL, VT);
2362 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2363 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2364 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2365 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2366 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2367
2368 // Second quotient/remainder refinement.
2369 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2370 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2371 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2372 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2373 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2374
2375 return DAG.getMergeValues({Q, R}, DL);
2376}
2377
2379 SelectionDAG &DAG) const {
2380 SDLoc DL(Op);
2381 EVT VT = Op.getValueType();
2382
2383 SDValue LHS = Op.getOperand(0);
2384 SDValue RHS = Op.getOperand(1);
2385
2386 SDValue Zero = DAG.getConstant(0, DL, VT);
2387 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2388
2389 if (VT == MVT::i32) {
2390 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2391 return Res;
2392 }
2393
2394 if (VT == MVT::i64 &&
2395 DAG.ComputeNumSignBits(LHS) > 32 &&
2396 DAG.ComputeNumSignBits(RHS) > 32) {
2397 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2398
2399 //HiLo split
2400 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2401 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2402 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2403 LHS_Lo, RHS_Lo);
2404 SDValue Res[2] = {
2405 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2406 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2407 };
2408 return DAG.getMergeValues(Res, DL);
2409 }
2410
2411 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2412 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2413 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2414 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2415
2416 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2417 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2418
2419 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2420 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2421
2422 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2423 SDValue Rem = Div.getValue(1);
2424
2425 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2426 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2427
2428 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2429 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2430
2431 SDValue Res[2] = {
2432 Div,
2433 Rem
2434 };
2435 return DAG.getMergeValues(Res, DL);
2436}
2437
2439 SDLoc SL(Op);
2440 SDValue Src = Op.getOperand(0);
2441
2442 // result = trunc(src)
2443 // if (src > 0.0 && src != result)
2444 // result += 1.0
2445
2446 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2447
2448 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2449 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2450
2451 EVT SetCCVT =
2452 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2453
2454 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2455 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2456 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2457
2458 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2459 // TODO: Should this propagate fast-math-flags?
2460 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2461}
2462
2464 SelectionDAG &DAG) {
2465 const unsigned FractBits = 52;
2466 const unsigned ExpBits = 11;
2467
2468 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2469 Hi,
2470 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2471 DAG.getConstant(ExpBits, SL, MVT::i32));
2472 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2473 DAG.getConstant(1023, SL, MVT::i32));
2474
2475 return Exp;
2476}
2477
2479 SDLoc SL(Op);
2480 SDValue Src = Op.getOperand(0);
2481
2482 assert(Op.getValueType() == MVT::f64);
2483
2484 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2485
2486 // Extract the upper half, since this is where we will find the sign and
2487 // exponent.
2488 SDValue Hi = getHiHalf64(Src, DAG);
2489
2490 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2491
2492 const unsigned FractBits = 52;
2493
2494 // Extract the sign bit.
2495 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2496 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2497
2498 // Extend back to 64-bits.
2499 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2500 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2501
2502 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2503 const SDValue FractMask
2504 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2505
2506 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2507 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2508 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2509
2510 EVT SetCCVT =
2511 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2512
2513 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2514
2515 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2516 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2517
2518 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2519 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2520
2521 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2522}
2523
2525 SelectionDAG &DAG) const {
2526 SDLoc SL(Op);
2527 SDValue Src = Op.getOperand(0);
2528
2529 assert(Op.getValueType() == MVT::f64);
2530
2531 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2532 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2533 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2534
2535 // TODO: Should this propagate fast-math-flags?
2536
2537 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2538 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2539
2540 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2541
2542 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2543 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2544
2545 EVT SetCCVT =
2546 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2547 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2548
2549 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2550}
2551
2553 SelectionDAG &DAG) const {
2554 // FNEARBYINT and FRINT are the same, except in their handling of FP
2555 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2556 // rint, so just treat them as equivalent.
2557 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2558 Op.getOperand(0));
2559}
2560
2562 auto VT = Op.getValueType();
2563 auto Arg = Op.getOperand(0u);
2564 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2565}
2566
2567// XXX - May require not supporting f32 denormals?
2568
2569// Don't handle v2f16. The extra instructions to scalarize and repack around the
2570// compare and vselect end up producing worse code than scalarizing the whole
2571// operation.
2573 SDLoc SL(Op);
2574 SDValue X = Op.getOperand(0);
2575 EVT VT = Op.getValueType();
2576
2577 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2578
2579 // TODO: Should this propagate fast-math-flags?
2580
2581 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2582
2583 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2584
2585 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2586 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2587
2588 EVT SetCCVT =
2589 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2590
2591 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2592 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2593 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2594
2595 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2596 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2597}
2598
2600 SDLoc SL(Op);
2601 SDValue Src = Op.getOperand(0);
2602
2603 // result = trunc(src);
2604 // if (src < 0.0 && src != result)
2605 // result += -1.0.
2606
2607 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2608
2609 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2610 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2611
2612 EVT SetCCVT =
2613 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2614
2615 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2616 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2617 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2618
2619 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2620 // TODO: Should this propagate fast-math-flags?
2621 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2622}
2623
2624/// Return true if it's known that \p Src can never be an f32 denormal value.
2626 switch (Src.getOpcode()) {
2627 case ISD::FP_EXTEND:
2628 return Src.getOperand(0).getValueType() == MVT::f16;
2629 case ISD::FP16_TO_FP:
2630 case ISD::FFREXP:
2631 return true;
2633 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2634 switch (IntrinsicID) {
2635 case Intrinsic::amdgcn_frexp_mant:
2636 return true;
2637 default:
2638 return false;
2639 }
2640 }
2641 default:
2642 return false;
2643 }
2644
2645 llvm_unreachable("covered opcode switch");
2646}
2647
2649 SDNodeFlags Flags) {
2650 return Flags.hasApproximateFuncs();
2651}
2652
2661
2663 SDValue Src,
2664 SDNodeFlags Flags) const {
2665 SDLoc SL(Src);
2666 EVT VT = Src.getValueType();
2667 const fltSemantics &Semantics = VT.getFltSemantics();
2668 SDValue SmallestNormal =
2669 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2670
2671 // Want to scale denormals up, but negatives and 0 work just as well on the
2672 // scaled path.
2673 SDValue IsLtSmallestNormal = DAG.getSetCC(
2674 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2675 SmallestNormal, ISD::SETOLT);
2676
2677 return IsLtSmallestNormal;
2678}
2679
2681 SDNodeFlags Flags) const {
2682 SDLoc SL(Src);
2683 EVT VT = Src.getValueType();
2684 const fltSemantics &Semantics = VT.getFltSemantics();
2685 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2686
2687 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2688 SDValue IsFinite = DAG.getSetCC(
2689 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2690 Inf, ISD::SETOLT);
2691 return IsFinite;
2692}
2693
2694/// If denormal handling is required return the scaled input to FLOG2, and the
2695/// check for denormal range. Otherwise, return null values.
2696std::pair<SDValue, SDValue>
2698 SDValue Src, SDNodeFlags Flags) const {
2699 if (!needsDenormHandlingF32(DAG, Src, Flags))
2700 return {};
2701
2702 MVT VT = MVT::f32;
2703 const fltSemantics &Semantics = APFloat::IEEEsingle();
2704 SDValue SmallestNormal =
2705 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2706
2707 SDValue IsLtSmallestNormal = DAG.getSetCC(
2708 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2709 SmallestNormal, ISD::SETOLT);
2710
2711 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2712 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2713 SDValue ScaleFactor =
2714 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2715
2716 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2717 return {ScaledInput, IsLtSmallestNormal};
2718}
2719
2721 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2722 // If we have to handle denormals, scale up the input and adjust the result.
2723
2724 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2725 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2726
2727 SDLoc SL(Op);
2728 EVT VT = Op.getValueType();
2729 SDValue Src = Op.getOperand(0);
2730 SDNodeFlags Flags = Op->getFlags();
2731
2732 if (VT == MVT::f16) {
2733 // Nothing in half is a denormal when promoted to f32.
2734 assert(!Subtarget->has16BitInsts());
2735 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2736 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2737 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2738 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2739 }
2740
2741 auto [ScaledInput, IsLtSmallestNormal] =
2742 getScaledLogInput(DAG, SL, Src, Flags);
2743 if (!ScaledInput)
2744 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2745
2746 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2747
2748 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2749 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2750 SDValue ResultOffset =
2751 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2752 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2753}
2754
2755static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2756 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2757 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2758 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2759}
2760
2762 SelectionDAG &DAG) const {
2763 SDValue X = Op.getOperand(0);
2764 EVT VT = Op.getValueType();
2765 SDNodeFlags Flags = Op->getFlags();
2766 SDLoc DL(Op);
2767
2768 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2769 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2770
2771 const auto &Options = getTargetMachine().Options;
2772 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2773
2774 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2775 // Log and multiply in f32 is good enough for f16.
2776 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2777 }
2778
2779 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2780 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2781 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2782 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2783 }
2784
2785 return Lowered;
2786 }
2787
2788 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2789 if (ScaledInput)
2790 X = ScaledInput;
2791
2792 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2793
2794 SDValue R;
2795 if (Subtarget->hasFastFMAF32()) {
2796 // c+cc are ln(2)/ln(10) to more than 49 bits
2797 const float c_log10 = 0x1.344134p-2f;
2798 const float cc_log10 = 0x1.09f79ep-26f;
2799
2800 // c + cc is ln(2) to more than 49 bits
2801 const float c_log = 0x1.62e42ep-1f;
2802 const float cc_log = 0x1.efa39ep-25f;
2803
2804 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2805 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2806
2807 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2808 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2809 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2810 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2811 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2812 } else {
2813 // ch+ct is ln(2)/ln(10) to more than 36 bits
2814 const float ch_log10 = 0x1.344000p-2f;
2815 const float ct_log10 = 0x1.3509f6p-18f;
2816
2817 // ch + ct is ln(2) to more than 36 bits
2818 const float ch_log = 0x1.62e000p-1f;
2819 const float ct_log = 0x1.0bfbe8p-15f;
2820
2821 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2822 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2823
2824 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2825 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2826 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2827 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2828 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2829
2830 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2831 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2832 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2833 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2834 }
2835
2836 const bool IsFiniteOnly =
2837 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2838
2839 // TODO: Check if known finite from source value.
2840 if (!IsFiniteOnly) {
2841 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2842 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2843 }
2844
2845 if (IsScaled) {
2846 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2847 SDValue ShiftK =
2848 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2849 SDValue Shift =
2850 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2851 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2852 }
2853
2854 return R;
2855}
2856
2860
2861// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2862// promote f16 operation.
2864 SelectionDAG &DAG, bool IsLog10,
2865 SDNodeFlags Flags) const {
2866 EVT VT = Src.getValueType();
2867 unsigned LogOp =
2868 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2869
2870 double Log2BaseInverted =
2872
2873 if (VT == MVT::f32) {
2874 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2875 if (ScaledInput) {
2876 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2877 SDValue ScaledResultOffset =
2878 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2879
2880 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2881
2882 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2883 ScaledResultOffset, Zero, Flags);
2884
2885 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2886
2887 if (Subtarget->hasFastFMAF32())
2888 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2889 Flags);
2890 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2891 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2892 }
2893 }
2894
2895 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2896 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2897
2898 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2899 Flags);
2900}
2901
2903 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2904 // If we have to handle denormals, scale up the input and adjust the result.
2905
2906 SDLoc SL(Op);
2907 EVT VT = Op.getValueType();
2908 SDValue Src = Op.getOperand(0);
2909 SDNodeFlags Flags = Op->getFlags();
2910
2911 if (VT == MVT::f16) {
2912 // Nothing in half is a denormal when promoted to f32.
2913 assert(!Subtarget->has16BitInsts());
2914 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2915 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2916 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2917 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2918 }
2919
2920 assert(VT == MVT::f32);
2921
2922 if (!needsDenormHandlingF32(DAG, Src, Flags))
2923 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2924
2925 // bool needs_scaling = x < -0x1.f80000p+6f;
2926 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2927
2928 // -nextafter(128.0, -1)
2929 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2930
2931 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2932
2933 SDValue NeedsScaling =
2934 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2935
2936 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2937 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2938
2939 SDValue AddOffset =
2940 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2941
2942 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2943 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2944
2945 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2946 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2947 SDValue ResultScale =
2948 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2949
2950 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2951}
2952
2954 SelectionDAG &DAG,
2955 SDNodeFlags Flags) const {
2956 EVT VT = X.getValueType();
2957 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2958
2959 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2960 // exp2(M_LOG2E_F * f);
2961 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2962 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2963 : (unsigned)ISD::FEXP2,
2964 SL, VT, Mul, Flags);
2965 }
2966
2967 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2968
2969 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2970 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2971
2972 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2973
2974 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2975
2976 SDValue AdjustedX =
2977 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2978
2979 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2980
2981 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2982
2983 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2984 SDValue AdjustedResult =
2985 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2986
2987 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2988 Flags);
2989}
2990
2991/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2992/// handled correctly.
2994 SelectionDAG &DAG,
2995 SDNodeFlags Flags) const {
2996 const EVT VT = X.getValueType();
2997 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
2998 : static_cast<unsigned>(ISD::FEXP2);
2999
3000 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3001 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3002 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3003 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3004
3005 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3006 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3007 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3008 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3009 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3010 }
3011
3012 // bool s = x < -0x1.2f7030p+5f;
3013 // x += s ? 0x1.0p+5f : 0.0f;
3014 // exp10 = exp2(x * 0x1.a92000p+1f) *
3015 // exp2(x * 0x1.4f0978p-11f) *
3016 // (s ? 0x1.9f623ep-107f : 1.0f);
3017
3018 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3019
3020 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3021 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3022
3023 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3024 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3025 SDValue AdjustedX =
3026 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3027
3028 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3029 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3030
3031 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3032 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3033 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3034 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3035
3036 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3037
3038 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3039 SDValue AdjustedResult =
3040 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3041
3042 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3043 Flags);
3044}
3045
3047 EVT VT = Op.getValueType();
3048 SDLoc SL(Op);
3049 SDValue X = Op.getOperand(0);
3050 SDNodeFlags Flags = Op->getFlags();
3051 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3052
3053 if (VT.getScalarType() == MVT::f16) {
3054 // v_exp_f16 (fmul x, log2e)
3055 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3056 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3057
3058 if (VT.isVector())
3059 return SDValue();
3060
3061 // exp(f16 x) ->
3062 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3063
3064 // Nothing in half is a denormal when promoted to f32.
3065 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3066 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3067 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3068 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3069 }
3070
3071 assert(VT == MVT::f32);
3072
3073 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3074 // library behavior. Also, is known-not-daz source sufficient?
3075 if (allowApproxFunc(DAG, Flags)) {
3076 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3077 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3078 }
3079
3080 // Algorithm:
3081 //
3082 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3083 //
3084 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3085 // n = 64*m + j, 0 <= j < 64
3086 //
3087 // e^x = 2^((64*m + j + f)/64)
3088 // = (2^m) * (2^(j/64)) * 2^(f/64)
3089 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3090 //
3091 // f = x*(64/ln(2)) - n
3092 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3093 //
3094 // e^x = (2^m) * (2^(j/64)) * e^r
3095 //
3096 // (2^(j/64)) is precomputed
3097 //
3098 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3099 // e^r = 1 + q
3100 //
3101 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3102 //
3103 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3104 SDNodeFlags FlagsNoContract = Flags;
3105 FlagsNoContract.setAllowContract(false);
3106
3107 SDValue PH, PL;
3108 if (Subtarget->hasFastFMAF32()) {
3109 const float c_exp = numbers::log2ef;
3110 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3111 const float c_exp10 = 0x1.a934f0p+1f;
3112 const float cc_exp10 = 0x1.2f346ep-24f;
3113
3114 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3115 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3116
3117 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3118 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3119 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3120 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3121 } else {
3122 const float ch_exp = 0x1.714000p+0f;
3123 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3124
3125 const float ch_exp10 = 0x1.a92000p+1f;
3126 const float cl_exp10 = 0x1.4f0978p-11f;
3127
3128 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3129 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3130
3131 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3132 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3133 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3134 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3135 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3136
3137 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3138
3139 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3140 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3141 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3142 }
3143
3144 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3145
3146 // It is unsafe to contract this fsub into the PH multiply.
3147 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3148
3149 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3150 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3151 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3152
3153 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3154
3155 SDValue UnderflowCheckConst =
3156 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3157
3158 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3159 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3160 SDValue Underflow =
3161 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3162
3163 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3164
3165 if (!Flags.hasNoInfs()) {
3166 SDValue OverflowCheckConst =
3167 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3168 SDValue Overflow =
3169 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3170 SDValue Inf =
3172 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3173 }
3174
3175 return R;
3176}
3177
3178static bool isCtlzOpc(unsigned Opc) {
3179 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3180}
3181
3182static bool isCttzOpc(unsigned Opc) {
3183 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3184}
3185
3187 SelectionDAG &DAG) const {
3188 auto SL = SDLoc(Op);
3189 auto Opc = Op.getOpcode();
3190 auto Arg = Op.getOperand(0u);
3191 auto ResultVT = Op.getValueType();
3192
3193 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3194 return {};
3195
3197 assert(ResultVT == Arg.getValueType());
3198
3199 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3200 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3201 SDValue NewOp;
3202
3203 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3204 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3205 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3206 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3207 } else {
3208 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3209 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3210 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3211 }
3212
3213 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3214}
3215
3217 SDLoc SL(Op);
3218 SDValue Src = Op.getOperand(0);
3219
3220 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3221 bool Ctlz = isCtlzOpc(Op.getOpcode());
3222 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3223
3224 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3225 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3226 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3227
3228 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3229 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3230 // (cttz hi:lo) -> (umin (ffbl src), 32)
3231 // (ctlz_zero_undef src) -> (ffbh src)
3232 // (cttz_zero_undef src) -> (ffbl src)
3233
3234 // 64-bit scalar version produce 32-bit result
3235 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3236 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3237 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3238 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3239 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3240 if (!ZeroUndef) {
3241 const SDValue ConstVal = DAG.getConstant(
3242 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3243 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3244 }
3245 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3246 }
3247
3248 SDValue Lo, Hi;
3249 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3250
3251 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3252 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3253
3254 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3255 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3256 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3257 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3258
3259 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3260 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3261 if (Ctlz)
3262 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3263 else
3264 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3265
3266 SDValue NewOpr;
3267 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3268 if (!ZeroUndef) {
3269 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3270 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3271 }
3272
3273 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3274}
3275
3277 bool Signed) const {
3278 // The regular method converting a 64-bit integer to float roughly consists of
3279 // 2 steps: normalization and rounding. In fact, after normalization, the
3280 // conversion from a 64-bit integer to a float is essentially the same as the
3281 // one from a 32-bit integer. The only difference is that it has more
3282 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3283 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3284 // converted into the correct float number. The basic steps for the unsigned
3285 // conversion are illustrated in the following pseudo code:
3286 //
3287 // f32 uitofp(i64 u) {
3288 // i32 hi, lo = split(u);
3289 // // Only count the leading zeros in hi as we have native support of the
3290 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3291 // // reduced to a 32-bit one automatically.
3292 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3293 // u <<= shamt;
3294 // hi, lo = split(u);
3295 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3296 // // convert it as a 32-bit integer and scale the result back.
3297 // return uitofp(hi) * 2^(32 - shamt);
3298 // }
3299 //
3300 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3301 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3302 // converted instead followed by negation based its sign bit.
3303
3304 SDLoc SL(Op);
3305 SDValue Src = Op.getOperand(0);
3306
3307 SDValue Lo, Hi;
3308 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3309 SDValue Sign;
3310 SDValue ShAmt;
3311 if (Signed && Subtarget->isGCN()) {
3312 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3313 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3314 // account. That is, the maximal shift is
3315 // - 32 if Lo and Hi have opposite signs;
3316 // - 33 if Lo and Hi have the same sign.
3317 //
3318 // Or, MaxShAmt = 33 + OppositeSign, where
3319 //
3320 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3321 // - -1 if Lo and Hi have opposite signs; and
3322 // - 0 otherwise.
3323 //
3324 // All in all, ShAmt is calculated as
3325 //
3326 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3327 //
3328 // or
3329 //
3330 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3331 //
3332 // to reduce the critical path.
3333 SDValue OppositeSign = DAG.getNode(
3334 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3335 DAG.getConstant(31, SL, MVT::i32));
3336 SDValue MaxShAmt =
3337 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3338 OppositeSign);
3339 // Count the leading sign bits.
3340 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3341 // Different from unsigned conversion, the shift should be one bit less to
3342 // preserve the sign bit.
3343 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3344 DAG.getConstant(1, SL, MVT::i32));
3345 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3346 } else {
3347 if (Signed) {
3348 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3349 // absolute value first.
3350 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3351 DAG.getConstant(63, SL, MVT::i64));
3352 SDValue Abs =
3353 DAG.getNode(ISD::XOR, SL, MVT::i64,
3354 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3355 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3356 }
3357 // Count the leading zeros.
3358 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3359 // The shift amount for signed integers is [0, 32].
3360 }
3361 // Normalize the given 64-bit integer.
3362 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3363 // Split it again.
3364 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3365 // Calculate the adjust bit for rounding.
3366 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3367 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3368 DAG.getConstant(1, SL, MVT::i32), Lo);
3369 // Get the 32-bit normalized integer.
3370 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3371 // Convert the normalized 32-bit integer into f32.
3372 unsigned Opc =
3373 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3374 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3375
3376 // Finally, need to scale back the converted floating number as the original
3377 // 64-bit integer is converted as a 32-bit one.
3378 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3379 ShAmt);
3380 // On GCN, use LDEXP directly.
3381 if (Subtarget->isGCN())
3382 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3383
3384 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3385 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3386 // exponent is enough to avoid overflowing into the sign bit.
3387 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3388 DAG.getConstant(23, SL, MVT::i32));
3389 SDValue IVal =
3390 DAG.getNode(ISD::ADD, SL, MVT::i32,
3391 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3392 if (Signed) {
3393 // Set the sign bit.
3394 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3395 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3396 DAG.getConstant(31, SL, MVT::i32));
3397 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3398 }
3399 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3400}
3401
3403 bool Signed) const {
3404 SDLoc SL(Op);
3405 SDValue Src = Op.getOperand(0);
3406
3407 SDValue Lo, Hi;
3408 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3409
3411 SL, MVT::f64, Hi);
3412
3413 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3414
3415 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3416 DAG.getConstant(32, SL, MVT::i32));
3417 // TODO: Should this propagate fast-math-flags?
3418 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3419}
3420
3422 SelectionDAG &DAG) const {
3423 // TODO: Factor out code common with LowerSINT_TO_FP.
3424 EVT DestVT = Op.getValueType();
3425 SDValue Src = Op.getOperand(0);
3426 EVT SrcVT = Src.getValueType();
3427
3428 if (SrcVT == MVT::i16) {
3429 if (DestVT == MVT::f16)
3430 return Op;
3431 SDLoc DL(Op);
3432
3433 // Promote src to i32
3434 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3435 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3436 }
3437
3438 if (DestVT == MVT::bf16) {
3439 SDLoc SL(Op);
3440 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3441 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3442 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3443 }
3444
3445 if (SrcVT != MVT::i64)
3446 return Op;
3447
3448 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3449 SDLoc DL(Op);
3450
3451 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3452 SDValue FPRoundFlag =
3453 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3454 SDValue FPRound =
3455 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3456
3457 return FPRound;
3458 }
3459
3460 if (DestVT == MVT::f32)
3461 return LowerINT_TO_FP32(Op, DAG, false);
3462
3463 assert(DestVT == MVT::f64);
3464 return LowerINT_TO_FP64(Op, DAG, false);
3465}
3466
3468 SelectionDAG &DAG) const {
3469 EVT DestVT = Op.getValueType();
3470
3471 SDValue Src = Op.getOperand(0);
3472 EVT SrcVT = Src.getValueType();
3473
3474 if (SrcVT == MVT::i16) {
3475 if (DestVT == MVT::f16)
3476 return Op;
3477
3478 SDLoc DL(Op);
3479 // Promote src to i32
3480 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3481 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3482 }
3483
3484 if (DestVT == MVT::bf16) {
3485 SDLoc SL(Op);
3486 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3487 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3488 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3489 }
3490
3491 if (SrcVT != MVT::i64)
3492 return Op;
3493
3494 // TODO: Factor out code common with LowerUINT_TO_FP.
3495
3496 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3497 SDLoc DL(Op);
3498 SDValue Src = Op.getOperand(0);
3499
3500 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3501 SDValue FPRoundFlag =
3502 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3503 SDValue FPRound =
3504 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3505
3506 return FPRound;
3507 }
3508
3509 if (DestVT == MVT::f32)
3510 return LowerINT_TO_FP32(Op, DAG, true);
3511
3512 assert(DestVT == MVT::f64);
3513 return LowerINT_TO_FP64(Op, DAG, true);
3514}
3515
3517 bool Signed) const {
3518 SDLoc SL(Op);
3519
3520 SDValue Src = Op.getOperand(0);
3521 EVT SrcVT = Src.getValueType();
3522
3523 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3524
3525 // The basic idea of converting a floating point number into a pair of 32-bit
3526 // integers is illustrated as follows:
3527 //
3528 // tf := trunc(val);
3529 // hif := floor(tf * 2^-32);
3530 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3531 // hi := fptoi(hif);
3532 // lo := fptoi(lof);
3533 //
3534 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3535 SDValue Sign;
3536 if (Signed && SrcVT == MVT::f32) {
3537 // However, a 32-bit floating point number has only 23 bits mantissa and
3538 // it's not enough to hold all the significant bits of `lof` if val is
3539 // negative. To avoid the loss of precision, We need to take the absolute
3540 // value after truncating and flip the result back based on the original
3541 // signedness.
3542 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3543 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3544 DAG.getConstant(31, SL, MVT::i32));
3545 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3546 }
3547
3548 SDValue K0, K1;
3549 if (SrcVT == MVT::f64) {
3550 K0 = DAG.getConstantFP(
3551 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3552 SrcVT);
3553 K1 = DAG.getConstantFP(
3554 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3555 SrcVT);
3556 } else {
3557 K0 = DAG.getConstantFP(
3558 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3559 K1 = DAG.getConstantFP(
3560 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3561 }
3562 // TODO: Should this propagate fast-math-flags?
3563 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3564
3565 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3566
3567 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3568
3569 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3571 SL, MVT::i32, FloorMul);
3572 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3573
3574 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3575 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3576
3577 if (Signed && SrcVT == MVT::f32) {
3578 assert(Sign);
3579 // Flip the result based on the signedness, which is either all 0s or 1s.
3580 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3581 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3582 // r := xor(r, sign) - sign;
3583 Result =
3584 DAG.getNode(ISD::SUB, SL, MVT::i64,
3585 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3586 }
3587
3588 return Result;
3589}
3590
3592 SDLoc DL(Op);
3593 SDValue N0 = Op.getOperand(0);
3594
3595 // Convert to target node to get known bits
3596 if (N0.getValueType() == MVT::f32)
3597 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3598
3599 if (Op->getFlags().hasApproximateFuncs()) {
3600 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3601 return SDValue();
3602 }
3603
3604 return LowerF64ToF16Safe(N0, DL, DAG);
3605}
3606
3607// return node in i32
3609 SelectionDAG &DAG) const {
3610 assert(Src.getSimpleValueType() == MVT::f64);
3611
3612 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3613 // TODO: We can generate better code for True16.
3614 const unsigned ExpMask = 0x7ff;
3615 const unsigned ExpBiasf64 = 1023;
3616 const unsigned ExpBiasf16 = 15;
3617 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3618 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3619 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3620 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3621 DAG.getConstant(32, DL, MVT::i64));
3622 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3623 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3624 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3625 DAG.getConstant(20, DL, MVT::i64));
3626 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3627 DAG.getConstant(ExpMask, DL, MVT::i32));
3628 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3629 // add the f16 bias (15) to get the biased exponent for the f16 format.
3630 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3631 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3632
3633 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3634 DAG.getConstant(8, DL, MVT::i32));
3635 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3636 DAG.getConstant(0xffe, DL, MVT::i32));
3637
3638 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3639 DAG.getConstant(0x1ff, DL, MVT::i32));
3640 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3641
3642 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3643 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3644
3645 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3646 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3647 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3648 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3649
3650 // N = M | (E << 12);
3651 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3652 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3653 DAG.getConstant(12, DL, MVT::i32)));
3654
3655 // B = clamp(1-E, 0, 13);
3656 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3657 One, E);
3658 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3659 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3660 DAG.getConstant(13, DL, MVT::i32));
3661
3662 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3663 DAG.getConstant(0x1000, DL, MVT::i32));
3664
3665 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3666 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3667 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3668 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3669
3670 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3671 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3672 DAG.getConstant(0x7, DL, MVT::i32));
3673 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3674 DAG.getConstant(2, DL, MVT::i32));
3675 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3676 One, Zero, ISD::SETEQ);
3677 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3678 One, Zero, ISD::SETGT);
3679 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3680 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3681
3682 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3683 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3684 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3685 I, V, ISD::SETEQ);
3686
3687 // Extract the sign bit.
3688 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3689 DAG.getConstant(16, DL, MVT::i32));
3690 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3691 DAG.getConstant(0x8000, DL, MVT::i32));
3692
3693 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3694}
3695
3697 SelectionDAG &DAG) const {
3698 SDValue Src = Op.getOperand(0);
3699 unsigned OpOpcode = Op.getOpcode();
3700 EVT SrcVT = Src.getValueType();
3701 EVT DestVT = Op.getValueType();
3702
3703 // Will be selected natively
3704 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3705 return Op;
3706
3707 if (SrcVT == MVT::bf16) {
3708 SDLoc DL(Op);
3709 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3710 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3711 }
3712
3713 // Promote i16 to i32
3714 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3715 SDLoc DL(Op);
3716
3717 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3718 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3719 }
3720
3721 if (DestVT != MVT::i64)
3722 return Op;
3723
3724 if (SrcVT == MVT::f16 ||
3725 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3726 SDLoc DL(Op);
3727
3728 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3729 unsigned Ext =
3731 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3732 }
3733
3734 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3735 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3736
3737 return SDValue();
3738}
3739
3741 SelectionDAG &DAG) const {
3742 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3743 MVT VT = Op.getSimpleValueType();
3744 MVT ScalarVT = VT.getScalarType();
3745
3746 assert(VT.isVector());
3747
3748 SDValue Src = Op.getOperand(0);
3749 SDLoc DL(Op);
3750
3751 // TODO: Don't scalarize on Evergreen?
3752 unsigned NElts = VT.getVectorNumElements();
3754 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3755
3756 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3757 for (unsigned I = 0; I < NElts; ++I)
3758 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3759
3760 return DAG.getBuildVector(VT, DL, Args);
3761}
3762
3763//===----------------------------------------------------------------------===//
3764// Custom DAG optimizations
3765//===----------------------------------------------------------------------===//
3766
3767static bool isU24(SDValue Op, SelectionDAG &DAG) {
3768 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3769}
3770
3771static bool isI24(SDValue Op, SelectionDAG &DAG) {
3772 EVT VT = Op.getValueType();
3773 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3774 // as unsigned 24-bit values.
3776}
3777
3780 SelectionDAG &DAG = DCI.DAG;
3781 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3782 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3783
3784 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3785 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3786 unsigned NewOpcode = Node24->getOpcode();
3787 if (IsIntrin) {
3788 unsigned IID = Node24->getConstantOperandVal(0);
3789 switch (IID) {
3790 case Intrinsic::amdgcn_mul_i24:
3791 NewOpcode = AMDGPUISD::MUL_I24;
3792 break;
3793 case Intrinsic::amdgcn_mul_u24:
3794 NewOpcode = AMDGPUISD::MUL_U24;
3795 break;
3796 case Intrinsic::amdgcn_mulhi_i24:
3797 NewOpcode = AMDGPUISD::MULHI_I24;
3798 break;
3799 case Intrinsic::amdgcn_mulhi_u24:
3800 NewOpcode = AMDGPUISD::MULHI_U24;
3801 break;
3802 default:
3803 llvm_unreachable("Expected 24-bit mul intrinsic");
3804 }
3805 }
3806
3807 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3808
3809 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3810 // the operands to have other uses, but will only perform simplifications that
3811 // involve bypassing some nodes for this user.
3812 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3813 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3814 if (DemandedLHS || DemandedRHS)
3815 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3816 DemandedLHS ? DemandedLHS : LHS,
3817 DemandedRHS ? DemandedRHS : RHS);
3818
3819 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3820 // operands if this node is the only user.
3821 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3822 return SDValue(Node24, 0);
3823 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3824 return SDValue(Node24, 0);
3825
3826 return SDValue();
3827}
3828
3829template <typename IntTy>
3831 uint32_t Width, const SDLoc &DL) {
3832 if (Width + Offset < 32) {
3833 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3834 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3835 if constexpr (std::is_signed_v<IntTy>) {
3836 return DAG.getSignedConstant(Result, DL, MVT::i32);
3837 } else {
3838 return DAG.getConstant(Result, DL, MVT::i32);
3839 }
3840 }
3841
3842 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3843}
3844
3845static bool hasVolatileUser(SDNode *Val) {
3846 for (SDNode *U : Val->users()) {
3847 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3848 if (M->isVolatile())
3849 return true;
3850 }
3851 }
3852
3853 return false;
3854}
3855
3857 // i32 vectors are the canonical memory type.
3858 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3859 return false;
3860
3861 if (!VT.isByteSized())
3862 return false;
3863
3864 unsigned Size = VT.getStoreSize();
3865
3866 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3867 return false;
3868
3869 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3870 return false;
3871
3872 return true;
3873}
3874
3875// Replace load of an illegal type with a bitcast from a load of a friendlier
3876// type.
3878 DAGCombinerInfo &DCI) const {
3879 if (!DCI.isBeforeLegalize())
3880 return SDValue();
3881
3883 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3884 return SDValue();
3885
3886 SDLoc SL(N);
3887 SelectionDAG &DAG = DCI.DAG;
3888 EVT VT = LN->getMemoryVT();
3889
3890 unsigned Size = VT.getStoreSize();
3891 Align Alignment = LN->getAlign();
3892 if (Alignment < Size && isTypeLegal(VT)) {
3893 unsigned IsFast;
3894 unsigned AS = LN->getAddressSpace();
3895
3896 // Expand unaligned loads earlier than legalization. Due to visitation order
3897 // problems during legalization, the emitted instructions to pack and unpack
3898 // the bytes again are not eliminated in the case of an unaligned copy.
3900 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3901 if (VT.isVector())
3902 return SplitVectorLoad(SDValue(LN, 0), DAG);
3903
3904 SDValue Ops[2];
3905 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3906
3907 return DAG.getMergeValues(Ops, SDLoc(N));
3908 }
3909
3910 if (!IsFast)
3911 return SDValue();
3912 }
3913
3914 if (!shouldCombineMemoryType(VT))
3915 return SDValue();
3916
3917 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3918
3919 SDValue NewLoad
3920 = DAG.getLoad(NewVT, SL, LN->getChain(),
3921 LN->getBasePtr(), LN->getMemOperand());
3922
3923 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3924 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3925 return SDValue(N, 0);
3926}
3927
3928// Replace store of an illegal type with a store of a bitcast to a friendlier
3929// type.
3931 DAGCombinerInfo &DCI) const {
3932 if (!DCI.isBeforeLegalize())
3933 return SDValue();
3934
3936 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3937 return SDValue();
3938
3939 EVT VT = SN->getMemoryVT();
3940 unsigned Size = VT.getStoreSize();
3941
3942 SDLoc SL(N);
3943 SelectionDAG &DAG = DCI.DAG;
3944 Align Alignment = SN->getAlign();
3945 if (Alignment < Size && isTypeLegal(VT)) {
3946 unsigned IsFast;
3947 unsigned AS = SN->getAddressSpace();
3948
3949 // Expand unaligned stores earlier than legalization. Due to visitation
3950 // order problems during legalization, the emitted instructions to pack and
3951 // unpack the bytes again are not eliminated in the case of an unaligned
3952 // copy.
3954 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3955 if (VT.isVector())
3956 return SplitVectorStore(SDValue(SN, 0), DAG);
3957
3958 return expandUnalignedStore(SN, DAG);
3959 }
3960
3961 if (!IsFast)
3962 return SDValue();
3963 }
3964
3965 if (!shouldCombineMemoryType(VT))
3966 return SDValue();
3967
3968 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3969 SDValue Val = SN->getValue();
3970
3971 //DCI.AddToWorklist(Val.getNode());
3972
3973 bool OtherUses = !Val.hasOneUse();
3974 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3975 if (OtherUses) {
3976 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3977 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3978 }
3979
3980 return DAG.getStore(SN->getChain(), SL, CastVal,
3981 SN->getBasePtr(), SN->getMemOperand());
3982}
3983
3984// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3985// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3986// issues.
3988 DAGCombinerInfo &DCI) const {
3989 SelectionDAG &DAG = DCI.DAG;
3990 SDValue N0 = N->getOperand(0);
3991
3992 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3993 // (vt2 (truncate (assertzext vt0:x, vt1)))
3994 if (N0.getOpcode() == ISD::TRUNCATE) {
3995 SDValue N1 = N->getOperand(1);
3996 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3997 SDLoc SL(N);
3998
3999 SDValue Src = N0.getOperand(0);
4000 EVT SrcVT = Src.getValueType();
4001 if (SrcVT.bitsGE(ExtVT)) {
4002 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4003 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4004 }
4005 }
4006
4007 return SDValue();
4008}
4009
4011 SDNode *N, DAGCombinerInfo &DCI) const {
4012 unsigned IID = N->getConstantOperandVal(0);
4013 switch (IID) {
4014 case Intrinsic::amdgcn_mul_i24:
4015 case Intrinsic::amdgcn_mul_u24:
4016 case Intrinsic::amdgcn_mulhi_i24:
4017 case Intrinsic::amdgcn_mulhi_u24:
4018 return simplifyMul24(N, DCI);
4019 case Intrinsic::amdgcn_fract:
4020 case Intrinsic::amdgcn_rsq:
4021 case Intrinsic::amdgcn_rcp_legacy:
4022 case Intrinsic::amdgcn_rsq_legacy:
4023 case Intrinsic::amdgcn_rsq_clamp:
4024 case Intrinsic::amdgcn_tanh:
4025 case Intrinsic::amdgcn_prng_b32: {
4026 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4027 SDValue Src = N->getOperand(1);
4028 return Src.isUndef() ? Src : SDValue();
4029 }
4030 case Intrinsic::amdgcn_frexp_exp: {
4031 // frexp_exp (fneg x) -> frexp_exp x
4032 // frexp_exp (fabs x) -> frexp_exp x
4033 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4034 SDValue Src = N->getOperand(1);
4035 SDValue PeekSign = peekFPSignOps(Src);
4036 if (PeekSign == Src)
4037 return SDValue();
4038 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4039 0);
4040 }
4041 default:
4042 return SDValue();
4043 }
4044}
4045
4046/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4047/// binary operation \p Opc to it with the corresponding constant operands.
4049 DAGCombinerInfo &DCI, const SDLoc &SL,
4050 unsigned Opc, SDValue LHS,
4051 uint32_t ValLo, uint32_t ValHi) const {
4052 SelectionDAG &DAG = DCI.DAG;
4053 SDValue Lo, Hi;
4054 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4055
4056 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4057 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4058
4059 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4060 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4061
4062 // Re-visit the ands. It's possible we eliminated one of them and it could
4063 // simplify the vector.
4064 DCI.AddToWorklist(Lo.getNode());
4065 DCI.AddToWorklist(Hi.getNode());
4066
4067 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4068 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4069}
4070
4072 DAGCombinerInfo &DCI) const {
4073 EVT VT = N->getValueType(0);
4074 SDValue LHS = N->getOperand(0);
4075 SDValue RHS = N->getOperand(1);
4077 SDLoc SL(N);
4078 SelectionDAG &DAG = DCI.DAG;
4079
4080 unsigned RHSVal;
4081 if (CRHS) {
4082 RHSVal = CRHS->getZExtValue();
4083 if (!RHSVal)
4084 return LHS;
4085
4086 switch (LHS->getOpcode()) {
4087 default:
4088 break;
4089 case ISD::ZERO_EXTEND:
4090 case ISD::SIGN_EXTEND:
4091 case ISD::ANY_EXTEND: {
4092 SDValue X = LHS->getOperand(0);
4093
4094 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4095 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4096 // Prefer build_vector as the canonical form if packed types are legal.
4097 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4098 SDValue Vec = DAG.getBuildVector(
4099 MVT::v2i16, SL,
4100 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4101 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4102 }
4103
4104 // shl (ext x) => zext (shl x), if shift does not overflow int
4105 if (VT != MVT::i64)
4106 break;
4107 KnownBits Known = DAG.computeKnownBits(X);
4108 unsigned LZ = Known.countMinLeadingZeros();
4109 if (LZ < RHSVal)
4110 break;
4111 EVT XVT = X.getValueType();
4112 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4113 return DAG.getZExtOrTrunc(Shl, SL, VT);
4114 }
4115 }
4116 }
4117
4118 if (VT.getScalarType() != MVT::i64)
4119 return SDValue();
4120
4121 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4122 // common case, splitting this into a move and a 32-bit shift is faster and
4123 // the same code size.
4124 KnownBits Known = DAG.computeKnownBits(RHS);
4125
4126 EVT ElementType = VT.getScalarType();
4127 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4128 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4129 : TargetScalarType;
4130
4131 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4132 return SDValue();
4133 SDValue ShiftAmt;
4134
4135 if (CRHS) {
4136 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4137 TargetType);
4138 } else {
4139 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4140 const SDValue ShiftMask =
4141 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4142 // This AND instruction will clamp out of bounds shift values.
4143 // It will also be removed during later instruction selection.
4144 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4145 }
4146
4147 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4148 SDValue NewShift =
4149 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4150
4151 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4152 SDValue Vec;
4153
4154 if (VT.isVector()) {
4155 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4156 unsigned NElts = TargetType.getVectorNumElements();
4158 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4159
4160 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4161 for (unsigned I = 0; I != NElts; ++I)
4162 HiAndLoOps[2 * I + 1] = HiOps[I];
4163 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4164 } else {
4165 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4166 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4167 }
4168 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4169}
4170
4172 DAGCombinerInfo &DCI) const {
4173 SDValue RHS = N->getOperand(1);
4175 EVT VT = N->getValueType(0);
4176 SDValue LHS = N->getOperand(0);
4177 SelectionDAG &DAG = DCI.DAG;
4178 SDLoc SL(N);
4179
4180 if (VT.getScalarType() != MVT::i64)
4181 return SDValue();
4182
4183 // For C >= 32
4184 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4185
4186 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4187 // common case, splitting this into a move and a 32-bit shift is faster and
4188 // the same code size.
4189 KnownBits Known = DAG.computeKnownBits(RHS);
4190
4191 EVT ElementType = VT.getScalarType();
4192 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4193 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4194 : TargetScalarType;
4195
4196 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4197 return SDValue();
4198
4199 SDValue ShiftFullAmt =
4200 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4201 SDValue ShiftAmt;
4202 if (CRHS) {
4203 unsigned RHSVal = CRHS->getZExtValue();
4204 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4205 TargetType);
4206 } else if (Known.getMinValue().getZExtValue() ==
4207 (ElementType.getSizeInBits() - 1)) {
4208 ShiftAmt = ShiftFullAmt;
4209 } else {
4210 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4211 const SDValue ShiftMask =
4212 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4213 // This AND instruction will clamp out of bounds shift values.
4214 // It will also be removed during later instruction selection.
4215 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4216 }
4217
4218 EVT ConcatType;
4219 SDValue Hi;
4220 SDLoc LHSSL(LHS);
4221 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4222 if (VT.isVector()) {
4223 unsigned NElts = TargetType.getVectorNumElements();
4224 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4225 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4226 SmallVector<SDValue, 8> HiOps(NElts);
4227 SmallVector<SDValue, 16> HiAndLoOps;
4228
4229 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4230 for (unsigned I = 0; I != NElts; ++I) {
4231 HiOps[I] = HiAndLoOps[2 * I + 1];
4232 }
4233 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4234 } else {
4235 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4236 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4237 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4238 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4239 }
4240
4241 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4242 SDValue HiShift;
4243 if (KnownLHS.isNegative()) {
4244 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4245 } else {
4246 Hi = DAG.getFreeze(Hi);
4247 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4248 }
4249 SDValue NewShift =
4250 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4251
4252 SDValue Vec;
4253 if (VT.isVector()) {
4254 unsigned NElts = TargetType.getVectorNumElements();
4257 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4258
4259 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4260 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4261 for (unsigned I = 0; I != NElts; ++I) {
4262 HiAndLoOps[2 * I + 1] = HiOps[I];
4263 HiAndLoOps[2 * I] = LoOps[I];
4264 }
4265 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4266 } else {
4267 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4268 }
4269 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4270}
4271
4273 DAGCombinerInfo &DCI) const {
4274 SDValue RHS = N->getOperand(1);
4276 EVT VT = N->getValueType(0);
4277 SDValue LHS = N->getOperand(0);
4278 SelectionDAG &DAG = DCI.DAG;
4279 SDLoc SL(N);
4280 unsigned RHSVal;
4281
4282 if (CRHS) {
4283 RHSVal = CRHS->getZExtValue();
4284
4285 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4286 // this improves the ability to match BFE patterns in isel.
4287 if (LHS.getOpcode() == ISD::AND) {
4288 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4289 unsigned MaskIdx, MaskLen;
4290 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4291 MaskIdx == RHSVal) {
4292 return DAG.getNode(ISD::AND, SL, VT,
4293 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4294 N->getOperand(1)),
4295 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4296 N->getOperand(1)));
4297 }
4298 }
4299 }
4300 }
4301
4302 if (VT.getScalarType() != MVT::i64)
4303 return SDValue();
4304
4305 // for C >= 32
4306 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4307
4308 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4309 // common case, splitting this into a move and a 32-bit shift is faster and
4310 // the same code size.
4311 KnownBits Known = DAG.computeKnownBits(RHS);
4312
4313 EVT ElementType = VT.getScalarType();
4314 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4315 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4316 : TargetScalarType;
4317
4318 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4319 return SDValue();
4320
4321 SDValue ShiftAmt;
4322 if (CRHS) {
4323 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4324 TargetType);
4325 } else {
4326 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4327 const SDValue ShiftMask =
4328 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4329 // This AND instruction will clamp out of bounds shift values.
4330 // It will also be removed during later instruction selection.
4331 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4332 }
4333
4334 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4335 EVT ConcatType;
4336 SDValue Hi;
4337 SDLoc LHSSL(LHS);
4338 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4339 if (VT.isVector()) {
4340 unsigned NElts = TargetType.getVectorNumElements();
4341 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4342 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4343 SmallVector<SDValue, 8> HiOps(NElts);
4344 SmallVector<SDValue, 16> HiAndLoOps;
4345
4346 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4347 for (unsigned I = 0; I != NElts; ++I)
4348 HiOps[I] = HiAndLoOps[2 * I + 1];
4349 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4350 } else {
4351 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4352 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4353 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4354 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4355 }
4356
4357 SDValue NewShift =
4358 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4359
4360 SDValue Vec;
4361 if (VT.isVector()) {
4362 unsigned NElts = TargetType.getVectorNumElements();
4364 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4365
4366 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4367 for (unsigned I = 0; I != NElts; ++I)
4368 HiAndLoOps[2 * I] = LoOps[I];
4369 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4370 } else {
4371 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4372 }
4373 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4374}
4375
4377 SDNode *N, DAGCombinerInfo &DCI) const {
4378 SDLoc SL(N);
4379 SelectionDAG &DAG = DCI.DAG;
4380 EVT VT = N->getValueType(0);
4381 SDValue Src = N->getOperand(0);
4382
4383 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4384 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4385 SDValue Vec = Src.getOperand(0);
4386 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4387 SDValue Elt0 = Vec.getOperand(0);
4388 EVT EltVT = Elt0.getValueType();
4389 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4390 if (EltVT.isFloatingPoint()) {
4391 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4392 EltVT.changeTypeToInteger(), Elt0);
4393 }
4394
4395 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4396 }
4397 }
4398 }
4399
4400 // Equivalent of above for accessing the high element of a vector as an
4401 // integer operation.
4402 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4403 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4404 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4405 SDValue BV = stripBitcast(Src.getOperand(0));
4406 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4407 EVT SrcEltVT = BV.getOperand(0).getValueType();
4408 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4409 unsigned BitIndex = K->getZExtValue();
4410 unsigned PartIndex = BitIndex / SrcEltSize;
4411
4412 if (PartIndex * SrcEltSize == BitIndex &&
4413 PartIndex < BV.getNumOperands()) {
4414 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4415 SDValue SrcElt =
4416 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4417 BV.getOperand(PartIndex));
4418 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4419 }
4420 }
4421 }
4422 }
4423 }
4424
4425 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4426 //
4427 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4428 // i16 (trunc (srl (i32 (trunc x), K)))
4429 if (VT.getScalarSizeInBits() < 32) {
4430 EVT SrcVT = Src.getValueType();
4431 if (SrcVT.getScalarSizeInBits() > 32 &&
4432 (Src.getOpcode() == ISD::SRL ||
4433 Src.getOpcode() == ISD::SRA ||
4434 Src.getOpcode() == ISD::SHL)) {
4435 SDValue Amt = Src.getOperand(1);
4436 KnownBits Known = DAG.computeKnownBits(Amt);
4437
4438 // - For left shifts, do the transform as long as the shift
4439 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4440 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4441 // losing information stored in the high bits when truncating.
4442 const unsigned MaxCstSize =
4443 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4444 if (Known.getMaxValue().ule(MaxCstSize)) {
4445 EVT MidVT = VT.isVector() ?
4446 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4447 VT.getVectorNumElements()) : MVT::i32;
4448
4449 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4450 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4451 Src.getOperand(0));
4452 DCI.AddToWorklist(Trunc.getNode());
4453
4454 if (Amt.getValueType() != NewShiftVT) {
4455 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4456 DCI.AddToWorklist(Amt.getNode());
4457 }
4458
4459 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4460 Trunc, Amt);
4461 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4462 }
4463 }
4464 }
4465
4466 return SDValue();
4467}
4468
4469// We need to specifically handle i64 mul here to avoid unnecessary conversion
4470// instructions. If we only match on the legalized i64 mul expansion,
4471// SimplifyDemandedBits will be unable to remove them because there will be
4472// multiple uses due to the separate mul + mulh[su].
4473static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4474 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4475 if (Size <= 32) {
4476 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4477 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4478 }
4479
4480 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4481 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4482
4483 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4484 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4485
4486 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4487}
4488
4489/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4490/// return SDValue().
4491static SDValue getAddOneOp(const SDNode *V) {
4492 if (V->getOpcode() != ISD::ADD)
4493 return SDValue();
4494
4495 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4496}
4497
4499 DAGCombinerInfo &DCI) const {
4500 assert(N->getOpcode() == ISD::MUL);
4501 EVT VT = N->getValueType(0);
4502
4503 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4504 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4505 // unnecessarily). isDivergent() is used as an approximation of whether the
4506 // value is in an SGPR.
4507 if (!N->isDivergent())
4508 return SDValue();
4509
4510 unsigned Size = VT.getSizeInBits();
4511 if (VT.isVector() || Size > 64)
4512 return SDValue();
4513
4514 SelectionDAG &DAG = DCI.DAG;
4515 SDLoc DL(N);
4516
4517 SDValue N0 = N->getOperand(0);
4518 SDValue N1 = N->getOperand(1);
4519
4520 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4521 // matching.
4522
4523 // mul x, (add y, 1) -> add (mul x, y), x
4524 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4525 SDValue AddOp = getAddOneOp(V.getNode());
4526 if (!AddOp)
4527 return SDValue();
4528
4529 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4530 return U->getOpcode() == ISD::MUL;
4531 }))
4532 return AddOp;
4533
4534 return SDValue();
4535 };
4536
4537 // FIXME: The selection pattern is not properly checking for commuted
4538 // operands, so we have to place the mul in the LHS
4539 if (SDValue MulOper = IsFoldableAdd(N0)) {
4540 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4541 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4542 }
4543
4544 if (SDValue MulOper = IsFoldableAdd(N1)) {
4545 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4546 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4547 }
4548
4549 // There are i16 integer mul/mad.
4550 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4551 return SDValue();
4552
4553 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4554 // in the source into any_extends if the result of the mul is truncated. Since
4555 // we can assume the high bits are whatever we want, use the underlying value
4556 // to avoid the unknown high bits from interfering.
4557 if (N0.getOpcode() == ISD::ANY_EXTEND)
4558 N0 = N0.getOperand(0);
4559
4560 if (N1.getOpcode() == ISD::ANY_EXTEND)
4561 N1 = N1.getOperand(0);
4562
4563 SDValue Mul;
4564
4565 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4566 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4567 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4568 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4569 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4570 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4571 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4572 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4573 } else {
4574 return SDValue();
4575 }
4576
4577 // We need to use sext even for MUL_U24, because MUL_U24 is used
4578 // for signed multiply of 8 and 16-bit types.
4579 return DAG.getSExtOrTrunc(Mul, DL, VT);
4580}
4581
4582SDValue
4584 DAGCombinerInfo &DCI) const {
4585 if (N->getValueType(0) != MVT::i32)
4586 return SDValue();
4587
4588 SelectionDAG &DAG = DCI.DAG;
4589 SDLoc DL(N);
4590
4591 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4592 SDValue N0 = N->getOperand(0);
4593 SDValue N1 = N->getOperand(1);
4594
4595 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4596 // in the source into any_extends if the result of the mul is truncated. Since
4597 // we can assume the high bits are whatever we want, use the underlying value
4598 // to avoid the unknown high bits from interfering.
4599 if (N0.getOpcode() == ISD::ANY_EXTEND)
4600 N0 = N0.getOperand(0);
4601 if (N1.getOpcode() == ISD::ANY_EXTEND)
4602 N1 = N1.getOperand(0);
4603
4604 // Try to use two fast 24-bit multiplies (one for each half of the result)
4605 // instead of one slow extending multiply.
4606 unsigned LoOpcode = 0;
4607 unsigned HiOpcode = 0;
4608 if (Signed) {
4609 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4610 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4611 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4612 LoOpcode = AMDGPUISD::MUL_I24;
4613 HiOpcode = AMDGPUISD::MULHI_I24;
4614 }
4615 } else {
4616 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4617 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4618 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4619 LoOpcode = AMDGPUISD::MUL_U24;
4620 HiOpcode = AMDGPUISD::MULHI_U24;
4621 }
4622 }
4623 if (!LoOpcode)
4624 return SDValue();
4625
4626 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4627 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4628 DCI.CombineTo(N, Lo, Hi);
4629 return SDValue(N, 0);
4630}
4631
4633 DAGCombinerInfo &DCI) const {
4634 EVT VT = N->getValueType(0);
4635
4636 if (!Subtarget->hasMulI24() || VT.isVector())
4637 return SDValue();
4638
4639 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4640 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4641 // unnecessarily). isDivergent() is used as an approximation of whether the
4642 // value is in an SGPR.
4643 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4644 // valu op anyway)
4645 if (Subtarget->hasSMulHi() && !N->isDivergent())
4646 return SDValue();
4647
4648 SelectionDAG &DAG = DCI.DAG;
4649 SDLoc DL(N);
4650
4651 SDValue N0 = N->getOperand(0);
4652 SDValue N1 = N->getOperand(1);
4653
4654 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4655 return SDValue();
4656
4657 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4658 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4659
4660 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4661 DCI.AddToWorklist(Mulhi.getNode());
4662 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4663}
4664
4666 DAGCombinerInfo &DCI) const {
4667 EVT VT = N->getValueType(0);
4668
4669 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4670 return SDValue();
4671
4672 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4673 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4674 // unnecessarily). isDivergent() is used as an approximation of whether the
4675 // value is in an SGPR.
4676 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4677 // valu op anyway)
4678 if (Subtarget->hasSMulHi() && !N->isDivergent())
4679 return SDValue();
4680
4681 SelectionDAG &DAG = DCI.DAG;
4682 SDLoc DL(N);
4683
4684 SDValue N0 = N->getOperand(0);
4685 SDValue N1 = N->getOperand(1);
4686
4687 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4688 return SDValue();
4689
4690 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4691 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4692
4693 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4694 DCI.AddToWorklist(Mulhi.getNode());
4695 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4696}
4697
4698SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4699 SDValue Op,
4700 const SDLoc &DL,
4701 unsigned Opc) const {
4702 EVT VT = Op.getValueType();
4703 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4704 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4705 LegalVT != MVT::i16))
4706 return SDValue();
4707
4708 if (VT != MVT::i32)
4709 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4710
4711 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4712 if (VT != MVT::i32)
4713 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4714
4715 return FFBX;
4716}
4717
4718// The native instructions return -1 on 0 input. Optimize out a select that
4719// produces -1 on 0.
4720//
4721// TODO: If zero is not undef, we could also do this if the output is compared
4722// against the bitwidth.
4723//
4724// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4726 SDValue LHS, SDValue RHS,
4727 DAGCombinerInfo &DCI) const {
4728 if (!isNullConstant(Cond.getOperand(1)))
4729 return SDValue();
4730
4731 SelectionDAG &DAG = DCI.DAG;
4732 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4733 SDValue CmpLHS = Cond.getOperand(0);
4734
4735 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4736 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4737 if (CCOpcode == ISD::SETEQ &&
4738 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4739 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4740 unsigned Opc =
4742 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4743 }
4744
4745 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4746 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4747 if (CCOpcode == ISD::SETNE &&
4748 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4749 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4750 unsigned Opc =
4752
4753 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4754 }
4755
4756 return SDValue();
4757}
4758
4760 unsigned Op,
4761 const SDLoc &SL,
4762 SDValue Cond,
4763 SDValue N1,
4764 SDValue N2) {
4765 SelectionDAG &DAG = DCI.DAG;
4766 EVT VT = N1.getValueType();
4767
4768 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4769 N1.getOperand(0), N2.getOperand(0));
4770 DCI.AddToWorklist(NewSelect.getNode());
4771 return DAG.getNode(Op, SL, VT, NewSelect);
4772}
4773
4774// Pull a free FP operation out of a select so it may fold into uses.
4775//
4776// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4777// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4778//
4779// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4780// select c, (fabs x), +k -> fabs (select c, x, k)
4781SDValue
4783 SDValue N) const {
4784 SelectionDAG &DAG = DCI.DAG;
4785 SDValue Cond = N.getOperand(0);
4786 SDValue LHS = N.getOperand(1);
4787 SDValue RHS = N.getOperand(2);
4788
4789 EVT VT = N.getValueType();
4790 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4791 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4793 return SDValue();
4794
4795 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4796 SDLoc(N), Cond, LHS, RHS);
4797 }
4798
4799 bool Inv = false;
4800 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4801 std::swap(LHS, RHS);
4802 Inv = true;
4803 }
4804
4805 // TODO: Support vector constants.
4807 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4808 !selectSupportsSourceMods(N.getNode())) {
4809 SDLoc SL(N);
4810 // If one side is an fneg/fabs and the other is a constant, we can push the
4811 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4812 SDValue NewLHS = LHS.getOperand(0);
4813 SDValue NewRHS = RHS;
4814
4815 // Careful: if the neg can be folded up, don't try to pull it back down.
4816 bool ShouldFoldNeg = true;
4817
4818 if (NewLHS.hasOneUse()) {
4819 unsigned Opc = NewLHS.getOpcode();
4820 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4821 ShouldFoldNeg = false;
4822 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4823 ShouldFoldNeg = false;
4824 }
4825
4826 if (ShouldFoldNeg) {
4827 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4828 return SDValue();
4829
4830 // We're going to be forced to use a source modifier anyway, there's no
4831 // point to pulling the negate out unless we can get a size reduction by
4832 // negating the constant.
4833 //
4834 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4835 // about cheaper constants.
4836 if (NewLHS.getOpcode() == ISD::FABS &&
4838 return SDValue();
4839
4841 return SDValue();
4842
4843 if (LHS.getOpcode() == ISD::FNEG)
4844 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4845
4846 if (Inv)
4847 std::swap(NewLHS, NewRHS);
4848
4849 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4850 Cond, NewLHS, NewRHS);
4851 DCI.AddToWorklist(NewSelect.getNode());
4852 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4853 }
4854 }
4855
4856 return SDValue();
4857}
4858
4860 DAGCombinerInfo &DCI) const {
4861 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4862 return Folded;
4863
4864 SDValue Cond = N->getOperand(0);
4865 if (Cond.getOpcode() != ISD::SETCC)
4866 return SDValue();
4867
4868 EVT VT = N->getValueType(0);
4869 SDValue LHS = Cond.getOperand(0);
4870 SDValue RHS = Cond.getOperand(1);
4871 SDValue CC = Cond.getOperand(2);
4872
4873 SDValue True = N->getOperand(1);
4874 SDValue False = N->getOperand(2);
4875
4876 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4877 SelectionDAG &DAG = DCI.DAG;
4878 if (DAG.isConstantValueOfAnyType(True) &&
4879 !DAG.isConstantValueOfAnyType(False)) {
4880 // Swap cmp + select pair to move constant to false input.
4881 // This will allow using VOPC cndmasks more often.
4882 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4883
4884 SDLoc SL(N);
4885 ISD::CondCode NewCC =
4886 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4887
4888 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4889 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4890 }
4891
4892 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4894 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4895 // Revisit this node so we can catch min3/max3/med3 patterns.
4896 //DCI.AddToWorklist(MinMax.getNode());
4897 return MinMax;
4898 }
4899 }
4900
4901 // There's no reason to not do this if the condition has other uses.
4902 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4903}
4904
4905static bool isInv2Pi(const APFloat &APF) {
4906 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4907 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4908 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4909
4910 return APF.bitwiseIsEqual(KF16) ||
4911 APF.bitwiseIsEqual(KF32) ||
4912 APF.bitwiseIsEqual(KF64);
4913}
4914
4915// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4916// additional cost to negate them.
4919 if (C->isZero())
4920 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4921
4922 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4923 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4924
4926}
4927
4933
4939
4940static unsigned inverseMinMax(unsigned Opc) {
4941 switch (Opc) {
4942 case ISD::FMAXNUM:
4943 return ISD::FMINNUM;
4944 case ISD::FMINNUM:
4945 return ISD::FMAXNUM;
4946 case ISD::FMAXNUM_IEEE:
4947 return ISD::FMINNUM_IEEE;
4948 case ISD::FMINNUM_IEEE:
4949 return ISD::FMAXNUM_IEEE;
4950 case ISD::FMAXIMUM:
4951 return ISD::FMINIMUM;
4952 case ISD::FMINIMUM:
4953 return ISD::FMAXIMUM;
4954 case ISD::FMAXIMUMNUM:
4955 return ISD::FMINIMUMNUM;
4956 case ISD::FMINIMUMNUM:
4957 return ISD::FMAXIMUMNUM;
4962 default:
4963 llvm_unreachable("invalid min/max opcode");
4964 }
4965}
4966
4967/// \return true if it's profitable to try to push an fneg into its source
4968/// instruction.
4970 // If the input has multiple uses and we can either fold the negate down, or
4971 // the other uses cannot, give up. This both prevents unprofitable
4972 // transformations and infinite loops: we won't repeatedly try to fold around
4973 // a negate that has no 'good' form.
4974 if (N0.hasOneUse()) {
4975 // This may be able to fold into the source, but at a code size cost. Don't
4976 // fold if the fold into the user is free.
4977 if (allUsesHaveSourceMods(N, 0))
4978 return false;
4979 } else {
4980 if (fnegFoldsIntoOp(N0.getNode()) &&
4982 return false;
4983 }
4984
4985 return true;
4986}
4987
4989 DAGCombinerInfo &DCI) const {
4990 SelectionDAG &DAG = DCI.DAG;
4991 SDValue N0 = N->getOperand(0);
4992 EVT VT = N->getValueType(0);
4993
4994 unsigned Opc = N0.getOpcode();
4995
4996 if (!shouldFoldFNegIntoSrc(N, N0))
4997 return SDValue();
4998
4999 SDLoc SL(N);
5000 switch (Opc) {
5001 case ISD::FADD: {
5002 if (!mayIgnoreSignedZero(N0))
5003 return SDValue();
5004
5005 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5006 SDValue LHS = N0.getOperand(0);
5007 SDValue RHS = N0.getOperand(1);
5008
5009 if (LHS.getOpcode() != ISD::FNEG)
5010 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5011 else
5012 LHS = LHS.getOperand(0);
5013
5014 if (RHS.getOpcode() != ISD::FNEG)
5015 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5016 else
5017 RHS = RHS.getOperand(0);
5018
5019 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5020 if (Res.getOpcode() != ISD::FADD)
5021 return SDValue(); // Op got folded away.
5022 if (!N0.hasOneUse())
5023 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5024 return Res;
5025 }
5026 case ISD::FMUL:
5028 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5029 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5030 SDValue LHS = N0.getOperand(0);
5031 SDValue RHS = N0.getOperand(1);
5032
5033 if (LHS.getOpcode() == ISD::FNEG)
5034 LHS = LHS.getOperand(0);
5035 else if (RHS.getOpcode() == ISD::FNEG)
5036 RHS = RHS.getOperand(0);
5037 else
5038 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5039
5040 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5041 if (Res.getOpcode() != Opc)
5042 return SDValue(); // Op got folded away.
5043 if (!N0.hasOneUse())
5044 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5045 return Res;
5046 }
5047 case ISD::FMA:
5048 case ISD::FMAD: {
5049 // TODO: handle llvm.amdgcn.fma.legacy
5050 if (!mayIgnoreSignedZero(N0))
5051 return SDValue();
5052
5053 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5054 SDValue LHS = N0.getOperand(0);
5055 SDValue MHS = N0.getOperand(1);
5056 SDValue RHS = N0.getOperand(2);
5057
5058 if (LHS.getOpcode() == ISD::FNEG)
5059 LHS = LHS.getOperand(0);
5060 else if (MHS.getOpcode() == ISD::FNEG)
5061 MHS = MHS.getOperand(0);
5062 else
5063 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5064
5065 if (RHS.getOpcode() != ISD::FNEG)
5066 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5067 else
5068 RHS = RHS.getOperand(0);
5069
5070 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5071 if (Res.getOpcode() != Opc)
5072 return SDValue(); // Op got folded away.
5073 if (!N0.hasOneUse())
5074 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5075 return Res;
5076 }
5077 case ISD::FMAXNUM:
5078 case ISD::FMINNUM:
5079 case ISD::FMAXNUM_IEEE:
5080 case ISD::FMINNUM_IEEE:
5081 case ISD::FMINIMUM:
5082 case ISD::FMAXIMUM:
5083 case ISD::FMINIMUMNUM:
5084 case ISD::FMAXIMUMNUM:
5087 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5088 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5089 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5090 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5091
5092 SDValue LHS = N0.getOperand(0);
5093 SDValue RHS = N0.getOperand(1);
5094
5095 // 0 doesn't have a negated inline immediate.
5096 // TODO: This constant check should be generalized to other operations.
5098 return SDValue();
5099
5100 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5101 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5102 unsigned Opposite = inverseMinMax(Opc);
5103
5104 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5105 if (Res.getOpcode() != Opposite)
5106 return SDValue(); // Op got folded away.
5107 if (!N0.hasOneUse())
5108 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5109 return Res;
5110 }
5111 case AMDGPUISD::FMED3: {
5112 SDValue Ops[3];
5113 for (unsigned I = 0; I < 3; ++I)
5114 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5115
5116 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5117 if (Res.getOpcode() != AMDGPUISD::FMED3)
5118 return SDValue(); // Op got folded away.
5119
5120 if (!N0.hasOneUse()) {
5121 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5122 DAG.ReplaceAllUsesWith(N0, Neg);
5123
5124 for (SDNode *U : Neg->users())
5125 DCI.AddToWorklist(U);
5126 }
5127
5128 return Res;
5129 }
5130 case ISD::FP_EXTEND:
5131 case ISD::FTRUNC:
5132 case ISD::FRINT:
5133 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5134 case ISD::FROUNDEVEN:
5135 case ISD::FSIN:
5136 case ISD::FCANONICALIZE:
5137 case AMDGPUISD::RCP:
5140 case AMDGPUISD::SIN_HW: {
5141 SDValue CvtSrc = N0.getOperand(0);
5142 if (CvtSrc.getOpcode() == ISD::FNEG) {
5143 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5144 // (fneg (rcp (fneg x))) -> (rcp x)
5145 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5146 }
5147
5148 if (!N0.hasOneUse())
5149 return SDValue();
5150
5151 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5152 // (fneg (rcp x)) -> (rcp (fneg x))
5153 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5154 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5155 }
5156 case ISD::FP_ROUND: {
5157 SDValue CvtSrc = N0.getOperand(0);
5158
5159 if (CvtSrc.getOpcode() == ISD::FNEG) {
5160 // (fneg (fp_round (fneg x))) -> (fp_round x)
5161 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5162 CvtSrc.getOperand(0), N0.getOperand(1));
5163 }
5164
5165 if (!N0.hasOneUse())
5166 return SDValue();
5167
5168 // (fneg (fp_round x)) -> (fp_round (fneg x))
5169 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5170 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5171 }
5172 case ISD::FP16_TO_FP: {
5173 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5174 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5175 // Put the fneg back as a legal source operation that can be matched later.
5176 SDLoc SL(N);
5177
5178 SDValue Src = N0.getOperand(0);
5179 EVT SrcVT = Src.getValueType();
5180
5181 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5182 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5183 DAG.getConstant(0x8000, SL, SrcVT));
5184 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5185 }
5186 case ISD::SELECT: {
5187 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5188 // TODO: Invert conditions of foldFreeOpFromSelect
5189 return SDValue();
5190 }
5191 case ISD::BITCAST: {
5192 SDLoc SL(N);
5193 SDValue BCSrc = N0.getOperand(0);
5194 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5195 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5196 if (HighBits.getValueType().getSizeInBits() != 32 ||
5197 !fnegFoldsIntoOp(HighBits.getNode()))
5198 return SDValue();
5199
5200 // f64 fneg only really needs to operate on the high half of of the
5201 // register, so try to force it to an f32 operation to help make use of
5202 // source modifiers.
5203 //
5204 //
5205 // fneg (f64 (bitcast (build_vector x, y))) ->
5206 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5207 // (fneg (bitcast i32:y to f32)))
5208
5209 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5210 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5211 SDValue CastBack =
5212 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5213
5215 Ops.back() = CastBack;
5216 DCI.AddToWorklist(NegHi.getNode());
5217 SDValue Build =
5218 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5219 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5220
5221 if (!N0.hasOneUse())
5222 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5223 return Result;
5224 }
5225
5226 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5227 BCSrc.hasOneUse()) {
5228 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5229 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5230
5231 // TODO: Cast back result for multiple uses is beneficial in some cases.
5232
5233 SDValue LHS =
5234 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5235 SDValue RHS =
5236 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5237
5238 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5239 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5240
5241 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5242 NegRHS);
5243 }
5244
5245 return SDValue();
5246 }
5247 default:
5248 return SDValue();
5249 }
5250}
5251
5253 DAGCombinerInfo &DCI) const {
5254 SelectionDAG &DAG = DCI.DAG;
5255 SDValue N0 = N->getOperand(0);
5256
5257 if (!N0.hasOneUse())
5258 return SDValue();
5259
5260 switch (N0.getOpcode()) {
5261 case ISD::FP16_TO_FP: {
5262 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5263 SDLoc SL(N);
5264 SDValue Src = N0.getOperand(0);
5265 EVT SrcVT = Src.getValueType();
5266
5267 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5268 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5269 DAG.getConstant(0x7fff, SL, SrcVT));
5270 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5271 }
5272 default:
5273 return SDValue();
5274 }
5275}
5276
5278 DAGCombinerInfo &DCI) const {
5279 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5280 if (!CFP)
5281 return SDValue();
5282
5283 // XXX - Should this flush denormals?
5284 const APFloat &Val = CFP->getValueAPF();
5285 APFloat One(Val.getSemantics(), "1.0");
5286 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5287}
5288
5290 DAGCombinerInfo &DCI) const {
5291 SelectionDAG &DAG = DCI.DAG;
5292 SDLoc DL(N);
5293
5294 switch(N->getOpcode()) {
5295 default:
5296 break;
5297 case ISD::BITCAST: {
5298 EVT DestVT = N->getValueType(0);
5299
5300 // Push casts through vector builds. This helps avoid emitting a large
5301 // number of copies when materializing floating point vector constants.
5302 //
5303 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5304 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5305 if (DestVT.isVector()) {
5306 SDValue Src = N->getOperand(0);
5307 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5310 EVT SrcVT = Src.getValueType();
5311 unsigned NElts = DestVT.getVectorNumElements();
5312
5313 if (SrcVT.getVectorNumElements() == NElts) {
5314 EVT DestEltVT = DestVT.getVectorElementType();
5315
5316 SmallVector<SDValue, 8> CastedElts;
5317 SDLoc SL(N);
5318 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5319 SDValue Elt = Src.getOperand(I);
5320 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5321 }
5322
5323 return DAG.getBuildVector(DestVT, SL, CastedElts);
5324 }
5325 }
5326 }
5327
5328 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5329 break;
5330
5331 // Fold bitcasts of constants.
5332 //
5333 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5334 // TODO: Generalize and move to DAGCombiner
5335 SDValue Src = N->getOperand(0);
5337 SDLoc SL(N);
5338 uint64_t CVal = C->getZExtValue();
5339 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5340 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5341 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5342 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5343 }
5344
5346 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5347 SDLoc SL(N);
5348 uint64_t CVal = Val.getZExtValue();
5349 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5350 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5351 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5352
5353 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5354 }
5355
5356 break;
5357 }
5358 case ISD::SHL:
5359 case ISD::SRA:
5360 case ISD::SRL: {
5361 // Range metadata can be invalidated when loads are converted to legal types
5362 // (e.g. v2i64 -> v4i32).
5363 // Try to convert vector shl/sra/srl before type legalization so that range
5364 // metadata can be utilized.
5365 if (!(N->getValueType(0).isVector() &&
5368 break;
5369 if (N->getOpcode() == ISD::SHL)
5370 return performShlCombine(N, DCI);
5371 if (N->getOpcode() == ISD::SRA)
5372 return performSraCombine(N, DCI);
5373 return performSrlCombine(N, DCI);
5374 }
5375 case ISD::TRUNCATE:
5376 return performTruncateCombine(N, DCI);
5377 case ISD::MUL:
5378 return performMulCombine(N, DCI);
5379 case AMDGPUISD::MUL_U24:
5380 case AMDGPUISD::MUL_I24: {
5381 if (SDValue Simplified = simplifyMul24(N, DCI))
5382 return Simplified;
5383 break;
5384 }
5387 return simplifyMul24(N, DCI);
5388 case ISD::SMUL_LOHI:
5389 case ISD::UMUL_LOHI:
5390 return performMulLoHiCombine(N, DCI);
5391 case ISD::MULHS:
5392 return performMulhsCombine(N, DCI);
5393 case ISD::MULHU:
5394 return performMulhuCombine(N, DCI);
5395 case ISD::SELECT:
5396 return performSelectCombine(N, DCI);
5397 case ISD::FNEG:
5398 return performFNegCombine(N, DCI);
5399 case ISD::FABS:
5400 return performFAbsCombine(N, DCI);
5401 case AMDGPUISD::BFE_I32:
5402 case AMDGPUISD::BFE_U32: {
5403 assert(!N->getValueType(0).isVector() &&
5404 "Vector handling of BFE not implemented");
5405 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5406 if (!Width)
5407 break;
5408
5409 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5410 if (WidthVal == 0)
5411 return DAG.getConstant(0, DL, MVT::i32);
5412
5414 if (!Offset)
5415 break;
5416
5417 SDValue BitsFrom = N->getOperand(0);
5418 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5419
5420 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5421
5422 if (OffsetVal == 0) {
5423 // This is already sign / zero extended, so try to fold away extra BFEs.
5424 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5425
5426 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5427 if (OpSignBits >= SignBits)
5428 return BitsFrom;
5429
5430 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5431 if (Signed) {
5432 // This is a sign_extend_inreg. Replace it to take advantage of existing
5433 // DAG Combines. If not eliminated, we will match back to BFE during
5434 // selection.
5435
5436 // TODO: The sext_inreg of extended types ends, although we can could
5437 // handle them in a single BFE.
5438 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5439 DAG.getValueType(SmallVT));
5440 }
5441
5442 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5443 }
5444
5445 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5446 if (Signed) {
5447 return constantFoldBFE<int32_t>(DAG,
5448 CVal->getSExtValue(),
5449 OffsetVal,
5450 WidthVal,
5451 DL);
5452 }
5453
5454 return constantFoldBFE<uint32_t>(DAG,
5455 CVal->getZExtValue(),
5456 OffsetVal,
5457 WidthVal,
5458 DL);
5459 }
5460
5461 if ((OffsetVal + WidthVal) >= 32 &&
5462 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5463 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5464 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5465 BitsFrom, ShiftVal);
5466 }
5467
5468 if (BitsFrom.hasOneUse()) {
5469 APInt Demanded = APInt::getBitsSet(32,
5470 OffsetVal,
5471 OffsetVal + WidthVal);
5472
5473 KnownBits Known;
5475 !DCI.isBeforeLegalizeOps());
5476 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5477 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5478 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5479 DCI.CommitTargetLoweringOpt(TLO);
5480 }
5481 }
5482
5483 break;
5484 }
5485 case ISD::LOAD:
5486 return performLoadCombine(N, DCI);
5487 case ISD::STORE:
5488 return performStoreCombine(N, DCI);
5489 case AMDGPUISD::RCP:
5491 return performRcpCombine(N, DCI);
5492 case ISD::AssertZext:
5493 case ISD::AssertSext:
5494 return performAssertSZExtCombine(N, DCI);
5496 return performIntrinsicWOChainCombine(N, DCI);
5497 case AMDGPUISD::FMAD_FTZ: {
5498 SDValue N0 = N->getOperand(0);
5499 SDValue N1 = N->getOperand(1);
5500 SDValue N2 = N->getOperand(2);
5501 EVT VT = N->getValueType(0);
5502
5503 // FMAD_FTZ is a FMAD + flush denormals to zero.
5504 // We flush the inputs, the intermediate step, and the output.
5508 if (N0CFP && N1CFP && N2CFP) {
5509 const auto FTZ = [](const APFloat &V) {
5510 if (V.isDenormal()) {
5511 APFloat Zero(V.getSemantics(), 0);
5512 return V.isNegative() ? -Zero : Zero;
5513 }
5514 return V;
5515 };
5516
5517 APFloat V0 = FTZ(N0CFP->getValueAPF());
5518 APFloat V1 = FTZ(N1CFP->getValueAPF());
5519 APFloat V2 = FTZ(N2CFP->getValueAPF());
5521 V0 = FTZ(V0);
5523 return DAG.getConstantFP(FTZ(V0), DL, VT);
5524 }
5525 break;
5526 }
5527 }
5528 return SDValue();
5529}
5530
5531//===----------------------------------------------------------------------===//
5532// Helper functions
5533//===----------------------------------------------------------------------===//
5534
5536 const TargetRegisterClass *RC,
5537 Register Reg, EVT VT,
5538 const SDLoc &SL,
5539 bool RawReg) const {
5542 Register VReg;
5543
5544 if (!MRI.isLiveIn(Reg)) {
5545 VReg = MRI.createVirtualRegister(RC);
5546 MRI.addLiveIn(Reg, VReg);
5547 } else {
5548 VReg = MRI.getLiveInVirtReg(Reg);
5549 }
5550
5551 if (RawReg)
5552 return DAG.getRegister(VReg, VT);
5553
5554 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5555}
5556
5557// This may be called multiple times, and nothing prevents creating multiple
5558// objects at the same offset. See if we already defined this object.
5560 int64_t Offset) {
5561 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5562 if (MFI.getObjectOffset(I) == Offset) {
5563 assert(MFI.getObjectSize(I) == Size);
5564 return I;
5565 }
5566 }
5567
5568 return MFI.CreateFixedObject(Size, Offset, true);
5569}
5570
5572 EVT VT,
5573 const SDLoc &SL,
5574 int64_t Offset) const {
5576 MachineFrameInfo &MFI = MF.getFrameInfo();
5577 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5578
5579 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5580 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5581
5582 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5585}
5586
5588 const SDLoc &SL,
5589 SDValue Chain,
5590 SDValue ArgVal,
5591 int64_t Offset) const {
5595
5596 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5597 // Stores to the argument stack area are relative to the stack pointer.
5598 SDValue SP =
5599 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5600 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5601 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5603 return Store;
5604}
5605
5607 const TargetRegisterClass *RC,
5608 EVT VT, const SDLoc &SL,
5609 const ArgDescriptor &Arg) const {
5610 assert(Arg && "Attempting to load missing argument");
5611
5612 SDValue V = Arg.isRegister() ?
5613 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5614 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5615
5616 if (!Arg.isMasked())
5617 return V;
5618
5619 unsigned Mask = Arg.getMask();
5620 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5621 V = DAG.getNode(ISD::SRL, SL, VT, V,
5622 DAG.getShiftAmountConstant(Shift, VT, SL));
5623 return DAG.getNode(ISD::AND, SL, VT, V,
5624 DAG.getConstant(Mask >> Shift, SL, VT));
5625}
5626
5628 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5629 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5630 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5631 uint64_t ArgOffset =
5632 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5633 switch (Param) {
5634 case FIRST_IMPLICIT:
5635 return ArgOffset;
5636 case PRIVATE_BASE:
5638 case SHARED_BASE:
5639 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5640 case QUEUE_PTR:
5641 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5642 }
5643 llvm_unreachable("unexpected implicit parameter type");
5644}
5645
5651
5652#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5653
5654const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5655 switch ((AMDGPUISD::NodeType)Opcode) {
5656 case AMDGPUISD::FIRST_NUMBER: break;
5657 // AMDIL DAG nodes
5658 NODE_NAME_CASE(BRANCH_COND);
5659
5660 // AMDGPU DAG nodes
5661 NODE_NAME_CASE(IF)
5662 NODE_NAME_CASE(ELSE)
5663 NODE_NAME_CASE(LOOP)
5664 NODE_NAME_CASE(CALL)
5665 NODE_NAME_CASE(TC_RETURN)
5666 NODE_NAME_CASE(TC_RETURN_GFX)
5667 NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
5668 NODE_NAME_CASE(TC_RETURN_CHAIN)
5669 NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
5670 NODE_NAME_CASE(TRAP)
5671 NODE_NAME_CASE(RET_GLUE)
5672 NODE_NAME_CASE(WAVE_ADDRESS)
5673 NODE_NAME_CASE(RETURN_TO_EPILOG)
5674 NODE_NAME_CASE(ENDPGM)
5675 NODE_NAME_CASE(ENDPGM_TRAP)
5676 NODE_NAME_CASE(SIMULATED_TRAP)
5677 NODE_NAME_CASE(DWORDADDR)
5678 NODE_NAME_CASE(FRACT)
5679 NODE_NAME_CASE(SETCC)
5680 NODE_NAME_CASE(DENORM_MODE)
5681 NODE_NAME_CASE(FMA_W_CHAIN)
5682 NODE_NAME_CASE(FMUL_W_CHAIN)
5683 NODE_NAME_CASE(CLAMP)
5684 NODE_NAME_CASE(COS_HW)
5685 NODE_NAME_CASE(SIN_HW)
5686 NODE_NAME_CASE(FMAX_LEGACY)
5687 NODE_NAME_CASE(FMIN_LEGACY)
5688 NODE_NAME_CASE(FMAX3)
5689 NODE_NAME_CASE(SMAX3)
5690 NODE_NAME_CASE(UMAX3)
5691 NODE_NAME_CASE(FMIN3)
5692 NODE_NAME_CASE(SMIN3)
5693 NODE_NAME_CASE(UMIN3)
5694 NODE_NAME_CASE(FMED3)
5695 NODE_NAME_CASE(SMED3)
5696 NODE_NAME_CASE(UMED3)
5697 NODE_NAME_CASE(FMAXIMUM3)
5698 NODE_NAME_CASE(FMINIMUM3)
5699 NODE_NAME_CASE(FDOT2)
5700 NODE_NAME_CASE(URECIP)
5701 NODE_NAME_CASE(DIV_SCALE)
5702 NODE_NAME_CASE(DIV_FMAS)
5703 NODE_NAME_CASE(DIV_FIXUP)
5704 NODE_NAME_CASE(FMAD_FTZ)
5705 NODE_NAME_CASE(RCP)
5706 NODE_NAME_CASE(RSQ)
5707 NODE_NAME_CASE(RCP_LEGACY)
5708 NODE_NAME_CASE(RCP_IFLAG)
5709 NODE_NAME_CASE(LOG)
5710 NODE_NAME_CASE(EXP)
5711 NODE_NAME_CASE(FMUL_LEGACY)
5712 NODE_NAME_CASE(RSQ_CLAMP)
5713 NODE_NAME_CASE(FP_CLASS)
5714 NODE_NAME_CASE(DOT4)
5715 NODE_NAME_CASE(CARRY)
5716 NODE_NAME_CASE(BORROW)
5717 NODE_NAME_CASE(BFE_U32)
5718 NODE_NAME_CASE(BFE_I32)
5719 NODE_NAME_CASE(BFI)
5720 NODE_NAME_CASE(BFM)
5721 NODE_NAME_CASE(FFBH_U32)
5722 NODE_NAME_CASE(FFBH_I32)
5723 NODE_NAME_CASE(FFBL_B32)
5724 NODE_NAME_CASE(MUL_U24)
5725 NODE_NAME_CASE(MUL_I24)
5726 NODE_NAME_CASE(MULHI_U24)
5727 NODE_NAME_CASE(MULHI_I24)
5728 NODE_NAME_CASE(MAD_U24)
5729 NODE_NAME_CASE(MAD_I24)
5730 NODE_NAME_CASE(MAD_I64_I32)
5731 NODE_NAME_CASE(MAD_U64_U32)
5732 NODE_NAME_CASE(PERM)
5733 NODE_NAME_CASE(TEXTURE_FETCH)
5734 NODE_NAME_CASE(R600_EXPORT)
5735 NODE_NAME_CASE(CONST_ADDRESS)
5736 NODE_NAME_CASE(REGISTER_LOAD)
5737 NODE_NAME_CASE(REGISTER_STORE)
5738 NODE_NAME_CASE(CVT_F32_UBYTE0)
5739 NODE_NAME_CASE(CVT_F32_UBYTE1)
5740 NODE_NAME_CASE(CVT_F32_UBYTE2)
5741 NODE_NAME_CASE(CVT_F32_UBYTE3)
5742 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5743 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5744 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5745 NODE_NAME_CASE(CVT_PK_I16_I32)
5746 NODE_NAME_CASE(CVT_PK_U16_U32)
5747 NODE_NAME_CASE(FP_TO_FP16)
5748 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5749 NODE_NAME_CASE(CONST_DATA_PTR)
5750 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5751 NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
5753 NODE_NAME_CASE(DUMMY_CHAIN)
5754 NODE_NAME_CASE(LOAD_D16_HI)
5755 NODE_NAME_CASE(LOAD_D16_LO)
5756 NODE_NAME_CASE(LOAD_D16_HI_I8)
5757 NODE_NAME_CASE(LOAD_D16_HI_U8)
5758 NODE_NAME_CASE(LOAD_D16_LO_I8)
5759 NODE_NAME_CASE(LOAD_D16_LO_U8)
5760 NODE_NAME_CASE(STORE_MSKOR)
5761 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5762 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5763 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5764 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5765 NODE_NAME_CASE(DS_ORDERED_COUNT)
5766 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5767 NODE_NAME_CASE(BUFFER_LOAD)
5768 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5769 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5770 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5771 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5772 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5773 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5774 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5775 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5776 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5777 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5778 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5779 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5780 NODE_NAME_CASE(SBUFFER_LOAD)
5781 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5782 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5783 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5784 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5785 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5786 NODE_NAME_CASE(BUFFER_STORE)
5787 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5788 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5789 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5790 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5791 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5792 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5793 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5794 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5795 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5796 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5797 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5798 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5799 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5800 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5801 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5802 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5803 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5804 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5805 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5806 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5807 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5808 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5809 NODE_NAME_CASE(WHOLE_WAVE_SETUP)
5810 NODE_NAME_CASE(WHOLE_WAVE_RETURN)
5811 }
5812 return nullptr;
5813}
5814
5816 SelectionDAG &DAG, int Enabled,
5817 int &RefinementSteps,
5818 bool &UseOneConstNR,
5819 bool Reciprocal) const {
5820 EVT VT = Operand.getValueType();
5821
5822 if (VT == MVT::f32) {
5823 RefinementSteps = 0;
5824 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5825 }
5826
5827 // TODO: There is also f64 rsq instruction, but the documentation is less
5828 // clear on its precision.
5829
5830 return SDValue();
5831}
5832
5834 SelectionDAG &DAG, int Enabled,
5835 int &RefinementSteps) const {
5836 EVT VT = Operand.getValueType();
5837
5838 if (VT == MVT::f32) {
5839 // Reciprocal, < 1 ulp error.
5840 //
5841 // This reciprocal approximation converges to < 0.5 ulp error with one
5842 // newton rhapson performed with two fused multiple adds (FMAs).
5843
5844 RefinementSteps = 0;
5845 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5846 }
5847
5848 // TODO: There is also f64 rcp instruction, but the documentation is less
5849 // clear on its precision.
5850
5851 return SDValue();
5852}
5853
5854static unsigned workitemIntrinsicDim(unsigned ID) {
5855 switch (ID) {
5856 case Intrinsic::amdgcn_workitem_id_x:
5857 return 0;
5858 case Intrinsic::amdgcn_workitem_id_y:
5859 return 1;
5860 case Intrinsic::amdgcn_workitem_id_z:
5861 return 2;
5862 default:
5863 llvm_unreachable("not a workitem intrinsic");
5864 }
5865}
5866
5868 const SDValue Op, KnownBits &Known,
5869 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5870
5871 Known.resetAll(); // Don't know anything.
5872
5873 unsigned Opc = Op.getOpcode();
5874
5875 switch (Opc) {
5876 default:
5877 break;
5878 case AMDGPUISD::CARRY:
5879 case AMDGPUISD::BORROW: {
5880 Known.Zero = APInt::getHighBitsSet(32, 31);
5881 break;
5882 }
5883
5884 case AMDGPUISD::BFE_I32:
5885 case AMDGPUISD::BFE_U32: {
5886 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5887 if (!CWidth)
5888 return;
5889
5890 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5891
5892 if (Opc == AMDGPUISD::BFE_U32)
5893 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5894
5895 break;
5896 }
5897 case AMDGPUISD::FP_TO_FP16: {
5898 unsigned BitWidth = Known.getBitWidth();
5899
5900 // High bits are zero.
5902 break;
5903 }
5904 case AMDGPUISD::MUL_U24:
5905 case AMDGPUISD::MUL_I24: {
5906 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5907 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5908 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5909 RHSKnown.countMinTrailingZeros();
5910 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5911 // Skip extra check if all bits are known zeros.
5912 if (TrailZ >= 32)
5913 break;
5914
5915 // Truncate to 24 bits.
5916 LHSKnown = LHSKnown.trunc(24);
5917 RHSKnown = RHSKnown.trunc(24);
5918
5919 if (Opc == AMDGPUISD::MUL_I24) {
5920 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5921 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5922 unsigned MaxValBits = LHSValBits + RHSValBits;
5923 if (MaxValBits > 32)
5924 break;
5925 unsigned SignBits = 32 - MaxValBits + 1;
5926 bool LHSNegative = LHSKnown.isNegative();
5927 bool LHSNonNegative = LHSKnown.isNonNegative();
5928 bool LHSPositive = LHSKnown.isStrictlyPositive();
5929 bool RHSNegative = RHSKnown.isNegative();
5930 bool RHSNonNegative = RHSKnown.isNonNegative();
5931 bool RHSPositive = RHSKnown.isStrictlyPositive();
5932
5933 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5934 Known.Zero.setHighBits(SignBits);
5935 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5936 Known.One.setHighBits(SignBits);
5937 } else {
5938 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5939 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5940 unsigned MaxValBits = LHSValBits + RHSValBits;
5941 if (MaxValBits >= 32)
5942 break;
5943 Known.Zero.setBitsFrom(MaxValBits);
5944 }
5945 break;
5946 }
5947 case AMDGPUISD::PERM: {
5948 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5949 if (!CMask)
5950 return;
5951
5952 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5953 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5954 unsigned Sel = CMask->getZExtValue();
5955
5956 for (unsigned I = 0; I < 32; I += 8) {
5957 unsigned SelBits = Sel & 0xff;
5958 if (SelBits < 4) {
5959 SelBits *= 8;
5960 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5961 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5962 } else if (SelBits < 7) {
5963 SelBits = (SelBits & 3) * 8;
5964 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5965 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5966 } else if (SelBits == 0x0c) {
5967 Known.Zero |= 0xFFull << I;
5968 } else if (SelBits > 0x0c) {
5969 Known.One |= 0xFFull << I;
5970 }
5971 Sel >>= 8;
5972 }
5973 break;
5974 }
5976 Known.Zero.setHighBits(24);
5977 break;
5978 }
5980 Known.Zero.setHighBits(16);
5981 break;
5982 }
5983 case AMDGPUISD::LDS: {
5984 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5985 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5986
5987 Known.Zero.setHighBits(16);
5988 Known.Zero.setLowBits(Log2(Alignment));
5989 break;
5990 }
5991 case AMDGPUISD::SMIN3:
5992 case AMDGPUISD::SMAX3:
5993 case AMDGPUISD::SMED3:
5994 case AMDGPUISD::UMIN3:
5995 case AMDGPUISD::UMAX3:
5996 case AMDGPUISD::UMED3: {
5997 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5998 if (Known2.isUnknown())
5999 break;
6000
6001 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6002 if (Known1.isUnknown())
6003 break;
6004
6005 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6006 if (Known0.isUnknown())
6007 break;
6008
6009 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6010 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6011 Known.One = Known0.One & Known1.One & Known2.One;
6012 break;
6013 }
6015 unsigned IID = Op.getConstantOperandVal(0);
6016 switch (IID) {
6017 case Intrinsic::amdgcn_workitem_id_x:
6018 case Intrinsic::amdgcn_workitem_id_y:
6019 case Intrinsic::amdgcn_workitem_id_z: {
6020 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6022 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6023 break;
6024 }
6025 default:
6026 break;
6027 }
6028 }
6029 }
6030}
6031
6033 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6034 unsigned Depth) const {
6035 switch (Op.getOpcode()) {
6036 case AMDGPUISD::BFE_I32: {
6037 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6038 if (!Width)
6039 return 1;
6040
6041 unsigned SignBits = 32 - Width->getZExtValue() + 1;
6042 if (!isNullConstant(Op.getOperand(1)))
6043 return SignBits;
6044
6045 // TODO: Could probably figure something out with non-0 offsets.
6046 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6047 return std::max(SignBits, Op0SignBits);
6048 }
6049
6050 case AMDGPUISD::BFE_U32: {
6051 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6052 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6053 }
6054
6055 case AMDGPUISD::CARRY:
6056 case AMDGPUISD::BORROW:
6057 return 31;
6059 return 25;
6061 return 17;
6063 return 24;
6065 return 16;
6067 return 16;
6068 case AMDGPUISD::SMIN3:
6069 case AMDGPUISD::SMAX3:
6070 case AMDGPUISD::SMED3:
6071 case AMDGPUISD::UMIN3:
6072 case AMDGPUISD::UMAX3:
6073 case AMDGPUISD::UMED3: {
6074 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6075 if (Tmp2 == 1)
6076 return 1; // Early out.
6077
6078 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6079 if (Tmp1 == 1)
6080 return 1; // Early out.
6081
6082 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6083 if (Tmp0 == 1)
6084 return 1; // Early out.
6085
6086 return std::min({Tmp0, Tmp1, Tmp2});
6087 }
6088 default:
6089 return 1;
6090 }
6091}
6092
6094 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6095 const MachineRegisterInfo &MRI, unsigned Depth) const {
6096 const MachineInstr *MI = MRI.getVRegDef(R);
6097 if (!MI)
6098 return 1;
6099
6100 // TODO: Check range metadata on MMO.
6101 switch (MI->getOpcode()) {
6102 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6103 return 25;
6104 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6105 return 17;
6106 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6107 return 24;
6108 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6109 return 16;
6110 case AMDGPU::G_AMDGPU_SMED3:
6111 case AMDGPU::G_AMDGPU_UMED3: {
6112 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6113 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6114 if (Tmp2 == 1)
6115 return 1;
6116 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6117 if (Tmp1 == 1)
6118 return 1;
6119 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6120 if (Tmp0 == 1)
6121 return 1;
6122 return std::min({Tmp0, Tmp1, Tmp2});
6123 }
6124 default:
6125 return 1;
6126 }
6127}
6128
6130 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6131 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6132 unsigned Opcode = Op.getOpcode();
6133 switch (Opcode) {
6134 case AMDGPUISD::BFE_I32:
6135 case AMDGPUISD::BFE_U32:
6136 return false;
6137 }
6139 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6140}
6141
6143 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6144 unsigned Depth) const {
6145 unsigned Opcode = Op.getOpcode();
6146 switch (Opcode) {
6149 if (SNaN)
6150 return true;
6151
6152 // TODO: Can check no nans on one of the operands for each one, but which
6153 // one?
6154 return false;
6155 }
6158 if (SNaN)
6159 return true;
6160 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6161 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6162 }
6163 case AMDGPUISD::FMED3:
6164 case AMDGPUISD::FMIN3:
6165 case AMDGPUISD::FMAX3:
6168 case AMDGPUISD::FMAD_FTZ: {
6169 if (SNaN)
6170 return true;
6171 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6172 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6173 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6174 }
6179 return true;
6180
6181 case AMDGPUISD::RCP:
6182 case AMDGPUISD::RSQ:
6184 case AMDGPUISD::RSQ_CLAMP: {
6185 if (SNaN)
6186 return true;
6187
6188 // TODO: Need is known positive check.
6189 return false;
6190 }
6191 case ISD::FLDEXP:
6192 case AMDGPUISD::FRACT: {
6193 if (SNaN)
6194 return true;
6195 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6196 }
6200 // TODO: Refine on operands.
6201 return SNaN;
6202 case AMDGPUISD::SIN_HW:
6203 case AMDGPUISD::COS_HW: {
6204 // TODO: Need check for infinity
6205 return SNaN;
6206 }
6208 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6209 // TODO: Handle more intrinsics
6210 switch (IntrinsicID) {
6211 case Intrinsic::amdgcn_cubeid:
6212 case Intrinsic::amdgcn_cvt_off_f32_i4:
6213 return true;
6214
6215 case Intrinsic::amdgcn_frexp_mant: {
6216 if (SNaN)
6217 return true;
6218 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6219 }
6220 case Intrinsic::amdgcn_cvt_pkrtz: {
6221 if (SNaN)
6222 return true;
6223 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6224 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6225 }
6226 case Intrinsic::amdgcn_rcp:
6227 case Intrinsic::amdgcn_rsq:
6228 case Intrinsic::amdgcn_rcp_legacy:
6229 case Intrinsic::amdgcn_rsq_legacy:
6230 case Intrinsic::amdgcn_rsq_clamp:
6231 case Intrinsic::amdgcn_tanh: {
6232 if (SNaN)
6233 return true;
6234
6235 // TODO: Need is known positive check.
6236 return false;
6237 }
6238 case Intrinsic::amdgcn_trig_preop:
6239 case Intrinsic::amdgcn_fdot2:
6240 // TODO: Refine on operand
6241 return SNaN;
6242 case Intrinsic::amdgcn_fma_legacy:
6243 if (SNaN)
6244 return true;
6245 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6246 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6247 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6248 default:
6249 return false;
6250 }
6251 }
6252 default:
6253 return false;
6254 }
6255}
6256
6258 Register N0, Register N1) const {
6259 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6260}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1414
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1181
const fltSemantics & getSemantics() const
Definition APFloat.h:1457
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1199
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1158
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
Definition MathExtras.h:49
constexpr double ln10
Definition MathExtras.h:50
constexpr float log2ef
Definition MathExtras.h:66
constexpr double log2e
Definition MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1569
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:269
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...