LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Lower floating point store/load to integer store/load to reduce the number
71 // of patterns in tablegen.
73 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
74
76 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
101
102 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
104
105 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
107
108 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
110
111 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
113
115 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
143
144 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
145 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
149
151 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
152
153 // TODO: Would be better to consume as directly legal
155 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
156
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
177
178 // There are no 64-bit extloads. These should be done as a 32-bit extload and
179 // an extension to 64-bit.
180 for (MVT VT : MVT::integer_valuetypes())
182 Expand);
183
184 for (MVT VT : MVT::integer_valuetypes()) {
185 if (VT == MVT::i64)
186 continue;
187
188 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
189 setLoadExtAction(Op, VT, MVT::i1, Promote);
190 setLoadExtAction(Op, VT, MVT::i8, Legal);
191 setLoadExtAction(Op, VT, MVT::i16, Legal);
192 setLoadExtAction(Op, VT, MVT::i32, Expand);
193 }
194 }
195
197 for (auto MemVT :
198 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
200 Expand);
201
202 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
203 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
204 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
205 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
216
217 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
223
224 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
236
238 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
239
241 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
278
280 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
281
283 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
284
286 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
287
289 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
317
318 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
319 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
320 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
321 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
322
323 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
324 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
325 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
326 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
327
328 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
329 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
330 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
331 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
332 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
333 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
334 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
335 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
336 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
337 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
338 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
339 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
340 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
341 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
342
343 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
344 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
345 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
346
347 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
348 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
349 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
350
351 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
352
353 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
354 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
355 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
356 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
357 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
358 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
359 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
360
361 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
362 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
363 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
364 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
365 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
366
367 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
368 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
369 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
370
371 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
372 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
373 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
374 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
375 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
376 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
379
380 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
381 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
382
384
385 // For R600, this is totally unsupported, just custom lower to produce an
386 // error.
388
389 // Library functions. These default to Expand, but we have instructions
390 // for them.
393 MVT::f32, Legal);
394
396 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
397
400 Custom);
401
402 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
403
404 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
405
406 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
407
408 if (Subtarget->has16BitInsts())
409 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
410 else {
411 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
413 }
414
416 Custom);
417
418 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
419 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
420 // default unless marked custom/legal.
423 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
424 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
425 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
426 Custom);
427
428 // Expand to fneg + fadd.
430
432 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
433 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
434 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
435 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
436 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
437 Custom);
438
439 // FIXME: Why is v8f16/v8bf16 missing?
442 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
443 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
444 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
445 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
446 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
447 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
448 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
449 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
450 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
451 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
452 Custom);
453
455 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
456
457 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
458 for (MVT VT : ScalarIntVTs) {
459 // These should use [SU]DIVREM, so set them to expand
461 Expand);
462
463 // GPU does not have divrem function for signed or unsigned.
465
466 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
468
470
471 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
473 }
474
475 // The hardware supports 32-bit FSHR, but not FSHL.
477
478 // The hardware supports 32-bit ROTR, but not ROTL.
479 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
481
483
487 MVT::i64, Custom);
489
491 Legal);
492
495 MVT::i64, Custom);
496
497 for (auto VT : {MVT::i8, MVT::i16})
499
500 static const MVT::SimpleValueType VectorIntTypes[] = {
501 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
502 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
503
504 for (MVT VT : VectorIntTypes) {
505 // Expand the following operations for the current type by default.
517 ISD::SETCC},
518 VT, Expand);
519 }
520
521 static const MVT::SimpleValueType FloatVectorTypes[] = {
522 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
523 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
524
525 for (MVT VT : FloatVectorTypes) {
538 VT, Expand);
539 }
540
541 // This causes using an unrolled select operation rather than expansion with
542 // bit operations. This is in general better, but the alternative using BFI
543 // instructions may be better if the select sources are SGPRs.
545 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
546
548 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
549
551 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
552
554 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
555
557 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
558
560 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
561
563 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
564
566 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
567
569 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
570
572 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
573
574 // Disable most libcalls.
575 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
576 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
577 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
578 }
579
581 setJumpIsExpensive(true);
582
583 // FIXME: This is only partially true. If we have to do vector compares, any
584 // SGPR pair can be a condition register. If we have a uniform condition, we
585 // are better off doing SALU operations, where there is only one SCC. For now,
586 // we don't have a way of knowing during instruction selection if a condition
587 // will be uniform and we always use vector compares. Assume we are using
588 // vector compares until that is fixed.
590
593
595
596 // We want to find all load dependencies for long chains of stores to enable
597 // merging into very wide vectors. The problem is with vectors with > 4
598 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
599 // vectors are a legal type, even though we have to split the loads
600 // usually. When we can more precisely specify load legality per address
601 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
602 // smarter so that they can figure out what to do in 2 iterations without all
603 // N > 4 stores on the same chain.
605
606 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
607 // about these during lowering.
608 MaxStoresPerMemcpy = 0xffffffff;
609 MaxStoresPerMemmove = 0xffffffff;
610 MaxStoresPerMemset = 0xffffffff;
611
612 // The expansion for 64-bit division is enormous.
614 addBypassSlowDiv(64, 32);
615
626
630}
631
633 if (getTargetMachine().Options.NoSignedZerosFPMath)
634 return true;
635
636 const auto Flags = Op.getNode()->getFlags();
637 if (Flags.hasNoSignedZeros())
638 return true;
639
640 return false;
641}
642
643//===----------------------------------------------------------------------===//
644// Target Information
645//===----------------------------------------------------------------------===//
646
648static bool fnegFoldsIntoOpcode(unsigned Opc) {
649 switch (Opc) {
650 case ISD::FADD:
651 case ISD::FSUB:
652 case ISD::FMUL:
653 case ISD::FMA:
654 case ISD::FMAD:
655 case ISD::FMINNUM:
656 case ISD::FMAXNUM:
659 case ISD::FMINIMUM:
660 case ISD::FMAXIMUM:
661 case ISD::SELECT:
662 case ISD::FSIN:
663 case ISD::FTRUNC:
664 case ISD::FRINT:
665 case ISD::FNEARBYINT:
666 case ISD::FROUNDEVEN:
668 case AMDGPUISD::RCP:
675 case AMDGPUISD::FMED3:
676 // TODO: handle llvm.amdgcn.fma.legacy
677 return true;
678 case ISD::BITCAST:
679 llvm_unreachable("bitcast is special cased");
680 default:
681 return false;
682 }
683}
684
685static bool fnegFoldsIntoOp(const SDNode *N) {
686 unsigned Opc = N->getOpcode();
687 if (Opc == ISD::BITCAST) {
688 // TODO: Is there a benefit to checking the conditions performFNegCombine
689 // does? We don't for the other cases.
690 SDValue BCSrc = N->getOperand(0);
691 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
692 return BCSrc.getNumOperands() == 2 &&
693 BCSrc.getOperand(1).getValueSizeInBits() == 32;
694 }
695
696 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
697 }
698
699 return fnegFoldsIntoOpcode(Opc);
700}
701
702/// \p returns true if the operation will definitely need to use a 64-bit
703/// encoding, and thus will use a VOP3 encoding regardless of the source
704/// modifiers.
706static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
707 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
708 VT == MVT::f64;
709}
710
711/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
712/// type for ISD::SELECT.
714static bool selectSupportsSourceMods(const SDNode *N) {
715 // TODO: Only applies if select will be vector
716 return N->getValueType(0) == MVT::f32;
717}
718
719// Most FP instructions support source modifiers, but this could be refined
720// slightly.
722static bool hasSourceMods(const SDNode *N) {
723 if (isa<MemSDNode>(N))
724 return false;
725
726 switch (N->getOpcode()) {
727 case ISD::CopyToReg:
728 case ISD::FDIV:
729 case ISD::FREM:
730 case ISD::INLINEASM:
734
735 // TODO: Should really be looking at the users of the bitcast. These are
736 // problematic because bitcasts are used to legalize all stores to integer
737 // types.
738 case ISD::BITCAST:
739 return false;
741 switch (N->getConstantOperandVal(0)) {
742 case Intrinsic::amdgcn_interp_p1:
743 case Intrinsic::amdgcn_interp_p2:
744 case Intrinsic::amdgcn_interp_mov:
745 case Intrinsic::amdgcn_interp_p1_f16:
746 case Intrinsic::amdgcn_interp_p2_f16:
747 return false;
748 default:
749 return true;
750 }
751 }
752 case ISD::SELECT:
754 default:
755 return true;
756 }
757}
758
760 unsigned CostThreshold) {
761 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
762 // it is truly free to use a source modifier in all cases. If there are
763 // multiple users but for each one will necessitate using VOP3, there will be
764 // a code size increase. Try to avoid increasing code size unless we know it
765 // will save on the instruction count.
766 unsigned NumMayIncreaseSize = 0;
767 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
768
769 assert(!N->use_empty());
770
771 // XXX - Should this limit number of uses to check?
772 for (const SDNode *U : N->uses()) {
773 if (!hasSourceMods(U))
774 return false;
775
776 if (!opMustUseVOP3Encoding(U, VT)) {
777 if (++NumMayIncreaseSize > CostThreshold)
778 return false;
779 }
780 }
781
782 return true;
783}
784
786 ISD::NodeType ExtendKind) const {
787 assert(!VT.isVector() && "only scalar expected");
788
789 // Round to the next multiple of 32-bits.
790 unsigned Size = VT.getSizeInBits();
791 if (Size <= 32)
792 return MVT::i32;
793 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
794}
795
797 return MVT::i32;
798}
799
801 return true;
802}
803
804// The backend supports 32 and 64 bit floating point immediates.
805// FIXME: Why are we reporting vectors of FP immediates as legal?
807 bool ForCodeSize) const {
808 EVT ScalarVT = VT.getScalarType();
809 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
810 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
811}
812
813// We don't want to shrink f64 / f32 constants.
815 EVT ScalarVT = VT.getScalarType();
816 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
817}
818
820 ISD::LoadExtType ExtTy,
821 EVT NewVT) const {
822 // TODO: This may be worth removing. Check regression tests for diffs.
824 return false;
825
826 unsigned NewSize = NewVT.getStoreSizeInBits();
827
828 // If we are reducing to a 32-bit load or a smaller multi-dword load,
829 // this is always better.
830 if (NewSize >= 32)
831 return true;
832
833 EVT OldVT = N->getValueType(0);
834 unsigned OldSize = OldVT.getStoreSizeInBits();
835
836 MemSDNode *MN = cast<MemSDNode>(N);
837 unsigned AS = MN->getAddressSpace();
838 // Do not shrink an aligned scalar load to sub-dword.
839 // Scalar engine cannot do sub-dword loads.
840 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
841 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
844 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
845 MN->isInvariant())) &&
847 return false;
848
849 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
850 // extloads, so doing one requires using a buffer_load. In cases where we
851 // still couldn't use a scalar load, using the wider load shouldn't really
852 // hurt anything.
853
854 // If the old size already had to be an extload, there's no harm in continuing
855 // to reduce the width.
856 return (OldSize < 32);
857}
858
860 const SelectionDAG &DAG,
861 const MachineMemOperand &MMO) const {
862
863 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
864
865 if (LoadTy.getScalarType() == MVT::i32)
866 return false;
867
868 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
869 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
870
871 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
872 return false;
873
874 unsigned Fast = 0;
876 CastTy, MMO, &Fast) &&
877 Fast;
878}
879
880// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
881// profitable with the expansion for 64-bit since it's generally good to
882// speculate things.
884 return true;
885}
886
888 return true;
889}
890
892 switch (N->getOpcode()) {
893 case ISD::EntryToken:
894 case ISD::TokenFactor:
895 return true;
897 unsigned IntrID = N->getConstantOperandVal(0);
899 }
900 case ISD::LOAD:
901 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
903 return true;
904 return false;
905 case AMDGPUISD::SETCC: // ballot-style instruction
906 return true;
907 }
908 return false;
909}
910
912 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
913 NegatibleCost &Cost, unsigned Depth) const {
914
915 switch (Op.getOpcode()) {
916 case ISD::FMA:
917 case ISD::FMAD: {
918 // Negating a fma is not free if it has users without source mods.
919 if (!allUsesHaveSourceMods(Op.getNode()))
920 return SDValue();
921 break;
922 }
923 case AMDGPUISD::RCP: {
924 SDValue Src = Op.getOperand(0);
925 EVT VT = Op.getValueType();
926 SDLoc SL(Op);
927
928 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
929 ForCodeSize, Cost, Depth + 1);
930 if (NegSrc)
931 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
932 return SDValue();
933 }
934 default:
935 break;
936 }
937
938 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
939 ForCodeSize, Cost, Depth);
940}
941
942//===---------------------------------------------------------------------===//
943// Target Properties
944//===---------------------------------------------------------------------===//
945
948
949 // Packed operations do not have a fabs modifier.
950 return VT == MVT::f32 || VT == MVT::f64 ||
951 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
952}
953
956 // Report this based on the end legalized type.
957 VT = VT.getScalarType();
958 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
959}
960
962 unsigned NumElem,
963 unsigned AS) const {
964 return true;
965}
966
968 // There are few operations which truly have vector input operands. Any vector
969 // operation is going to involve operations on each component, and a
970 // build_vector will be a copy per element, so it always makes sense to use a
971 // build_vector input in place of the extracted element to avoid a copy into a
972 // super register.
973 //
974 // We should probably only do this if all users are extracts only, but this
975 // should be the common case.
976 return true;
977}
978
980 // Truncate is just accessing a subregister.
981
982 unsigned SrcSize = Source.getSizeInBits();
983 unsigned DestSize = Dest.getSizeInBits();
984
985 return DestSize < SrcSize && DestSize % 32 == 0 ;
986}
987
989 // Truncate is just accessing a subregister.
990
991 unsigned SrcSize = Source->getScalarSizeInBits();
992 unsigned DestSize = Dest->getScalarSizeInBits();
993
994 if (DestSize== 16 && Subtarget->has16BitInsts())
995 return SrcSize >= 32;
996
997 return DestSize < SrcSize && DestSize % 32 == 0;
998}
999
1001 unsigned SrcSize = Src->getScalarSizeInBits();
1002 unsigned DestSize = Dest->getScalarSizeInBits();
1003
1004 if (SrcSize == 16 && Subtarget->has16BitInsts())
1005 return DestSize >= 32;
1006
1007 return SrcSize == 32 && DestSize == 64;
1008}
1009
1011 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1012 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1013 // this will enable reducing 64-bit operations the 32-bit, which is always
1014 // good.
1015
1016 if (Src == MVT::i16)
1017 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1018
1019 return Src == MVT::i32 && Dest == MVT::i64;
1020}
1021
1023 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1024 // limited number of native 64-bit operations. Shrinking an operation to fit
1025 // in a single 32-bit register should always be helpful. As currently used,
1026 // this is much less general than the name suggests, and is only used in
1027 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1028 // not profitable, and may actually be harmful.
1029 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1030}
1031
1033 const SDNode* N, CombineLevel Level) const {
1034 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1035 N->getOpcode() == ISD::SRL) &&
1036 "Expected shift op");
1037 // Always commute pre-type legalization and right shifts.
1038 // We're looking for shl(or(x,y),z) patterns.
1040 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1041 return true;
1042
1043 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1044 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1045 (N->use_begin()->getOpcode() == ISD::SRA ||
1046 N->use_begin()->getOpcode() == ISD::SRL))
1047 return false;
1048
1049 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1050 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1051 if (LHS.getOpcode() != ISD::SHL)
1052 return false;
1053 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1054 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1055 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1056 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1057 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1058 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1059 };
1060 SDValue LHS = N->getOperand(0).getOperand(0);
1061 SDValue RHS = N->getOperand(0).getOperand(1);
1062 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1063}
1064
1065//===---------------------------------------------------------------------===//
1066// TargetLowering Callbacks
1067//===---------------------------------------------------------------------===//
1068
1070 bool IsVarArg) {
1071 switch (CC) {
1079 return CC_AMDGPU;
1082 return CC_AMDGPU_CS_CHAIN;
1083 case CallingConv::C:
1084 case CallingConv::Fast:
1085 case CallingConv::Cold:
1086 return CC_AMDGPU_Func;
1088 return CC_SI_Gfx;
1091 default:
1092 report_fatal_error("Unsupported calling convention for call");
1093 }
1094}
1095
1097 bool IsVarArg) {
1098 switch (CC) {
1101 llvm_unreachable("kernels should not be handled here");
1111 return RetCC_SI_Shader;
1113 return RetCC_SI_Gfx;
1114 case CallingConv::C:
1115 case CallingConv::Fast:
1116 case CallingConv::Cold:
1117 return RetCC_AMDGPU_Func;
1118 default:
1119 report_fatal_error("Unsupported calling convention.");
1120 }
1121}
1122
1123/// The SelectionDAGBuilder will automatically promote function arguments
1124/// with illegal types. However, this does not work for the AMDGPU targets
1125/// since the function arguments are stored in memory as these illegal types.
1126/// In order to handle this properly we need to get the original types sizes
1127/// from the LLVM IR Function and fixup the ISD:InputArg values before
1128/// passing them to AnalyzeFormalArguments()
1129
1130/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1131/// input values across multiple registers. Each item in the Ins array
1132/// represents a single value that will be stored in registers. Ins[x].VT is
1133/// the value type of the value that will be stored in the register, so
1134/// whatever SDNode we lower the argument to needs to be this type.
1135///
1136/// In order to correctly lower the arguments we need to know the size of each
1137/// argument. Since Ins[x].VT gives us the size of the register that will
1138/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1139/// for the original function argument so that we can deduce the correct memory
1140/// type to use for Ins[x]. In most cases the correct memory type will be
1141/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1142/// we have a kernel argument of type v8i8, this argument will be split into
1143/// 8 parts and each part will be represented by its own item in the Ins array.
1144/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1145/// the argument before it was split. From this, we deduce that the memory type
1146/// for each individual part is i8. We pass the memory type as LocVT to the
1147/// calling convention analysis function and the register type (Ins[x].VT) as
1148/// the ValVT.
1150 CCState &State,
1151 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1152 const MachineFunction &MF = State.getMachineFunction();
1153 const Function &Fn = MF.getFunction();
1154 LLVMContext &Ctx = Fn.getParent()->getContext();
1155 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1156 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1158
1159 Align MaxAlign = Align(1);
1160 uint64_t ExplicitArgOffset = 0;
1161 const DataLayout &DL = Fn.getDataLayout();
1162
1163 unsigned InIndex = 0;
1164
1165 for (const Argument &Arg : Fn.args()) {
1166 const bool IsByRef = Arg.hasByRefAttr();
1167 Type *BaseArgTy = Arg.getType();
1168 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1169 Align Alignment = DL.getValueOrABITypeAlignment(
1170 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1171 MaxAlign = std::max(Alignment, MaxAlign);
1172 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1173
1174 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1175 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1176
1177 // We're basically throwing away everything passed into us and starting over
1178 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1179 // to us as computed in Ins.
1180 //
1181 // We also need to figure out what type legalization is trying to do to get
1182 // the correct memory offsets.
1183
1184 SmallVector<EVT, 16> ValueVTs;
1186 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1187
1188 for (unsigned Value = 0, NumValues = ValueVTs.size();
1189 Value != NumValues; ++Value) {
1190 uint64_t BasePartOffset = Offsets[Value];
1191
1192 EVT ArgVT = ValueVTs[Value];
1193 EVT MemVT = ArgVT;
1194 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1195 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1196
1197 if (NumRegs == 1) {
1198 // This argument is not split, so the IR type is the memory type.
1199 if (ArgVT.isExtended()) {
1200 // We have an extended type, like i24, so we should just use the
1201 // register type.
1202 MemVT = RegisterVT;
1203 } else {
1204 MemVT = ArgVT;
1205 }
1206 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1207 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1208 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1209 // We have a vector value which has been split into a vector with
1210 // the same scalar type, but fewer elements. This should handle
1211 // all the floating-point vector types.
1212 MemVT = RegisterVT;
1213 } else if (ArgVT.isVector() &&
1214 ArgVT.getVectorNumElements() == NumRegs) {
1215 // This arg has been split so that each element is stored in a separate
1216 // register.
1217 MemVT = ArgVT.getScalarType();
1218 } else if (ArgVT.isExtended()) {
1219 // We have an extended type, like i65.
1220 MemVT = RegisterVT;
1221 } else {
1222 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1223 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1224 if (RegisterVT.isInteger()) {
1225 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1226 } else if (RegisterVT.isVector()) {
1227 assert(!RegisterVT.getScalarType().isFloatingPoint());
1228 unsigned NumElements = RegisterVT.getVectorNumElements();
1229 assert(MemoryBits % NumElements == 0);
1230 // This vector type has been split into another vector type with
1231 // a different elements size.
1232 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1233 MemoryBits / NumElements);
1234 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1235 } else {
1236 llvm_unreachable("cannot deduce memory type.");
1237 }
1238 }
1239
1240 // Convert one element vectors to scalar.
1241 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1242 MemVT = MemVT.getScalarType();
1243
1244 // Round up vec3/vec5 argument.
1245 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1246 assert(MemVT.getVectorNumElements() == 3 ||
1247 MemVT.getVectorNumElements() == 5 ||
1248 (MemVT.getVectorNumElements() >= 9 &&
1249 MemVT.getVectorNumElements() <= 12));
1250 MemVT = MemVT.getPow2VectorType(State.getContext());
1251 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1252 MemVT = MemVT.getRoundIntegerType(State.getContext());
1253 }
1254
1255 unsigned PartOffset = 0;
1256 for (unsigned i = 0; i != NumRegs; ++i) {
1257 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1258 BasePartOffset + PartOffset,
1259 MemVT.getSimpleVT(),
1261 PartOffset += MemVT.getStoreSize();
1262 }
1263 }
1264 }
1265}
1266
1268 SDValue Chain, CallingConv::ID CallConv,
1269 bool isVarArg,
1271 const SmallVectorImpl<SDValue> &OutVals,
1272 const SDLoc &DL, SelectionDAG &DAG) const {
1273 // FIXME: Fails for r600 tests
1274 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1275 // "wave terminate should not have return values");
1276 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1277}
1278
1279//===---------------------------------------------------------------------===//
1280// Target specific lowering
1281//===---------------------------------------------------------------------===//
1282
1283/// Selects the correct CCAssignFn for a given CallingConvention value.
1285 bool IsVarArg) {
1286 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1287}
1288
1290 bool IsVarArg) {
1292}
1293
1295 SelectionDAG &DAG,
1296 MachineFrameInfo &MFI,
1297 int ClobberedFI) const {
1298 SmallVector<SDValue, 8> ArgChains;
1299 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1300 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1301
1302 // Include the original chain at the beginning of the list. When this is
1303 // used by target LowerCall hooks, this helps legalize find the
1304 // CALLSEQ_BEGIN node.
1305 ArgChains.push_back(Chain);
1306
1307 // Add a chain value for each stack argument corresponding
1308 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1309 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1310 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1311 if (FI->getIndex() < 0) {
1312 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1313 int64_t InLastByte = InFirstByte;
1314 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1315
1316 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1317 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1318 ArgChains.push_back(SDValue(L, 1));
1319 }
1320 }
1321 }
1322 }
1323
1324 // Build a tokenfactor for all the chains.
1325 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1326}
1327
1330 StringRef Reason) const {
1331 SDValue Callee = CLI.Callee;
1332 SelectionDAG &DAG = CLI.DAG;
1333
1334 const Function &Fn = DAG.getMachineFunction().getFunction();
1335
1336 StringRef FuncName("<unknown>");
1337
1338 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1339 FuncName = G->getSymbol();
1340 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1341 FuncName = G->getGlobal()->getName();
1342
1344 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1345 DAG.getContext()->diagnose(NoCalls);
1346
1347 if (!CLI.IsTailCall) {
1348 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1349 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1350 }
1351
1352 return DAG.getEntryNode();
1353}
1354
1356 SmallVectorImpl<SDValue> &InVals) const {
1357 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1358}
1359
1361 SelectionDAG &DAG) const {
1362 const Function &Fn = DAG.getMachineFunction().getFunction();
1363
1364 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1365 SDLoc(Op).getDebugLoc());
1366 DAG.getContext()->diagnose(NoDynamicAlloca);
1367 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1368 return DAG.getMergeValues(Ops, SDLoc());
1369}
1370
1372 SelectionDAG &DAG) const {
1373 switch (Op.getOpcode()) {
1374 default:
1375 Op->print(errs(), &DAG);
1376 llvm_unreachable("Custom lowering code for this "
1377 "instruction is not implemented yet!");
1378 break;
1380 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1382 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1383 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1384 case ISD::FREM: return LowerFREM(Op, DAG);
1385 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1386 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1387 case ISD::FRINT: return LowerFRINT(Op, DAG);
1388 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1389 case ISD::FROUNDEVEN:
1390 return LowerFROUNDEVEN(Op, DAG);
1391 case ISD::FROUND: return LowerFROUND(Op, DAG);
1392 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1393 case ISD::FLOG2:
1394 return LowerFLOG2(Op, DAG);
1395 case ISD::FLOG:
1396 case ISD::FLOG10:
1397 return LowerFLOGCommon(Op, DAG);
1398 case ISD::FEXP:
1399 case ISD::FEXP10:
1400 return lowerFEXP(Op, DAG);
1401 case ISD::FEXP2:
1402 return lowerFEXP2(Op, DAG);
1403 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1404 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1405 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1406 case ISD::FP_TO_SINT:
1407 case ISD::FP_TO_UINT:
1408 return LowerFP_TO_INT(Op, DAG);
1409 case ISD::CTTZ:
1411 case ISD::CTLZ:
1413 return LowerCTLZ_CTTZ(Op, DAG);
1415 }
1416 return Op;
1417}
1418
1421 SelectionDAG &DAG) const {
1422 switch (N->getOpcode()) {
1424 // Different parts of legalization seem to interpret which type of
1425 // sign_extend_inreg is the one to check for custom lowering. The extended
1426 // from type is what really matters, but some places check for custom
1427 // lowering of the result type. This results in trying to use
1428 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1429 // nothing here and let the illegal result integer be handled normally.
1430 return;
1431 case ISD::FLOG2:
1432 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1433 Results.push_back(Lowered);
1434 return;
1435 case ISD::FLOG:
1436 case ISD::FLOG10:
1437 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1438 Results.push_back(Lowered);
1439 return;
1440 case ISD::FEXP2:
1441 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1442 Results.push_back(Lowered);
1443 return;
1444 case ISD::FEXP:
1445 case ISD::FEXP10:
1446 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1447 Results.push_back(Lowered);
1448 return;
1449 case ISD::CTLZ:
1451 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1452 Results.push_back(Lowered);
1453 return;
1454 default:
1455 return;
1456 }
1457}
1458
1460 SDValue Op,
1461 SelectionDAG &DAG) const {
1462
1463 const DataLayout &DL = DAG.getDataLayout();
1464 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1465 const GlobalValue *GV = G->getGlobal();
1466
1467 if (!MFI->isModuleEntryFunction()) {
1468 if (std::optional<uint32_t> Address =
1470 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1471 }
1472 }
1473
1474 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1475 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1476 if (!MFI->isModuleEntryFunction() &&
1477 GV->getName() != "llvm.amdgcn.module.lds") {
1478 SDLoc DL(Op);
1479 const Function &Fn = DAG.getMachineFunction().getFunction();
1480 DiagnosticInfoUnsupported BadLDSDecl(
1481 Fn, "local memory global used by non-kernel function",
1482 DL.getDebugLoc(), DS_Warning);
1483 DAG.getContext()->diagnose(BadLDSDecl);
1484
1485 // We currently don't have a way to correctly allocate LDS objects that
1486 // aren't directly associated with a kernel. We do force inlining of
1487 // functions that use local objects. However, if these dead functions are
1488 // not eliminated, we don't want a compile time error. Just emit a warning
1489 // and a trap, since there should be no callable path here.
1490 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1491 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1492 Trap, DAG.getRoot());
1493 DAG.setRoot(OutputChain);
1494 return DAG.getUNDEF(Op.getValueType());
1495 }
1496
1497 // XXX: What does the value of G->getOffset() mean?
1498 assert(G->getOffset() == 0 &&
1499 "Do not know what to do with an non-zero offset");
1500
1501 // TODO: We could emit code to handle the initialization somewhere.
1502 // We ignore the initializer for now and legalize it to allow selection.
1503 // The initializer will anyway get errored out during assembly emission.
1504 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1505 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1506 }
1507 return SDValue();
1508}
1509
1511 SelectionDAG &DAG) const {
1513 SDLoc SL(Op);
1514
1515 EVT VT = Op.getValueType();
1516 if (VT.getVectorElementType().getSizeInBits() < 32) {
1517 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1518 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1519 unsigned NewNumElt = OpBitSize / 32;
1520 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1522 MVT::i32, NewNumElt);
1523 for (const SDUse &U : Op->ops()) {
1524 SDValue In = U.get();
1525 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1526 if (NewNumElt > 1)
1527 DAG.ExtractVectorElements(NewIn, Args);
1528 else
1529 Args.push_back(NewIn);
1530 }
1531
1532 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1533 NewNumElt * Op.getNumOperands());
1534 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1535 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1536 }
1537 }
1538
1539 for (const SDUse &U : Op->ops())
1540 DAG.ExtractVectorElements(U.get(), Args);
1541
1542 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1543}
1544
1546 SelectionDAG &DAG) const {
1547 SDLoc SL(Op);
1549 unsigned Start = Op.getConstantOperandVal(1);
1550 EVT VT = Op.getValueType();
1551 EVT SrcVT = Op.getOperand(0).getValueType();
1552
1553 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1554 unsigned NumElt = VT.getVectorNumElements();
1555 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1556 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1557
1558 // Extract 32-bit registers at a time.
1559 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1560 EVT NewVT = NumElt == 2
1561 ? MVT::i32
1562 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1563 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1564
1565 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1566 if (NumElt == 2)
1567 Tmp = Args[0];
1568 else
1569 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1570
1571 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1572 }
1573
1574 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1576
1577 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1578}
1579
1580// TODO: Handle fabs too
1582 if (Val.getOpcode() == ISD::FNEG)
1583 return Val.getOperand(0);
1584
1585 return Val;
1586}
1587
1589 if (Val.getOpcode() == ISD::FNEG)
1590 Val = Val.getOperand(0);
1591 if (Val.getOpcode() == ISD::FABS)
1592 Val = Val.getOperand(0);
1593 if (Val.getOpcode() == ISD::FCOPYSIGN)
1594 Val = Val.getOperand(0);
1595 return Val;
1596}
1597
1599 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1600 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1601 SelectionDAG &DAG = DCI.DAG;
1602 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1603 switch (CCOpcode) {
1604 case ISD::SETOEQ:
1605 case ISD::SETONE:
1606 case ISD::SETUNE:
1607 case ISD::SETNE:
1608 case ISD::SETUEQ:
1609 case ISD::SETEQ:
1610 case ISD::SETFALSE:
1611 case ISD::SETFALSE2:
1612 case ISD::SETTRUE:
1613 case ISD::SETTRUE2:
1614 case ISD::SETUO:
1615 case ISD::SETO:
1616 break;
1617 case ISD::SETULE:
1618 case ISD::SETULT: {
1619 if (LHS == True)
1620 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1621 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1622 }
1623 case ISD::SETOLE:
1624 case ISD::SETOLT:
1625 case ISD::SETLE:
1626 case ISD::SETLT: {
1627 // Ordered. Assume ordered for undefined.
1628
1629 // Only do this after legalization to avoid interfering with other combines
1630 // which might occur.
1632 !DCI.isCalledByLegalizer())
1633 return SDValue();
1634
1635 // We need to permute the operands to get the correct NaN behavior. The
1636 // selected operand is the second one based on the failing compare with NaN,
1637 // so permute it based on the compare type the hardware uses.
1638 if (LHS == True)
1639 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1640 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1641 }
1642 case ISD::SETUGE:
1643 case ISD::SETUGT: {
1644 if (LHS == True)
1645 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1646 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1647 }
1648 case ISD::SETGT:
1649 case ISD::SETGE:
1650 case ISD::SETOGE:
1651 case ISD::SETOGT: {
1653 !DCI.isCalledByLegalizer())
1654 return SDValue();
1655
1656 if (LHS == True)
1657 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1658 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1659 }
1660 case ISD::SETCC_INVALID:
1661 llvm_unreachable("Invalid setcc condcode!");
1662 }
1663 return SDValue();
1664}
1665
1666/// Generate Min/Max node
1668 SDValue LHS, SDValue RHS,
1669 SDValue True, SDValue False,
1670 SDValue CC,
1671 DAGCombinerInfo &DCI) const {
1672 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1673 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1674
1675 SelectionDAG &DAG = DCI.DAG;
1676
1677 // If we can't directly match this, try to see if we can fold an fneg to
1678 // match.
1679
1680 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1681 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1682 SDValue NegTrue = peekFNeg(True);
1683
1684 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1685 // fmin/fmax.
1686 //
1687 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1688 // -> fneg (fmin_legacy lhs, K)
1689 //
1690 // TODO: Use getNegatedExpression
1691 if (LHS == NegTrue && CFalse && CRHS) {
1692 APFloat NegRHS = neg(CRHS->getValueAPF());
1693 if (NegRHS == CFalse->getValueAPF()) {
1694 SDValue Combined =
1695 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1696 if (Combined)
1697 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1698 return SDValue();
1699 }
1700 }
1701
1702 return SDValue();
1703}
1704
1705std::pair<SDValue, SDValue>
1707 SDLoc SL(Op);
1708
1709 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1710
1711 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1712 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1713
1714 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1715 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1716
1717 return std::pair(Lo, Hi);
1718}
1719
1721 SDLoc SL(Op);
1722
1723 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1724 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1725 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1726}
1727
1729 SDLoc SL(Op);
1730
1731 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1732 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1733 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1734}
1735
1736// Split a vector type into two parts. The first part is a power of two vector.
1737// The second part is whatever is left over, and is a scalar if it would
1738// otherwise be a 1-vector.
1739std::pair<EVT, EVT>
1741 EVT LoVT, HiVT;
1742 EVT EltVT = VT.getVectorElementType();
1743 unsigned NumElts = VT.getVectorNumElements();
1744 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1745 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1746 HiVT = NumElts - LoNumElts == 1
1747 ? EltVT
1748 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1749 return std::pair(LoVT, HiVT);
1750}
1751
1752// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1753// scalar.
1754std::pair<SDValue, SDValue>
1756 const EVT &LoVT, const EVT &HiVT,
1757 SelectionDAG &DAG) const {
1759 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1760 N.getValueType().getVectorNumElements() &&
1761 "More vector elements requested than available!");
1763 DAG.getVectorIdxConstant(0, DL));
1764 SDValue Hi = DAG.getNode(
1766 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1767 return std::pair(Lo, Hi);
1768}
1769
1771 SelectionDAG &DAG) const {
1772 LoadSDNode *Load = cast<LoadSDNode>(Op);
1773 EVT VT = Op.getValueType();
1774 SDLoc SL(Op);
1775
1776
1777 // If this is a 2 element vector, we really want to scalarize and not create
1778 // weird 1 element vectors.
1779 if (VT.getVectorNumElements() == 2) {
1780 SDValue Ops[2];
1781 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1782 return DAG.getMergeValues(Ops, SL);
1783 }
1784
1785 SDValue BasePtr = Load->getBasePtr();
1786 EVT MemVT = Load->getMemoryVT();
1787
1788 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1789
1790 EVT LoVT, HiVT;
1791 EVT LoMemVT, HiMemVT;
1792 SDValue Lo, Hi;
1793
1794 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1795 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1796 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1797
1798 unsigned Size = LoMemVT.getStoreSize();
1799 Align BaseAlign = Load->getAlign();
1800 Align HiAlign = commonAlignment(BaseAlign, Size);
1801
1802 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1803 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1804 BaseAlign, Load->getMemOperand()->getFlags());
1805 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1806 SDValue HiLoad =
1807 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1808 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1809 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1810
1811 SDValue Join;
1812 if (LoVT == HiVT) {
1813 // This is the case that the vector is power of two so was evenly split.
1814 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1815 } else {
1816 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1817 DAG.getVectorIdxConstant(0, SL));
1818 Join = DAG.getNode(
1820 VT, Join, HiLoad,
1822 }
1823
1824 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1825 LoLoad.getValue(1), HiLoad.getValue(1))};
1826
1827 return DAG.getMergeValues(Ops, SL);
1828}
1829
1831 SelectionDAG &DAG) const {
1832 LoadSDNode *Load = cast<LoadSDNode>(Op);
1833 EVT VT = Op.getValueType();
1834 SDValue BasePtr = Load->getBasePtr();
1835 EVT MemVT = Load->getMemoryVT();
1836 SDLoc SL(Op);
1837 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1838 Align BaseAlign = Load->getAlign();
1839 unsigned NumElements = MemVT.getVectorNumElements();
1840
1841 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1842 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1843 if (NumElements != 3 ||
1844 (BaseAlign < Align(8) &&
1845 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1846 return SplitVectorLoad(Op, DAG);
1847
1848 assert(NumElements == 3);
1849
1850 EVT WideVT =
1852 EVT WideMemVT =
1854 SDValue WideLoad = DAG.getExtLoad(
1855 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1856 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1857 return DAG.getMergeValues(
1858 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1859 DAG.getVectorIdxConstant(0, SL)),
1860 WideLoad.getValue(1)},
1861 SL);
1862}
1863
1865 SelectionDAG &DAG) const {
1866 StoreSDNode *Store = cast<StoreSDNode>(Op);
1867 SDValue Val = Store->getValue();
1868 EVT VT = Val.getValueType();
1869
1870 // If this is a 2 element vector, we really want to scalarize and not create
1871 // weird 1 element vectors.
1872 if (VT.getVectorNumElements() == 2)
1873 return scalarizeVectorStore(Store, DAG);
1874
1875 EVT MemVT = Store->getMemoryVT();
1876 SDValue Chain = Store->getChain();
1877 SDValue BasePtr = Store->getBasePtr();
1878 SDLoc SL(Op);
1879
1880 EVT LoVT, HiVT;
1881 EVT LoMemVT, HiMemVT;
1882 SDValue Lo, Hi;
1883
1884 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1885 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1886 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1887
1888 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1889
1890 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1891 Align BaseAlign = Store->getAlign();
1892 unsigned Size = LoMemVT.getStoreSize();
1893 Align HiAlign = commonAlignment(BaseAlign, Size);
1894
1895 SDValue LoStore =
1896 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1897 Store->getMemOperand()->getFlags());
1898 SDValue HiStore =
1899 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1900 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1901
1902 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1903}
1904
1905// This is a shortcut for integer division because we have fast i32<->f32
1906// conversions, and fast f32 reciprocal instructions. The fractional part of a
1907// float is enough to accurately represent up to a 24-bit signed integer.
1909 bool Sign) const {
1910 SDLoc DL(Op);
1911 EVT VT = Op.getValueType();
1912 SDValue LHS = Op.getOperand(0);
1913 SDValue RHS = Op.getOperand(1);
1914 MVT IntVT = MVT::i32;
1915 MVT FltVT = MVT::f32;
1916
1917 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1918 if (LHSSignBits < 9)
1919 return SDValue();
1920
1921 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1922 if (RHSSignBits < 9)
1923 return SDValue();
1924
1925 unsigned BitSize = VT.getSizeInBits();
1926 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1927 unsigned DivBits = BitSize - SignBits;
1928 if (Sign)
1929 ++DivBits;
1930
1933
1934 SDValue jq = DAG.getConstant(1, DL, IntVT);
1935
1936 if (Sign) {
1937 // char|short jq = ia ^ ib;
1938 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1939
1940 // jq = jq >> (bitsize - 2)
1941 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1942 DAG.getConstant(BitSize - 2, DL, VT));
1943
1944 // jq = jq | 0x1
1945 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1946 }
1947
1948 // int ia = (int)LHS;
1949 SDValue ia = LHS;
1950
1951 // int ib, (int)RHS;
1952 SDValue ib = RHS;
1953
1954 // float fa = (float)ia;
1955 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1956
1957 // float fb = (float)ib;
1958 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1959
1960 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1961 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1962
1963 // fq = trunc(fq);
1964 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1965
1966 // float fqneg = -fq;
1967 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1968
1970
1971 bool UseFmadFtz = false;
1972 if (Subtarget->isGCN()) {
1974 UseFmadFtz =
1976 }
1977
1978 // float fr = mad(fqneg, fb, fa);
1979 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1980 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1982 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1983
1984 // int iq = (int)fq;
1985 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1986
1987 // fr = fabs(fr);
1988 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1989
1990 // fb = fabs(fb);
1991 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1992
1993 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1994
1995 // int cv = fr >= fb;
1996 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1997
1998 // jq = (cv ? jq : 0);
1999 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2000
2001 // dst = iq + jq;
2002 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2003
2004 // Rem needs compensation, it's easier to recompute it
2005 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2006 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2007
2008 // Truncate to number of bits this divide really is.
2009 if (Sign) {
2010 SDValue InRegSize
2011 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2012 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2013 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2014 } else {
2015 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2016 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2017 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2018 }
2019
2020 return DAG.getMergeValues({ Div, Rem }, DL);
2021}
2022
2024 SelectionDAG &DAG,
2026 SDLoc DL(Op);
2027 EVT VT = Op.getValueType();
2028
2029 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2030
2031 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2032
2033 SDValue One = DAG.getConstant(1, DL, HalfVT);
2034 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2035
2036 //HiLo split
2037 SDValue LHS_Lo, LHS_Hi;
2038 SDValue LHS = Op.getOperand(0);
2039 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2040
2041 SDValue RHS_Lo, RHS_Hi;
2042 SDValue RHS = Op.getOperand(1);
2043 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2044
2045 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2047
2048 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2049 LHS_Lo, RHS_Lo);
2050
2051 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2052 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2053
2054 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2055 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2056 return;
2057 }
2058
2059 if (isTypeLegal(MVT::i64)) {
2060 // The algorithm here is based on ideas from "Software Integer Division",
2061 // Tom Rodeheffer, August 2008.
2062
2065
2066 // Compute denominator reciprocal.
2067 unsigned FMAD =
2068 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2071 : (unsigned)AMDGPUISD::FMAD_FTZ;
2072
2073 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2074 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2075 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2076 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2077 Cvt_Lo);
2078 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2079 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2080 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2081 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2082 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2083 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2084 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2085 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2086 Mul1);
2087 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2088 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2089 SDValue Rcp64 = DAG.getBitcast(VT,
2090 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2091
2092 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2093 SDValue One64 = DAG.getConstant(1, DL, VT);
2094 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2095 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2096
2097 // First round of UNR (Unsigned integer Newton-Raphson).
2098 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2099 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2100 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2101 SDValue Mulhi1_Lo, Mulhi1_Hi;
2102 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2103 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2104 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2105 Mulhi1_Lo, Zero1);
2106 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2107 Mulhi1_Hi, Add1_Lo.getValue(1));
2108 SDValue Add1 = DAG.getBitcast(VT,
2109 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2110
2111 // Second round of UNR.
2112 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2113 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2114 SDValue Mulhi2_Lo, Mulhi2_Hi;
2115 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2116 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2117 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2118 Mulhi2_Lo, Zero1);
2119 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2120 Mulhi2_Hi, Add2_Lo.getValue(1));
2121 SDValue Add2 = DAG.getBitcast(VT,
2122 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2123
2124 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2125
2126 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2127
2128 SDValue Mul3_Lo, Mul3_Hi;
2129 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2130 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2131 Mul3_Lo, Zero1);
2132 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2133 Mul3_Hi, Sub1_Lo.getValue(1));
2134 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2135 SDValue Sub1 = DAG.getBitcast(VT,
2136 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2137
2138 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2139 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2140 ISD::SETUGE);
2141 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2142 ISD::SETUGE);
2143 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2144
2145 // TODO: Here and below portions of the code can be enclosed into if/endif.
2146 // Currently control flow is unconditional and we have 4 selects after
2147 // potential endif to substitute PHIs.
2148
2149 // if C3 != 0 ...
2150 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2151 RHS_Lo, Zero1);
2152 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2153 RHS_Hi, Sub1_Lo.getValue(1));
2154 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2155 Zero, Sub2_Lo.getValue(1));
2156 SDValue Sub2 = DAG.getBitcast(VT,
2157 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2158
2159 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2160
2161 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2162 ISD::SETUGE);
2163 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2164 ISD::SETUGE);
2165 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2166
2167 // if (C6 != 0)
2168 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2169
2170 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2171 RHS_Lo, Zero1);
2172 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2173 RHS_Hi, Sub2_Lo.getValue(1));
2174 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2175 Zero, Sub3_Lo.getValue(1));
2176 SDValue Sub3 = DAG.getBitcast(VT,
2177 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2178
2179 // endif C6
2180 // endif C3
2181
2182 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2183 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2184
2185 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2186 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2187
2188 Results.push_back(Div);
2189 Results.push_back(Rem);
2190
2191 return;
2192 }
2193
2194 // r600 expandion.
2195 // Get Speculative values
2196 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2197 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2198
2199 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2200 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2201 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2202
2203 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2204 SDValue DIV_Lo = Zero;
2205
2206 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2207
2208 for (unsigned i = 0; i < halfBitWidth; ++i) {
2209 const unsigned bitPos = halfBitWidth - i - 1;
2210 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2211 // Get value of high bit
2212 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2213 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2214 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2215
2216 // Shift
2217 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2218 // Add LHS high bit
2219 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2220
2221 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2222 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2223
2224 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2225
2226 // Update REM
2227 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2228 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2229 }
2230
2231 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2232 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2233 Results.push_back(DIV);
2234 Results.push_back(REM);
2235}
2236
2238 SelectionDAG &DAG) const {
2239 SDLoc DL(Op);
2240 EVT VT = Op.getValueType();
2241
2242 if (VT == MVT::i64) {
2244 LowerUDIVREM64(Op, DAG, Results);
2245 return DAG.getMergeValues(Results, DL);
2246 }
2247
2248 if (VT == MVT::i32) {
2249 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2250 return Res;
2251 }
2252
2253 SDValue X = Op.getOperand(0);
2254 SDValue Y = Op.getOperand(1);
2255
2256 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2257 // algorithm used here.
2258
2259 // Initial estimate of inv(y).
2260 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2261
2262 // One round of UNR.
2263 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2264 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2265 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2266 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2267
2268 // Quotient/remainder estimate.
2269 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2270 SDValue R =
2271 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2272
2273 // First quotient/remainder refinement.
2274 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2275 SDValue One = DAG.getConstant(1, DL, VT);
2276 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2277 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2278 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2279 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2280 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2281
2282 // Second quotient/remainder refinement.
2283 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2284 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2285 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2286 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2287 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2288
2289 return DAG.getMergeValues({Q, R}, DL);
2290}
2291
2293 SelectionDAG &DAG) const {
2294 SDLoc DL(Op);
2295 EVT VT = Op.getValueType();
2296
2297 SDValue LHS = Op.getOperand(0);
2298 SDValue RHS = Op.getOperand(1);
2299
2300 SDValue Zero = DAG.getConstant(0, DL, VT);
2301 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2302
2303 if (VT == MVT::i32) {
2304 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2305 return Res;
2306 }
2307
2308 if (VT == MVT::i64 &&
2309 DAG.ComputeNumSignBits(LHS) > 32 &&
2310 DAG.ComputeNumSignBits(RHS) > 32) {
2311 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2312
2313 //HiLo split
2314 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2315 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2316 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2317 LHS_Lo, RHS_Lo);
2318 SDValue Res[2] = {
2319 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2320 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2321 };
2322 return DAG.getMergeValues(Res, DL);
2323 }
2324
2325 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2326 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2327 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2328 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2329
2330 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2331 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2332
2333 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2334 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2335
2336 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2337 SDValue Rem = Div.getValue(1);
2338
2339 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2340 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2341
2342 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2343 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2344
2345 SDValue Res[2] = {
2346 Div,
2347 Rem
2348 };
2349 return DAG.getMergeValues(Res, DL);
2350}
2351
2352// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2354 SDLoc SL(Op);
2355 EVT VT = Op.getValueType();
2356 auto Flags = Op->getFlags();
2357 SDValue X = Op.getOperand(0);
2358 SDValue Y = Op.getOperand(1);
2359
2360 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2361 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2362 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2363 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2364 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2365}
2366
2368 SDLoc SL(Op);
2369 SDValue Src = Op.getOperand(0);
2370
2371 // result = trunc(src)
2372 // if (src > 0.0 && src != result)
2373 // result += 1.0
2374
2375 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2376
2377 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2378 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2379
2380 EVT SetCCVT =
2381 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2382
2383 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2384 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2385 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2386
2387 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2388 // TODO: Should this propagate fast-math-flags?
2389 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2390}
2391
2393 SelectionDAG &DAG) {
2394 const unsigned FractBits = 52;
2395 const unsigned ExpBits = 11;
2396
2397 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2398 Hi,
2399 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2400 DAG.getConstant(ExpBits, SL, MVT::i32));
2401 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2402 DAG.getConstant(1023, SL, MVT::i32));
2403
2404 return Exp;
2405}
2406
2408 SDLoc SL(Op);
2409 SDValue Src = Op.getOperand(0);
2410
2411 assert(Op.getValueType() == MVT::f64);
2412
2413 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2414
2415 // Extract the upper half, since this is where we will find the sign and
2416 // exponent.
2417 SDValue Hi = getHiHalf64(Src, DAG);
2418
2419 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2420
2421 const unsigned FractBits = 52;
2422
2423 // Extract the sign bit.
2424 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2425 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2426
2427 // Extend back to 64-bits.
2428 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2429 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2430
2431 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2432 const SDValue FractMask
2433 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2434
2435 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2436 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2437 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2438
2439 EVT SetCCVT =
2440 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2441
2442 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2443
2444 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2445 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2446
2447 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2448 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2449
2450 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2451}
2452
2454 SelectionDAG &DAG) const {
2455 SDLoc SL(Op);
2456 SDValue Src = Op.getOperand(0);
2457
2458 assert(Op.getValueType() == MVT::f64);
2459
2460 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2461 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2462 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2463
2464 // TODO: Should this propagate fast-math-flags?
2465
2466 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2467 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2468
2469 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2470
2471 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2472 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2473
2474 EVT SetCCVT =
2475 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2476 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2477
2478 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2479}
2480
2482 SelectionDAG &DAG) const {
2483 // FNEARBYINT and FRINT are the same, except in their handling of FP
2484 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2485 // rint, so just treat them as equivalent.
2486 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2487 Op.getOperand(0));
2488}
2489
2491 auto VT = Op.getValueType();
2492 auto Arg = Op.getOperand(0u);
2493 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2494}
2495
2496// XXX - May require not supporting f32 denormals?
2497
2498// Don't handle v2f16. The extra instructions to scalarize and repack around the
2499// compare and vselect end up producing worse code than scalarizing the whole
2500// operation.
2502 SDLoc SL(Op);
2503 SDValue X = Op.getOperand(0);
2504 EVT VT = Op.getValueType();
2505
2506 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2507
2508 // TODO: Should this propagate fast-math-flags?
2509
2510 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2511
2512 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2513
2514 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2515 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2516
2517 EVT SetCCVT =
2518 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2519
2520 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2521 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2522 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2523
2524 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2525 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2526}
2527
2529 SDLoc SL(Op);
2530 SDValue Src = Op.getOperand(0);
2531
2532 // result = trunc(src);
2533 // if (src < 0.0 && src != result)
2534 // result += -1.0.
2535
2536 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2537
2538 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2539 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2540
2541 EVT SetCCVT =
2542 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2543
2544 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2545 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2546 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2547
2548 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2549 // TODO: Should this propagate fast-math-flags?
2550 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2551}
2552
2553/// Return true if it's known that \p Src can never be an f32 denormal value.
2555 switch (Src.getOpcode()) {
2556 case ISD::FP_EXTEND:
2557 return Src.getOperand(0).getValueType() == MVT::f16;
2558 case ISD::FP16_TO_FP:
2559 case ISD::FFREXP:
2560 return true;
2562 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2563 switch (IntrinsicID) {
2564 case Intrinsic::amdgcn_frexp_mant:
2565 return true;
2566 default:
2567 return false;
2568 }
2569 }
2570 default:
2571 return false;
2572 }
2573
2574 llvm_unreachable("covered opcode switch");
2575}
2576
2578 SDNodeFlags Flags) {
2579 if (Flags.hasApproximateFuncs())
2580 return true;
2581 auto &Options = DAG.getTarget().Options;
2582 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2583}
2584
2586 SDValue Src,
2587 SDNodeFlags Flags) {
2588 return !valueIsKnownNeverF32Denorm(Src) &&
2589 DAG.getMachineFunction()
2592}
2593
2595 SDValue Src,
2596 SDNodeFlags Flags) const {
2597 SDLoc SL(Src);
2598 EVT VT = Src.getValueType();
2600 SDValue SmallestNormal =
2601 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2602
2603 // Want to scale denormals up, but negatives and 0 work just as well on the
2604 // scaled path.
2605 SDValue IsLtSmallestNormal = DAG.getSetCC(
2606 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2607 SmallestNormal, ISD::SETOLT);
2608
2609 return IsLtSmallestNormal;
2610}
2611
2613 SDNodeFlags Flags) const {
2614 SDLoc SL(Src);
2615 EVT VT = Src.getValueType();
2617 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2618
2619 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2620 SDValue IsFinite = DAG.getSetCC(
2621 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2622 Inf, ISD::SETOLT);
2623 return IsFinite;
2624}
2625
2626/// If denormal handling is required return the scaled input to FLOG2, and the
2627/// check for denormal range. Otherwise, return null values.
2628std::pair<SDValue, SDValue>
2630 SDValue Src, SDNodeFlags Flags) const {
2631 if (!needsDenormHandlingF32(DAG, Src, Flags))
2632 return {};
2633
2634 MVT VT = MVT::f32;
2635 const fltSemantics &Semantics = APFloat::IEEEsingle();
2636 SDValue SmallestNormal =
2637 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2638
2639 SDValue IsLtSmallestNormal = DAG.getSetCC(
2640 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2641 SmallestNormal, ISD::SETOLT);
2642
2643 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2644 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2645 SDValue ScaleFactor =
2646 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2647
2648 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2649 return {ScaledInput, IsLtSmallestNormal};
2650}
2651
2653 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2654 // If we have to handle denormals, scale up the input and adjust the result.
2655
2656 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2657 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2658
2659 SDLoc SL(Op);
2660 EVT VT = Op.getValueType();
2661 SDValue Src = Op.getOperand(0);
2662 SDNodeFlags Flags = Op->getFlags();
2663
2664 if (VT == MVT::f16) {
2665 // Nothing in half is a denormal when promoted to f32.
2666 assert(!Subtarget->has16BitInsts());
2667 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2668 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2669 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2670 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2671 }
2672
2673 auto [ScaledInput, IsLtSmallestNormal] =
2674 getScaledLogInput(DAG, SL, Src, Flags);
2675 if (!ScaledInput)
2676 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2677
2678 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2679
2680 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2681 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2682 SDValue ResultOffset =
2683 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2684 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2685}
2686
2687static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2688 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2689 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2690 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2691}
2692
2694 SelectionDAG &DAG) const {
2695 SDValue X = Op.getOperand(0);
2696 EVT VT = Op.getValueType();
2697 SDNodeFlags Flags = Op->getFlags();
2698 SDLoc DL(Op);
2699
2700 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2701 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2702
2703 const auto &Options = getTargetMachine().Options;
2704 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2705 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2706
2707 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2708 // Log and multiply in f32 is good enough for f16.
2709 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2710 }
2711
2712 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2713 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2714 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2715 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2716 }
2717
2718 return Lowered;
2719 }
2720
2721 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2722 if (ScaledInput)
2723 X = ScaledInput;
2724
2725 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2726
2727 SDValue R;
2728 if (Subtarget->hasFastFMAF32()) {
2729 // c+cc are ln(2)/ln(10) to more than 49 bits
2730 const float c_log10 = 0x1.344134p-2f;
2731 const float cc_log10 = 0x1.09f79ep-26f;
2732
2733 // c + cc is ln(2) to more than 49 bits
2734 const float c_log = 0x1.62e42ep-1f;
2735 const float cc_log = 0x1.efa39ep-25f;
2736
2737 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2738 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2739
2740 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2741 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2742 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2743 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2744 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2745 } else {
2746 // ch+ct is ln(2)/ln(10) to more than 36 bits
2747 const float ch_log10 = 0x1.344000p-2f;
2748 const float ct_log10 = 0x1.3509f6p-18f;
2749
2750 // ch + ct is ln(2) to more than 36 bits
2751 const float ch_log = 0x1.62e000p-1f;
2752 const float ct_log = 0x1.0bfbe8p-15f;
2753
2754 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2755 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2756
2757 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2758 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2759 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2760 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2761 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2762
2763 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2764 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2765 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2766 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2767 }
2768
2769 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2770 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2771
2772 // TODO: Check if known finite from source value.
2773 if (!IsFiniteOnly) {
2774 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2775 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2776 }
2777
2778 if (IsScaled) {
2779 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2780 SDValue ShiftK =
2781 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2782 SDValue Shift =
2783 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2784 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2785 }
2786
2787 return R;
2788}
2789
2791 return LowerFLOGCommon(Op, DAG);
2792}
2793
2794// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2795// promote f16 operation.
2797 SelectionDAG &DAG, bool IsLog10,
2798 SDNodeFlags Flags) const {
2799 EVT VT = Src.getValueType();
2800 unsigned LogOp =
2801 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2802
2803 double Log2BaseInverted =
2805
2806 if (VT == MVT::f32) {
2807 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2808 if (ScaledInput) {
2809 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2810 SDValue ScaledResultOffset =
2811 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2812
2813 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2814
2815 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2816 ScaledResultOffset, Zero, Flags);
2817
2818 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2819
2820 if (Subtarget->hasFastFMAF32())
2821 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2822 Flags);
2823 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2824 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2825 }
2826 }
2827
2828 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2829 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2830
2831 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2832 Flags);
2833}
2834
2836 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2837 // If we have to handle denormals, scale up the input and adjust the result.
2838
2839 SDLoc SL(Op);
2840 EVT VT = Op.getValueType();
2841 SDValue Src = Op.getOperand(0);
2842 SDNodeFlags Flags = Op->getFlags();
2843
2844 if (VT == MVT::f16) {
2845 // Nothing in half is a denormal when promoted to f32.
2846 assert(!Subtarget->has16BitInsts());
2847 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2848 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2849 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2850 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2851 }
2852
2853 assert(VT == MVT::f32);
2854
2855 if (!needsDenormHandlingF32(DAG, Src, Flags))
2856 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2857
2858 // bool needs_scaling = x < -0x1.f80000p+6f;
2859 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2860
2861 // -nextafter(128.0, -1)
2862 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2863
2864 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2865
2866 SDValue NeedsScaling =
2867 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2868
2869 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2870 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2871
2872 SDValue AddOffset =
2873 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2874
2875 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2876 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2877
2878 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2879 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2880 SDValue ResultScale =
2881 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2882
2883 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2884}
2885
2887 SelectionDAG &DAG,
2888 SDNodeFlags Flags) const {
2889 EVT VT = X.getValueType();
2890 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2891
2892 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2893 // exp2(M_LOG2E_F * f);
2894 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2895 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2896 : (unsigned)ISD::FEXP2,
2897 SL, VT, Mul, Flags);
2898 }
2899
2900 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2901
2902 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2903 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2904
2905 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2906
2907 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2908
2909 SDValue AdjustedX =
2910 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2911
2912 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2913
2914 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2915
2916 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2917 SDValue AdjustedResult =
2918 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2919
2920 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2921 Flags);
2922}
2923
2924/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2925/// handled correctly.
2927 SelectionDAG &DAG,
2928 SDNodeFlags Flags) const {
2929 const EVT VT = X.getValueType();
2930 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2931
2932 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2933 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2934 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2935 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2936
2937 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2938 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2939 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2940 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2941 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2942 }
2943
2944 // bool s = x < -0x1.2f7030p+5f;
2945 // x += s ? 0x1.0p+5f : 0.0f;
2946 // exp10 = exp2(x * 0x1.a92000p+1f) *
2947 // exp2(x * 0x1.4f0978p-11f) *
2948 // (s ? 0x1.9f623ep-107f : 1.0f);
2949
2950 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2951
2952 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2953 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2954
2955 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2956 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2957 SDValue AdjustedX =
2958 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2959
2960 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2961 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2962
2963 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2964 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2965 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2966 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2967
2968 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2969
2970 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2971 SDValue AdjustedResult =
2972 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2973
2974 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2975 Flags);
2976}
2977
2979 EVT VT = Op.getValueType();
2980 SDLoc SL(Op);
2981 SDValue X = Op.getOperand(0);
2982 SDNodeFlags Flags = Op->getFlags();
2983 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2984
2985 if (VT.getScalarType() == MVT::f16) {
2986 // v_exp_f16 (fmul x, log2e)
2987 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2988 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2989
2990 if (VT.isVector())
2991 return SDValue();
2992
2993 // exp(f16 x) ->
2994 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2995
2996 // Nothing in half is a denormal when promoted to f32.
2997 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2998 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2999 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3000 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3001 }
3002
3003 assert(VT == MVT::f32);
3004
3005 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3006 // library behavior. Also, is known-not-daz source sufficient?
3007 if (allowApproxFunc(DAG, Flags)) {
3008 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3009 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3010 }
3011
3012 // Algorithm:
3013 //
3014 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3015 //
3016 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3017 // n = 64*m + j, 0 <= j < 64
3018 //
3019 // e^x = 2^((64*m + j + f)/64)
3020 // = (2^m) * (2^(j/64)) * 2^(f/64)
3021 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3022 //
3023 // f = x*(64/ln(2)) - n
3024 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3025 //
3026 // e^x = (2^m) * (2^(j/64)) * e^r
3027 //
3028 // (2^(j/64)) is precomputed
3029 //
3030 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3031 // e^r = 1 + q
3032 //
3033 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3034 //
3035 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3036 SDNodeFlags FlagsNoContract = Flags;
3037 FlagsNoContract.setAllowContract(false);
3038
3039 SDValue PH, PL;
3040 if (Subtarget->hasFastFMAF32()) {
3041 const float c_exp = numbers::log2ef;
3042 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3043 const float c_exp10 = 0x1.a934f0p+1f;
3044 const float cc_exp10 = 0x1.2f346ep-24f;
3045
3046 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3047 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3048
3049 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3050 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3051 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3052 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3053 } else {
3054 const float ch_exp = 0x1.714000p+0f;
3055 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3056
3057 const float ch_exp10 = 0x1.a92000p+1f;
3058 const float cl_exp10 = 0x1.4f0978p-11f;
3059
3060 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3061 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3062
3063 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3064 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3065 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3066 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3067 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3068
3069 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3070
3071 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3072 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3073 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3074 }
3075
3076 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3077
3078 // It is unsafe to contract this fsub into the PH multiply.
3079 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3080
3081 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3082 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3083 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3084
3085 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3086
3087 SDValue UnderflowCheckConst =
3088 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3089
3090 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3091 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3092 SDValue Underflow =
3093 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3094
3095 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3096 const auto &Options = getTargetMachine().Options;
3097
3098 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3099 SDValue OverflowCheckConst =
3100 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3101 SDValue Overflow =
3102 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3103 SDValue Inf =
3105 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3106 }
3107
3108 return R;
3109}
3110
3111static bool isCtlzOpc(unsigned Opc) {
3112 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3113}
3114
3115static bool isCttzOpc(unsigned Opc) {
3116 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3117}
3118
3120 SelectionDAG &DAG) const {
3121 auto SL = SDLoc(Op);
3122 auto Opc = Op.getOpcode();
3123 auto Arg = Op.getOperand(0u);
3124 auto ResultVT = Op.getValueType();
3125
3126 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3127 return {};
3128
3129 assert(isCtlzOpc(Opc));
3130 assert(ResultVT == Arg.getValueType());
3131
3132 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3133 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3134 SDValue NewOp;
3135
3136 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3137 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3138 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3139 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3140 } else {
3141 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3142 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3143 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3144 }
3145
3146 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3147}
3148
3150 SDLoc SL(Op);
3151 SDValue Src = Op.getOperand(0);
3152
3153 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3154 bool Ctlz = isCtlzOpc(Op.getOpcode());
3155 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3156
3157 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3158 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3159 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3160
3161 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3162 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3163 // (cttz hi:lo) -> (umin (ffbl src), 32)
3164 // (ctlz_zero_undef src) -> (ffbh src)
3165 // (cttz_zero_undef src) -> (ffbl src)
3166
3167 // 64-bit scalar version produce 32-bit result
3168 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3169 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3170 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3171 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3172 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3173 if (!ZeroUndef) {
3174 const SDValue ConstVal = DAG.getConstant(
3175 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3176 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3177 }
3178 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3179 }
3180
3181 SDValue Lo, Hi;
3182 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3183
3184 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3185 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3186
3187 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3188 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3189 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3190 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3191
3192 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3193 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3194 if (Ctlz)
3195 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3196 else
3197 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3198
3199 SDValue NewOpr;
3200 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3201 if (!ZeroUndef) {
3202 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3203 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3204 }
3205
3206 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3207}
3208
3210 bool Signed) const {
3211 // The regular method converting a 64-bit integer to float roughly consists of
3212 // 2 steps: normalization and rounding. In fact, after normalization, the
3213 // conversion from a 64-bit integer to a float is essentially the same as the
3214 // one from a 32-bit integer. The only difference is that it has more
3215 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3216 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3217 // converted into the correct float number. The basic steps for the unsigned
3218 // conversion are illustrated in the following pseudo code:
3219 //
3220 // f32 uitofp(i64 u) {
3221 // i32 hi, lo = split(u);
3222 // // Only count the leading zeros in hi as we have native support of the
3223 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3224 // // reduced to a 32-bit one automatically.
3225 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3226 // u <<= shamt;
3227 // hi, lo = split(u);
3228 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3229 // // convert it as a 32-bit integer and scale the result back.
3230 // return uitofp(hi) * 2^(32 - shamt);
3231 // }
3232 //
3233 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3234 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3235 // converted instead followed by negation based its sign bit.
3236
3237 SDLoc SL(Op);
3238 SDValue Src = Op.getOperand(0);
3239
3240 SDValue Lo, Hi;
3241 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3242 SDValue Sign;
3243 SDValue ShAmt;
3244 if (Signed && Subtarget->isGCN()) {
3245 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3246 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3247 // account. That is, the maximal shift is
3248 // - 32 if Lo and Hi have opposite signs;
3249 // - 33 if Lo and Hi have the same sign.
3250 //
3251 // Or, MaxShAmt = 33 + OppositeSign, where
3252 //
3253 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3254 // - -1 if Lo and Hi have opposite signs; and
3255 // - 0 otherwise.
3256 //
3257 // All in all, ShAmt is calculated as
3258 //
3259 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3260 //
3261 // or
3262 //
3263 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3264 //
3265 // to reduce the critical path.
3266 SDValue OppositeSign = DAG.getNode(
3267 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3268 DAG.getConstant(31, SL, MVT::i32));
3269 SDValue MaxShAmt =
3270 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3271 OppositeSign);
3272 // Count the leading sign bits.
3273 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3274 // Different from unsigned conversion, the shift should be one bit less to
3275 // preserve the sign bit.
3276 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3277 DAG.getConstant(1, SL, MVT::i32));
3278 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3279 } else {
3280 if (Signed) {
3281 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3282 // absolute value first.
3283 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3284 DAG.getConstant(63, SL, MVT::i64));
3285 SDValue Abs =
3286 DAG.getNode(ISD::XOR, SL, MVT::i64,
3287 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3288 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3289 }
3290 // Count the leading zeros.
3291 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3292 // The shift amount for signed integers is [0, 32].
3293 }
3294 // Normalize the given 64-bit integer.
3295 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3296 // Split it again.
3297 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3298 // Calculate the adjust bit for rounding.
3299 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3300 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3301 DAG.getConstant(1, SL, MVT::i32), Lo);
3302 // Get the 32-bit normalized integer.
3303 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3304 // Convert the normalized 32-bit integer into f32.
3305 unsigned Opc =
3306 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3307 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3308
3309 // Finally, need to scale back the converted floating number as the original
3310 // 64-bit integer is converted as a 32-bit one.
3311 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3312 ShAmt);
3313 // On GCN, use LDEXP directly.
3314 if (Subtarget->isGCN())
3315 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3316
3317 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3318 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3319 // exponent is enough to avoid overflowing into the sign bit.
3320 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3321 DAG.getConstant(23, SL, MVT::i32));
3322 SDValue IVal =
3323 DAG.getNode(ISD::ADD, SL, MVT::i32,
3324 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3325 if (Signed) {
3326 // Set the sign bit.
3327 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3328 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3329 DAG.getConstant(31, SL, MVT::i32));
3330 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3331 }
3332 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3333}
3334
3336 bool Signed) const {
3337 SDLoc SL(Op);
3338 SDValue Src = Op.getOperand(0);
3339
3340 SDValue Lo, Hi;
3341 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3342
3344 SL, MVT::f64, Hi);
3345
3346 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3347
3348 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3349 DAG.getConstant(32, SL, MVT::i32));
3350 // TODO: Should this propagate fast-math-flags?
3351 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3352}
3353
3355 SelectionDAG &DAG) const {
3356 // TODO: Factor out code common with LowerSINT_TO_FP.
3357 EVT DestVT = Op.getValueType();
3358 SDValue Src = Op.getOperand(0);
3359 EVT SrcVT = Src.getValueType();
3360
3361 if (SrcVT == MVT::i16) {
3362 if (DestVT == MVT::f16)
3363 return Op;
3364 SDLoc DL(Op);
3365
3366 // Promote src to i32
3367 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3368 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3369 }
3370
3371 if (DestVT == MVT::bf16) {
3372 SDLoc SL(Op);
3373 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3374 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3375 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3376 }
3377
3378 if (SrcVT != MVT::i64)
3379 return Op;
3380
3381 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3382 SDLoc DL(Op);
3383
3384 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3385 SDValue FPRoundFlag =
3386 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3387 SDValue FPRound =
3388 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3389
3390 return FPRound;
3391 }
3392
3393 if (DestVT == MVT::f32)
3394 return LowerINT_TO_FP32(Op, DAG, false);
3395
3396 assert(DestVT == MVT::f64);
3397 return LowerINT_TO_FP64(Op, DAG, false);
3398}
3399
3401 SelectionDAG &DAG) const {
3402 EVT DestVT = Op.getValueType();
3403
3404 SDValue Src = Op.getOperand(0);
3405 EVT SrcVT = Src.getValueType();
3406
3407 if (SrcVT == MVT::i16) {
3408 if (DestVT == MVT::f16)
3409 return Op;
3410
3411 SDLoc DL(Op);
3412 // Promote src to i32
3413 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3414 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3415 }
3416
3417 if (DestVT == MVT::bf16) {
3418 SDLoc SL(Op);
3419 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3420 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3421 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3422 }
3423
3424 if (SrcVT != MVT::i64)
3425 return Op;
3426
3427 // TODO: Factor out code common with LowerUINT_TO_FP.
3428
3429 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3430 SDLoc DL(Op);
3431 SDValue Src = Op.getOperand(0);
3432
3433 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3434 SDValue FPRoundFlag =
3435 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3436 SDValue FPRound =
3437 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3438
3439 return FPRound;
3440 }
3441
3442 if (DestVT == MVT::f32)
3443 return LowerINT_TO_FP32(Op, DAG, true);
3444
3445 assert(DestVT == MVT::f64);
3446 return LowerINT_TO_FP64(Op, DAG, true);
3447}
3448
3450 bool Signed) const {
3451 SDLoc SL(Op);
3452
3453 SDValue Src = Op.getOperand(0);
3454 EVT SrcVT = Src.getValueType();
3455
3456 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3457
3458 // The basic idea of converting a floating point number into a pair of 32-bit
3459 // integers is illustrated as follows:
3460 //
3461 // tf := trunc(val);
3462 // hif := floor(tf * 2^-32);
3463 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3464 // hi := fptoi(hif);
3465 // lo := fptoi(lof);
3466 //
3467 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3468 SDValue Sign;
3469 if (Signed && SrcVT == MVT::f32) {
3470 // However, a 32-bit floating point number has only 23 bits mantissa and
3471 // it's not enough to hold all the significant bits of `lof` if val is
3472 // negative. To avoid the loss of precision, We need to take the absolute
3473 // value after truncating and flip the result back based on the original
3474 // signedness.
3475 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3476 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3477 DAG.getConstant(31, SL, MVT::i32));
3478 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3479 }
3480
3481 SDValue K0, K1;
3482 if (SrcVT == MVT::f64) {
3483 K0 = DAG.getConstantFP(
3484 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3485 SrcVT);
3486 K1 = DAG.getConstantFP(
3487 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3488 SrcVT);
3489 } else {
3490 K0 = DAG.getConstantFP(
3491 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3492 K1 = DAG.getConstantFP(
3493 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3494 }
3495 // TODO: Should this propagate fast-math-flags?
3496 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3497
3498 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3499
3500 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3501
3502 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3504 SL, MVT::i32, FloorMul);
3505 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3506
3507 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3508 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3509
3510 if (Signed && SrcVT == MVT::f32) {
3511 assert(Sign);
3512 // Flip the result based on the signedness, which is either all 0s or 1s.
3513 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3514 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3515 // r := xor(r, sign) - sign;
3516 Result =
3517 DAG.getNode(ISD::SUB, SL, MVT::i64,
3518 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3519 }
3520
3521 return Result;
3522}
3523
3525 SDLoc DL(Op);
3526 SDValue N0 = Op.getOperand(0);
3527
3528 // Convert to target node to get known bits
3529 if (N0.getValueType() == MVT::f32)
3530 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3531
3532 if (getTargetMachine().Options.UnsafeFPMath) {
3533 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3534 return SDValue();
3535 }
3536
3537 assert(N0.getSimpleValueType() == MVT::f64);
3538
3539 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3540 const unsigned ExpMask = 0x7ff;
3541 const unsigned ExpBiasf64 = 1023;
3542 const unsigned ExpBiasf16 = 15;
3543 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3544 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3545 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3546 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3547 DAG.getConstant(32, DL, MVT::i64));
3548 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3549 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3550 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3551 DAG.getConstant(20, DL, MVT::i64));
3552 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3553 DAG.getConstant(ExpMask, DL, MVT::i32));
3554 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3555 // add the f16 bias (15) to get the biased exponent for the f16 format.
3556 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3557 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3558
3559 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3560 DAG.getConstant(8, DL, MVT::i32));
3561 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3562 DAG.getConstant(0xffe, DL, MVT::i32));
3563
3564 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3565 DAG.getConstant(0x1ff, DL, MVT::i32));
3566 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3567
3568 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3569 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3570
3571 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3572 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3573 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3574 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3575
3576 // N = M | (E << 12);
3577 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3578 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3579 DAG.getConstant(12, DL, MVT::i32)));
3580
3581 // B = clamp(1-E, 0, 13);
3582 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3583 One, E);
3584 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3585 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3586 DAG.getConstant(13, DL, MVT::i32));
3587
3588 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3589 DAG.getConstant(0x1000, DL, MVT::i32));
3590
3591 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3592 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3593 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3594 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3595
3596 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3597 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3598 DAG.getConstant(0x7, DL, MVT::i32));
3599 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3600 DAG.getConstant(2, DL, MVT::i32));
3601 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3602 One, Zero, ISD::SETEQ);
3603 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3604 One, Zero, ISD::SETGT);
3605 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3606 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3607
3608 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3609 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3610 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3611 I, V, ISD::SETEQ);
3612
3613 // Extract the sign bit.
3614 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3615 DAG.getConstant(16, DL, MVT::i32));
3616 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3617 DAG.getConstant(0x8000, DL, MVT::i32));
3618
3619 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3620 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3621}
3622
3624 SelectionDAG &DAG) const {
3625 SDValue Src = Op.getOperand(0);
3626 unsigned OpOpcode = Op.getOpcode();
3627 EVT SrcVT = Src.getValueType();
3628 EVT DestVT = Op.getValueType();
3629
3630 // Will be selected natively
3631 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3632 return Op;
3633
3634 if (SrcVT == MVT::bf16) {
3635 SDLoc DL(Op);
3636 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3637 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3638 }
3639
3640 // Promote i16 to i32
3641 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3642 SDLoc DL(Op);
3643
3644 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3645 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3646 }
3647
3648 if (DestVT != MVT::i64)
3649 return Op;
3650
3651 if (SrcVT == MVT::f16 ||
3652 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3653 SDLoc DL(Op);
3654
3655 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3656 unsigned Ext =
3658 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3659 }
3660
3661 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3662 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3663
3664 return SDValue();
3665}
3666
3668 SelectionDAG &DAG) const {
3669 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3670 MVT VT = Op.getSimpleValueType();
3671 MVT ScalarVT = VT.getScalarType();
3672
3673 assert(VT.isVector());
3674
3675 SDValue Src = Op.getOperand(0);
3676 SDLoc DL(Op);
3677
3678 // TODO: Don't scalarize on Evergreen?
3679 unsigned NElts = VT.getVectorNumElements();
3681 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3682
3683 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3684 for (unsigned I = 0; I < NElts; ++I)
3685 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3686
3687 return DAG.getBuildVector(VT, DL, Args);
3688}
3689
3690//===----------------------------------------------------------------------===//
3691// Custom DAG optimizations
3692//===----------------------------------------------------------------------===//
3693
3694static bool isU24(SDValue Op, SelectionDAG &DAG) {
3695 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3696}
3697
3698static bool isI24(SDValue Op, SelectionDAG &DAG) {
3699 EVT VT = Op.getValueType();
3700 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3701 // as unsigned 24-bit values.
3703}
3704
3707 SelectionDAG &DAG = DCI.DAG;
3708 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3709 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3710
3711 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3712 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3713 unsigned NewOpcode = Node24->getOpcode();
3714 if (IsIntrin) {
3715 unsigned IID = Node24->getConstantOperandVal(0);
3716 switch (IID) {
3717 case Intrinsic::amdgcn_mul_i24:
3718 NewOpcode = AMDGPUISD::MUL_I24;
3719 break;
3720 case Intrinsic::amdgcn_mul_u24:
3721 NewOpcode = AMDGPUISD::MUL_U24;
3722 break;
3723 case Intrinsic::amdgcn_mulhi_i24:
3724 NewOpcode = AMDGPUISD::MULHI_I24;
3725 break;
3726 case Intrinsic::amdgcn_mulhi_u24:
3727 NewOpcode = AMDGPUISD::MULHI_U24;
3728 break;
3729 default:
3730 llvm_unreachable("Expected 24-bit mul intrinsic");
3731 }
3732 }
3733
3734 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3735
3736 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3737 // the operands to have other uses, but will only perform simplifications that
3738 // involve bypassing some nodes for this user.
3739 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3740 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3741 if (DemandedLHS || DemandedRHS)
3742 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3743 DemandedLHS ? DemandedLHS : LHS,
3744 DemandedRHS ? DemandedRHS : RHS);
3745
3746 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3747 // operands if this node is the only user.
3748 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3749 return SDValue(Node24, 0);
3750 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3751 return SDValue(Node24, 0);
3752
3753 return SDValue();
3754}
3755
3756template <typename IntTy>
3758 uint32_t Width, const SDLoc &DL) {
3759 if (Width + Offset < 32) {
3760 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3761 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3762 return DAG.getConstant(Result, DL, MVT::i32);
3763 }
3764
3765 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3766}
3767
3768static bool hasVolatileUser(SDNode *Val) {
3769 for (SDNode *U : Val->uses()) {
3770 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3771 if (M->isVolatile())
3772 return true;
3773 }
3774 }
3775
3776 return false;
3777}
3778
3780 // i32 vectors are the canonical memory type.
3781 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3782 return false;
3783
3784 if (!VT.isByteSized())
3785 return false;
3786
3787 unsigned Size = VT.getStoreSize();
3788
3789 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3790 return false;
3791
3792 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3793 return false;
3794
3795 return true;
3796}
3797
3798// Replace load of an illegal type with a store of a bitcast to a friendlier
3799// type.
3801 DAGCombinerInfo &DCI) const {
3802 if (!DCI.isBeforeLegalize())
3803 return SDValue();
3804
3805 LoadSDNode *LN = cast<LoadSDNode>(N);
3806 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3807 return SDValue();
3808
3809 SDLoc SL(N);
3810 SelectionDAG &DAG = DCI.DAG;
3811 EVT VT = LN->getMemoryVT();
3812
3813 unsigned Size = VT.getStoreSize();
3814 Align Alignment = LN->getAlign();
3815 if (Alignment < Size && isTypeLegal(VT)) {
3816 unsigned IsFast;
3817 unsigned AS = LN->getAddressSpace();
3818
3819 // Expand unaligned loads earlier than legalization. Due to visitation order
3820 // problems during legalization, the emitted instructions to pack and unpack
3821 // the bytes again are not eliminated in the case of an unaligned copy.
3823 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3824 if (VT.isVector())
3825 return SplitVectorLoad(SDValue(LN, 0), DAG);
3826
3827 SDValue Ops[2];
3828 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3829
3830 return DAG.getMergeValues(Ops, SDLoc(N));
3831 }
3832
3833 if (!IsFast)
3834 return SDValue();
3835 }
3836
3837 if (!shouldCombineMemoryType(VT))
3838 return SDValue();
3839
3840 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3841
3842 SDValue NewLoad
3843 = DAG.getLoad(NewVT, SL, LN->getChain(),
3844 LN->getBasePtr(), LN->getMemOperand());
3845
3846 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3847 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3848 return SDValue(N, 0);
3849}
3850
3851// Replace store of an illegal type with a store of a bitcast to a friendlier
3852// type.
3854 DAGCombinerInfo &DCI) const {
3855 if (!DCI.isBeforeLegalize())
3856 return SDValue();
3857
3858 StoreSDNode *SN = cast<StoreSDNode>(N);
3859 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3860 return SDValue();
3861
3862 EVT VT = SN->getMemoryVT();
3863 unsigned Size = VT.getStoreSize();
3864
3865 SDLoc SL(N);
3866 SelectionDAG &DAG = DCI.DAG;
3867 Align Alignment = SN->getAlign();
3868 if (Alignment < Size && isTypeLegal(VT)) {
3869 unsigned IsFast;
3870 unsigned AS = SN->getAddressSpace();
3871
3872 // Expand unaligned stores earlier than legalization. Due to visitation
3873 // order problems during legalization, the emitted instructions to pack and
3874 // unpack the bytes again are not eliminated in the case of an unaligned
3875 // copy.
3877 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3878 if (VT.isVector())
3879 return SplitVectorStore(SDValue(SN, 0), DAG);
3880
3881 return expandUnalignedStore(SN, DAG);
3882 }
3883
3884 if (!IsFast)
3885 return SDValue();
3886 }
3887
3888 if (!shouldCombineMemoryType(VT))
3889 return SDValue();
3890
3891 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3892 SDValue Val = SN->getValue();
3893
3894 //DCI.AddToWorklist(Val.getNode());
3895
3896 bool OtherUses = !Val.hasOneUse();
3897 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3898 if (OtherUses) {
3899 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3900 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3901 }
3902
3903 return DAG.getStore(SN->getChain(), SL, CastVal,
3904 SN->getBasePtr(), SN->getMemOperand());
3905}
3906
3907// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3908// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3909// issues.
3911 DAGCombinerInfo &DCI) const {
3912 SelectionDAG &DAG = DCI.DAG;
3913 SDValue N0 = N->getOperand(0);
3914
3915 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3916 // (vt2 (truncate (assertzext vt0:x, vt1)))
3917 if (N0.getOpcode() == ISD::TRUNCATE) {
3918 SDValue N1 = N->getOperand(1);
3919 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3920 SDLoc SL(N);
3921
3922 SDValue Src = N0.getOperand(0);
3923 EVT SrcVT = Src.getValueType();
3924 if (SrcVT.bitsGE(ExtVT)) {
3925 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3926 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3927 }
3928 }
3929
3930 return SDValue();
3931}
3932
3934 SDNode *N, DAGCombinerInfo &DCI) const {
3935 unsigned IID = N->getConstantOperandVal(0);
3936 switch (IID) {
3937 case Intrinsic::amdgcn_mul_i24:
3938 case Intrinsic::amdgcn_mul_u24:
3939 case Intrinsic::amdgcn_mulhi_i24:
3940 case Intrinsic::amdgcn_mulhi_u24:
3941 return simplifyMul24(N, DCI);
3942 case Intrinsic::amdgcn_fract:
3943 case Intrinsic::amdgcn_rsq:
3944 case Intrinsic::amdgcn_rcp_legacy:
3945 case Intrinsic::amdgcn_rsq_legacy:
3946 case Intrinsic::amdgcn_rsq_clamp: {
3947 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3948 SDValue Src = N->getOperand(1);
3949 return Src.isUndef() ? Src : SDValue();
3950 }
3951 case Intrinsic::amdgcn_frexp_exp: {
3952 // frexp_exp (fneg x) -> frexp_exp x
3953 // frexp_exp (fabs x) -> frexp_exp x
3954 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3955 SDValue Src = N->getOperand(1);
3956 SDValue PeekSign = peekFPSignOps(Src);
3957 if (PeekSign == Src)
3958 return SDValue();
3959 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3960 0);
3961 }
3962 default:
3963 return SDValue();
3964 }
3965}
3966
3967/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3968/// binary operation \p Opc to it with the corresponding constant operands.
3970 DAGCombinerInfo &DCI, const SDLoc &SL,
3971 unsigned Opc, SDValue LHS,
3972 uint32_t ValLo, uint32_t ValHi) const {
3973 SelectionDAG &DAG = DCI.DAG;
3974 SDValue Lo, Hi;
3975 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3976
3977 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3978 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3979
3980 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3981 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3982
3983 // Re-visit the ands. It's possible we eliminated one of them and it could
3984 // simplify the vector.
3985 DCI.AddToWorklist(Lo.getNode());
3986 DCI.AddToWorklist(Hi.getNode());
3987
3988 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3989 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3990}
3991
3993 DAGCombinerInfo &DCI) const {
3994 EVT VT = N->getValueType(0);
3995
3996 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3997 if (!RHS)
3998 return SDValue();
3999
4000 SDValue LHS = N->getOperand(0);
4001 unsigned RHSVal = RHS->getZExtValue();
4002 if (!RHSVal)
4003 return LHS;
4004
4005 SDLoc SL(N);
4006 SelectionDAG &DAG = DCI.DAG;
4007
4008 switch (LHS->getOpcode()) {
4009 default:
4010 break;
4011 case ISD::ZERO_EXTEND:
4012 case ISD::SIGN_EXTEND:
4013 case ISD::ANY_EXTEND: {
4014 SDValue X = LHS->getOperand(0);
4015
4016 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4017 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4018 // Prefer build_vector as the canonical form if packed types are legal.
4019 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4020 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4021 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4022 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4023 }
4024
4025 // shl (ext x) => zext (shl x), if shift does not overflow int
4026 if (VT != MVT::i64)
4027 break;
4028 KnownBits Known = DAG.computeKnownBits(X);
4029 unsigned LZ = Known.countMinLeadingZeros();
4030 if (LZ < RHSVal)
4031 break;
4032 EVT XVT = X.getValueType();
4033 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4034 return DAG.getZExtOrTrunc(Shl, SL, VT);
4035 }
4036 }
4037
4038 if (VT != MVT::i64)
4039 return SDValue();
4040
4041 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4042
4043 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4044 // common case, splitting this into a move and a 32-bit shift is faster and
4045 // the same code size.
4046 if (RHSVal < 32)
4047 return SDValue();
4048
4049 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4050
4051 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4052 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4053
4054 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4055
4056 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4057 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4058}
4059
4061 DAGCombinerInfo &DCI) const {
4062 if (N->getValueType(0) != MVT::i64)
4063 return SDValue();
4064
4065 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4066 if (!RHS)
4067 return SDValue();
4068
4069 SelectionDAG &DAG = DCI.DAG;
4070 SDLoc SL(N);
4071 unsigned RHSVal = RHS->getZExtValue();
4072
4073 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4074 if (RHSVal == 32) {
4075 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4076 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4077 DAG.getConstant(31, SL, MVT::i32));
4078
4079 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4080 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4081 }
4082
4083 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4084 if (RHSVal == 63) {
4085 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4086 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4087 DAG.getConstant(31, SL, MVT::i32));
4088 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4089 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4090 }
4091
4092 return SDValue();
4093}
4094
4096 DAGCombinerInfo &DCI) const {
4097 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4098 if (!RHS)
4099 return SDValue();
4100
4101 EVT VT = N->getValueType(0);
4102 SDValue LHS = N->getOperand(0);
4103 unsigned ShiftAmt = RHS->getZExtValue();
4104 SelectionDAG &DAG = DCI.DAG;
4105 SDLoc SL(N);
4106
4107 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4108 // this improves the ability to match BFE patterns in isel.
4109 if (LHS.getOpcode() == ISD::AND) {
4110 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4111 unsigned MaskIdx, MaskLen;
4112 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4113 MaskIdx == ShiftAmt) {
4114 return DAG.getNode(
4115 ISD::AND, SL, VT,
4116 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4117 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4118 }
4119 }
4120 }
4121
4122 if (VT != MVT::i64)
4123 return SDValue();
4124
4125 if (ShiftAmt < 32)
4126 return SDValue();
4127
4128 // srl i64:x, C for C >= 32
4129 // =>
4130 // build_pair (srl hi_32(x), C - 32), 0
4131 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4132
4133 SDValue Hi = getHiHalf64(LHS, DAG);
4134
4135 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4136 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4137
4138 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4139
4140 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4141}
4142
4144 SDNode *N, DAGCombinerInfo &DCI) const {
4145 SDLoc SL(N);
4146 SelectionDAG &DAG = DCI.DAG;
4147 EVT VT = N->getValueType(0);
4148 SDValue Src = N->getOperand(0);
4149
4150 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4151 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4152 SDValue Vec = Src.getOperand(0);
4153 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4154 SDValue Elt0 = Vec.getOperand(0);
4155 EVT EltVT = Elt0.getValueType();
4156 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4157 if (EltVT.isFloatingPoint()) {
4158 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4159 EltVT.changeTypeToInteger(), Elt0);
4160 }
4161
4162 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4163 }
4164 }
4165 }
4166
4167 // Equivalent of above for accessing the high element of a vector as an
4168 // integer operation.
4169 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4170 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4171 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4172 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4173 SDValue BV = stripBitcast(Src.getOperand(0));
4174 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4175 BV.getValueType().getVectorNumElements() == 2) {
4176 SDValue SrcElt = BV.getOperand(1);
4177 EVT SrcEltVT = SrcElt.getValueType();
4178 if (SrcEltVT.isFloatingPoint()) {
4179 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4180 SrcEltVT.changeTypeToInteger(), SrcElt);
4181 }
4182
4183 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4184 }
4185 }
4186 }
4187 }
4188
4189 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4190 //
4191 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4192 // i16 (trunc (srl (i32 (trunc x), K)))
4193 if (VT.getScalarSizeInBits() < 32) {
4194 EVT SrcVT = Src.getValueType();
4195 if (SrcVT.getScalarSizeInBits() > 32 &&
4196 (Src.getOpcode() == ISD::SRL ||
4197 Src.getOpcode() == ISD::SRA ||
4198 Src.getOpcode() == ISD::SHL)) {
4199 SDValue Amt = Src.getOperand(1);
4200 KnownBits Known = DAG.computeKnownBits(Amt);
4201
4202 // - For left shifts, do the transform as long as the shift
4203 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4204 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4205 // losing information stored in the high bits when truncating.
4206 const unsigned MaxCstSize =
4207 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4208 if (Known.getMaxValue().ule(MaxCstSize)) {
4209 EVT MidVT = VT.isVector() ?
4210 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4211 VT.getVectorNumElements()) : MVT::i32;
4212
4213 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4214 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4215 Src.getOperand(0));
4216 DCI.AddToWorklist(Trunc.getNode());
4217
4218 if (Amt.getValueType() != NewShiftVT) {
4219 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4220 DCI.AddToWorklist(Amt.getNode());
4221 }
4222
4223 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4224 Trunc, Amt);
4225 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4226 }
4227 }
4228 }
4229
4230 return SDValue();
4231}
4232
4233// We need to specifically handle i64 mul here to avoid unnecessary conversion
4234// instructions. If we only match on the legalized i64 mul expansion,
4235// SimplifyDemandedBits will be unable to remove them because there will be
4236// multiple uses due to the separate mul + mulh[su].
4237static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4238 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4239 if (Size <= 32) {
4240 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4241 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4242 }
4243
4244 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4245 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4246
4247 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4248 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4249
4250 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4251}
4252
4253/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4254/// return SDValue().
4255static SDValue getAddOneOp(const SDNode *V) {
4256 if (V->getOpcode() != ISD::ADD)
4257 return SDValue();
4258
4259 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4260}
4261
4263 DAGCombinerInfo &DCI) const {
4264 assert(N->getOpcode() == ISD::MUL);
4265 EVT VT = N->getValueType(0);
4266
4267 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4268 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4269 // unnecessarily). isDivergent() is used as an approximation of whether the
4270 // value is in an SGPR.
4271 if (!N->isDivergent())
4272 return SDValue();
4273
4274 unsigned Size = VT.getSizeInBits();
4275 if (VT.isVector() || Size > 64)
4276 return SDValue();
4277
4278 SelectionDAG &DAG = DCI.DAG;
4279 SDLoc DL(N);
4280
4281 SDValue N0 = N->getOperand(0);
4282 SDValue N1 = N->getOperand(1);
4283
4284 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4285 // matching.
4286
4287 // mul x, (add y, 1) -> add (mul x, y), x
4288 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4289 SDValue AddOp = getAddOneOp(V.getNode());
4290 if (!AddOp)
4291 return SDValue();
4292
4293 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4294 return U->getOpcode() == ISD::MUL;
4295 }))
4296 return AddOp;
4297
4298 return SDValue();
4299 };
4300
4301 // FIXME: The selection pattern is not properly checking for commuted
4302 // operands, so we have to place the mul in the LHS
4303 if (SDValue MulOper = IsFoldableAdd(N0)) {
4304 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4305 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4306 }
4307
4308 if (SDValue MulOper = IsFoldableAdd(N1)) {
4309 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4310 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4311 }
4312
4313 // There are i16 integer mul/mad.
4314 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4315 return SDValue();
4316
4317 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4318 // in the source into any_extends if the result of the mul is truncated. Since
4319 // we can assume the high bits are whatever we want, use the underlying value
4320 // to avoid the unknown high bits from interfering.
4321 if (N0.getOpcode() == ISD::ANY_EXTEND)
4322 N0 = N0.getOperand(0);
4323
4324 if (N1.getOpcode() == ISD::ANY_EXTEND)
4325 N1 = N1.getOperand(0);
4326
4327 SDValue Mul;
4328
4329 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4330 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4331 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4332 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4333 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4334 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4335 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4336 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4337 } else {
4338 return SDValue();
4339 }
4340
4341 // We need to use sext even for MUL_U24, because MUL_U24 is used
4342 // for signed multiply of 8 and 16-bit types.
4343 return DAG.getSExtOrTrunc(Mul, DL, VT);
4344}
4345
4346SDValue
4348 DAGCombinerInfo &DCI) const {
4349 if (N->getValueType(0) != MVT::i32)
4350 return SDValue();
4351
4352 SelectionDAG &DAG = DCI.DAG;
4353 SDLoc DL(N);
4354
4355 SDValue N0 = N->getOperand(0);
4356 SDValue N1 = N->getOperand(1);
4357
4358 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4359 // in the source into any_extends if the result of the mul is truncated. Since
4360 // we can assume the high bits are whatever we want, use the underlying value
4361 // to avoid the unknown high bits from interfering.
4362 if (N0.getOpcode() == ISD::ANY_EXTEND)
4363 N0 = N0.getOperand(0);
4364 if (N1.getOpcode() == ISD::ANY_EXTEND)
4365 N1 = N1.getOperand(0);
4366
4367 // Try to use two fast 24-bit multiplies (one for each half of the result)
4368 // instead of one slow extending multiply.
4369 unsigned LoOpcode, HiOpcode;
4370 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4371 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4372 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4373 LoOpcode = AMDGPUISD::MUL_U24;
4374 HiOpcode = AMDGPUISD::MULHI_U24;
4375 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4376 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4377 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4378 LoOpcode = AMDGPUISD::MUL_I24;
4379 HiOpcode = AMDGPUISD::MULHI_I24;
4380 } else {
4381 return SDValue();
4382 }
4383
4384 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4385 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4386 DCI.CombineTo(N, Lo, Hi);
4387 return SDValue(N, 0);
4388}
4389
4391 DAGCombinerInfo &DCI) const {
4392 EVT VT = N->getValueType(0);
4393
4394 if (!Subtarget->hasMulI24() || VT.isVector())
4395 return SDValue();
4396
4397 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4398 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4399 // unnecessarily). isDivergent() is used as an approximation of whether the
4400 // value is in an SGPR.
4401 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4402 // valu op anyway)
4403 if (Subtarget->hasSMulHi() && !N->isDivergent())
4404 return SDValue();
4405
4406 SelectionDAG &DAG = DCI.DAG;
4407 SDLoc DL(N);
4408
4409 SDValue N0 = N->getOperand(0);
4410 SDValue N1 = N->getOperand(1);
4411
4412 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4413 return SDValue();
4414
4415 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4416 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4417
4418 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4419 DCI.AddToWorklist(Mulhi.getNode());
4420 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4421}
4422
4424 DAGCombinerInfo &DCI) const {
4425 EVT VT = N->getValueType(0);
4426
4427 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4428 return SDValue();
4429
4430 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4431 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4432 // unnecessarily). isDivergent() is used as an approximation of whether the
4433 // value is in an SGPR.
4434 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4435 // valu op anyway)
4436 if (Subtarget->hasSMulHi() && !N->isDivergent())
4437 return SDValue();
4438
4439 SelectionDAG &DAG = DCI.DAG;
4440 SDLoc DL(N);
4441
4442 SDValue N0 = N->getOperand(0);
4443 SDValue N1 = N->getOperand(1);
4444
4445 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4446 return SDValue();
4447
4448 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4449 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4450
4451 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4452 DCI.AddToWorklist(Mulhi.getNode());
4453 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4454}
4455
4456SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4457 SDValue Op,
4458 const SDLoc &DL,
4459 unsigned Opc) const {
4460 EVT VT = Op.getValueType();
4461 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4462 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4463 LegalVT != MVT::i16))
4464 return SDValue();
4465
4466 if (VT != MVT::i32)
4467 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4468
4469 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4470 if (VT != MVT::i32)
4471 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4472
4473 return FFBX;
4474}
4475
4476// The native instructions return -1 on 0 input. Optimize out a select that
4477// produces -1 on 0.
4478//
4479// TODO: If zero is not undef, we could also do this if the output is compared
4480// against the bitwidth.
4481//
4482// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4484 SDValue LHS, SDValue RHS,
4485 DAGCombinerInfo &DCI) const {
4486 if (!isNullConstant(Cond.getOperand(1)))
4487 return SDValue();
4488
4489 SelectionDAG &DAG = DCI.DAG;
4490 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4491 SDValue CmpLHS = Cond.getOperand(0);
4492
4493 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4494 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4495 if (CCOpcode == ISD::SETEQ &&
4496 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4497 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4498 unsigned Opc =
4500 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4501 }
4502
4503 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4504 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4505 if (CCOpcode == ISD::SETNE &&
4506 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4507 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4508 unsigned Opc =
4510
4511 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4512 }
4513
4514 return SDValue();
4515}
4516
4518 unsigned Op,
4519 const SDLoc &SL,
4520 SDValue Cond,
4521 SDValue N1,
4522 SDValue N2) {
4523 SelectionDAG &DAG = DCI.DAG;
4524 EVT VT = N1.getValueType();
4525
4526 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4527 N1.getOperand(0), N2.getOperand(0));
4528 DCI.AddToWorklist(NewSelect.getNode());
4529 return DAG.getNode(Op, SL, VT, NewSelect);
4530}
4531
4532// Pull a free FP operation out of a select so it may fold into uses.
4533//
4534// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4535// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4536//
4537// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4538// select c, (fabs x), +k -> fabs (select c, x, k)
4539SDValue
4541 SDValue N) const {
4542 SelectionDAG &DAG = DCI.DAG;
4543 SDValue Cond = N.getOperand(0);
4544 SDValue LHS = N.getOperand(1);
4545 SDValue RHS = N.getOperand(2);
4546
4547 EVT VT = N.getValueType();
4548 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4549 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4551 return SDValue();
4552
4553 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4554 SDLoc(N), Cond, LHS, RHS);
4555 }
4556
4557 bool Inv = false;
4558 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4559 std::swap(LHS, RHS);
4560 Inv = true;
4561 }
4562
4563 // TODO: Support vector constants.
4564 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4565 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4566 !selectSupportsSourceMods(N.getNode())) {
4567 SDLoc SL(N);
4568 // If one side is an fneg/fabs and the other is a constant, we can push the
4569 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4570 SDValue NewLHS = LHS.getOperand(0);
4571 SDValue NewRHS = RHS;
4572
4573 // Careful: if the neg can be folded up, don't try to pull it back down.
4574 bool ShouldFoldNeg = true;
4575
4576 if (NewLHS.hasOneUse()) {
4577 unsigned Opc = NewLHS.getOpcode();
4578 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4579 ShouldFoldNeg = false;
4580 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4581 ShouldFoldNeg = false;
4582 }
4583
4584 if (ShouldFoldNeg) {
4585 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4586 return SDValue();
4587
4588 // We're going to be forced to use a source modifier anyway, there's no
4589 // point to pulling the negate out unless we can get a size reduction by
4590 // negating the constant.
4591 //
4592 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4593 // about cheaper constants.
4594 if (NewLHS.getOpcode() == ISD::FABS &&
4596 return SDValue();
4597
4599 return SDValue();
4600
4601 if (LHS.getOpcode() == ISD::FNEG)
4602 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4603
4604 if (Inv)
4605 std::swap(NewLHS, NewRHS);
4606
4607 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4608 Cond, NewLHS, NewRHS);
4609 DCI.AddToWorklist(NewSelect.getNode());
4610 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4611 }
4612 }
4613
4614 return SDValue();
4615}
4616
4618 DAGCombinerInfo &DCI) const {
4619 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4620 return Folded;
4621
4622 SDValue Cond = N->getOperand(0);
4623 if (Cond.getOpcode() != ISD::SETCC)
4624 return SDValue();
4625
4626 EVT VT = N->getValueType(0);
4627 SDValue LHS = Cond.getOperand(0);
4628 SDValue RHS = Cond.getOperand(1);
4629 SDValue CC = Cond.getOperand(2);
4630
4631 SDValue True = N->getOperand(1);
4632 SDValue False = N->getOperand(2);
4633
4634 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4635 SelectionDAG &DAG = DCI.DAG;
4636 if (DAG.isConstantValueOfAnyType(True) &&
4637 !DAG.isConstantValueOfAnyType(False)) {
4638 // Swap cmp + select pair to move constant to false input.
4639 // This will allow using VOPC cndmasks more often.
4640 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4641
4642 SDLoc SL(N);
4643 ISD::CondCode NewCC =
4644 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4645
4646 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4647 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4648 }
4649
4650 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4652 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4653 // Revisit this node so we can catch min3/max3/med3 patterns.
4654 //DCI.AddToWorklist(MinMax.getNode());
4655 return MinMax;
4656 }
4657 }
4658
4659 // There's no reason to not do this if the condition has other uses.
4660 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4661}
4662
4663static bool isInv2Pi(const APFloat &APF) {
4664 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4665 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4666 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4667
4668 return APF.bitwiseIsEqual(KF16) ||
4669 APF.bitwiseIsEqual(KF32) ||
4670 APF.bitwiseIsEqual(KF64);
4671}
4672
4673// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4674// additional cost to negate them.
4677 if (C->isZero())
4678 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4679
4680 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4681 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4682
4684}
4685
4689 return false;
4690}
4691
4695 return false;
4696}
4697
4698static unsigned inverseMinMax(unsigned Opc) {
4699 switch (Opc) {
4700 case ISD::FMAXNUM:
4701 return ISD::FMINNUM;
4702 case ISD::FMINNUM:
4703 return ISD::FMAXNUM;
4704 case ISD::FMAXNUM_IEEE:
4705 return ISD::FMINNUM_IEEE;
4706 case ISD::FMINNUM_IEEE:
4707 return ISD::FMAXNUM_IEEE;
4708 case ISD::FMAXIMUM:
4709 return ISD::FMINIMUM;
4710 case ISD::FMINIMUM:
4711 return ISD::FMAXIMUM;
4716 default:
4717 llvm_unreachable("invalid min/max opcode");
4718 }
4719}
4720
4721/// \return true if it's profitable to try to push an fneg into its source
4722/// instruction.
4724 // If the input has multiple uses and we can either fold the negate down, or
4725 // the other uses cannot, give up. This both prevents unprofitable
4726 // transformations and infinite loops: we won't repeatedly try to fold around
4727 // a negate that has no 'good' form.
4728 if (N0.hasOneUse()) {
4729 // This may be able to fold into the source, but at a code size cost. Don't
4730 // fold if the fold into the user is free.
4731 if (allUsesHaveSourceMods(N, 0))
4732 return false;
4733 } else {
4734 if (fnegFoldsIntoOp(N0.getNode()) &&
4736 return false;
4737 }
4738
4739 return true;
4740}
4741
4743 DAGCombinerInfo &DCI) const {
4744 SelectionDAG &DAG = DCI.DAG;
4745 SDValue N0 = N->getOperand(0);
4746 EVT VT = N->getValueType(0);
4747
4748 unsigned Opc = N0.getOpcode();
4749
4750 if (!shouldFoldFNegIntoSrc(N, N0))
4751 return SDValue();
4752
4753 SDLoc SL(N);
4754 switch (Opc) {
4755 case ISD::FADD: {
4756 if (!mayIgnoreSignedZero(N0))
4757 return SDValue();
4758
4759 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4760 SDValue LHS = N0.getOperand(0);
4761 SDValue RHS = N0.getOperand(1);
4762
4763 if (LHS.getOpcode() != ISD::FNEG)
4764 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4765 else
4766 LHS = LHS.getOperand(0);
4767
4768 if (RHS.getOpcode() != ISD::FNEG)
4769 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4770 else
4771 RHS = RHS.getOperand(0);
4772
4773 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4774 if (Res.getOpcode() != ISD::FADD)
4775 return SDValue(); // Op got folded away.
4776 if (!N0.hasOneUse())
4777 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4778 return Res;
4779 }
4780 case ISD::FMUL:
4782 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4783 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4784 SDValue LHS = N0.getOperand(0);
4785 SDValue RHS = N0.getOperand(1);
4786
4787 if (LHS.getOpcode() == ISD::FNEG)
4788 LHS = LHS.getOperand(0);
4789 else if (RHS.getOpcode() == ISD::FNEG)
4790 RHS = RHS.getOperand(0);
4791 else
4792 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4793
4794 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4795 if (Res.getOpcode() != Opc)
4796 return SDValue(); // Op got folded away.
4797 if (!N0.hasOneUse())
4798 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4799 return Res;
4800 }
4801 case ISD::FMA:
4802 case ISD::FMAD: {
4803 // TODO: handle llvm.amdgcn.fma.legacy
4804 if (!mayIgnoreSignedZero(N0))
4805 return SDValue();
4806
4807 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4808 SDValue LHS = N0.getOperand(0);
4809 SDValue MHS = N0.getOperand(1);
4810 SDValue RHS = N0.getOperand(2);
4811
4812 if (LHS.getOpcode() == ISD::FNEG)
4813 LHS = LHS.getOperand(0);
4814 else if (MHS.getOpcode() == ISD::FNEG)
4815 MHS = MHS.getOperand(0);
4816 else
4817 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4818
4819 if (RHS.getOpcode() != ISD::FNEG)
4820 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4821 else
4822 RHS = RHS.getOperand(0);
4823
4824 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4825 if (Res.getOpcode() != Opc)
4826 return SDValue(); // Op got folded away.
4827 if (!N0.hasOneUse())
4828 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4829 return Res;
4830 }
4831 case ISD::FMAXNUM:
4832 case ISD::FMINNUM:
4833 case ISD::FMAXNUM_IEEE:
4834 case ISD::FMINNUM_IEEE:
4835 case ISD::FMINIMUM:
4836 case ISD::FMAXIMUM:
4839 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4840 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4841 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4842 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4843
4844 SDValue LHS = N0.getOperand(0);
4845 SDValue RHS = N0.getOperand(1);
4846
4847 // 0 doesn't have a negated inline immediate.
4848 // TODO: This constant check should be generalized to other operations.
4850 return SDValue();
4851
4852 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4853 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4854 unsigned Opposite = inverseMinMax(Opc);
4855
4856 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4857 if (Res.getOpcode() != Opposite)
4858 return SDValue(); // Op got folded away.
4859 if (!N0.hasOneUse())
4860 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4861 return Res;
4862 }
4863 case AMDGPUISD::FMED3: {
4864 SDValue Ops[3];
4865 for (unsigned I = 0; I < 3; ++I)
4866 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4867
4868 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4869 if (Res.getOpcode() != AMDGPUISD::FMED3)
4870 return SDValue(); // Op got folded away.
4871
4872 if (!N0.hasOneUse()) {
4873 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4874 DAG.ReplaceAllUsesWith(N0, Neg);
4875
4876 for (SDNode *U : Neg->uses())
4877 DCI.AddToWorklist(U);
4878 }
4879
4880 return Res;
4881 }
4882 case ISD::FP_EXTEND:
4883 case ISD::FTRUNC:
4884 case ISD::FRINT:
4885 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4886 case ISD::FROUNDEVEN:
4887 case ISD::FSIN:
4888 case ISD::FCANONICALIZE:
4889 case AMDGPUISD::RCP:
4892 case AMDGPUISD::SIN_HW: {
4893 SDValue CvtSrc = N0.getOperand(0);
4894 if (CvtSrc.getOpcode() == ISD::FNEG) {
4895 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4896 // (fneg (rcp (fneg x))) -> (rcp x)
4897 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4898 }
4899
4900 if (!N0.hasOneUse())
4901 return SDValue();
4902
4903 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4904 // (fneg (rcp x)) -> (rcp (fneg x))
4905 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4906 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4907 }
4908 case ISD::FP_ROUND: {
4909 SDValue CvtSrc = N0.getOperand(0);
4910
4911 if (CvtSrc.getOpcode() == ISD::FNEG) {
4912 // (fneg (fp_round (fneg x))) -> (fp_round x)
4913 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4914 CvtSrc.getOperand(0), N0.getOperand(1));
4915 }
4916
4917 if (!N0.hasOneUse())
4918 return SDValue();
4919
4920 // (fneg (fp_round x)) -> (fp_round (fneg x))
4921 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4922 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4923 }
4924 case ISD::FP16_TO_FP: {
4925 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4926 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4927 // Put the fneg back as a legal source operation that can be matched later.
4928 SDLoc SL(N);
4929
4930 SDValue Src = N0.getOperand(0);
4931 EVT SrcVT = Src.getValueType();
4932
4933 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4934 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4935 DAG.getConstant(0x8000, SL, SrcVT));
4936 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4937 }
4938 case ISD::SELECT: {
4939 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4940 // TODO: Invert conditions of foldFreeOpFromSelect
4941 return SDValue();
4942 }
4943 case ISD::BITCAST: {
4944 SDLoc SL(N);
4945 SDValue BCSrc = N0.getOperand(0);
4946 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4947 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4948 if (HighBits.getValueType().getSizeInBits() != 32 ||
4949 !fnegFoldsIntoOp(HighBits.getNode()))
4950 return SDValue();
4951
4952 // f64 fneg only really needs to operate on the high half of of the
4953 // register, so try to force it to an f32 operation to help make use of
4954 // source modifiers.
4955 //
4956 //
4957 // fneg (f64 (bitcast (build_vector x, y))) ->
4958 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4959 // (fneg (bitcast i32:y to f32)))
4960
4961 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4962 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4963 SDValue CastBack =
4964 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4965
4966 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4967 Ops.back() = CastBack;
4968 DCI.AddToWorklist(NegHi.getNode());
4969 SDValue Build =
4970 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4971 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4972
4973 if (!N0.hasOneUse())
4974 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4975 return Result;
4976 }
4977
4978 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4979 BCSrc.hasOneUse()) {
4980 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4981 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4982
4983 // TODO: Cast back result for multiple uses is beneficial in some cases.
4984
4985 SDValue LHS =
4986 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4987 SDValue RHS =
4988 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4989
4990 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4991 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4992
4993 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4994 NegRHS);
4995 }
4996
4997 return SDValue();
4998 }
4999 default:
5000 return SDValue();
5001 }
5002}
5003
5005 DAGCombinerInfo &DCI) const {
5006 SelectionDAG &DAG = DCI.DAG;
5007 SDValue N0 = N->getOperand(0);
5008
5009 if (!N0.hasOneUse())
5010 return SDValue();
5011
5012 switch (N0.getOpcode()) {
5013 case ISD::FP16_TO_FP: {
5014 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5015 SDLoc SL(N);
5016 SDValue Src = N0.getOperand(0);
5017 EVT SrcVT = Src.getValueType();
5018
5019 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5020 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5021 DAG.getConstant(0x7fff, SL, SrcVT));
5022 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5023 }
5024 default:
5025 return SDValue();
5026 }
5027}
5028
5030 DAGCombinerInfo &DCI) const {
5031 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5032 if (!CFP)
5033 return SDValue();
5034
5035 // XXX - Should this flush denormals?
5036 const APFloat &Val = CFP->getValueAPF();
5037 APFloat One(Val.getSemantics(), "1.0");
5038 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5039}
5040
5042 DAGCombinerInfo &DCI) const {
5043 SelectionDAG &DAG = DCI.DAG;
5044 SDLoc DL(N);
5045
5046 switch(N->getOpcode()) {
5047 default:
5048 break;
5049 case ISD::BITCAST: {
5050 EVT DestVT = N->getValueType(0);
5051
5052 // Push casts through vector builds. This helps avoid emitting a large
5053 // number of copies when materializing floating point vector constants.
5054 //
5055 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5056 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5057 if (DestVT.isVector()) {
5058 SDValue Src = N->getOperand(0);
5059 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5062 EVT SrcVT = Src.getValueType();
5063 unsigned NElts = DestVT.getVectorNumElements();
5064
5065 if (SrcVT.getVectorNumElements() == NElts) {
5066 EVT DestEltVT = DestVT.getVectorElementType();
5067
5068 SmallVector<SDValue, 8> CastedElts;
5069 SDLoc SL(N);
5070 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5071 SDValue Elt = Src.getOperand(I);
5072 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5073 }
5074
5075 return DAG.getBuildVector(DestVT, SL, CastedElts);
5076 }
5077 }
5078 }
5079
5080 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5081 break;
5082
5083 // Fold bitcasts of constants.
5084 //
5085 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5086 // TODO: Generalize and move to DAGCombiner
5087 SDValue Src = N->getOperand(0);
5088 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5089 SDLoc SL(N);
5090 uint64_t CVal = C->getZExtValue();
5091 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5092 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5093 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5094 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5095 }
5096
5097 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5098 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5099 SDLoc SL(N);
5100 uint64_t CVal = Val.getZExtValue();
5101 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5102 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5103 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5104
5105 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5106 }
5107
5108 break;
5109 }
5110 case ISD::SHL: {
5112 break;
5113
5114 return performShlCombine(N, DCI);
5115 }
5116 case ISD::SRL: {
5118 break;
5119
5120 return performSrlCombine(N, DCI);
5121 }
5122 case ISD::SRA: {
5124 break;
5125
5126 return performSraCombine(N, DCI);
5127 }
5128 case ISD::TRUNCATE:
5129 return performTruncateCombine(N, DCI);
5130 case ISD::MUL:
5131 return performMulCombine(N, DCI);
5132 case AMDGPUISD::MUL_U24:
5133 case AMDGPUISD::MUL_I24: {
5134 if (SDValue Simplified = simplifyMul24(N, DCI))
5135 return Simplified;
5136 break;
5137 }
5140 return simplifyMul24(N, DCI);
5141 case ISD::SMUL_LOHI:
5142 case ISD::UMUL_LOHI:
5143 return performMulLoHiCombine(N, DCI);
5144 case ISD::MULHS:
5145 return performMulhsCombine(N, DCI);
5146 case ISD::MULHU:
5147 return performMulhuCombine(N, DCI);
5148 case ISD::SELECT:
5149 return performSelectCombine(N, DCI);
5150 case ISD::FNEG:
5151 return performFNegCombine(N, DCI);
5152 case ISD::FABS:
5153 return performFAbsCombine(N, DCI);
5154 case AMDGPUISD::BFE_I32:
5155 case AMDGPUISD::BFE_U32: {
5156 assert(!N->getValueType(0).isVector() &&
5157 "Vector handling of BFE not implemented");
5158 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5159 if (!Width)
5160 break;
5161
5162 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5163 if (WidthVal == 0)
5164 return DAG.getConstant(0, DL, MVT::i32);
5165
5166 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5167 if (!Offset)
5168 break;
5169
5170 SDValue BitsFrom = N->getOperand(0);
5171 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5172
5173 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5174
5175 if (OffsetVal == 0) {
5176 // This is already sign / zero extended, so try to fold away extra BFEs.
5177 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5178
5179 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5180 if (OpSignBits >= SignBits)
5181 return BitsFrom;
5182
5183 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5184 if (Signed) {
5185 // This is a sign_extend_inreg. Replace it to take advantage of existing
5186 // DAG Combines. If not eliminated, we will match back to BFE during
5187 // selection.
5188
5189 // TODO: The sext_inreg of extended types ends, although we can could
5190 // handle them in a single BFE.
5191 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5192 DAG.getValueType(SmallVT));
5193 }
5194
5195 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5196 }
5197
5198 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5199 if (Signed) {
5200 return constantFoldBFE<int32_t>(DAG,
5201 CVal->getSExtValue(),
5202 OffsetVal,
5203 WidthVal,
5204 DL);
5205 }
5206
5207 return constantFoldBFE<uint32_t>(DAG,
5208 CVal->getZExtValue(),
5209 OffsetVal,
5210 WidthVal,
5211 DL);
5212 }
5213
5214 if ((OffsetVal + WidthVal) >= 32 &&
5215 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5216 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5217 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5218 BitsFrom, ShiftVal);
5219 }
5220
5221 if (BitsFrom.hasOneUse()) {
5222 APInt Demanded = APInt::getBitsSet(32,
5223 OffsetVal,
5224 OffsetVal + WidthVal);
5225
5226 KnownBits Known;
5228 !DCI.isBeforeLegalizeOps());
5229 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5230 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5231 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5232 DCI.CommitTargetLoweringOpt(TLO);
5233 }
5234 }
5235
5236 break;
5237 }
5238 case ISD::LOAD:
5239 return performLoadCombine(N, DCI);
5240 case ISD::STORE:
5241 return performStoreCombine(N, DCI);
5242 case AMDGPUISD::RCP:
5244 return performRcpCombine(N, DCI);
5245 case ISD::AssertZext:
5246 case ISD::AssertSext:
5247 return performAssertSZExtCombine(N, DCI);
5249 return performIntrinsicWOChainCombine(N, DCI);
5250 case AMDGPUISD::FMAD_FTZ: {
5251 SDValue N0 = N->getOperand(0);
5252 SDValue N1 = N->getOperand(1);
5253 SDValue N2 = N->getOperand(2);
5254 EVT VT = N->getValueType(0);
5255
5256 // FMAD_FTZ is a FMAD + flush denormals to zero.
5257 // We flush the inputs, the intermediate step, and the output.
5258 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5259 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5260 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5261 if (N0CFP && N1CFP && N2CFP) {
5262 const auto FTZ = [](const APFloat &V) {
5263 if (V.isDenormal()) {
5264 APFloat Zero(V.getSemantics(), 0);
5265 return V.isNegative() ? -Zero : Zero;
5266 }
5267 return V;
5268 };
5269
5270 APFloat V0 = FTZ(N0CFP->getValueAPF());
5271 APFloat V1 = FTZ(N1CFP->getValueAPF());
5272 APFloat V2 = FTZ(N2CFP->getValueAPF());
5274 V0 = FTZ(V0);
5276 return DAG.getConstantFP(FTZ(V0), DL, VT);
5277 }
5278 break;
5279 }
5280 }
5281 return SDValue();
5282}
5283
5284//===----------------------------------------------------------------------===//
5285// Helper functions
5286//===----------------------------------------------------------------------===//
5287
5289 const TargetRegisterClass *RC,
5290 Register Reg, EVT VT,
5291 const SDLoc &SL,
5292 bool RawReg) const {
5295 Register VReg;
5296
5297 if (!MRI.isLiveIn(Reg)) {
5298 VReg = MRI.createVirtualRegister(RC);
5299 MRI.addLiveIn(Reg, VReg);
5300 } else {
5301 VReg = MRI.getLiveInVirtReg(Reg);
5302 }
5303
5304 if (RawReg)
5305 return DAG.getRegister(VReg, VT);
5306
5307 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5308}
5309
5310// This may be called multiple times, and nothing prevents creating multiple
5311// objects at the same offset. See if we already defined this object.
5313 int64_t Offset) {
5314 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5315 if (MFI.getObjectOffset(I) == Offset) {
5316 assert(MFI.getObjectSize(I) == Size);
5317 return I;
5318 }
5319 }
5320
5321 return MFI.CreateFixedObject(Size, Offset, true);
5322}
5323
5325 EVT VT,
5326 const SDLoc &SL,
5327 int64_t Offset) const {
5329 MachineFrameInfo &MFI = MF.getFrameInfo();
5330 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5331
5332 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5333 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5334
5335 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5338}
5339
5341 const SDLoc &SL,
5342 SDValue Chain,
5343 SDValue ArgVal,
5344 int64_t Offset) const {
5348
5349 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5350 // Stores to the argument stack area are relative to the stack pointer.
5351 SDValue SP =
5352 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5353 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5354 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5356 return Store;
5357}
5358
5360 const TargetRegisterClass *RC,
5361 EVT VT, const SDLoc &SL,
5362 const ArgDescriptor &Arg) const {
5363 assert(Arg && "Attempting to load missing argument");
5364
5365 SDValue V = Arg.isRegister() ?
5366 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5367 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5368
5369 if (!Arg.isMasked())
5370 return V;
5371
5372 unsigned Mask = Arg.getMask();
5373 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5374 V = DAG.getNode(ISD::SRL, SL, VT, V,
5375 DAG.getShiftAmountConstant(Shift, VT, SL));
5376 return DAG.getNode(ISD::AND, SL, VT, V,
5377 DAG.getConstant(Mask >> Shift, SL, VT));
5378}
5379
5381 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5382 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5383 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5384 uint64_t ArgOffset =
5385 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5386 switch (Param) {
5387 case FIRST_IMPLICIT:
5388 return ArgOffset;
5389 case PRIVATE_BASE:
5391 case SHARED_BASE:
5392 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5393 case QUEUE_PTR:
5394 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5395 }
5396 llvm_unreachable("unexpected implicit parameter type");
5397}
5398
5400 const MachineFunction &MF, const ImplicitParameter Param) const {
5403}
5404
5405#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5406
5407const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5408 switch ((AMDGPUISD::NodeType)Opcode) {
5409 case AMDGPUISD::FIRST_NUMBER: break;
5410 // AMDIL DAG nodes
5411 NODE_NAME_CASE(UMUL);
5412 NODE_NAME_CASE(BRANCH_COND);
5413
5414 // AMDGPU DAG nodes
5415 NODE_NAME_CASE(IF)
5416 NODE_NAME_CASE(ELSE)
5417 NODE_NAME_CASE(LOOP)
5418 NODE_NAME_CASE(CALL)
5419 NODE_NAME_CASE(TC_RETURN)
5420 NODE_NAME_CASE(TC_RETURN_GFX)
5421 NODE_NAME_CASE(TC_RETURN_CHAIN)
5422 NODE_NAME_CASE(TRAP)
5423 NODE_NAME_CASE(RET_GLUE)
5424 NODE_NAME_CASE(WAVE_ADDRESS)
5425 NODE_NAME_CASE(RETURN_TO_EPILOG)
5426 NODE_NAME_CASE(ENDPGM)
5427 NODE_NAME_CASE(ENDPGM_TRAP)
5428 NODE_NAME_CASE(SIMULATED_TRAP)
5429 NODE_NAME_CASE(DWORDADDR)
5430 NODE_NAME_CASE(FRACT)
5431 NODE_NAME_CASE(SETCC)
5432 NODE_NAME_CASE(SETREG)
5433 NODE_NAME_CASE(DENORM_MODE)
5434 NODE_NAME_CASE(FMA_W_CHAIN)
5435 NODE_NAME_CASE(FMUL_W_CHAIN)
5436 NODE_NAME_CASE(CLAMP)
5437 NODE_NAME_CASE(COS_HW)
5438 NODE_NAME_CASE(SIN_HW)
5439 NODE_NAME_CASE(FMAX_LEGACY)
5440 NODE_NAME_CASE(FMIN_LEGACY)
5441 NODE_NAME_CASE(FMAX3)
5442 NODE_NAME_CASE(SMAX3)
5443 NODE_NAME_CASE(UMAX3)
5444 NODE_NAME_CASE(FMIN3)
5445 NODE_NAME_CASE(SMIN3)
5446 NODE_NAME_CASE(UMIN3)
5447 NODE_NAME_CASE(FMED3)
5448 NODE_NAME_CASE(SMED3)
5449 NODE_NAME_CASE(UMED3)
5450 NODE_NAME_CASE(FMAXIMUM3)
5451 NODE_NAME_CASE(FMINIMUM3)
5452 NODE_NAME_CASE(FDOT2)
5453 NODE_NAME_CASE(URECIP)
5454 NODE_NAME_CASE(DIV_SCALE)
5455 NODE_NAME_CASE(DIV_FMAS)
5456 NODE_NAME_CASE(DIV_FIXUP)
5457 NODE_NAME_CASE(FMAD_FTZ)
5458 NODE_NAME_CASE(RCP)
5459 NODE_NAME_CASE(RSQ)
5460 NODE_NAME_CASE(RCP_LEGACY)
5461 NODE_NAME_CASE(RCP_IFLAG)
5462 NODE_NAME_CASE(LOG)
5463 NODE_NAME_CASE(EXP)
5464 NODE_NAME_CASE(FMUL_LEGACY)
5465 NODE_NAME_CASE(RSQ_CLAMP)
5466 NODE_NAME_CASE(FP_CLASS)
5467 NODE_NAME_CASE(DOT4)
5468 NODE_NAME_CASE(CARRY)
5469 NODE_NAME_CASE(BORROW)
5470 NODE_NAME_CASE(BFE_U32)
5471 NODE_NAME_CASE(BFE_I32)
5472 NODE_NAME_CASE(BFI)
5473 NODE_NAME_CASE(BFM)
5474 NODE_NAME_CASE(FFBH_U32)
5475 NODE_NAME_CASE(FFBH_I32)
5476 NODE_NAME_CASE(FFBL_B32)
5477 NODE_NAME_CASE(MUL_U24)
5478 NODE_NAME_CASE(MUL_I24)
5479 NODE_NAME_CASE(MULHI_U24)
5480 NODE_NAME_CASE(MULHI_I24)
5481 NODE_NAME_CASE(MAD_U24)
5482 NODE_NAME_CASE(MAD_I24)
5483 NODE_NAME_CASE(MAD_I64_I32)
5484 NODE_NAME_CASE(MAD_U64_U32)
5485 NODE_NAME_CASE(PERM)
5486 NODE_NAME_CASE(TEXTURE_FETCH)
5487 NODE_NAME_CASE(R600_EXPORT)
5488 NODE_NAME_CASE(CONST_ADDRESS)
5489 NODE_NAME_CASE(REGISTER_LOAD)
5490 NODE_NAME_CASE(REGISTER_STORE)
5491 NODE_NAME_CASE(SAMPLE)
5492 NODE_NAME_CASE(SAMPLEB)
5493 NODE_NAME_CASE(SAMPLED)
5494 NODE_NAME_CASE(SAMPLEL)
5495 NODE_NAME_CASE(CVT_F32_UBYTE0)
5496 NODE_NAME_CASE(CVT_F32_UBYTE1)
5497 NODE_NAME_CASE(CVT_F32_UBYTE2)
5498 NODE_NAME_CASE(CVT_F32_UBYTE3)
5499 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5500 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5501 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5502 NODE_NAME_CASE(CVT_PK_I16_I32)
5503 NODE_NAME_CASE(CVT_PK_U16_U32)
5504 NODE_NAME_CASE(FP_TO_FP16)
5505 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5506 NODE_NAME_CASE(CONST_DATA_PTR)
5507 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5509 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5510 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5511 NODE_NAME_CASE(DUMMY_CHAIN)
5513 NODE_NAME_CASE(LOAD_D16_HI)
5514 NODE_NAME_CASE(LOAD_D16_LO)
5515 NODE_NAME_CASE(LOAD_D16_HI_I8)
5516 NODE_NAME_CASE(LOAD_D16_HI_U8)
5517 NODE_NAME_CASE(LOAD_D16_LO_I8)
5518 NODE_NAME_CASE(LOAD_D16_LO_U8)
5519 NODE_NAME_CASE(STORE_MSKOR)
5520 NODE_NAME_CASE(LOAD_CONSTANT)
5521 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5522 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5523 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5524 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5525 NODE_NAME_CASE(DS_ORDERED_COUNT)
5526 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5527 NODE_NAME_CASE(BUFFER_LOAD)
5528 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5529 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5530 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5531 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5532 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5533 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5534 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5535 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5536 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5537 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5538 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5539 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5540 NODE_NAME_CASE(SBUFFER_LOAD)
5541 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5542 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5543 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5544 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5545 NODE_NAME_CASE(BUFFER_STORE)
5546 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5547 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5548 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5549 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5550 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5551 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5552 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5553 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5554 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5555 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5556 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5557 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5558 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5559 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5560 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5561 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5562 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5563 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5564 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5565 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5566 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5567 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5568
5570 }
5571 return nullptr;
5572}
5573
5575 SelectionDAG &DAG, int Enabled,
5576 int &RefinementSteps,
5577 bool &UseOneConstNR,
5578 bool Reciprocal) const {
5579 EVT VT = Operand.getValueType();
5580
5581 if (VT == MVT::f32) {
5582 RefinementSteps = 0;
5583 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5584 }
5585
5586 // TODO: There is also f64 rsq instruction, but the documentation is less
5587 // clear on its precision.
5588
5589 return SDValue();
5590}
5591
5593 SelectionDAG &DAG, int Enabled,
5594 int &RefinementSteps) const {
5595 EVT VT = Operand.getValueType();
5596
5597 if (VT == MVT::f32) {
5598 // Reciprocal, < 1 ulp error.
5599 //
5600 // This reciprocal approximation converges to < 0.5 ulp error with one
5601 // newton rhapson performed with two fused multiple adds (FMAs).
5602
5603 RefinementSteps = 0;
5604 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5605 }
5606
5607 // TODO: There is also f64 rcp instruction, but the documentation is less
5608 // clear on its precision.
5609
5610 return SDValue();
5611}
5612
5613static unsigned workitemIntrinsicDim(unsigned ID) {
5614 switch (ID) {
5615 case Intrinsic::amdgcn_workitem_id_x:
5616 return 0;
5617 case Intrinsic::amdgcn_workitem_id_y:
5618 return 1;
5619 case Intrinsic::amdgcn_workitem_id_z:
5620 return 2;
5621 default:
5622 llvm_unreachable("not a workitem intrinsic");
5623 }
5624}
5625
5627 const SDValue Op, KnownBits &Known,
5628 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5629
5630 Known.resetAll(); // Don't know anything.
5631
5632 unsigned Opc = Op.getOpcode();
5633
5634 switch (Opc) {
5635 default:
5636 break;
5637 case AMDGPUISD::CARRY:
5638 case AMDGPUISD::BORROW: {
5639 Known.Zero = APInt::getHighBitsSet(32, 31);
5640 break;
5641 }
5642
5643 case AMDGPUISD::BFE_I32:
5644 case AMDGPUISD::BFE_U32: {
5645 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5646 if (!CWidth)
5647 return;
5648
5649 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5650
5651 if (Opc == AMDGPUISD::BFE_U32)
5652 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5653
5654 break;
5655 }
5656 case AMDGPUISD::FP_TO_FP16: {
5657 unsigned BitWidth = Known.getBitWidth();
5658
5659 // High bits are zero.
5661 break;
5662 }
5663 case AMDGPUISD::MUL_U24:
5664 case AMDGPUISD::MUL_I24: {
5665 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5666 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5667 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5668 RHSKnown.countMinTrailingZeros();
5669 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5670 // Skip extra check if all bits are known zeros.
5671 if (TrailZ >= 32)
5672 break;
5673
5674 // Truncate to 24 bits.
5675 LHSKnown = LHSKnown.trunc(24);
5676 RHSKnown = RHSKnown.trunc(24);
5677
5678 if (Opc == AMDGPUISD::MUL_I24) {
5679 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5680 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5681 unsigned MaxValBits = LHSValBits + RHSValBits;
5682 if (MaxValBits > 32)
5683 break;
5684 unsigned SignBits = 32 - MaxValBits + 1;
5685 bool LHSNegative = LHSKnown.isNegative();
5686 bool LHSNonNegative = LHSKnown.isNonNegative();
5687 bool LHSPositive = LHSKnown.isStrictlyPositive();
5688 bool RHSNegative = RHSKnown.isNegative();
5689 bool RHSNonNegative = RHSKnown.isNonNegative();
5690 bool RHSPositive = RHSKnown.isStrictlyPositive();
5691
5692 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5693 Known.Zero.setHighBits(SignBits);
5694 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5695 Known.One.setHighBits(SignBits);
5696 } else {
5697 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5698 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5699 unsigned MaxValBits = LHSValBits + RHSValBits;
5700 if (MaxValBits >= 32)
5701 break;
5702 Known.Zero.setBitsFrom(MaxValBits);
5703 }
5704 break;
5705 }
5706 case AMDGPUISD::PERM: {
5707 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5708 if (!CMask)
5709 return;
5710
5711 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5712 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5713 unsigned Sel = CMask->getZExtValue();
5714
5715 for (unsigned I = 0; I < 32; I += 8) {
5716 unsigned SelBits = Sel & 0xff;
5717 if (SelBits < 4) {
5718 SelBits *= 8;
5719 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5720 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5721 } else if (SelBits < 7) {
5722 SelBits = (SelBits & 3) * 8;
5723 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5724 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5725 } else if (SelBits == 0x0c) {
5726 Known.Zero |= 0xFFull << I;
5727 } else if (SelBits > 0x0c) {
5728 Known.One |= 0xFFull << I;
5729 }
5730 Sel >>= 8;
5731 }
5732 break;
5733 }
5735 Known.Zero.setHighBits(24);
5736 break;
5737 }
5739 Known.Zero.setHighBits(16);
5740 break;
5741 }
5742 case AMDGPUISD::LDS: {
5743 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5744 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5745
5746 Known.Zero.setHighBits(16);
5747 Known.Zero.setLowBits(Log2(Alignment));
5748 break;
5749 }
5750 case AMDGPUISD::SMIN3:
5751 case AMDGPUISD::SMAX3:
5752 case AMDGPUISD::SMED3:
5753 case AMDGPUISD::UMIN3:
5754 case AMDGPUISD::UMAX3:
5755 case AMDGPUISD::UMED3: {
5756 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5757 if (Known2.isUnknown())
5758 break;
5759
5760 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5761 if (Known1.isUnknown())
5762 break;
5763
5764 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5765 if (Known0.isUnknown())
5766 break;
5767
5768 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5769 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5770 Known.One = Known0.One & Known1.One & Known2.One;
5771 break;
5772 }
5774 unsigned IID = Op.getConstantOperandVal(0);
5775 switch (IID) {
5776 case Intrinsic::amdgcn_workitem_id_x:
5777 case Intrinsic::amdgcn_workitem_id_y:
5778 case Intrinsic::amdgcn_workitem_id_z: {
5779 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5781 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5782 break;
5783 }
5784 default:
5785 break;
5786 }
5787 }
5788 }
5789}
5790
5792 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5793 unsigned Depth) const {
5794 switch (Op.getOpcode()) {
5795 case AMDGPUISD::BFE_I32: {
5796 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5797 if (!Width)
5798 return 1;
5799
5800 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5801 if (!isNullConstant(Op.getOperand(1)))
5802 return SignBits;
5803
5804 // TODO: Could probably figure something out with non-0 offsets.
5805 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5806 return std::max(SignBits, Op0SignBits);
5807 }
5808
5809 case AMDGPUISD::BFE_U32: {
5810 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5811 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5812 }
5813
5814 case AMDGPUISD::CARRY:
5815 case AMDGPUISD::BORROW:
5816 return 31;
5818 return 25;
5820 return 17;
5822 return 24;
5824 return 16;
5826 return 16;
5827 case AMDGPUISD::SMIN3:
5828 case AMDGPUISD::SMAX3:
5829 case AMDGPUISD::SMED3:
5830 case AMDGPUISD::UMIN3:
5831 case AMDGPUISD::UMAX3:
5832 case AMDGPUISD::UMED3: {
5833 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5834 if (Tmp2 == 1)
5835 return 1; // Early out.
5836
5837 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5838 if (Tmp1 == 1)
5839 return 1; // Early out.
5840
5841 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5842 if (Tmp0 == 1)
5843 return 1; // Early out.
5844
5845 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5846 }
5847 default:
5848 return 1;
5849 }
5850}
5851
5854 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5855 unsigned Depth) const {
5856 const MachineInstr *MI = MRI.getVRegDef(R);
5857 if (!MI)
5858 return 1;
5859
5860 // TODO: Check range metadata on MMO.
5861 switch (MI->getOpcode()) {
5862 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5863 return 25;
5864 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5865 return 17;
5866 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5867 return 24;
5868 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5869 return 16;
5870 case AMDGPU::G_AMDGPU_SMED3:
5871 case AMDGPU::G_AMDGPU_UMED3: {
5872 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5873 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5874 if (Tmp2 == 1)
5875 return 1;
5876 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5877 if (Tmp1 == 1)
5878 return 1;
5879 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5880 if (Tmp0 == 1)
5881 return 1;
5882 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5883 }
5884 default:
5885 return 1;
5886 }
5887}
5888
5890 const SelectionDAG &DAG,
5891 bool SNaN,
5892 unsigned Depth) const {
5893 unsigned Opcode = Op.getOpcode();
5894 switch (Opcode) {
5897 if (SNaN)
5898 return true;
5899
5900 // TODO: Can check no nans on one of the operands for each one, but which
5901 // one?
5902 return false;
5903 }
5906 if (SNaN)
5907 return true;
5908 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5909 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5910 }
5911 case AMDGPUISD::FMED3:
5912 case AMDGPUISD::FMIN3:
5913 case AMDGPUISD::FMAX3:
5916 case AMDGPUISD::FMAD_FTZ: {
5917 if (SNaN)
5918 return true;
5919 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5920 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5921 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5922 }
5927 return true;
5928
5929 case AMDGPUISD::RCP:
5930 case AMDGPUISD::RSQ:
5932 case AMDGPUISD::RSQ_CLAMP: {
5933 if (SNaN)
5934 return true;
5935
5936 // TODO: Need is known positive check.
5937 return false;
5938 }
5939 case ISD::FLDEXP:
5940 case AMDGPUISD::FRACT: {
5941 if (SNaN)
5942 return true;
5943 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5944 }
5948 // TODO: Refine on operands.
5949 return SNaN;
5950 case AMDGPUISD::SIN_HW:
5951 case AMDGPUISD::COS_HW: {
5952 // TODO: Need check for infinity
5953 return SNaN;
5954 }
5956 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5957 // TODO: Handle more intrinsics
5958 switch (IntrinsicID) {
5959 case Intrinsic::amdgcn_cubeid:
5960 return true;
5961
5962 case Intrinsic::amdgcn_frexp_mant: {
5963 if (SNaN)
5964 return true;
5965 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5966 }
5967 case Intrinsic::amdgcn_cvt_pkrtz: {
5968 if (SNaN)
5969 return true;
5970 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5971 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5972 }
5973 case Intrinsic::amdgcn_rcp:
5974 case Intrinsic::amdgcn_rsq:
5975 case Intrinsic::amdgcn_rcp_legacy:
5976 case Intrinsic::amdgcn_rsq_legacy:
5977 case Intrinsic::amdgcn_rsq_clamp: {
5978 if (SNaN)
5979 return true;
5980
5981 // TODO: Need is known positive check.
5982 return false;
5983 }
5984 case Intrinsic::amdgcn_trig_preop:
5985 case Intrinsic::amdgcn_fdot2:
5986 // TODO: Refine on operand
5987 return SNaN;
5988 case Intrinsic::amdgcn_fma_legacy:
5989 if (SNaN)
5990 return true;
5991 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5992 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5993 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5994 default:
5995 return false;
5996 }
5997 }
5998 default:
5999 return false;
6000 }
6001}
6002
6004 Register N0, Register N1) const {
6005 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6006}
6007
6010 switch (RMW->getOperation()) {
6017 case AtomicRMWInst::Xchg: {
6018 const DataLayout &DL = RMW->getFunction()->getDataLayout();
6019 unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
6020 if (ValSize == 32 || ValSize == 64)
6023 }
6024 default: {
6025 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
6026 unsigned Size = IntTy->getBitWidth();
6027 if (Size == 32 || Size == 64)
6029 }
6030
6032 }
6033 }
6034}
6035
6036/// Whether it is profitable to sink the operands of an
6037/// Instruction I to the basic block of I.
6038/// This helps using several modifiers (like abs and neg) more often.
6040 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6041 using namespace PatternMatch;
6042
6043 for (auto &Op : I->operands()) {
6044 // Ensure we are not already sinking this operand.
6045 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6046 continue;
6047
6048 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6049 Ops.push_back(&Op);
6050 }
6051
6052 return !Ops.empty();
6053}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_READONLY
Definition: Compiler.h:227
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1313
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1086
const fltSemantics & getSemantics() const
Definition: APFloat.h:1356
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1104
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1058
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:998
Class for arbitrary precision integers.
Definition: APInt.h:77
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1371
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1365
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:237
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1129
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:285
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:275
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1368
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
@ FAdd
*p = old + v
Definition: Instructions.h:732
@ FSub
*p = old - v
Definition: Instructions.h:735
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:743
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:739
@ Nand
*p = ~(old & v)
Definition: Instructions.h:717
BinOp getOperation() const
Definition: Instructions.h:786
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:362
iterator_range< arg_iterator > args()
Definition: Function.h:855
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:565
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:574
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes=true) const
Returns the type for the shift amount of a shift opcode.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:737
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:567
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1262
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:495
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:797
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:551
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:927
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:954
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:736
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1073
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1077
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:508
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:515
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1258
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:608
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:581
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:999
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:756
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1062
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:750
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1118
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1229
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:532
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:959
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1115
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1554
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1534
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
MaybeAlign getAlign(const Function &F, unsigned Index)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1440
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:271
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:246
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:272
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:269
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:462
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:141
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:97
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:231
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:103
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:94
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:258
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...