LLVM 20.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382
383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
396 MVT::f32, Legal);
397
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
401 {MVT::f16, MVT::f32, MVT::f64}, Expand);
402
405 Custom);
406
407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412 Expand);
413
414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
415
416 if (Subtarget->has16BitInsts())
417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
418 else {
419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
421 }
422
424 Custom);
425
426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428 // default unless marked custom/legal.
430 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
431 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
432 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
433 MVT::v16f64},
434 Custom);
435
436 if (isTypeLegal(MVT::f16))
438 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
439 Custom);
440
441 // Expand to fneg + fadd.
443
445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
446 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
454 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
455 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
456 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
457 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
458 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
459 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
460 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
461 Custom);
462
464 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
465
466 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
467 for (MVT VT : ScalarIntVTs) {
468 // These should use [SU]DIVREM, so set them to expand
470 Expand);
471
472 // GPU does not have divrem function for signed or unsigned.
474
475 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
477
479
480 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
482 }
483
484 // The hardware supports 32-bit FSHR, but not FSHL.
486
487 // The hardware supports 32-bit ROTR, but not ROTL.
488 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
490
492
496 MVT::i64, Custom);
498
500 Legal);
501
504 MVT::i64, Custom);
505
506 for (auto VT : {MVT::i8, MVT::i16})
508
509 static const MVT::SimpleValueType VectorIntTypes[] = {
510 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
511 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
512
513 for (MVT VT : VectorIntTypes) {
514 // Expand the following operations for the current type by default.
527 VT, Expand);
528 }
529
530 static const MVT::SimpleValueType FloatVectorTypes[] = {
531 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
532 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
533
534 for (MVT VT : FloatVectorTypes) {
547 VT, Expand);
548 }
549
550 // This causes using an unrolled select operation rather than expansion with
551 // bit operations. This is in general better, but the alternative using BFI
552 // instructions may be better if the select sources are SGPRs.
554 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
555
557 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
558
560 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
561
563 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
564
566 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
567
569 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
570
572 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
582
584 setJumpIsExpensive(true);
585
586 // FIXME: This is only partially true. If we have to do vector compares, any
587 // SGPR pair can be a condition register. If we have a uniform condition, we
588 // are better off doing SALU operations, where there is only one SCC. For now,
589 // we don't have a way of knowing during instruction selection if a condition
590 // will be uniform and we always use vector compares. Assume we are using
591 // vector compares until that is fixed.
593
596
598
599 // We want to find all load dependencies for long chains of stores to enable
600 // merging into very wide vectors. The problem is with vectors with > 4
601 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
602 // vectors are a legal type, even though we have to split the loads
603 // usually. When we can more precisely specify load legality per address
604 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
605 // smarter so that they can figure out what to do in 2 iterations without all
606 // N > 4 stores on the same chain.
608
609 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
610 // about these during lowering.
611 MaxStoresPerMemcpy = 0xffffffff;
612 MaxStoresPerMemmove = 0xffffffff;
613 MaxStoresPerMemset = 0xffffffff;
614
615 // The expansion for 64-bit division is enormous.
617 addBypassSlowDiv(64, 32);
618
629
633}
634
636 if (getTargetMachine().Options.NoSignedZerosFPMath)
637 return true;
638
639 const auto Flags = Op.getNode()->getFlags();
640 if (Flags.hasNoSignedZeros())
641 return true;
642
643 return false;
644}
645
646//===----------------------------------------------------------------------===//
647// Target Information
648//===----------------------------------------------------------------------===//
649
651static bool fnegFoldsIntoOpcode(unsigned Opc) {
652 switch (Opc) {
653 case ISD::FADD:
654 case ISD::FSUB:
655 case ISD::FMUL:
656 case ISD::FMA:
657 case ISD::FMAD:
658 case ISD::FMINNUM:
659 case ISD::FMAXNUM:
662 case ISD::FMINIMUM:
663 case ISD::FMAXIMUM:
664 case ISD::SELECT:
665 case ISD::FSIN:
666 case ISD::FTRUNC:
667 case ISD::FRINT:
668 case ISD::FNEARBYINT:
669 case ISD::FROUNDEVEN:
671 case AMDGPUISD::RCP:
678 case AMDGPUISD::FMED3:
679 // TODO: handle llvm.amdgcn.fma.legacy
680 return true;
681 case ISD::BITCAST:
682 llvm_unreachable("bitcast is special cased");
683 default:
684 return false;
685 }
686}
687
688static bool fnegFoldsIntoOp(const SDNode *N) {
689 unsigned Opc = N->getOpcode();
690 if (Opc == ISD::BITCAST) {
691 // TODO: Is there a benefit to checking the conditions performFNegCombine
692 // does? We don't for the other cases.
693 SDValue BCSrc = N->getOperand(0);
694 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
695 return BCSrc.getNumOperands() == 2 &&
696 BCSrc.getOperand(1).getValueSizeInBits() == 32;
697 }
698
699 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
700 }
701
702 return fnegFoldsIntoOpcode(Opc);
703}
704
705/// \p returns true if the operation will definitely need to use a 64-bit
706/// encoding, and thus will use a VOP3 encoding regardless of the source
707/// modifiers.
709static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
710 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
711 VT == MVT::f64;
712}
713
714/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
715/// type for ISD::SELECT.
717static bool selectSupportsSourceMods(const SDNode *N) {
718 // TODO: Only applies if select will be vector
719 return N->getValueType(0) == MVT::f32;
720}
721
722// Most FP instructions support source modifiers, but this could be refined
723// slightly.
725static bool hasSourceMods(const SDNode *N) {
726 if (isa<MemSDNode>(N))
727 return false;
728
729 switch (N->getOpcode()) {
730 case ISD::CopyToReg:
731 case ISD::FDIV:
732 case ISD::FREM:
733 case ISD::INLINEASM:
737
738 // TODO: Should really be looking at the users of the bitcast. These are
739 // problematic because bitcasts are used to legalize all stores to integer
740 // types.
741 case ISD::BITCAST:
742 return false;
744 switch (N->getConstantOperandVal(0)) {
745 case Intrinsic::amdgcn_interp_p1:
746 case Intrinsic::amdgcn_interp_p2:
747 case Intrinsic::amdgcn_interp_mov:
748 case Intrinsic::amdgcn_interp_p1_f16:
749 case Intrinsic::amdgcn_interp_p2_f16:
750 return false;
751 default:
752 return true;
753 }
754 }
755 case ISD::SELECT:
757 default:
758 return true;
759 }
760}
761
763 unsigned CostThreshold) {
764 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
765 // it is truly free to use a source modifier in all cases. If there are
766 // multiple users but for each one will necessitate using VOP3, there will be
767 // a code size increase. Try to avoid increasing code size unless we know it
768 // will save on the instruction count.
769 unsigned NumMayIncreaseSize = 0;
770 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
771
772 assert(!N->use_empty());
773
774 // XXX - Should this limit number of uses to check?
775 for (const SDNode *U : N->users()) {
776 if (!hasSourceMods(U))
777 return false;
778
779 if (!opMustUseVOP3Encoding(U, VT)) {
780 if (++NumMayIncreaseSize > CostThreshold)
781 return false;
782 }
783 }
784
785 return true;
786}
787
789 ISD::NodeType ExtendKind) const {
790 assert(!VT.isVector() && "only scalar expected");
791
792 // Round to the next multiple of 32-bits.
793 unsigned Size = VT.getSizeInBits();
794 if (Size <= 32)
795 return MVT::i32;
796 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
797}
798
800 return MVT::i32;
801}
802
804 return true;
805}
806
807// The backend supports 32 and 64 bit floating point immediates.
808// FIXME: Why are we reporting vectors of FP immediates as legal?
810 bool ForCodeSize) const {
811 EVT ScalarVT = VT.getScalarType();
812 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
813 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
814}
815
816// We don't want to shrink f64 / f32 constants.
818 EVT ScalarVT = VT.getScalarType();
819 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
820}
821
823 ISD::LoadExtType ExtTy,
824 EVT NewVT) const {
825 // TODO: This may be worth removing. Check regression tests for diffs.
827 return false;
828
829 unsigned NewSize = NewVT.getStoreSizeInBits();
830
831 // If we are reducing to a 32-bit load or a smaller multi-dword load,
832 // this is always better.
833 if (NewSize >= 32)
834 return true;
835
836 EVT OldVT = N->getValueType(0);
837 unsigned OldSize = OldVT.getStoreSizeInBits();
838
839 MemSDNode *MN = cast<MemSDNode>(N);
840 unsigned AS = MN->getAddressSpace();
841 // Do not shrink an aligned scalar load to sub-dword.
842 // Scalar engine cannot do sub-dword loads.
843 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
844 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
847 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
848 MN->isInvariant())) &&
850 return false;
851
852 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
853 // extloads, so doing one requires using a buffer_load. In cases where we
854 // still couldn't use a scalar load, using the wider load shouldn't really
855 // hurt anything.
856
857 // If the old size already had to be an extload, there's no harm in continuing
858 // to reduce the width.
859 return (OldSize < 32);
860}
861
863 const SelectionDAG &DAG,
864 const MachineMemOperand &MMO) const {
865
866 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
867
868 if (LoadTy.getScalarType() == MVT::i32)
869 return false;
870
871 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
872 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
873
874 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
875 return false;
876
877 unsigned Fast = 0;
879 CastTy, MMO, &Fast) &&
880 Fast;
881}
882
883// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
884// profitable with the expansion for 64-bit since it's generally good to
885// speculate things.
887 return true;
888}
889
891 return true;
892}
893
895 switch (N->getOpcode()) {
896 case ISD::EntryToken:
897 case ISD::TokenFactor:
898 return true;
900 unsigned IntrID = N->getConstantOperandVal(0);
902 }
904 unsigned IntrID = N->getConstantOperandVal(1);
906 }
907 case ISD::LOAD:
908 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
910 return true;
911 return false;
912 case AMDGPUISD::SETCC: // ballot-style instruction
913 return true;
914 }
915 return false;
916}
917
919 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
920 NegatibleCost &Cost, unsigned Depth) const {
921
922 switch (Op.getOpcode()) {
923 case ISD::FMA:
924 case ISD::FMAD: {
925 // Negating a fma is not free if it has users without source mods.
926 if (!allUsesHaveSourceMods(Op.getNode()))
927 return SDValue();
928 break;
929 }
930 case AMDGPUISD::RCP: {
931 SDValue Src = Op.getOperand(0);
932 EVT VT = Op.getValueType();
933 SDLoc SL(Op);
934
935 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
936 ForCodeSize, Cost, Depth + 1);
937 if (NegSrc)
938 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
939 return SDValue();
940 }
941 default:
942 break;
943 }
944
945 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
946 ForCodeSize, Cost, Depth);
947}
948
949//===---------------------------------------------------------------------===//
950// Target Properties
951//===---------------------------------------------------------------------===//
952
955
956 // Packed operations do not have a fabs modifier.
957 return VT == MVT::f32 || VT == MVT::f64 ||
958 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
959}
960
963 // Report this based on the end legalized type.
964 VT = VT.getScalarType();
965 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
966}
967
969 unsigned NumElem,
970 unsigned AS) const {
971 return true;
972}
973
975 // There are few operations which truly have vector input operands. Any vector
976 // operation is going to involve operations on each component, and a
977 // build_vector will be a copy per element, so it always makes sense to use a
978 // build_vector input in place of the extracted element to avoid a copy into a
979 // super register.
980 //
981 // We should probably only do this if all users are extracts only, but this
982 // should be the common case.
983 return true;
984}
985
987 // Truncate is just accessing a subregister.
988
989 unsigned SrcSize = Source.getSizeInBits();
990 unsigned DestSize = Dest.getSizeInBits();
991
992 return DestSize < SrcSize && DestSize % 32 == 0 ;
993}
994
996 // Truncate is just accessing a subregister.
997
998 unsigned SrcSize = Source->getScalarSizeInBits();
999 unsigned DestSize = Dest->getScalarSizeInBits();
1000
1001 if (DestSize== 16 && Subtarget->has16BitInsts())
1002 return SrcSize >= 32;
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0;
1005}
1006
1008 unsigned SrcSize = Src->getScalarSizeInBits();
1009 unsigned DestSize = Dest->getScalarSizeInBits();
1010
1011 if (SrcSize == 16 && Subtarget->has16BitInsts())
1012 return DestSize >= 32;
1013
1014 return SrcSize == 32 && DestSize == 64;
1015}
1016
1018 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1019 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1020 // this will enable reducing 64-bit operations the 32-bit, which is always
1021 // good.
1022
1023 if (Src == MVT::i16)
1024 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1025
1026 return Src == MVT::i32 && Dest == MVT::i64;
1027}
1028
1030 EVT DestVT) const {
1031 switch (N->getOpcode()) {
1032 case ISD::ADD:
1033 case ISD::SUB:
1034 case ISD::SHL:
1035 case ISD::SRL:
1036 case ISD::SRA:
1037 case ISD::AND:
1038 case ISD::OR:
1039 case ISD::XOR:
1040 case ISD::MUL:
1041 case ISD::SETCC:
1042 case ISD::SELECT:
1043 if (Subtarget->has16BitInsts() &&
1044 (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
1045 // Don't narrow back down to i16 if promoted to i32 already.
1046 if (!N->isDivergent() && DestVT.isInteger() &&
1047 DestVT.getScalarSizeInBits() > 1 &&
1048 DestVT.getScalarSizeInBits() <= 16 &&
1049 SrcVT.getScalarSizeInBits() > 16) {
1050 return false;
1051 }
1052 }
1053 return true;
1054 default:
1055 break;
1056 }
1057
1058 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1059 // limited number of native 64-bit operations. Shrinking an operation to fit
1060 // in a single 32-bit register should always be helpful. As currently used,
1061 // this is much less general than the name suggests, and is only used in
1062 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1063 // not profitable, and may actually be harmful.
1064 if (isa<LoadSDNode>(N))
1065 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1066
1067 return true;
1068}
1069
1071 const SDNode* N, CombineLevel Level) const {
1072 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1073 N->getOpcode() == ISD::SRL) &&
1074 "Expected shift op");
1075
1076 SDValue ShiftLHS = N->getOperand(0);
1077 if (!ShiftLHS->hasOneUse())
1078 return false;
1079
1080 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1081 !ShiftLHS.getOperand(0)->hasOneUse())
1082 return false;
1083
1084 // Always commute pre-type legalization and right shifts.
1085 // We're looking for shl(or(x,y),z) patterns.
1087 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1088 return true;
1089
1090 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1091 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1092 (N->user_begin()->getOpcode() == ISD::SRA ||
1093 N->user_begin()->getOpcode() == ISD::SRL))
1094 return false;
1095
1096 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1097 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1098 if (LHS.getOpcode() != ISD::SHL)
1099 return false;
1100 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1101 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1102 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1103 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1104 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1105 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1106 };
1107 SDValue LHS = N->getOperand(0).getOperand(0);
1108 SDValue RHS = N->getOperand(0).getOperand(1);
1109 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1110}
1111
1112//===---------------------------------------------------------------------===//
1113// TargetLowering Callbacks
1114//===---------------------------------------------------------------------===//
1115
1117 bool IsVarArg) {
1118 switch (CC) {
1126 return CC_AMDGPU;
1129 return CC_AMDGPU_CS_CHAIN;
1130 case CallingConv::C:
1131 case CallingConv::Fast:
1132 case CallingConv::Cold:
1133 return CC_AMDGPU_Func;
1135 return CC_SI_Gfx;
1138 default:
1139 report_fatal_error("Unsupported calling convention for call");
1140 }
1141}
1142
1144 bool IsVarArg) {
1145 switch (CC) {
1148 llvm_unreachable("kernels should not be handled here");
1158 return RetCC_SI_Shader;
1160 return RetCC_SI_Gfx;
1161 case CallingConv::C:
1162 case CallingConv::Fast:
1163 case CallingConv::Cold:
1164 return RetCC_AMDGPU_Func;
1165 default:
1166 report_fatal_error("Unsupported calling convention.");
1167 }
1168}
1169
1170/// The SelectionDAGBuilder will automatically promote function arguments
1171/// with illegal types. However, this does not work for the AMDGPU targets
1172/// since the function arguments are stored in memory as these illegal types.
1173/// In order to handle this properly we need to get the original types sizes
1174/// from the LLVM IR Function and fixup the ISD:InputArg values before
1175/// passing them to AnalyzeFormalArguments()
1176
1177/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1178/// input values across multiple registers. Each item in the Ins array
1179/// represents a single value that will be stored in registers. Ins[x].VT is
1180/// the value type of the value that will be stored in the register, so
1181/// whatever SDNode we lower the argument to needs to be this type.
1182///
1183/// In order to correctly lower the arguments we need to know the size of each
1184/// argument. Since Ins[x].VT gives us the size of the register that will
1185/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1186/// for the original function argument so that we can deduce the correct memory
1187/// type to use for Ins[x]. In most cases the correct memory type will be
1188/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1189/// we have a kernel argument of type v8i8, this argument will be split into
1190/// 8 parts and each part will be represented by its own item in the Ins array.
1191/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1192/// the argument before it was split. From this, we deduce that the memory type
1193/// for each individual part is i8. We pass the memory type as LocVT to the
1194/// calling convention analysis function and the register type (Ins[x].VT) as
1195/// the ValVT.
1197 CCState &State,
1198 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1199 const MachineFunction &MF = State.getMachineFunction();
1200 const Function &Fn = MF.getFunction();
1201 LLVMContext &Ctx = Fn.getParent()->getContext();
1202 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1203 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1205
1206 Align MaxAlign = Align(1);
1207 uint64_t ExplicitArgOffset = 0;
1208 const DataLayout &DL = Fn.getDataLayout();
1209
1210 unsigned InIndex = 0;
1211
1212 for (const Argument &Arg : Fn.args()) {
1213 const bool IsByRef = Arg.hasByRefAttr();
1214 Type *BaseArgTy = Arg.getType();
1215 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1216 Align Alignment = DL.getValueOrABITypeAlignment(
1217 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1218 MaxAlign = std::max(Alignment, MaxAlign);
1219 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1220
1221 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1222 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1223
1224 // We're basically throwing away everything passed into us and starting over
1225 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1226 // to us as computed in Ins.
1227 //
1228 // We also need to figure out what type legalization is trying to do to get
1229 // the correct memory offsets.
1230
1231 SmallVector<EVT, 16> ValueVTs;
1233 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1234
1235 for (unsigned Value = 0, NumValues = ValueVTs.size();
1236 Value != NumValues; ++Value) {
1237 uint64_t BasePartOffset = Offsets[Value];
1238
1239 EVT ArgVT = ValueVTs[Value];
1240 EVT MemVT = ArgVT;
1241 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1242 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1243
1244 if (NumRegs == 1) {
1245 // This argument is not split, so the IR type is the memory type.
1246 if (ArgVT.isExtended()) {
1247 // We have an extended type, like i24, so we should just use the
1248 // register type.
1249 MemVT = RegisterVT;
1250 } else {
1251 MemVT = ArgVT;
1252 }
1253 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1254 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1255 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1256 // We have a vector value which has been split into a vector with
1257 // the same scalar type, but fewer elements. This should handle
1258 // all the floating-point vector types.
1259 MemVT = RegisterVT;
1260 } else if (ArgVT.isVector() &&
1261 ArgVT.getVectorNumElements() == NumRegs) {
1262 // This arg has been split so that each element is stored in a separate
1263 // register.
1264 MemVT = ArgVT.getScalarType();
1265 } else if (ArgVT.isExtended()) {
1266 // We have an extended type, like i65.
1267 MemVT = RegisterVT;
1268 } else {
1269 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1270 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1271 if (RegisterVT.isInteger()) {
1272 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1273 } else if (RegisterVT.isVector()) {
1274 assert(!RegisterVT.getScalarType().isFloatingPoint());
1275 unsigned NumElements = RegisterVT.getVectorNumElements();
1276 assert(MemoryBits % NumElements == 0);
1277 // This vector type has been split into another vector type with
1278 // a different elements size.
1279 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1280 MemoryBits / NumElements);
1281 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1282 } else {
1283 llvm_unreachable("cannot deduce memory type.");
1284 }
1285 }
1286
1287 // Convert one element vectors to scalar.
1288 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1289 MemVT = MemVT.getScalarType();
1290
1291 // Round up vec3/vec5 argument.
1292 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1293 MemVT = MemVT.getPow2VectorType(State.getContext());
1294 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1295 MemVT = MemVT.getRoundIntegerType(State.getContext());
1296 }
1297
1298 unsigned PartOffset = 0;
1299 for (unsigned i = 0; i != NumRegs; ++i) {
1300 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1301 BasePartOffset + PartOffset,
1302 MemVT.getSimpleVT(),
1304 PartOffset += MemVT.getStoreSize();
1305 }
1306 }
1307 }
1308}
1309
1311 SDValue Chain, CallingConv::ID CallConv,
1312 bool isVarArg,
1314 const SmallVectorImpl<SDValue> &OutVals,
1315 const SDLoc &DL, SelectionDAG &DAG) const {
1316 // FIXME: Fails for r600 tests
1317 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1318 // "wave terminate should not have return values");
1319 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1320}
1321
1322//===---------------------------------------------------------------------===//
1323// Target specific lowering
1324//===---------------------------------------------------------------------===//
1325
1326/// Selects the correct CCAssignFn for a given CallingConvention value.
1328 bool IsVarArg) {
1329 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1330}
1331
1333 bool IsVarArg) {
1335}
1336
1338 SelectionDAG &DAG,
1339 MachineFrameInfo &MFI,
1340 int ClobberedFI) const {
1341 SmallVector<SDValue, 8> ArgChains;
1342 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1343 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1344
1345 // Include the original chain at the beginning of the list. When this is
1346 // used by target LowerCall hooks, this helps legalize find the
1347 // CALLSEQ_BEGIN node.
1348 ArgChains.push_back(Chain);
1349
1350 // Add a chain value for each stack argument corresponding
1351 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1352 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1353 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1354 if (FI->getIndex() < 0) {
1355 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1356 int64_t InLastByte = InFirstByte;
1357 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1358
1359 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1360 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1361 ArgChains.push_back(SDValue(L, 1));
1362 }
1363 }
1364 }
1365 }
1366
1367 // Build a tokenfactor for all the chains.
1368 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1369}
1370
1373 StringRef Reason) const {
1374 SDValue Callee = CLI.Callee;
1375 SelectionDAG &DAG = CLI.DAG;
1376
1377 const Function &Fn = DAG.getMachineFunction().getFunction();
1378
1379 StringRef FuncName("<unknown>");
1380
1381 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1382 FuncName = G->getSymbol();
1383 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1384 FuncName = G->getGlobal()->getName();
1385
1387 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1388 DAG.getContext()->diagnose(NoCalls);
1389
1390 if (!CLI.IsTailCall) {
1391 for (ISD::InputArg &Arg : CLI.Ins)
1392 InVals.push_back(DAG.getUNDEF(Arg.VT));
1393 }
1394
1395 return DAG.getEntryNode();
1396}
1397
1399 SmallVectorImpl<SDValue> &InVals) const {
1400 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1401}
1402
1404 SelectionDAG &DAG) const {
1405 const Function &Fn = DAG.getMachineFunction().getFunction();
1406
1407 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1408 SDLoc(Op).getDebugLoc());
1409 DAG.getContext()->diagnose(NoDynamicAlloca);
1410 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1411 return DAG.getMergeValues(Ops, SDLoc());
1412}
1413
1415 SelectionDAG &DAG) const {
1416 switch (Op.getOpcode()) {
1417 default:
1418 Op->print(errs(), &DAG);
1419 llvm_unreachable("Custom lowering code for this "
1420 "instruction is not implemented yet!");
1421 break;
1423 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1425 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1426 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1427 case ISD::FREM: return LowerFREM(Op, DAG);
1428 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1429 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1430 case ISD::FRINT: return LowerFRINT(Op, DAG);
1431 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1432 case ISD::FROUNDEVEN:
1433 return LowerFROUNDEVEN(Op, DAG);
1434 case ISD::FROUND: return LowerFROUND(Op, DAG);
1435 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1436 case ISD::FLOG2:
1437 return LowerFLOG2(Op, DAG);
1438 case ISD::FLOG:
1439 case ISD::FLOG10:
1440 return LowerFLOGCommon(Op, DAG);
1441 case ISD::FEXP:
1442 case ISD::FEXP10:
1443 return lowerFEXP(Op, DAG);
1444 case ISD::FEXP2:
1445 return lowerFEXP2(Op, DAG);
1446 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1447 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1448 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1449 case ISD::FP_TO_SINT:
1450 case ISD::FP_TO_UINT:
1451 return LowerFP_TO_INT(Op, DAG);
1452 case ISD::CTTZ:
1454 case ISD::CTLZ:
1456 return LowerCTLZ_CTTZ(Op, DAG);
1458 }
1459 return Op;
1460}
1461
1464 SelectionDAG &DAG) const {
1465 switch (N->getOpcode()) {
1467 // Different parts of legalization seem to interpret which type of
1468 // sign_extend_inreg is the one to check for custom lowering. The extended
1469 // from type is what really matters, but some places check for custom
1470 // lowering of the result type. This results in trying to use
1471 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1472 // nothing here and let the illegal result integer be handled normally.
1473 return;
1474 case ISD::FLOG2:
1475 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1476 Results.push_back(Lowered);
1477 return;
1478 case ISD::FLOG:
1479 case ISD::FLOG10:
1480 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1481 Results.push_back(Lowered);
1482 return;
1483 case ISD::FEXP2:
1484 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1485 Results.push_back(Lowered);
1486 return;
1487 case ISD::FEXP:
1488 case ISD::FEXP10:
1489 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1490 Results.push_back(Lowered);
1491 return;
1492 case ISD::CTLZ:
1494 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1495 Results.push_back(Lowered);
1496 return;
1497 default:
1498 return;
1499 }
1500}
1501
1503 SDValue Op,
1504 SelectionDAG &DAG) const {
1505
1506 const DataLayout &DL = DAG.getDataLayout();
1507 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1508 const GlobalValue *GV = G->getGlobal();
1509
1510 if (!MFI->isModuleEntryFunction()) {
1511 if (std::optional<uint32_t> Address =
1513 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1514 }
1515 }
1516
1517 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1518 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1519 if (!MFI->isModuleEntryFunction() &&
1520 GV->getName() != "llvm.amdgcn.module.lds" &&
1521 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
1522 SDLoc DL(Op);
1523 const Function &Fn = DAG.getMachineFunction().getFunction();
1524 DiagnosticInfoUnsupported BadLDSDecl(
1525 Fn, "local memory global used by non-kernel function",
1526 DL.getDebugLoc(), DS_Warning);
1527 DAG.getContext()->diagnose(BadLDSDecl);
1528
1529 // We currently don't have a way to correctly allocate LDS objects that
1530 // aren't directly associated with a kernel. We do force inlining of
1531 // functions that use local objects. However, if these dead functions are
1532 // not eliminated, we don't want a compile time error. Just emit a warning
1533 // and a trap, since there should be no callable path here.
1534 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1535 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1536 Trap, DAG.getRoot());
1537 DAG.setRoot(OutputChain);
1538 return DAG.getUNDEF(Op.getValueType());
1539 }
1540
1541 // XXX: What does the value of G->getOffset() mean?
1542 assert(G->getOffset() == 0 &&
1543 "Do not know what to do with an non-zero offset");
1544
1545 // TODO: We could emit code to handle the initialization somewhere.
1546 // We ignore the initializer for now and legalize it to allow selection.
1547 // The initializer will anyway get errored out during assembly emission.
1548 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1549 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1550 }
1551 return SDValue();
1552}
1553
1555 SelectionDAG &DAG) const {
1557 SDLoc SL(Op);
1558
1559 EVT VT = Op.getValueType();
1560 if (VT.getVectorElementType().getSizeInBits() < 32) {
1561 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1562 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1563 unsigned NewNumElt = OpBitSize / 32;
1564 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1566 MVT::i32, NewNumElt);
1567 for (const SDUse &U : Op->ops()) {
1568 SDValue In = U.get();
1569 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1570 if (NewNumElt > 1)
1571 DAG.ExtractVectorElements(NewIn, Args);
1572 else
1573 Args.push_back(NewIn);
1574 }
1575
1576 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1577 NewNumElt * Op.getNumOperands());
1578 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1579 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1580 }
1581 }
1582
1583 for (const SDUse &U : Op->ops())
1584 DAG.ExtractVectorElements(U.get(), Args);
1585
1586 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1587}
1588
1590 SelectionDAG &DAG) const {
1591 SDLoc SL(Op);
1593 unsigned Start = Op.getConstantOperandVal(1);
1594 EVT VT = Op.getValueType();
1595 EVT SrcVT = Op.getOperand(0).getValueType();
1596
1597 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1598 unsigned NumElt = VT.getVectorNumElements();
1599 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1600 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1601
1602 // Extract 32-bit registers at a time.
1603 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1604 EVT NewVT = NumElt == 2
1605 ? MVT::i32
1606 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1607 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1608
1609 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1610 if (NumElt == 2)
1611 Tmp = Args[0];
1612 else
1613 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1614
1615 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1616 }
1617
1618 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1620
1621 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1622}
1623
1624// TODO: Handle fabs too
1626 if (Val.getOpcode() == ISD::FNEG)
1627 return Val.getOperand(0);
1628
1629 return Val;
1630}
1631
1633 if (Val.getOpcode() == ISD::FNEG)
1634 Val = Val.getOperand(0);
1635 if (Val.getOpcode() == ISD::FABS)
1636 Val = Val.getOperand(0);
1637 if (Val.getOpcode() == ISD::FCOPYSIGN)
1638 Val = Val.getOperand(0);
1639 return Val;
1640}
1641
1643 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1644 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1645 SelectionDAG &DAG = DCI.DAG;
1646 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1647 switch (CCOpcode) {
1648 case ISD::SETOEQ:
1649 case ISD::SETONE:
1650 case ISD::SETUNE:
1651 case ISD::SETNE:
1652 case ISD::SETUEQ:
1653 case ISD::SETEQ:
1654 case ISD::SETFALSE:
1655 case ISD::SETFALSE2:
1656 case ISD::SETTRUE:
1657 case ISD::SETTRUE2:
1658 case ISD::SETUO:
1659 case ISD::SETO:
1660 break;
1661 case ISD::SETULE:
1662 case ISD::SETULT: {
1663 if (LHS == True)
1664 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1665 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1666 }
1667 case ISD::SETOLE:
1668 case ISD::SETOLT:
1669 case ISD::SETLE:
1670 case ISD::SETLT: {
1671 // Ordered. Assume ordered for undefined.
1672
1673 // Only do this after legalization to avoid interfering with other combines
1674 // which might occur.
1676 !DCI.isCalledByLegalizer())
1677 return SDValue();
1678
1679 // We need to permute the operands to get the correct NaN behavior. The
1680 // selected operand is the second one based on the failing compare with NaN,
1681 // so permute it based on the compare type the hardware uses.
1682 if (LHS == True)
1683 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1684 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1685 }
1686 case ISD::SETUGE:
1687 case ISD::SETUGT: {
1688 if (LHS == True)
1689 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1690 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1691 }
1692 case ISD::SETGT:
1693 case ISD::SETGE:
1694 case ISD::SETOGE:
1695 case ISD::SETOGT: {
1697 !DCI.isCalledByLegalizer())
1698 return SDValue();
1699
1700 if (LHS == True)
1701 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1702 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1703 }
1704 case ISD::SETCC_INVALID:
1705 llvm_unreachable("Invalid setcc condcode!");
1706 }
1707 return SDValue();
1708}
1709
1710/// Generate Min/Max node
1712 SDValue LHS, SDValue RHS,
1713 SDValue True, SDValue False,
1714 SDValue CC,
1715 DAGCombinerInfo &DCI) const {
1716 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1717 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1718
1719 SelectionDAG &DAG = DCI.DAG;
1720
1721 // If we can't directly match this, try to see if we can fold an fneg to
1722 // match.
1723
1724 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1725 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1726 SDValue NegTrue = peekFNeg(True);
1727
1728 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1729 // fmin/fmax.
1730 //
1731 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1732 // -> fneg (fmin_legacy lhs, K)
1733 //
1734 // TODO: Use getNegatedExpression
1735 if (LHS == NegTrue && CFalse && CRHS) {
1736 APFloat NegRHS = neg(CRHS->getValueAPF());
1737 if (NegRHS == CFalse->getValueAPF()) {
1738 SDValue Combined =
1739 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1740 if (Combined)
1741 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1742 return SDValue();
1743 }
1744 }
1745
1746 return SDValue();
1747}
1748
1749std::pair<SDValue, SDValue>
1751 SDLoc SL(Op);
1752
1753 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1754
1755 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1756 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1757
1758 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1759 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1760
1761 return std::pair(Lo, Hi);
1762}
1763
1765 SDLoc SL(Op);
1766
1767 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1768 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1769 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1770}
1771
1773 SDLoc SL(Op);
1774
1775 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1776 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1777 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1778}
1779
1780// Split a vector type into two parts. The first part is a power of two vector.
1781// The second part is whatever is left over, and is a scalar if it would
1782// otherwise be a 1-vector.
1783std::pair<EVT, EVT>
1785 EVT LoVT, HiVT;
1786 EVT EltVT = VT.getVectorElementType();
1787 unsigned NumElts = VT.getVectorNumElements();
1788 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1789 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1790 HiVT = NumElts - LoNumElts == 1
1791 ? EltVT
1792 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1793 return std::pair(LoVT, HiVT);
1794}
1795
1796// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1797// scalar.
1798std::pair<SDValue, SDValue>
1800 const EVT &LoVT, const EVT &HiVT,
1801 SelectionDAG &DAG) const {
1803 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1804 N.getValueType().getVectorNumElements() &&
1805 "More vector elements requested than available!");
1807 DAG.getVectorIdxConstant(0, DL));
1808 SDValue Hi = DAG.getNode(
1810 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1811 return std::pair(Lo, Hi);
1812}
1813
1815 SelectionDAG &DAG) const {
1816 LoadSDNode *Load = cast<LoadSDNode>(Op);
1817 EVT VT = Op.getValueType();
1818 SDLoc SL(Op);
1819
1820
1821 // If this is a 2 element vector, we really want to scalarize and not create
1822 // weird 1 element vectors.
1823 if (VT.getVectorNumElements() == 2) {
1824 SDValue Ops[2];
1825 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1826 return DAG.getMergeValues(Ops, SL);
1827 }
1828
1829 SDValue BasePtr = Load->getBasePtr();
1830 EVT MemVT = Load->getMemoryVT();
1831
1832 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1833
1834 EVT LoVT, HiVT;
1835 EVT LoMemVT, HiMemVT;
1836 SDValue Lo, Hi;
1837
1838 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1839 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1840 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1841
1842 unsigned Size = LoMemVT.getStoreSize();
1843 Align BaseAlign = Load->getAlign();
1844 Align HiAlign = commonAlignment(BaseAlign, Size);
1845
1846 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1847 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1848 BaseAlign, Load->getMemOperand()->getFlags());
1849 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1850 SDValue HiLoad =
1851 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1852 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1853 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1854
1855 SDValue Join;
1856 if (LoVT == HiVT) {
1857 // This is the case that the vector is power of two so was evenly split.
1858 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1859 } else {
1860 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1861 DAG.getVectorIdxConstant(0, SL));
1862 Join = DAG.getNode(
1864 VT, Join, HiLoad,
1866 }
1867
1868 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1869 LoLoad.getValue(1), HiLoad.getValue(1))};
1870
1871 return DAG.getMergeValues(Ops, SL);
1872}
1873
1875 SelectionDAG &DAG) const {
1876 LoadSDNode *Load = cast<LoadSDNode>(Op);
1877 EVT VT = Op.getValueType();
1878 SDValue BasePtr = Load->getBasePtr();
1879 EVT MemVT = Load->getMemoryVT();
1880 SDLoc SL(Op);
1881 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1882 Align BaseAlign = Load->getAlign();
1883 unsigned NumElements = MemVT.getVectorNumElements();
1884
1885 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1886 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1887 if (NumElements != 3 ||
1888 (BaseAlign < Align(8) &&
1889 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1890 return SplitVectorLoad(Op, DAG);
1891
1892 assert(NumElements == 3);
1893
1894 EVT WideVT =
1896 EVT WideMemVT =
1898 SDValue WideLoad = DAG.getExtLoad(
1899 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1900 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1901 return DAG.getMergeValues(
1902 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1903 DAG.getVectorIdxConstant(0, SL)),
1904 WideLoad.getValue(1)},
1905 SL);
1906}
1907
1909 SelectionDAG &DAG) const {
1910 StoreSDNode *Store = cast<StoreSDNode>(Op);
1911 SDValue Val = Store->getValue();
1912 EVT VT = Val.getValueType();
1913
1914 // If this is a 2 element vector, we really want to scalarize and not create
1915 // weird 1 element vectors.
1916 if (VT.getVectorNumElements() == 2)
1917 return scalarizeVectorStore(Store, DAG);
1918
1919 EVT MemVT = Store->getMemoryVT();
1920 SDValue Chain = Store->getChain();
1921 SDValue BasePtr = Store->getBasePtr();
1922 SDLoc SL(Op);
1923
1924 EVT LoVT, HiVT;
1925 EVT LoMemVT, HiMemVT;
1926 SDValue Lo, Hi;
1927
1928 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1929 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1930 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1931
1932 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1933
1934 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1935 Align BaseAlign = Store->getAlign();
1936 unsigned Size = LoMemVT.getStoreSize();
1937 Align HiAlign = commonAlignment(BaseAlign, Size);
1938
1939 SDValue LoStore =
1940 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1941 Store->getMemOperand()->getFlags());
1942 SDValue HiStore =
1943 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1944 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1945
1946 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1947}
1948
1949// This is a shortcut for integer division because we have fast i32<->f32
1950// conversions, and fast f32 reciprocal instructions. The fractional part of a
1951// float is enough to accurately represent up to a 24-bit signed integer.
1953 bool Sign) const {
1954 SDLoc DL(Op);
1955 EVT VT = Op.getValueType();
1956 SDValue LHS = Op.getOperand(0);
1957 SDValue RHS = Op.getOperand(1);
1958 MVT IntVT = MVT::i32;
1959 MVT FltVT = MVT::f32;
1960
1961 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1962 if (LHSSignBits < 9)
1963 return SDValue();
1964
1965 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1966 if (RHSSignBits < 9)
1967 return SDValue();
1968
1969 unsigned BitSize = VT.getSizeInBits();
1970 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1971 unsigned DivBits = BitSize - SignBits;
1972 if (Sign)
1973 ++DivBits;
1974
1977
1978 SDValue jq = DAG.getConstant(1, DL, IntVT);
1979
1980 if (Sign) {
1981 // char|short jq = ia ^ ib;
1982 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1983
1984 // jq = jq >> (bitsize - 2)
1985 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1986 DAG.getConstant(BitSize - 2, DL, VT));
1987
1988 // jq = jq | 0x1
1989 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1990 }
1991
1992 // int ia = (int)LHS;
1993 SDValue ia = LHS;
1994
1995 // int ib, (int)RHS;
1996 SDValue ib = RHS;
1997
1998 // float fa = (float)ia;
1999 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2000
2001 // float fb = (float)ib;
2002 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2003
2004 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2005 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2006
2007 // fq = trunc(fq);
2008 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2009
2010 // float fqneg = -fq;
2011 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2012
2014
2015 bool UseFmadFtz = false;
2016 if (Subtarget->isGCN()) {
2018 UseFmadFtz =
2020 }
2021
2022 // float fr = mad(fqneg, fb, fa);
2023 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2024 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2026 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2027
2028 // int iq = (int)fq;
2029 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2030
2031 // fr = fabs(fr);
2032 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2033
2034 // fb = fabs(fb);
2035 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2036
2037 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2038
2039 // int cv = fr >= fb;
2040 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2041
2042 // jq = (cv ? jq : 0);
2043 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2044
2045 // dst = iq + jq;
2046 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2047
2048 // Rem needs compensation, it's easier to recompute it
2049 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2050 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2051
2052 // Truncate to number of bits this divide really is.
2053 if (Sign) {
2054 SDValue InRegSize
2055 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2056 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2057 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2058 } else {
2059 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2060 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2061 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2062 }
2063
2064 return DAG.getMergeValues({ Div, Rem }, DL);
2065}
2066
2068 SelectionDAG &DAG,
2070 SDLoc DL(Op);
2071 EVT VT = Op.getValueType();
2072
2073 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2074
2075 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2076
2077 SDValue One = DAG.getConstant(1, DL, HalfVT);
2078 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2079
2080 //HiLo split
2081 SDValue LHS_Lo, LHS_Hi;
2082 SDValue LHS = Op.getOperand(0);
2083 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2084
2085 SDValue RHS_Lo, RHS_Hi;
2086 SDValue RHS = Op.getOperand(1);
2087 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2088
2089 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2091
2092 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2093 LHS_Lo, RHS_Lo);
2094
2095 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2096 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2097
2098 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2099 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2100 return;
2101 }
2102
2103 if (isTypeLegal(MVT::i64)) {
2104 // The algorithm here is based on ideas from "Software Integer Division",
2105 // Tom Rodeheffer, August 2008.
2106
2109
2110 // Compute denominator reciprocal.
2111 unsigned FMAD =
2112 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2115 : (unsigned)AMDGPUISD::FMAD_FTZ;
2116
2117 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2118 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2119 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2120 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2121 Cvt_Lo);
2122 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2123 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2124 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2125 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2126 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2127 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2128 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2129 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2130 Mul1);
2131 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2132 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2133 SDValue Rcp64 = DAG.getBitcast(VT,
2134 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2135
2136 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2137 SDValue One64 = DAG.getConstant(1, DL, VT);
2138 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2139 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2140
2141 // First round of UNR (Unsigned integer Newton-Raphson).
2142 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2143 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2144 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2145 SDValue Mulhi1_Lo, Mulhi1_Hi;
2146 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2147 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2148 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2149 Mulhi1_Lo, Zero1);
2150 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2151 Mulhi1_Hi, Add1_Lo.getValue(1));
2152 SDValue Add1 = DAG.getBitcast(VT,
2153 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2154
2155 // Second round of UNR.
2156 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2157 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2158 SDValue Mulhi2_Lo, Mulhi2_Hi;
2159 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2160 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2161 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2162 Mulhi2_Lo, Zero1);
2163 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2164 Mulhi2_Hi, Add2_Lo.getValue(1));
2165 SDValue Add2 = DAG.getBitcast(VT,
2166 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2167
2168 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2169
2170 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2171
2172 SDValue Mul3_Lo, Mul3_Hi;
2173 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2174 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2175 Mul3_Lo, Zero1);
2176 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2177 Mul3_Hi, Sub1_Lo.getValue(1));
2178 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2179 SDValue Sub1 = DAG.getBitcast(VT,
2180 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2181
2182 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2183 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2184 ISD::SETUGE);
2185 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2186 ISD::SETUGE);
2187 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2188
2189 // TODO: Here and below portions of the code can be enclosed into if/endif.
2190 // Currently control flow is unconditional and we have 4 selects after
2191 // potential endif to substitute PHIs.
2192
2193 // if C3 != 0 ...
2194 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2195 RHS_Lo, Zero1);
2196 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2197 RHS_Hi, Sub1_Lo.getValue(1));
2198 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2199 Zero, Sub2_Lo.getValue(1));
2200 SDValue Sub2 = DAG.getBitcast(VT,
2201 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2202
2203 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2204
2205 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2206 ISD::SETUGE);
2207 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2208 ISD::SETUGE);
2209 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2210
2211 // if (C6 != 0)
2212 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2213
2214 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2215 RHS_Lo, Zero1);
2216 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2217 RHS_Hi, Sub2_Lo.getValue(1));
2218 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2219 Zero, Sub3_Lo.getValue(1));
2220 SDValue Sub3 = DAG.getBitcast(VT,
2221 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2222
2223 // endif C6
2224 // endif C3
2225
2226 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2227 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2228
2229 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2230 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2231
2232 Results.push_back(Div);
2233 Results.push_back(Rem);
2234
2235 return;
2236 }
2237
2238 // r600 expandion.
2239 // Get Speculative values
2240 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2241 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2242
2243 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2244 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2245 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2246
2247 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2248 SDValue DIV_Lo = Zero;
2249
2250 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2251
2252 for (unsigned i = 0; i < halfBitWidth; ++i) {
2253 const unsigned bitPos = halfBitWidth - i - 1;
2254 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2255 // Get value of high bit
2256 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2257 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2258 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2259
2260 // Shift
2261 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2262 // Add LHS high bit
2263 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2264
2265 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2266 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2267
2268 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2269
2270 // Update REM
2271 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2272 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2273 }
2274
2275 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2276 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2277 Results.push_back(DIV);
2278 Results.push_back(REM);
2279}
2280
2282 SelectionDAG &DAG) const {
2283 SDLoc DL(Op);
2284 EVT VT = Op.getValueType();
2285
2286 if (VT == MVT::i64) {
2288 LowerUDIVREM64(Op, DAG, Results);
2289 return DAG.getMergeValues(Results, DL);
2290 }
2291
2292 if (VT == MVT::i32) {
2293 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2294 return Res;
2295 }
2296
2297 SDValue X = Op.getOperand(0);
2298 SDValue Y = Op.getOperand(1);
2299
2300 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2301 // algorithm used here.
2302
2303 // Initial estimate of inv(y).
2304 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2305
2306 // One round of UNR.
2307 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2308 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2309 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2310 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2311
2312 // Quotient/remainder estimate.
2313 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2314 SDValue R =
2315 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2316
2317 // First quotient/remainder refinement.
2318 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2319 SDValue One = DAG.getConstant(1, DL, VT);
2320 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2321 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2322 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2323 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2324 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2325
2326 // Second quotient/remainder refinement.
2327 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2328 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2329 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2330 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2331 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2332
2333 return DAG.getMergeValues({Q, R}, DL);
2334}
2335
2337 SelectionDAG &DAG) const {
2338 SDLoc DL(Op);
2339 EVT VT = Op.getValueType();
2340
2341 SDValue LHS = Op.getOperand(0);
2342 SDValue RHS = Op.getOperand(1);
2343
2344 SDValue Zero = DAG.getConstant(0, DL, VT);
2345 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2346
2347 if (VT == MVT::i32) {
2348 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2349 return Res;
2350 }
2351
2352 if (VT == MVT::i64 &&
2353 DAG.ComputeNumSignBits(LHS) > 32 &&
2354 DAG.ComputeNumSignBits(RHS) > 32) {
2355 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2356
2357 //HiLo split
2358 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2359 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2360 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2361 LHS_Lo, RHS_Lo);
2362 SDValue Res[2] = {
2363 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2364 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2365 };
2366 return DAG.getMergeValues(Res, DL);
2367 }
2368
2369 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2370 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2371 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2372 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2373
2374 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2375 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2376
2377 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2378 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2379
2380 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2381 SDValue Rem = Div.getValue(1);
2382
2383 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2384 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2385
2386 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2387 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2388
2389 SDValue Res[2] = {
2390 Div,
2391 Rem
2392 };
2393 return DAG.getMergeValues(Res, DL);
2394}
2395
2396// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2398 SDLoc SL(Op);
2399 EVT VT = Op.getValueType();
2400 auto Flags = Op->getFlags();
2401 SDValue X = Op.getOperand(0);
2402 SDValue Y = Op.getOperand(1);
2403
2404 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2405 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2406 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2407 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2408 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2409}
2410
2412 SDLoc SL(Op);
2413 SDValue Src = Op.getOperand(0);
2414
2415 // result = trunc(src)
2416 // if (src > 0.0 && src != result)
2417 // result += 1.0
2418
2419 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2420
2421 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2422 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2423
2424 EVT SetCCVT =
2425 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2426
2427 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2428 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2429 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2430
2431 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2432 // TODO: Should this propagate fast-math-flags?
2433 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2434}
2435
2437 SelectionDAG &DAG) {
2438 const unsigned FractBits = 52;
2439 const unsigned ExpBits = 11;
2440
2441 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2442 Hi,
2443 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2444 DAG.getConstant(ExpBits, SL, MVT::i32));
2445 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2446 DAG.getConstant(1023, SL, MVT::i32));
2447
2448 return Exp;
2449}
2450
2452 SDLoc SL(Op);
2453 SDValue Src = Op.getOperand(0);
2454
2455 assert(Op.getValueType() == MVT::f64);
2456
2457 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2458
2459 // Extract the upper half, since this is where we will find the sign and
2460 // exponent.
2461 SDValue Hi = getHiHalf64(Src, DAG);
2462
2463 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2464
2465 const unsigned FractBits = 52;
2466
2467 // Extract the sign bit.
2468 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2469 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2470
2471 // Extend back to 64-bits.
2472 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2473 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2474
2475 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2476 const SDValue FractMask
2477 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2478
2479 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2480 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2481 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2482
2483 EVT SetCCVT =
2484 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2485
2486 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2487
2488 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2489 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2490
2491 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2492 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2493
2494 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2495}
2496
2498 SelectionDAG &DAG) const {
2499 SDLoc SL(Op);
2500 SDValue Src = Op.getOperand(0);
2501
2502 assert(Op.getValueType() == MVT::f64);
2503
2504 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2505 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2506 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2507
2508 // TODO: Should this propagate fast-math-flags?
2509
2510 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2511 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2512
2513 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2514
2515 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2516 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2517
2518 EVT SetCCVT =
2519 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2520 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2521
2522 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2523}
2524
2526 SelectionDAG &DAG) const {
2527 // FNEARBYINT and FRINT are the same, except in their handling of FP
2528 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2529 // rint, so just treat them as equivalent.
2530 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2531 Op.getOperand(0));
2532}
2533
2535 auto VT = Op.getValueType();
2536 auto Arg = Op.getOperand(0u);
2537 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2538}
2539
2540// XXX - May require not supporting f32 denormals?
2541
2542// Don't handle v2f16. The extra instructions to scalarize and repack around the
2543// compare and vselect end up producing worse code than scalarizing the whole
2544// operation.
2546 SDLoc SL(Op);
2547 SDValue X = Op.getOperand(0);
2548 EVT VT = Op.getValueType();
2549
2550 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2551
2552 // TODO: Should this propagate fast-math-flags?
2553
2554 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2555
2556 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2557
2558 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2559 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2560
2561 EVT SetCCVT =
2562 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2563
2564 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2565 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2566 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2567
2568 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2569 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2570}
2571
2573 SDLoc SL(Op);
2574 SDValue Src = Op.getOperand(0);
2575
2576 // result = trunc(src);
2577 // if (src < 0.0 && src != result)
2578 // result += -1.0.
2579
2580 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2581
2582 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2583 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2584
2585 EVT SetCCVT =
2586 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2587
2588 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2589 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2590 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2591
2592 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2593 // TODO: Should this propagate fast-math-flags?
2594 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2595}
2596
2597/// Return true if it's known that \p Src can never be an f32 denormal value.
2599 switch (Src.getOpcode()) {
2600 case ISD::FP_EXTEND:
2601 return Src.getOperand(0).getValueType() == MVT::f16;
2602 case ISD::FP16_TO_FP:
2603 case ISD::FFREXP:
2604 return true;
2606 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2607 switch (IntrinsicID) {
2608 case Intrinsic::amdgcn_frexp_mant:
2609 return true;
2610 default:
2611 return false;
2612 }
2613 }
2614 default:
2615 return false;
2616 }
2617
2618 llvm_unreachable("covered opcode switch");
2619}
2620
2622 SDNodeFlags Flags) {
2623 if (Flags.hasApproximateFuncs())
2624 return true;
2625 auto &Options = DAG.getTarget().Options;
2626 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2627}
2628
2630 SDValue Src,
2631 SDNodeFlags Flags) {
2632 return !valueIsKnownNeverF32Denorm(Src) &&
2633 DAG.getMachineFunction()
2636}
2637
2639 SDValue Src,
2640 SDNodeFlags Flags) const {
2641 SDLoc SL(Src);
2642 EVT VT = Src.getValueType();
2643 const fltSemantics &Semantics = VT.getFltSemantics();
2644 SDValue SmallestNormal =
2645 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2646
2647 // Want to scale denormals up, but negatives and 0 work just as well on the
2648 // scaled path.
2649 SDValue IsLtSmallestNormal = DAG.getSetCC(
2650 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2651 SmallestNormal, ISD::SETOLT);
2652
2653 return IsLtSmallestNormal;
2654}
2655
2657 SDNodeFlags Flags) const {
2658 SDLoc SL(Src);
2659 EVT VT = Src.getValueType();
2660 const fltSemantics &Semantics = VT.getFltSemantics();
2661 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2662
2663 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2664 SDValue IsFinite = DAG.getSetCC(
2665 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2666 Inf, ISD::SETOLT);
2667 return IsFinite;
2668}
2669
2670/// If denormal handling is required return the scaled input to FLOG2, and the
2671/// check for denormal range. Otherwise, return null values.
2672std::pair<SDValue, SDValue>
2674 SDValue Src, SDNodeFlags Flags) const {
2675 if (!needsDenormHandlingF32(DAG, Src, Flags))
2676 return {};
2677
2678 MVT VT = MVT::f32;
2679 const fltSemantics &Semantics = APFloat::IEEEsingle();
2680 SDValue SmallestNormal =
2681 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2682
2683 SDValue IsLtSmallestNormal = DAG.getSetCC(
2684 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2685 SmallestNormal, ISD::SETOLT);
2686
2687 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2688 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2689 SDValue ScaleFactor =
2690 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2691
2692 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2693 return {ScaledInput, IsLtSmallestNormal};
2694}
2695
2697 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2698 // If we have to handle denormals, scale up the input and adjust the result.
2699
2700 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2701 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2702
2703 SDLoc SL(Op);
2704 EVT VT = Op.getValueType();
2705 SDValue Src = Op.getOperand(0);
2706 SDNodeFlags Flags = Op->getFlags();
2707
2708 if (VT == MVT::f16) {
2709 // Nothing in half is a denormal when promoted to f32.
2710 assert(!Subtarget->has16BitInsts());
2711 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2712 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2713 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2714 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2715 }
2716
2717 auto [ScaledInput, IsLtSmallestNormal] =
2718 getScaledLogInput(DAG, SL, Src, Flags);
2719 if (!ScaledInput)
2720 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2721
2722 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2723
2724 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2725 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2726 SDValue ResultOffset =
2727 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2728 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2729}
2730
2731static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2732 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2733 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2734 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2735}
2736
2738 SelectionDAG &DAG) const {
2739 SDValue X = Op.getOperand(0);
2740 EVT VT = Op.getValueType();
2741 SDNodeFlags Flags = Op->getFlags();
2742 SDLoc DL(Op);
2743
2744 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2745 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2746
2747 const auto &Options = getTargetMachine().Options;
2748 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2749 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2750
2751 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2752 // Log and multiply in f32 is good enough for f16.
2753 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2754 }
2755
2756 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2757 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2758 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2759 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2760 }
2761
2762 return Lowered;
2763 }
2764
2765 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2766 if (ScaledInput)
2767 X = ScaledInput;
2768
2769 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2770
2771 SDValue R;
2772 if (Subtarget->hasFastFMAF32()) {
2773 // c+cc are ln(2)/ln(10) to more than 49 bits
2774 const float c_log10 = 0x1.344134p-2f;
2775 const float cc_log10 = 0x1.09f79ep-26f;
2776
2777 // c + cc is ln(2) to more than 49 bits
2778 const float c_log = 0x1.62e42ep-1f;
2779 const float cc_log = 0x1.efa39ep-25f;
2780
2781 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2782 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2783
2784 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2785 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2786 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2787 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2788 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2789 } else {
2790 // ch+ct is ln(2)/ln(10) to more than 36 bits
2791 const float ch_log10 = 0x1.344000p-2f;
2792 const float ct_log10 = 0x1.3509f6p-18f;
2793
2794 // ch + ct is ln(2) to more than 36 bits
2795 const float ch_log = 0x1.62e000p-1f;
2796 const float ct_log = 0x1.0bfbe8p-15f;
2797
2798 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2799 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2800
2801 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2802 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2803 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2804 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2805 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2806
2807 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2808 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2809 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2810 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2811 }
2812
2813 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2814 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2815
2816 // TODO: Check if known finite from source value.
2817 if (!IsFiniteOnly) {
2818 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2819 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2820 }
2821
2822 if (IsScaled) {
2823 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2824 SDValue ShiftK =
2825 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2826 SDValue Shift =
2827 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2828 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2829 }
2830
2831 return R;
2832}
2833
2835 return LowerFLOGCommon(Op, DAG);
2836}
2837
2838// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2839// promote f16 operation.
2841 SelectionDAG &DAG, bool IsLog10,
2842 SDNodeFlags Flags) const {
2843 EVT VT = Src.getValueType();
2844 unsigned LogOp =
2845 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2846
2847 double Log2BaseInverted =
2849
2850 if (VT == MVT::f32) {
2851 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2852 if (ScaledInput) {
2853 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2854 SDValue ScaledResultOffset =
2855 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2856
2857 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2858
2859 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2860 ScaledResultOffset, Zero, Flags);
2861
2862 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2863
2864 if (Subtarget->hasFastFMAF32())
2865 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2866 Flags);
2867 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2868 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2869 }
2870 }
2871
2872 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2873 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2874
2875 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2876 Flags);
2877}
2878
2880 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2881 // If we have to handle denormals, scale up the input and adjust the result.
2882
2883 SDLoc SL(Op);
2884 EVT VT = Op.getValueType();
2885 SDValue Src = Op.getOperand(0);
2886 SDNodeFlags Flags = Op->getFlags();
2887
2888 if (VT == MVT::f16) {
2889 // Nothing in half is a denormal when promoted to f32.
2890 assert(!Subtarget->has16BitInsts());
2891 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2892 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2893 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2894 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2895 }
2896
2897 assert(VT == MVT::f32);
2898
2899 if (!needsDenormHandlingF32(DAG, Src, Flags))
2900 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2901
2902 // bool needs_scaling = x < -0x1.f80000p+6f;
2903 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2904
2905 // -nextafter(128.0, -1)
2906 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2907
2908 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2909
2910 SDValue NeedsScaling =
2911 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2912
2913 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2914 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2915
2916 SDValue AddOffset =
2917 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2918
2919 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2920 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2921
2922 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2923 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2924 SDValue ResultScale =
2925 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2926
2927 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2928}
2929
2931 SelectionDAG &DAG,
2932 SDNodeFlags Flags) const {
2933 EVT VT = X.getValueType();
2934 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2935
2936 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2937 // exp2(M_LOG2E_F * f);
2938 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2939 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2940 : (unsigned)ISD::FEXP2,
2941 SL, VT, Mul, Flags);
2942 }
2943
2944 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2945
2946 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2947 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2948
2949 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2950
2951 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2952
2953 SDValue AdjustedX =
2954 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2955
2956 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2957
2958 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2959
2960 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2961 SDValue AdjustedResult =
2962 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2963
2964 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2965 Flags);
2966}
2967
2968/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2969/// handled correctly.
2971 SelectionDAG &DAG,
2972 SDNodeFlags Flags) const {
2973 const EVT VT = X.getValueType();
2974 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2975
2976 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2977 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2978 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2979 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2980
2981 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2982 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2983 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2984 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2985 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2986 }
2987
2988 // bool s = x < -0x1.2f7030p+5f;
2989 // x += s ? 0x1.0p+5f : 0.0f;
2990 // exp10 = exp2(x * 0x1.a92000p+1f) *
2991 // exp2(x * 0x1.4f0978p-11f) *
2992 // (s ? 0x1.9f623ep-107f : 1.0f);
2993
2994 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2995
2996 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2997 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2998
2999 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3000 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3001 SDValue AdjustedX =
3002 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3003
3004 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3005 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3006
3007 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3008 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3009 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3010 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3011
3012 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3013
3014 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3015 SDValue AdjustedResult =
3016 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3017
3018 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3019 Flags);
3020}
3021
3023 EVT VT = Op.getValueType();
3024 SDLoc SL(Op);
3025 SDValue X = Op.getOperand(0);
3026 SDNodeFlags Flags = Op->getFlags();
3027 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3028
3029 if (VT.getScalarType() == MVT::f16) {
3030 // v_exp_f16 (fmul x, log2e)
3031 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3032 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3033
3034 if (VT.isVector())
3035 return SDValue();
3036
3037 // exp(f16 x) ->
3038 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3039
3040 // Nothing in half is a denormal when promoted to f32.
3041 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3042 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3043 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3044 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3045 }
3046
3047 assert(VT == MVT::f32);
3048
3049 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3050 // library behavior. Also, is known-not-daz source sufficient?
3051 if (allowApproxFunc(DAG, Flags)) {
3052 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3053 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3054 }
3055
3056 // Algorithm:
3057 //
3058 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3059 //
3060 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3061 // n = 64*m + j, 0 <= j < 64
3062 //
3063 // e^x = 2^((64*m + j + f)/64)
3064 // = (2^m) * (2^(j/64)) * 2^(f/64)
3065 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3066 //
3067 // f = x*(64/ln(2)) - n
3068 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3069 //
3070 // e^x = (2^m) * (2^(j/64)) * e^r
3071 //
3072 // (2^(j/64)) is precomputed
3073 //
3074 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3075 // e^r = 1 + q
3076 //
3077 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3078 //
3079 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3080 SDNodeFlags FlagsNoContract = Flags;
3081 FlagsNoContract.setAllowContract(false);
3082
3083 SDValue PH, PL;
3084 if (Subtarget->hasFastFMAF32()) {
3085 const float c_exp = numbers::log2ef;
3086 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3087 const float c_exp10 = 0x1.a934f0p+1f;
3088 const float cc_exp10 = 0x1.2f346ep-24f;
3089
3090 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3091 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3092
3093 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3094 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3095 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3096 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3097 } else {
3098 const float ch_exp = 0x1.714000p+0f;
3099 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3100
3101 const float ch_exp10 = 0x1.a92000p+1f;
3102 const float cl_exp10 = 0x1.4f0978p-11f;
3103
3104 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3105 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3106
3107 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3108 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3109 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3110 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3111 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3112
3113 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3114
3115 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3116 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3117 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3118 }
3119
3120 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3121
3122 // It is unsafe to contract this fsub into the PH multiply.
3123 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3124
3125 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3126 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3127 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3128
3129 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3130
3131 SDValue UnderflowCheckConst =
3132 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3133
3134 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3135 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3136 SDValue Underflow =
3137 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3138
3139 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3140 const auto &Options = getTargetMachine().Options;
3141
3142 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3143 SDValue OverflowCheckConst =
3144 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3145 SDValue Overflow =
3146 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3147 SDValue Inf =
3149 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3150 }
3151
3152 return R;
3153}
3154
3155static bool isCtlzOpc(unsigned Opc) {
3156 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3157}
3158
3159static bool isCttzOpc(unsigned Opc) {
3160 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3161}
3162
3164 SelectionDAG &DAG) const {
3165 auto SL = SDLoc(Op);
3166 auto Opc = Op.getOpcode();
3167 auto Arg = Op.getOperand(0u);
3168 auto ResultVT = Op.getValueType();
3169
3170 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3171 return {};
3172
3173 assert(isCtlzOpc(Opc));
3174 assert(ResultVT == Arg.getValueType());
3175
3176 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3177 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3178 SDValue NewOp;
3179
3180 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3181 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3182 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3183 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3184 } else {
3185 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3186 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3187 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3188 }
3189
3190 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3191}
3192
3194 SDLoc SL(Op);
3195 SDValue Src = Op.getOperand(0);
3196
3197 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3198 bool Ctlz = isCtlzOpc(Op.getOpcode());
3199 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3200
3201 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3202 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3203 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3204
3205 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3206 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3207 // (cttz hi:lo) -> (umin (ffbl src), 32)
3208 // (ctlz_zero_undef src) -> (ffbh src)
3209 // (cttz_zero_undef src) -> (ffbl src)
3210
3211 // 64-bit scalar version produce 32-bit result
3212 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3213 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3214 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3215 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3216 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3217 if (!ZeroUndef) {
3218 const SDValue ConstVal = DAG.getConstant(
3219 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3220 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3221 }
3222 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3223 }
3224
3225 SDValue Lo, Hi;
3226 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3227
3228 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3229 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3230
3231 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3232 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3233 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3234 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3235
3236 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3237 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3238 if (Ctlz)
3239 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3240 else
3241 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3242
3243 SDValue NewOpr;
3244 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3245 if (!ZeroUndef) {
3246 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3247 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3248 }
3249
3250 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3251}
3252
3254 bool Signed) const {
3255 // The regular method converting a 64-bit integer to float roughly consists of
3256 // 2 steps: normalization and rounding. In fact, after normalization, the
3257 // conversion from a 64-bit integer to a float is essentially the same as the
3258 // one from a 32-bit integer. The only difference is that it has more
3259 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3260 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3261 // converted into the correct float number. The basic steps for the unsigned
3262 // conversion are illustrated in the following pseudo code:
3263 //
3264 // f32 uitofp(i64 u) {
3265 // i32 hi, lo = split(u);
3266 // // Only count the leading zeros in hi as we have native support of the
3267 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3268 // // reduced to a 32-bit one automatically.
3269 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3270 // u <<= shamt;
3271 // hi, lo = split(u);
3272 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3273 // // convert it as a 32-bit integer and scale the result back.
3274 // return uitofp(hi) * 2^(32 - shamt);
3275 // }
3276 //
3277 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3278 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3279 // converted instead followed by negation based its sign bit.
3280
3281 SDLoc SL(Op);
3282 SDValue Src = Op.getOperand(0);
3283
3284 SDValue Lo, Hi;
3285 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3286 SDValue Sign;
3287 SDValue ShAmt;
3288 if (Signed && Subtarget->isGCN()) {
3289 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3290 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3291 // account. That is, the maximal shift is
3292 // - 32 if Lo and Hi have opposite signs;
3293 // - 33 if Lo and Hi have the same sign.
3294 //
3295 // Or, MaxShAmt = 33 + OppositeSign, where
3296 //
3297 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3298 // - -1 if Lo and Hi have opposite signs; and
3299 // - 0 otherwise.
3300 //
3301 // All in all, ShAmt is calculated as
3302 //
3303 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3304 //
3305 // or
3306 //
3307 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3308 //
3309 // to reduce the critical path.
3310 SDValue OppositeSign = DAG.getNode(
3311 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3312 DAG.getConstant(31, SL, MVT::i32));
3313 SDValue MaxShAmt =
3314 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3315 OppositeSign);
3316 // Count the leading sign bits.
3317 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3318 // Different from unsigned conversion, the shift should be one bit less to
3319 // preserve the sign bit.
3320 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3321 DAG.getConstant(1, SL, MVT::i32));
3322 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3323 } else {
3324 if (Signed) {
3325 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3326 // absolute value first.
3327 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3328 DAG.getConstant(63, SL, MVT::i64));
3329 SDValue Abs =
3330 DAG.getNode(ISD::XOR, SL, MVT::i64,
3331 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3332 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3333 }
3334 // Count the leading zeros.
3335 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3336 // The shift amount for signed integers is [0, 32].
3337 }
3338 // Normalize the given 64-bit integer.
3339 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3340 // Split it again.
3341 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3342 // Calculate the adjust bit for rounding.
3343 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3344 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3345 DAG.getConstant(1, SL, MVT::i32), Lo);
3346 // Get the 32-bit normalized integer.
3347 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3348 // Convert the normalized 32-bit integer into f32.
3349 unsigned Opc =
3350 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3351 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3352
3353 // Finally, need to scale back the converted floating number as the original
3354 // 64-bit integer is converted as a 32-bit one.
3355 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3356 ShAmt);
3357 // On GCN, use LDEXP directly.
3358 if (Subtarget->isGCN())
3359 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3360
3361 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3362 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3363 // exponent is enough to avoid overflowing into the sign bit.
3364 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3365 DAG.getConstant(23, SL, MVT::i32));
3366 SDValue IVal =
3367 DAG.getNode(ISD::ADD, SL, MVT::i32,
3368 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3369 if (Signed) {
3370 // Set the sign bit.
3371 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3372 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3373 DAG.getConstant(31, SL, MVT::i32));
3374 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3375 }
3376 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3377}
3378
3380 bool Signed) const {
3381 SDLoc SL(Op);
3382 SDValue Src = Op.getOperand(0);
3383
3384 SDValue Lo, Hi;
3385 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3386
3388 SL, MVT::f64, Hi);
3389
3390 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3391
3392 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3393 DAG.getConstant(32, SL, MVT::i32));
3394 // TODO: Should this propagate fast-math-flags?
3395 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3396}
3397
3399 SelectionDAG &DAG) const {
3400 // TODO: Factor out code common with LowerSINT_TO_FP.
3401 EVT DestVT = Op.getValueType();
3402 SDValue Src = Op.getOperand(0);
3403 EVT SrcVT = Src.getValueType();
3404
3405 if (SrcVT == MVT::i16) {
3406 if (DestVT == MVT::f16)
3407 return Op;
3408 SDLoc DL(Op);
3409
3410 // Promote src to i32
3411 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3412 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3413 }
3414
3415 if (DestVT == MVT::bf16) {
3416 SDLoc SL(Op);
3417 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3418 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3419 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3420 }
3421
3422 if (SrcVT != MVT::i64)
3423 return Op;
3424
3425 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3426 SDLoc DL(Op);
3427
3428 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3429 SDValue FPRoundFlag =
3430 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3431 SDValue FPRound =
3432 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3433
3434 return FPRound;
3435 }
3436
3437 if (DestVT == MVT::f32)
3438 return LowerINT_TO_FP32(Op, DAG, false);
3439
3440 assert(DestVT == MVT::f64);
3441 return LowerINT_TO_FP64(Op, DAG, false);
3442}
3443
3445 SelectionDAG &DAG) const {
3446 EVT DestVT = Op.getValueType();
3447
3448 SDValue Src = Op.getOperand(0);
3449 EVT SrcVT = Src.getValueType();
3450
3451 if (SrcVT == MVT::i16) {
3452 if (DestVT == MVT::f16)
3453 return Op;
3454
3455 SDLoc DL(Op);
3456 // Promote src to i32
3457 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3458 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3459 }
3460
3461 if (DestVT == MVT::bf16) {
3462 SDLoc SL(Op);
3463 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3464 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3465 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3466 }
3467
3468 if (SrcVT != MVT::i64)
3469 return Op;
3470
3471 // TODO: Factor out code common with LowerUINT_TO_FP.
3472
3473 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3474 SDLoc DL(Op);
3475 SDValue Src = Op.getOperand(0);
3476
3477 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3478 SDValue FPRoundFlag =
3479 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3480 SDValue FPRound =
3481 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3482
3483 return FPRound;
3484 }
3485
3486 if (DestVT == MVT::f32)
3487 return LowerINT_TO_FP32(Op, DAG, true);
3488
3489 assert(DestVT == MVT::f64);
3490 return LowerINT_TO_FP64(Op, DAG, true);
3491}
3492
3494 bool Signed) const {
3495 SDLoc SL(Op);
3496
3497 SDValue Src = Op.getOperand(0);
3498 EVT SrcVT = Src.getValueType();
3499
3500 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3501
3502 // The basic idea of converting a floating point number into a pair of 32-bit
3503 // integers is illustrated as follows:
3504 //
3505 // tf := trunc(val);
3506 // hif := floor(tf * 2^-32);
3507 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3508 // hi := fptoi(hif);
3509 // lo := fptoi(lof);
3510 //
3511 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3512 SDValue Sign;
3513 if (Signed && SrcVT == MVT::f32) {
3514 // However, a 32-bit floating point number has only 23 bits mantissa and
3515 // it's not enough to hold all the significant bits of `lof` if val is
3516 // negative. To avoid the loss of precision, We need to take the absolute
3517 // value after truncating and flip the result back based on the original
3518 // signedness.
3519 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3520 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3521 DAG.getConstant(31, SL, MVT::i32));
3522 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3523 }
3524
3525 SDValue K0, K1;
3526 if (SrcVT == MVT::f64) {
3527 K0 = DAG.getConstantFP(
3528 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3529 SrcVT);
3530 K1 = DAG.getConstantFP(
3531 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3532 SrcVT);
3533 } else {
3534 K0 = DAG.getConstantFP(
3535 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3536 K1 = DAG.getConstantFP(
3537 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3538 }
3539 // TODO: Should this propagate fast-math-flags?
3540 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3541
3542 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3543
3544 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3545
3546 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3548 SL, MVT::i32, FloorMul);
3549 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3550
3551 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3552 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3553
3554 if (Signed && SrcVT == MVT::f32) {
3555 assert(Sign);
3556 // Flip the result based on the signedness, which is either all 0s or 1s.
3557 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3558 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3559 // r := xor(r, sign) - sign;
3560 Result =
3561 DAG.getNode(ISD::SUB, SL, MVT::i64,
3562 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3563 }
3564
3565 return Result;
3566}
3567
3569 SDLoc DL(Op);
3570 SDValue N0 = Op.getOperand(0);
3571
3572 // Convert to target node to get known bits
3573 if (N0.getValueType() == MVT::f32)
3574 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3575
3576 if (getTargetMachine().Options.UnsafeFPMath) {
3577 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3578 return SDValue();
3579 }
3580
3581 assert(N0.getSimpleValueType() == MVT::f64);
3582
3583 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3584 const unsigned ExpMask = 0x7ff;
3585 const unsigned ExpBiasf64 = 1023;
3586 const unsigned ExpBiasf16 = 15;
3587 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3588 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3589 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3590 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3591 DAG.getConstant(32, DL, MVT::i64));
3592 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3593 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3594 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3595 DAG.getConstant(20, DL, MVT::i64));
3596 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3597 DAG.getConstant(ExpMask, DL, MVT::i32));
3598 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3599 // add the f16 bias (15) to get the biased exponent for the f16 format.
3600 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3601 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3602
3603 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3604 DAG.getConstant(8, DL, MVT::i32));
3605 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3606 DAG.getConstant(0xffe, DL, MVT::i32));
3607
3608 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3609 DAG.getConstant(0x1ff, DL, MVT::i32));
3610 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3611
3612 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3613 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3614
3615 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3616 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3617 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3618 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3619
3620 // N = M | (E << 12);
3621 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3622 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3623 DAG.getConstant(12, DL, MVT::i32)));
3624
3625 // B = clamp(1-E, 0, 13);
3626 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3627 One, E);
3628 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3629 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3630 DAG.getConstant(13, DL, MVT::i32));
3631
3632 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3633 DAG.getConstant(0x1000, DL, MVT::i32));
3634
3635 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3636 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3637 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3638 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3639
3640 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3641 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3642 DAG.getConstant(0x7, DL, MVT::i32));
3643 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3644 DAG.getConstant(2, DL, MVT::i32));
3645 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3646 One, Zero, ISD::SETEQ);
3647 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3648 One, Zero, ISD::SETGT);
3649 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3650 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3651
3652 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3653 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3654 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3655 I, V, ISD::SETEQ);
3656
3657 // Extract the sign bit.
3658 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3659 DAG.getConstant(16, DL, MVT::i32));
3660 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3661 DAG.getConstant(0x8000, DL, MVT::i32));
3662
3663 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3664 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3665}
3666
3668 SelectionDAG &DAG) const {
3669 SDValue Src = Op.getOperand(0);
3670 unsigned OpOpcode = Op.getOpcode();
3671 EVT SrcVT = Src.getValueType();
3672 EVT DestVT = Op.getValueType();
3673
3674 // Will be selected natively
3675 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3676 return Op;
3677
3678 if (SrcVT == MVT::bf16) {
3679 SDLoc DL(Op);
3680 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3681 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3682 }
3683
3684 // Promote i16 to i32
3685 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3686 SDLoc DL(Op);
3687
3688 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3689 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3690 }
3691
3692 if (DestVT != MVT::i64)
3693 return Op;
3694
3695 if (SrcVT == MVT::f16 ||
3696 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3697 SDLoc DL(Op);
3698
3699 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3700 unsigned Ext =
3702 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3703 }
3704
3705 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3706 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3707
3708 return SDValue();
3709}
3710
3712 SelectionDAG &DAG) const {
3713 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3714 MVT VT = Op.getSimpleValueType();
3715 MVT ScalarVT = VT.getScalarType();
3716
3717 assert(VT.isVector());
3718
3719 SDValue Src = Op.getOperand(0);
3720 SDLoc DL(Op);
3721
3722 // TODO: Don't scalarize on Evergreen?
3723 unsigned NElts = VT.getVectorNumElements();
3725 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3726
3727 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3728 for (unsigned I = 0; I < NElts; ++I)
3729 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3730
3731 return DAG.getBuildVector(VT, DL, Args);
3732}
3733
3734//===----------------------------------------------------------------------===//
3735// Custom DAG optimizations
3736//===----------------------------------------------------------------------===//
3737
3738static bool isU24(SDValue Op, SelectionDAG &DAG) {
3739 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3740}
3741
3742static bool isI24(SDValue Op, SelectionDAG &DAG) {
3743 EVT VT = Op.getValueType();
3744 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3745 // as unsigned 24-bit values.
3747}
3748
3751 SelectionDAG &DAG = DCI.DAG;
3752 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3753 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3754
3755 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3756 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3757 unsigned NewOpcode = Node24->getOpcode();
3758 if (IsIntrin) {
3759 unsigned IID = Node24->getConstantOperandVal(0);
3760 switch (IID) {
3761 case Intrinsic::amdgcn_mul_i24:
3762 NewOpcode = AMDGPUISD::MUL_I24;
3763 break;
3764 case Intrinsic::amdgcn_mul_u24:
3765 NewOpcode = AMDGPUISD::MUL_U24;
3766 break;
3767 case Intrinsic::amdgcn_mulhi_i24:
3768 NewOpcode = AMDGPUISD::MULHI_I24;
3769 break;
3770 case Intrinsic::amdgcn_mulhi_u24:
3771 NewOpcode = AMDGPUISD::MULHI_U24;
3772 break;
3773 default:
3774 llvm_unreachable("Expected 24-bit mul intrinsic");
3775 }
3776 }
3777
3778 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3779
3780 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3781 // the operands to have other uses, but will only perform simplifications that
3782 // involve bypassing some nodes for this user.
3783 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3784 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3785 if (DemandedLHS || DemandedRHS)
3786 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3787 DemandedLHS ? DemandedLHS : LHS,
3788 DemandedRHS ? DemandedRHS : RHS);
3789
3790 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3791 // operands if this node is the only user.
3792 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3793 return SDValue(Node24, 0);
3794 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3795 return SDValue(Node24, 0);
3796
3797 return SDValue();
3798}
3799
3800template <typename IntTy>
3802 uint32_t Width, const SDLoc &DL) {
3803 if (Width + Offset < 32) {
3804 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3805 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3806 if constexpr (std::is_signed_v<IntTy>) {
3807 return DAG.getSignedConstant(Result, DL, MVT::i32);
3808 } else {
3809 return DAG.getConstant(Result, DL, MVT::i32);
3810 }
3811 }
3812
3813 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3814}
3815
3816static bool hasVolatileUser(SDNode *Val) {
3817 for (SDNode *U : Val->users()) {
3818 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3819 if (M->isVolatile())
3820 return true;
3821 }
3822 }
3823
3824 return false;
3825}
3826
3828 // i32 vectors are the canonical memory type.
3829 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3830 return false;
3831
3832 if (!VT.isByteSized())
3833 return false;
3834
3835 unsigned Size = VT.getStoreSize();
3836
3837 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3838 return false;
3839
3840 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3841 return false;
3842
3843 return true;
3844}
3845
3846// Replace load of an illegal type with a store of a bitcast to a friendlier
3847// type.
3849 DAGCombinerInfo &DCI) const {
3850 if (!DCI.isBeforeLegalize())
3851 return SDValue();
3852
3853 LoadSDNode *LN = cast<LoadSDNode>(N);
3854 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3855 return SDValue();
3856
3857 SDLoc SL(N);
3858 SelectionDAG &DAG = DCI.DAG;
3859 EVT VT = LN->getMemoryVT();
3860
3861 unsigned Size = VT.getStoreSize();
3862 Align Alignment = LN->getAlign();
3863 if (Alignment < Size && isTypeLegal(VT)) {
3864 unsigned IsFast;
3865 unsigned AS = LN->getAddressSpace();
3866
3867 // Expand unaligned loads earlier than legalization. Due to visitation order
3868 // problems during legalization, the emitted instructions to pack and unpack
3869 // the bytes again are not eliminated in the case of an unaligned copy.
3871 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3872 if (VT.isVector())
3873 return SplitVectorLoad(SDValue(LN, 0), DAG);
3874
3875 SDValue Ops[2];